fix(search): filter forum/magazin/listing URLs from web search results
Blocks common non-recipe paths like /forum/, /magazin/, /suche/, /themen/, Chefkoch's /rs/s\d+/ search URLs and /Rezepte.html listings. Before: 'ravioli' search returned forum threads and listing pages that triggered 'No schema.org/Recipe JSON-LD' on preview. After: only real recipe URLs pass through. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -30,6 +30,53 @@ function hostnameFromUrl(url: string): string | null {
|
||||
}
|
||||
}
|
||||
|
||||
// Paths that are clearly NOT a single recipe page on common domains.
|
||||
// Broad enough to cover forum/magazine/listing URLs we've seen in the wild
|
||||
// across chefkoch.de, emmikochteinfach.de, experimente-aus-meiner-kueche.de etc.
|
||||
const NON_RECIPE_PATH_PATTERNS: RegExp[] = [
|
||||
/\/forum\//i,
|
||||
/\/magazin\//i,
|
||||
/\/magazine\//i,
|
||||
/\/suche($|\/|\?)/i,
|
||||
/\/search($|\/|\?)/i,
|
||||
/\/benutzer\//i,
|
||||
/\/profil\//i,
|
||||
/\/autoren\//i,
|
||||
/\/themen\//i,
|
||||
/\/kategorie\//i,
|
||||
/\/kategorien\//i,
|
||||
/\/cook-and-style\//i,
|
||||
/\/tag\//i,
|
||||
/\/rezepte\/?$/i, // "/rezepte/" listing root
|
||||
/\/rezepte\/kategorien/i,
|
||||
/\/rezepte\/was-kocht/i,
|
||||
/\/gewinnspiel/i,
|
||||
/\/impressum/i,
|
||||
/\/datenschutz/i,
|
||||
/\/ueber-(uns|mich)/i,
|
||||
// Chefkoch-specific search-/listing-URLs
|
||||
/\/rs\/s\d+\//i,
|
||||
/\/rs\/s\d+$/i,
|
||||
/Rezepte\.html/i // /rs/.../Rezepte.html is a listing
|
||||
];
|
||||
|
||||
function looksLikeRecipePage(url: string): boolean {
|
||||
try {
|
||||
const u = new URL(url);
|
||||
const path = u.pathname + u.search;
|
||||
for (const rx of NON_RECIPE_PATH_PATTERNS) {
|
||||
if (rx.test(path)) return false;
|
||||
}
|
||||
// Heuristic: very short paths (just "/" or "/xyz") on non-blog sites
|
||||
// are usually landing pages. Allow when at least one path segment exists
|
||||
// AND path isn't just the root or a single top-level category keyword.
|
||||
if (path === '/' || path === '') return false;
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
export async function searchWeb(
|
||||
db: Database.Database,
|
||||
query: string,
|
||||
@@ -63,6 +110,7 @@ export async function searchWeb(
|
||||
for (const r of results) {
|
||||
const host = hostnameFromUrl(r.url);
|
||||
if (!host || !allowed.has(host)) continue;
|
||||
if (!looksLikeRecipePage(r.url)) continue;
|
||||
if (seen.has(r.url)) continue;
|
||||
seen.add(r.url);
|
||||
hits.push({
|
||||
|
||||
Reference in New Issue
Block a user