fix(search): filter forum/magazin/listing URLs from web search results

Blocks common non-recipe paths like /forum/, /magazin/, /suche/, /themen/,
Chefkoch's /rs/s\d+/ search URLs and /Rezepte.html listings.

Before: 'ravioli' search returned forum threads and listing pages that
triggered 'No schema.org/Recipe JSON-LD' on preview.
After: only real recipe URLs pass through.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-17 15:47:28 +02:00
parent bce9e87095
commit 041ef12582
2 changed files with 64 additions and 0 deletions

View File

@@ -30,6 +30,53 @@ function hostnameFromUrl(url: string): string | null {
}
}
// Paths that are clearly NOT a single recipe page on common domains.
// Broad enough to cover forum/magazine/listing URLs we've seen in the wild
// across chefkoch.de, emmikochteinfach.de, experimente-aus-meiner-kueche.de etc.
const NON_RECIPE_PATH_PATTERNS: RegExp[] = [
/\/forum\//i,
/\/magazin\//i,
/\/magazine\//i,
/\/suche($|\/|\?)/i,
/\/search($|\/|\?)/i,
/\/benutzer\//i,
/\/profil\//i,
/\/autoren\//i,
/\/themen\//i,
/\/kategorie\//i,
/\/kategorien\//i,
/\/cook-and-style\//i,
/\/tag\//i,
/\/rezepte\/?$/i, // "/rezepte/" listing root
/\/rezepte\/kategorien/i,
/\/rezepte\/was-kocht/i,
/\/gewinnspiel/i,
/\/impressum/i,
/\/datenschutz/i,
/\/ueber-(uns|mich)/i,
// Chefkoch-specific search-/listing-URLs
/\/rs\/s\d+\//i,
/\/rs\/s\d+$/i,
/Rezepte\.html/i // /rs/.../Rezepte.html is a listing
];
function looksLikeRecipePage(url: string): boolean {
try {
const u = new URL(url);
const path = u.pathname + u.search;
for (const rx of NON_RECIPE_PATH_PATTERNS) {
if (rx.test(path)) return false;
}
// Heuristic: very short paths (just "/" or "/xyz") on non-blog sites
// are usually landing pages. Allow when at least one path segment exists
// AND path isn't just the root or a single top-level category keyword.
if (path === '/' || path === '') return false;
return true;
} catch {
return false;
}
}
export async function searchWeb(
db: Database.Database,
query: string,
@@ -63,6 +110,7 @@ export async function searchWeb(
for (const r of results) {
const host = hostnameFromUrl(r.url);
if (!host || !allowed.has(host)) continue;
if (!looksLikeRecipePage(r.url)) continue;
if (seen.has(r.url)) continue;
seen.add(r.url);
hits.push({

View File

@@ -71,4 +71,20 @@ describe('searchWeb', () => {
const hits = await searchWeb(db, ' ', { searxngUrl: baseUrl });
expect(hits).toEqual([]);
});
it('filters out forum/magazine/listing URLs', async () => {
const db = openInMemoryForTest();
addDomain(db, 'chefkoch.de');
respondWith([
{ url: 'https://www.chefkoch.de/rezepte/123/Ravioli.html', title: 'Ravioli' },
{ url: 'https://www.chefkoch.de/forum/2,17,89865/ravioli.html', title: 'Forum Ravioli' },
{ url: 'https://www.chefkoch.de/magazin/artikel/x.html', title: 'Magazin' },
{ url: 'https://www.chefkoch.de/suche/ravioli', title: 'Suche' },
{ url: 'https://www.chefkoch.de/themen/ravioli/', title: 'Themen' },
{ url: 'https://www.chefkoch.de/rezepte/', title: 'Rezepte Übersicht' }
]);
const hits = await searchWeb(db, 'ravioli', { searxngUrl: baseUrl });
expect(hits.length).toBe(1);
expect(hits[0].title).toBe('Ravioli');
});
});