fix(search): filter forum/magazin/listing URLs from web search results
Blocks common non-recipe paths like /forum/, /magazin/, /suche/, /themen/, Chefkoch's /rs/s\d+/ search URLs and /Rezepte.html listings. Before: 'ravioli' search returned forum threads and listing pages that triggered 'No schema.org/Recipe JSON-LD' on preview. After: only real recipe URLs pass through. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -30,6 +30,53 @@ function hostnameFromUrl(url: string): string | null {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Paths that are clearly NOT a single recipe page on common domains.
|
||||||
|
// Broad enough to cover forum/magazine/listing URLs we've seen in the wild
|
||||||
|
// across chefkoch.de, emmikochteinfach.de, experimente-aus-meiner-kueche.de etc.
|
||||||
|
const NON_RECIPE_PATH_PATTERNS: RegExp[] = [
|
||||||
|
/\/forum\//i,
|
||||||
|
/\/magazin\//i,
|
||||||
|
/\/magazine\//i,
|
||||||
|
/\/suche($|\/|\?)/i,
|
||||||
|
/\/search($|\/|\?)/i,
|
||||||
|
/\/benutzer\//i,
|
||||||
|
/\/profil\//i,
|
||||||
|
/\/autoren\//i,
|
||||||
|
/\/themen\//i,
|
||||||
|
/\/kategorie\//i,
|
||||||
|
/\/kategorien\//i,
|
||||||
|
/\/cook-and-style\//i,
|
||||||
|
/\/tag\//i,
|
||||||
|
/\/rezepte\/?$/i, // "/rezepte/" listing root
|
||||||
|
/\/rezepte\/kategorien/i,
|
||||||
|
/\/rezepte\/was-kocht/i,
|
||||||
|
/\/gewinnspiel/i,
|
||||||
|
/\/impressum/i,
|
||||||
|
/\/datenschutz/i,
|
||||||
|
/\/ueber-(uns|mich)/i,
|
||||||
|
// Chefkoch-specific search-/listing-URLs
|
||||||
|
/\/rs\/s\d+\//i,
|
||||||
|
/\/rs\/s\d+$/i,
|
||||||
|
/Rezepte\.html/i // /rs/.../Rezepte.html is a listing
|
||||||
|
];
|
||||||
|
|
||||||
|
function looksLikeRecipePage(url: string): boolean {
|
||||||
|
try {
|
||||||
|
const u = new URL(url);
|
||||||
|
const path = u.pathname + u.search;
|
||||||
|
for (const rx of NON_RECIPE_PATH_PATTERNS) {
|
||||||
|
if (rx.test(path)) return false;
|
||||||
|
}
|
||||||
|
// Heuristic: very short paths (just "/" or "/xyz") on non-blog sites
|
||||||
|
// are usually landing pages. Allow when at least one path segment exists
|
||||||
|
// AND path isn't just the root or a single top-level category keyword.
|
||||||
|
if (path === '/' || path === '') return false;
|
||||||
|
return true;
|
||||||
|
} catch {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
export async function searchWeb(
|
export async function searchWeb(
|
||||||
db: Database.Database,
|
db: Database.Database,
|
||||||
query: string,
|
query: string,
|
||||||
@@ -63,6 +110,7 @@ export async function searchWeb(
|
|||||||
for (const r of results) {
|
for (const r of results) {
|
||||||
const host = hostnameFromUrl(r.url);
|
const host = hostnameFromUrl(r.url);
|
||||||
if (!host || !allowed.has(host)) continue;
|
if (!host || !allowed.has(host)) continue;
|
||||||
|
if (!looksLikeRecipePage(r.url)) continue;
|
||||||
if (seen.has(r.url)) continue;
|
if (seen.has(r.url)) continue;
|
||||||
seen.add(r.url);
|
seen.add(r.url);
|
||||||
hits.push({
|
hits.push({
|
||||||
|
|||||||
@@ -71,4 +71,20 @@ describe('searchWeb', () => {
|
|||||||
const hits = await searchWeb(db, ' ', { searxngUrl: baseUrl });
|
const hits = await searchWeb(db, ' ', { searxngUrl: baseUrl });
|
||||||
expect(hits).toEqual([]);
|
expect(hits).toEqual([]);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('filters out forum/magazine/listing URLs', async () => {
|
||||||
|
const db = openInMemoryForTest();
|
||||||
|
addDomain(db, 'chefkoch.de');
|
||||||
|
respondWith([
|
||||||
|
{ url: 'https://www.chefkoch.de/rezepte/123/Ravioli.html', title: 'Ravioli' },
|
||||||
|
{ url: 'https://www.chefkoch.de/forum/2,17,89865/ravioli.html', title: 'Forum Ravioli' },
|
||||||
|
{ url: 'https://www.chefkoch.de/magazin/artikel/x.html', title: 'Magazin' },
|
||||||
|
{ url: 'https://www.chefkoch.de/suche/ravioli', title: 'Suche' },
|
||||||
|
{ url: 'https://www.chefkoch.de/themen/ravioli/', title: 'Themen' },
|
||||||
|
{ url: 'https://www.chefkoch.de/rezepte/', title: 'Rezepte Übersicht' }
|
||||||
|
]);
|
||||||
|
const hits = await searchWeb(db, 'ravioli', { searxngUrl: baseUrl });
|
||||||
|
expect(hits.length).toBe(1);
|
||||||
|
expect(hits[0].title).toBe('Ravioli');
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
Reference in New Issue
Block a user