From 041ef125822f4289d800d6ed499b50a2aaba8694 Mon Sep 17 00:00:00 2001 From: Hendrik Date: Fri, 17 Apr 2026 15:47:28 +0200 Subject: [PATCH] fix(search): filter forum/magazin/listing URLs from web search results Blocks common non-recipe paths like /forum/, /magazin/, /suche/, /themen/, Chefkoch's /rs/s\d+/ search URLs and /Rezepte.html listings. Before: 'ravioli' search returned forum threads and listing pages that triggered 'No schema.org/Recipe JSON-LD' on preview. After: only real recipe URLs pass through. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/lib/server/search/searxng.ts | 48 +++++++++++++++++++++++++++++++ tests/integration/searxng.test.ts | 16 +++++++++++ 2 files changed, 64 insertions(+) diff --git a/src/lib/server/search/searxng.ts b/src/lib/server/search/searxng.ts index fb412c2..08cb973 100644 --- a/src/lib/server/search/searxng.ts +++ b/src/lib/server/search/searxng.ts @@ -30,6 +30,53 @@ function hostnameFromUrl(url: string): string | null { } } +// Paths that are clearly NOT a single recipe page on common domains. +// Broad enough to cover forum/magazine/listing URLs we've seen in the wild +// across chefkoch.de, emmikochteinfach.de, experimente-aus-meiner-kueche.de etc. +const NON_RECIPE_PATH_PATTERNS: RegExp[] = [ + /\/forum\//i, + /\/magazin\//i, + /\/magazine\//i, + /\/suche($|\/|\?)/i, + /\/search($|\/|\?)/i, + /\/benutzer\//i, + /\/profil\//i, + /\/autoren\//i, + /\/themen\//i, + /\/kategorie\//i, + /\/kategorien\//i, + /\/cook-and-style\//i, + /\/tag\//i, + /\/rezepte\/?$/i, // "/rezepte/" listing root + /\/rezepte\/kategorien/i, + /\/rezepte\/was-kocht/i, + /\/gewinnspiel/i, + /\/impressum/i, + /\/datenschutz/i, + /\/ueber-(uns|mich)/i, + // Chefkoch-specific search-/listing-URLs + /\/rs\/s\d+\//i, + /\/rs\/s\d+$/i, + /Rezepte\.html/i // /rs/.../Rezepte.html is a listing +]; + +function looksLikeRecipePage(url: string): boolean { + try { + const u = new URL(url); + const path = u.pathname + u.search; + for (const rx of NON_RECIPE_PATH_PATTERNS) { + if (rx.test(path)) return false; + } + // Heuristic: very short paths (just "/" or "/xyz") on non-blog sites + // are usually landing pages. Allow when at least one path segment exists + // AND path isn't just the root or a single top-level category keyword. + if (path === '/' || path === '') return false; + return true; + } catch { + return false; + } +} + export async function searchWeb( db: Database.Database, query: string, @@ -63,6 +110,7 @@ export async function searchWeb( for (const r of results) { const host = hostnameFromUrl(r.url); if (!host || !allowed.has(host)) continue; + if (!looksLikeRecipePage(r.url)) continue; if (seen.has(r.url)) continue; seen.add(r.url); hits.push({ diff --git a/tests/integration/searxng.test.ts b/tests/integration/searxng.test.ts index 5dd9ba3..1d47ac0 100644 --- a/tests/integration/searxng.test.ts +++ b/tests/integration/searxng.test.ts @@ -71,4 +71,20 @@ describe('searchWeb', () => { const hits = await searchWeb(db, ' ', { searxngUrl: baseUrl }); expect(hits).toEqual([]); }); + + it('filters out forum/magazine/listing URLs', async () => { + const db = openInMemoryForTest(); + addDomain(db, 'chefkoch.de'); + respondWith([ + { url: 'https://www.chefkoch.de/rezepte/123/Ravioli.html', title: 'Ravioli' }, + { url: 'https://www.chefkoch.de/forum/2,17,89865/ravioli.html', title: 'Forum Ravioli' }, + { url: 'https://www.chefkoch.de/magazin/artikel/x.html', title: 'Magazin' }, + { url: 'https://www.chefkoch.de/suche/ravioli', title: 'Suche' }, + { url: 'https://www.chefkoch.de/themen/ravioli/', title: 'Themen' }, + { url: 'https://www.chefkoch.de/rezepte/', title: 'Rezepte Übersicht' } + ]); + const hits = await searchWeb(db, 'ravioli', { searxngUrl: baseUrl }); + expect(hits.length).toBe(1); + expect(hits[0].title).toBe('Ravioli'); + }); });