From 342ea0efc89d115bb8db8b77f1b255d2eeb42696 Mon Sep 17 00:00:00 2001 From: hsiegeln <37154749+hsiegeln@users.noreply.github.com> Date: Fri, 17 Apr 2026 22:20:22 +0200 Subject: [PATCH] feat(search): Treffer ohne Recipe-JSON-LD rausfiltern MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wir fetchen die Trefferseite sowieso schon fürs Thumbnail — prüfen jetzt in der gleichen HTML-Parse-Runde, ob überhaupt ein schema.org/Recipe JSON-LD vorhanden ist. Fehlt es, wird der Treffer aus der Liste entfernt, weil der Importer auf dieser Seite später sowieso mit „Diese Seite enthält kein Rezept" scheitern würde. - Migration 007: thumbnail_cache.has_recipe (NULL=unbekannt, 0=nein, 1=ja). - Fetch-Fehler hinterlassen NULL → Treffer bleibt konservativ sichtbar. - Neue export `hasRecipeJsonLd(html)` in json-ld-recipe.ts. - Alle Cache-Reads/Writes nehmen den neuen Wert mit. Tests: +2 für Filter/Failover, bestehende Thumbnail-Tests mit Recipe-JSON-LD-Stub ergänzt, damit sie nicht selber rausgefiltert werden. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../007_thumbnail_cache_has_recipe.sql | 11 +++ src/lib/server/parsers/json-ld-recipe.ts | 8 ++ src/lib/server/search/searxng.ts | 78 ++++++++++++------- tests/integration/searxng.test.ts | 61 +++++++++++++-- 4 files changed, 125 insertions(+), 33 deletions(-) create mode 100644 src/lib/server/db/migrations/007_thumbnail_cache_has_recipe.sql diff --git a/src/lib/server/db/migrations/007_thumbnail_cache_has_recipe.sql b/src/lib/server/db/migrations/007_thumbnail_cache_has_recipe.sql new file mode 100644 index 0000000..2110675 --- /dev/null +++ b/src/lib/server/db/migrations/007_thumbnail_cache_has_recipe.sql @@ -0,0 +1,11 @@ +-- Erweitert thumbnail_cache um ein has_recipe-Flag. Beim Thumbnail- +-- Enrichment checken wir, ob die Seite überhaupt ein schema.org/Recipe +-- JSON-LD enthält — sonst kann der Importer das Rezept später sowieso +-- nicht extrahieren, und der User sieht nur die „Diese Seite enthält +-- kein Rezept"-Fehlermeldung. +-- +-- NULL = unbekannt (vor dieser Migration gecached oder Fetch schlug fehl, +-- dann behalten wir den Treffer konservativ); +-- 0 = gesicherter Nicht-Treffer (ausblenden); +-- 1 = Rezept vorhanden. +ALTER TABLE thumbnail_cache ADD COLUMN has_recipe INTEGER; diff --git a/src/lib/server/parsers/json-ld-recipe.ts b/src/lib/server/parsers/json-ld-recipe.ts index 0ffeed1..1827bb0 100644 --- a/src/lib/server/parsers/json-ld-recipe.ts +++ b/src/lib/server/parsers/json-ld-recipe.ts @@ -106,6 +106,14 @@ function findRecipeNode(html: string): JsonLdNode | null { return null; } +export function hasRecipeJsonLd(html: string): boolean { + try { + return findRecipeNode(html) !== null; + } catch { + return false; + } +} + export function extractRecipeFromHtml(html: string): Recipe | null { const node = findRecipeNode(html); if (!node) return null; diff --git a/src/lib/server/search/searxng.ts b/src/lib/server/search/searxng.ts index 09b2062..7d5da30 100644 --- a/src/lib/server/search/searxng.ts +++ b/src/lib/server/search/searxng.ts @@ -2,6 +2,7 @@ import type Database from 'better-sqlite3'; import { parseHTML } from 'linkedom'; import { listDomains, normalizeDomain } from '../domains/repository'; import { fetchText } from '../http'; +import { hasRecipeJsonLd } from '../parsers/json-ld-recipe'; export type WebHit = { url: string; @@ -182,70 +183,91 @@ function extractPageImage(html: string, baseUrl: string): string | null { const THUMB_TTL_DAYS = Number(process.env.KOCHWAS_THUMB_TTL_DAYS ?? 30); const THUMB_TTL_MS = THUMB_TTL_DAYS * 24 * 60 * 60 * 1000; -function readCachedThumbnail( +type PageMeta = { + image: string | null; + hasRecipe: 0 | 1 | null; +}; + +function readCachedPageMeta( db: Database.Database, url: string -): { image: string | null } | null { +): PageMeta | null { const row = db - .prepare<[string, string], { image: string | null }>( - "SELECT image FROM thumbnail_cache WHERE url = ? AND expires_at > ?" + .prepare< + [string, string], + { image: string | null; has_recipe: 0 | 1 | null } + >( + 'SELECT image, has_recipe FROM thumbnail_cache WHERE url = ? AND expires_at > ?' ) .get(url, new Date().toISOString()); - return row ?? null; + if (!row) return null; + return { image: row.image, hasRecipe: row.has_recipe }; } -function writeCachedThumbnail( +function writeCachedPageMeta( db: Database.Database, url: string, - image: string | null + meta: PageMeta ): void { const expiresAt = new Date(Date.now() + THUMB_TTL_MS).toISOString(); db.prepare( - 'INSERT OR REPLACE INTO thumbnail_cache (url, image, expires_at) VALUES (?, ?, ?)' - ).run(url, image, expiresAt); + 'INSERT OR REPLACE INTO thumbnail_cache (url, image, expires_at, has_recipe) VALUES (?, ?, ?, ?)' + ).run(url, meta.image, expiresAt, meta.hasRecipe); } -async function enrichThumbnail( +async function enrichPageMeta( db: Database.Database, url: string -): Promise { - const cached = readCachedThumbnail(db, url); - if (cached) return cached.image; - let image: string | null = null; +): Promise { + const cached = readCachedPageMeta(db, url); + if (cached) return cached; + let meta: PageMeta = { image: null, hasRecipe: null }; try { const html = await fetchText(url, { timeoutMs: 4_000, maxBytes: 512 * 1024 }); - image = extractPageImage(html, url); + meta = { + image: extractPageImage(html, url), + hasRecipe: hasRecipeJsonLd(html) ? 1 : 0 + }; } catch { - image = null; + // Fetch failed — leave hasRecipe null (unknown) so we don't permanently + // hide a temporary-network-error URL. } - writeCachedThumbnail(db, url, image); - return image; + writeCachedPageMeta(db, url, meta); + return meta; } -async function enrichAllThumbnails( +async function enrichAndFilterHits( db: Database.Database, hits: WebHit[] -): Promise { - // Always fetch the page image even when SearXNG gave us a thumbnail — - // the search engine's thumbnail is typically 150-200px, while og:image - // / JSON-LD image on the page is the full-resolution recipe photo. - // The thumbnail_cache table (default 30-day TTL) makes repeat searches instant. - if (hits.length === 0) return; +): Promise { + // Always fetch the page even when SearXNG gave us a thumbnail — we need + // the HTML anyway for the high-res og:image AND to confirm a Recipe + // JSON-LD actually exists. The thumbnail_cache table (default 30-day TTL) + // makes repeat searches instant. + if (hits.length === 0) return hits; // Lazy cleanup of expired entries — O(log n) index scan, cheap. db.prepare('DELETE FROM thumbnail_cache WHERE expires_at <= ?').run( new Date().toISOString() ); + const metas = new Map(); const queue = [...hits]; const LIMIT = 6; const workers = Array.from({ length: Math.min(LIMIT, queue.length) }, async () => { while (queue.length > 0) { const h = queue.shift(); if (!h) break; - const image = await enrichThumbnail(db, h.url); - if (image) h.thumbnail = image; + metas.set(h.url, await enrichPageMeta(db, h.url)); } }); await Promise.all(workers); + // Drop confirmed-non-recipe pages (hasRecipe === 0). Keep unknown (null) + // and confirmed recipes (1). + return hits + .filter((h) => metas.get(h.url)?.hasRecipe !== 0) + .map((h) => { + const image = metas.get(h.url)?.image; + return image ? { ...h, thumbnail: image } : h; + }); } export async function searchWeb( @@ -310,7 +332,7 @@ export async function searchWeb( if (hits.length >= limit) break; } if (opts.enrichThumbnails !== false) { - await enrichAllThumbnails(db, hits); + return await enrichAndFilterHits(db, hits); } return hits; } diff --git a/tests/integration/searxng.test.ts b/tests/integration/searxng.test.ts index 476c1bd..f8bc071 100644 --- a/tests/integration/searxng.test.ts +++ b/tests/integration/searxng.test.ts @@ -100,11 +100,62 @@ describe('searchWeb', () => { expect(receivedPageno).toBe(null); }); + it('drops hits whose page lacks a Recipe JSON-LD', async () => { + const pageServer = createServer((req, res) => { + res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' }); + if (req.url === '/with-recipe') { + res.end(` + + `); + } else { + // forum page: no Recipe JSON-LD + res.end('ForumDiskussion'); + } + }); + await new Promise((r) => pageServer.listen(0, '127.0.0.1', r)); + const addr = pageServer.address() as AddressInfo; + try { + const db = openInMemoryForTest(); + addDomain(db, '127.0.0.1'); + respondWith([ + { url: `http://127.0.0.1:${addr.port}/with-recipe`, title: 'Recipe', content: '' }, + { url: `http://127.0.0.1:${addr.port}/forum-thread`, title: 'Forum', content: '' } + ]); + const hits = await searchWeb(db, 'x', { searxngUrl: baseUrl }); + expect(hits.length).toBe(1); + expect(hits[0].url.endsWith('/with-recipe')).toBe(true); + } finally { + await new Promise((r) => pageServer.close(() => r())); + } + }); + + it('keeps hit when page fetch fails (unknown recipe status)', async () => { + const db = openInMemoryForTest(); + addDomain(db, '127.0.0.1'); + // URL points to a port nobody listens on → fetch fails + respondWith([ + { url: 'http://127.0.0.1:1/unreachable', title: 'Unreachable', content: '' } + ]); + const hits = await searchWeb(db, 'x', { searxngUrl: baseUrl }); + expect(hits.length).toBe(1); + }); + + // Minimal Recipe-JSON-LD stub so enrichAndFilterHits doesn't drop test hits + // as non-recipe pages. Used in tests that focus on thumbnail extraction. + const RECIPE_LD = ``; + it('enriches missing thumbnails from og:image', async () => { const pageServer = createServer((_req, res) => { res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' }); res.end( - '' + `${RECIPE_LD}` ); }); await new Promise((r) => pageServer.listen(0, '127.0.0.1', r)); @@ -151,7 +202,7 @@ describe('searchWeb', () => { const pageServer = createServer((_req, res) => { res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' }); res.end( - '
' + `${RECIPE_LD}
` ); }); await new Promise((r) => pageServer.listen(0, '127.0.0.1', r)); @@ -172,7 +223,7 @@ describe('searchWeb', () => { const pageServer = createServer((_req, res) => { res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' }); res.end( - '' + `${RECIPE_LD}` ); }); await new Promise((r) => pageServer.listen(0, '127.0.0.1', r)); @@ -194,7 +245,7 @@ describe('searchWeb', () => { it('keeps SearXNG thumbnail when page has no image', async () => { const pageServer = createServer((_req, res) => { res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' }); - res.end('no images here'); + res.end(`${RECIPE_LD}no images here`); }); await new Promise((r) => pageServer.listen(0, '127.0.0.1', r)); const addr = pageServer.address() as AddressInfo; @@ -217,7 +268,7 @@ describe('searchWeb', () => { const pageServer = createServer((_req, res) => { pageHits += 1; res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' }); - res.end(''); + res.end(`${RECIPE_LD}`); }); await new Promise((r) => pageServer.listen(0, '127.0.0.1', r)); const addr = pageServer.address() as AddressInfo;