feat(search): Treffer ohne Recipe-JSON-LD rausfiltern
All checks were successful
Build & Publish Docker Image / build-and-push (push) Successful in 1m17s

Wir fetchen die Trefferseite sowieso schon fürs Thumbnail — prüfen
jetzt in der gleichen HTML-Parse-Runde, ob überhaupt ein
schema.org/Recipe JSON-LD vorhanden ist. Fehlt es, wird der Treffer
aus der Liste entfernt, weil der Importer auf dieser Seite später
sowieso mit „Diese Seite enthält kein Rezept" scheitern würde.

- Migration 007: thumbnail_cache.has_recipe (NULL=unbekannt, 0=nein, 1=ja).
- Fetch-Fehler hinterlassen NULL → Treffer bleibt konservativ sichtbar.
- Neue export `hasRecipeJsonLd(html)` in json-ld-recipe.ts.
- Alle Cache-Reads/Writes nehmen den neuen Wert mit.

Tests: +2 für Filter/Failover, bestehende Thumbnail-Tests mit
Recipe-JSON-LD-Stub ergänzt, damit sie nicht selber rausgefiltert
werden.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
hsiegeln
2026-04-17 22:20:22 +02:00
parent dbc9646caa
commit 342ea0efc8
4 changed files with 125 additions and 33 deletions

View File

@@ -0,0 +1,11 @@
-- Erweitert thumbnail_cache um ein has_recipe-Flag. Beim Thumbnail-
-- Enrichment checken wir, ob die Seite überhaupt ein schema.org/Recipe
-- JSON-LD enthält — sonst kann der Importer das Rezept später sowieso
-- nicht extrahieren, und der User sieht nur die „Diese Seite enthält
-- kein Rezept"-Fehlermeldung.
--
-- NULL = unbekannt (vor dieser Migration gecached oder Fetch schlug fehl,
-- dann behalten wir den Treffer konservativ);
-- 0 = gesicherter Nicht-Treffer (ausblenden);
-- 1 = Rezept vorhanden.
ALTER TABLE thumbnail_cache ADD COLUMN has_recipe INTEGER;

View File

@@ -106,6 +106,14 @@ function findRecipeNode(html: string): JsonLdNode | null {
return null; return null;
} }
export function hasRecipeJsonLd(html: string): boolean {
try {
return findRecipeNode(html) !== null;
} catch {
return false;
}
}
export function extractRecipeFromHtml(html: string): Recipe | null { export function extractRecipeFromHtml(html: string): Recipe | null {
const node = findRecipeNode(html); const node = findRecipeNode(html);
if (!node) return null; if (!node) return null;

View File

@@ -2,6 +2,7 @@ import type Database from 'better-sqlite3';
import { parseHTML } from 'linkedom'; import { parseHTML } from 'linkedom';
import { listDomains, normalizeDomain } from '../domains/repository'; import { listDomains, normalizeDomain } from '../domains/repository';
import { fetchText } from '../http'; import { fetchText } from '../http';
import { hasRecipeJsonLd } from '../parsers/json-ld-recipe';
export type WebHit = { export type WebHit = {
url: string; url: string;
@@ -182,70 +183,91 @@ function extractPageImage(html: string, baseUrl: string): string | null {
const THUMB_TTL_DAYS = Number(process.env.KOCHWAS_THUMB_TTL_DAYS ?? 30); const THUMB_TTL_DAYS = Number(process.env.KOCHWAS_THUMB_TTL_DAYS ?? 30);
const THUMB_TTL_MS = THUMB_TTL_DAYS * 24 * 60 * 60 * 1000; const THUMB_TTL_MS = THUMB_TTL_DAYS * 24 * 60 * 60 * 1000;
function readCachedThumbnail( type PageMeta = {
image: string | null;
hasRecipe: 0 | 1 | null;
};
function readCachedPageMeta(
db: Database.Database, db: Database.Database,
url: string url: string
): { image: string | null } | null { ): PageMeta | null {
const row = db const row = db
.prepare<[string, string], { image: string | null }>( .prepare<
"SELECT image FROM thumbnail_cache WHERE url = ? AND expires_at > ?" [string, string],
{ image: string | null; has_recipe: 0 | 1 | null }
>(
'SELECT image, has_recipe FROM thumbnail_cache WHERE url = ? AND expires_at > ?'
) )
.get(url, new Date().toISOString()); .get(url, new Date().toISOString());
return row ?? null; if (!row) return null;
return { image: row.image, hasRecipe: row.has_recipe };
} }
function writeCachedThumbnail( function writeCachedPageMeta(
db: Database.Database, db: Database.Database,
url: string, url: string,
image: string | null meta: PageMeta
): void { ): void {
const expiresAt = new Date(Date.now() + THUMB_TTL_MS).toISOString(); const expiresAt = new Date(Date.now() + THUMB_TTL_MS).toISOString();
db.prepare( db.prepare(
'INSERT OR REPLACE INTO thumbnail_cache (url, image, expires_at) VALUES (?, ?, ?)' 'INSERT OR REPLACE INTO thumbnail_cache (url, image, expires_at, has_recipe) VALUES (?, ?, ?, ?)'
).run(url, image, expiresAt); ).run(url, meta.image, expiresAt, meta.hasRecipe);
} }
async function enrichThumbnail( async function enrichPageMeta(
db: Database.Database, db: Database.Database,
url: string url: string
): Promise<string | null> { ): Promise<PageMeta> {
const cached = readCachedThumbnail(db, url); const cached = readCachedPageMeta(db, url);
if (cached) return cached.image; if (cached) return cached;
let image: string | null = null; let meta: PageMeta = { image: null, hasRecipe: null };
try { try {
const html = await fetchText(url, { timeoutMs: 4_000, maxBytes: 512 * 1024 }); const html = await fetchText(url, { timeoutMs: 4_000, maxBytes: 512 * 1024 });
image = extractPageImage(html, url); meta = {
image: extractPageImage(html, url),
hasRecipe: hasRecipeJsonLd(html) ? 1 : 0
};
} catch { } catch {
image = null; // Fetch failed — leave hasRecipe null (unknown) so we don't permanently
// hide a temporary-network-error URL.
} }
writeCachedThumbnail(db, url, image); writeCachedPageMeta(db, url, meta);
return image; return meta;
} }
async function enrichAllThumbnails( async function enrichAndFilterHits(
db: Database.Database, db: Database.Database,
hits: WebHit[] hits: WebHit[]
): Promise<void> { ): Promise<WebHit[]> {
// Always fetch the page image even when SearXNG gave us a thumbnail — // Always fetch the page even when SearXNG gave us a thumbnail — we need
// the search engine's thumbnail is typically 150-200px, while og:image // the HTML anyway for the high-res og:image AND to confirm a Recipe
// / JSON-LD image on the page is the full-resolution recipe photo. // JSON-LD actually exists. The thumbnail_cache table (default 30-day TTL)
// The thumbnail_cache table (default 30-day TTL) makes repeat searches instant. // makes repeat searches instant.
if (hits.length === 0) return; if (hits.length === 0) return hits;
// Lazy cleanup of expired entries — O(log n) index scan, cheap. // Lazy cleanup of expired entries — O(log n) index scan, cheap.
db.prepare('DELETE FROM thumbnail_cache WHERE expires_at <= ?').run( db.prepare('DELETE FROM thumbnail_cache WHERE expires_at <= ?').run(
new Date().toISOString() new Date().toISOString()
); );
const metas = new Map<string, PageMeta>();
const queue = [...hits]; const queue = [...hits];
const LIMIT = 6; const LIMIT = 6;
const workers = Array.from({ length: Math.min(LIMIT, queue.length) }, async () => { const workers = Array.from({ length: Math.min(LIMIT, queue.length) }, async () => {
while (queue.length > 0) { while (queue.length > 0) {
const h = queue.shift(); const h = queue.shift();
if (!h) break; if (!h) break;
const image = await enrichThumbnail(db, h.url); metas.set(h.url, await enrichPageMeta(db, h.url));
if (image) h.thumbnail = image;
} }
}); });
await Promise.all(workers); await Promise.all(workers);
// Drop confirmed-non-recipe pages (hasRecipe === 0). Keep unknown (null)
// and confirmed recipes (1).
return hits
.filter((h) => metas.get(h.url)?.hasRecipe !== 0)
.map((h) => {
const image = metas.get(h.url)?.image;
return image ? { ...h, thumbnail: image } : h;
});
} }
export async function searchWeb( export async function searchWeb(
@@ -310,7 +332,7 @@ export async function searchWeb(
if (hits.length >= limit) break; if (hits.length >= limit) break;
} }
if (opts.enrichThumbnails !== false) { if (opts.enrichThumbnails !== false) {
await enrichAllThumbnails(db, hits); return await enrichAndFilterHits(db, hits);
} }
return hits; return hits;
} }

View File

@@ -100,11 +100,62 @@ describe('searchWeb', () => {
expect(receivedPageno).toBe(null); expect(receivedPageno).toBe(null);
}); });
it('drops hits whose page lacks a Recipe JSON-LD', async () => {
const pageServer = createServer((req, res) => {
res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' });
if (req.url === '/with-recipe') {
res.end(`<html><head>
<script type="application/ld+json">${JSON.stringify({
'@type': 'Recipe',
name: 'Pie',
image: 'https://cdn.example/pie.jpg'
})}</script>
</head></html>`);
} else {
// forum page: no Recipe JSON-LD
res.end('<html><head><title>Forum</title></head><body>Diskussion</body></html>');
}
});
await new Promise<void>((r) => pageServer.listen(0, '127.0.0.1', r));
const addr = pageServer.address() as AddressInfo;
try {
const db = openInMemoryForTest();
addDomain(db, '127.0.0.1');
respondWith([
{ url: `http://127.0.0.1:${addr.port}/with-recipe`, title: 'Recipe', content: '' },
{ url: `http://127.0.0.1:${addr.port}/forum-thread`, title: 'Forum', content: '' }
]);
const hits = await searchWeb(db, 'x', { searxngUrl: baseUrl });
expect(hits.length).toBe(1);
expect(hits[0].url.endsWith('/with-recipe')).toBe(true);
} finally {
await new Promise<void>((r) => pageServer.close(() => r()));
}
});
it('keeps hit when page fetch fails (unknown recipe status)', async () => {
const db = openInMemoryForTest();
addDomain(db, '127.0.0.1');
// URL points to a port nobody listens on → fetch fails
respondWith([
{ url: 'http://127.0.0.1:1/unreachable', title: 'Unreachable', content: '' }
]);
const hits = await searchWeb(db, 'x', { searxngUrl: baseUrl });
expect(hits.length).toBe(1);
});
// Minimal Recipe-JSON-LD stub so enrichAndFilterHits doesn't drop test hits
// as non-recipe pages. Used in tests that focus on thumbnail extraction.
const RECIPE_LD = `<script type="application/ld+json">${JSON.stringify({
'@type': 'Recipe',
name: 'stub'
})}</script>`;
it('enriches missing thumbnails from og:image', async () => { it('enriches missing thumbnails from og:image', async () => {
const pageServer = createServer((_req, res) => { const pageServer = createServer((_req, res) => {
res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' }); res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' });
res.end( res.end(
'<html><head><meta property="og:image" content="https://cdn.example/foo.jpg" /></head><body></body></html>' `<html><head><meta property="og:image" content="https://cdn.example/foo.jpg" />${RECIPE_LD}</head><body></body></html>`
); );
}); });
await new Promise<void>((r) => pageServer.listen(0, '127.0.0.1', r)); await new Promise<void>((r) => pageServer.listen(0, '127.0.0.1', r));
@@ -151,7 +202,7 @@ describe('searchWeb', () => {
const pageServer = createServer((_req, res) => { const pageServer = createServer((_req, res) => {
res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' }); res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' });
res.end( res.end(
'<html><body><article><img src="/uploads/dish.jpg" alt=""></article></body></html>' `<html><head>${RECIPE_LD}</head><body><article><img src="/uploads/dish.jpg" alt=""></article></body></html>`
); );
}); });
await new Promise<void>((r) => pageServer.listen(0, '127.0.0.1', r)); await new Promise<void>((r) => pageServer.listen(0, '127.0.0.1', r));
@@ -172,7 +223,7 @@ describe('searchWeb', () => {
const pageServer = createServer((_req, res) => { const pageServer = createServer((_req, res) => {
res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' }); res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' });
res.end( res.end(
'<html><head><meta property="og:image" content="https://cdn.example/hq.jpg" /></head></html>' `<html><head><meta property="og:image" content="https://cdn.example/hq.jpg" />${RECIPE_LD}</head></html>`
); );
}); });
await new Promise<void>((r) => pageServer.listen(0, '127.0.0.1', r)); await new Promise<void>((r) => pageServer.listen(0, '127.0.0.1', r));
@@ -194,7 +245,7 @@ describe('searchWeb', () => {
it('keeps SearXNG thumbnail when page has no image', async () => { it('keeps SearXNG thumbnail when page has no image', async () => {
const pageServer = createServer((_req, res) => { const pageServer = createServer((_req, res) => {
res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' }); res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' });
res.end('<html><head></head><body>no images here</body></html>'); res.end(`<html><head>${RECIPE_LD}</head><body>no images here</body></html>`);
}); });
await new Promise<void>((r) => pageServer.listen(0, '127.0.0.1', r)); await new Promise<void>((r) => pageServer.listen(0, '127.0.0.1', r));
const addr = pageServer.address() as AddressInfo; const addr = pageServer.address() as AddressInfo;
@@ -217,7 +268,7 @@ describe('searchWeb', () => {
const pageServer = createServer((_req, res) => { const pageServer = createServer((_req, res) => {
pageHits += 1; pageHits += 1;
res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' }); res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' });
res.end('<html><head><meta property="og:image" content="https://cdn.example/c.jpg"></head></html>'); res.end(`<html><head><meta property="og:image" content="https://cdn.example/c.jpg">${RECIPE_LD}</head></html>`);
}); });
await new Promise<void>((r) => pageServer.listen(0, '127.0.0.1', r)); await new Promise<void>((r) => pageServer.listen(0, '127.0.0.1', r));
const addr = pageServer.address() as AddressInfo; const addr = pageServer.address() as AddressInfo;