feat(search): Treffer ohne Recipe-JSON-LD rausfiltern
All checks were successful
Build & Publish Docker Image / build-and-push (push) Successful in 1m17s

Wir fetchen die Trefferseite sowieso schon fürs Thumbnail — prüfen
jetzt in der gleichen HTML-Parse-Runde, ob überhaupt ein
schema.org/Recipe JSON-LD vorhanden ist. Fehlt es, wird der Treffer
aus der Liste entfernt, weil der Importer auf dieser Seite später
sowieso mit „Diese Seite enthält kein Rezept" scheitern würde.

- Migration 007: thumbnail_cache.has_recipe (NULL=unbekannt, 0=nein, 1=ja).
- Fetch-Fehler hinterlassen NULL → Treffer bleibt konservativ sichtbar.
- Neue export `hasRecipeJsonLd(html)` in json-ld-recipe.ts.
- Alle Cache-Reads/Writes nehmen den neuen Wert mit.

Tests: +2 für Filter/Failover, bestehende Thumbnail-Tests mit
Recipe-JSON-LD-Stub ergänzt, damit sie nicht selber rausgefiltert
werden.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
hsiegeln
2026-04-17 22:20:22 +02:00
parent dbc9646caa
commit 342ea0efc8
4 changed files with 125 additions and 33 deletions

View File

@@ -2,6 +2,7 @@ import type Database from 'better-sqlite3';
import { parseHTML } from 'linkedom';
import { listDomains, normalizeDomain } from '../domains/repository';
import { fetchText } from '../http';
import { hasRecipeJsonLd } from '../parsers/json-ld-recipe';
export type WebHit = {
url: string;
@@ -182,70 +183,91 @@ function extractPageImage(html: string, baseUrl: string): string | null {
const THUMB_TTL_DAYS = Number(process.env.KOCHWAS_THUMB_TTL_DAYS ?? 30);
const THUMB_TTL_MS = THUMB_TTL_DAYS * 24 * 60 * 60 * 1000;
function readCachedThumbnail(
type PageMeta = {
image: string | null;
hasRecipe: 0 | 1 | null;
};
function readCachedPageMeta(
db: Database.Database,
url: string
): { image: string | null } | null {
): PageMeta | null {
const row = db
.prepare<[string, string], { image: string | null }>(
"SELECT image FROM thumbnail_cache WHERE url = ? AND expires_at > ?"
.prepare<
[string, string],
{ image: string | null; has_recipe: 0 | 1 | null }
>(
'SELECT image, has_recipe FROM thumbnail_cache WHERE url = ? AND expires_at > ?'
)
.get(url, new Date().toISOString());
return row ?? null;
if (!row) return null;
return { image: row.image, hasRecipe: row.has_recipe };
}
function writeCachedThumbnail(
function writeCachedPageMeta(
db: Database.Database,
url: string,
image: string | null
meta: PageMeta
): void {
const expiresAt = new Date(Date.now() + THUMB_TTL_MS).toISOString();
db.prepare(
'INSERT OR REPLACE INTO thumbnail_cache (url, image, expires_at) VALUES (?, ?, ?)'
).run(url, image, expiresAt);
'INSERT OR REPLACE INTO thumbnail_cache (url, image, expires_at, has_recipe) VALUES (?, ?, ?, ?)'
).run(url, meta.image, expiresAt, meta.hasRecipe);
}
async function enrichThumbnail(
async function enrichPageMeta(
db: Database.Database,
url: string
): Promise<string | null> {
const cached = readCachedThumbnail(db, url);
if (cached) return cached.image;
let image: string | null = null;
): Promise<PageMeta> {
const cached = readCachedPageMeta(db, url);
if (cached) return cached;
let meta: PageMeta = { image: null, hasRecipe: null };
try {
const html = await fetchText(url, { timeoutMs: 4_000, maxBytes: 512 * 1024 });
image = extractPageImage(html, url);
meta = {
image: extractPageImage(html, url),
hasRecipe: hasRecipeJsonLd(html) ? 1 : 0
};
} catch {
image = null;
// Fetch failed — leave hasRecipe null (unknown) so we don't permanently
// hide a temporary-network-error URL.
}
writeCachedThumbnail(db, url, image);
return image;
writeCachedPageMeta(db, url, meta);
return meta;
}
async function enrichAllThumbnails(
async function enrichAndFilterHits(
db: Database.Database,
hits: WebHit[]
): Promise<void> {
// Always fetch the page image even when SearXNG gave us a thumbnail —
// the search engine's thumbnail is typically 150-200px, while og:image
// / JSON-LD image on the page is the full-resolution recipe photo.
// The thumbnail_cache table (default 30-day TTL) makes repeat searches instant.
if (hits.length === 0) return;
): Promise<WebHit[]> {
// Always fetch the page even when SearXNG gave us a thumbnail — we need
// the HTML anyway for the high-res og:image AND to confirm a Recipe
// JSON-LD actually exists. The thumbnail_cache table (default 30-day TTL)
// makes repeat searches instant.
if (hits.length === 0) return hits;
// Lazy cleanup of expired entries — O(log n) index scan, cheap.
db.prepare('DELETE FROM thumbnail_cache WHERE expires_at <= ?').run(
new Date().toISOString()
);
const metas = new Map<string, PageMeta>();
const queue = [...hits];
const LIMIT = 6;
const workers = Array.from({ length: Math.min(LIMIT, queue.length) }, async () => {
while (queue.length > 0) {
const h = queue.shift();
if (!h) break;
const image = await enrichThumbnail(db, h.url);
if (image) h.thumbnail = image;
metas.set(h.url, await enrichPageMeta(db, h.url));
}
});
await Promise.all(workers);
// Drop confirmed-non-recipe pages (hasRecipe === 0). Keep unknown (null)
// and confirmed recipes (1).
return hits
.filter((h) => metas.get(h.url)?.hasRecipe !== 0)
.map((h) => {
const image = metas.get(h.url)?.image;
return image ? { ...h, thumbnail: image } : h;
});
}
export async function searchWeb(
@@ -310,7 +332,7 @@ export async function searchWeb(
if (hits.length >= limit) break;
}
if (opts.enrichThumbnails !== false) {
await enrichAllThumbnails(db, hits);
return await enrichAndFilterHits(db, hits);
}
return hits;
}