feat(search): Treffer ohne Recipe-JSON-LD rausfiltern
All checks were successful
Build & Publish Docker Image / build-and-push (push) Successful in 1m17s
All checks were successful
Build & Publish Docker Image / build-and-push (push) Successful in 1m17s
Wir fetchen die Trefferseite sowieso schon fürs Thumbnail — prüfen jetzt in der gleichen HTML-Parse-Runde, ob überhaupt ein schema.org/Recipe JSON-LD vorhanden ist. Fehlt es, wird der Treffer aus der Liste entfernt, weil der Importer auf dieser Seite später sowieso mit „Diese Seite enthält kein Rezept" scheitern würde. - Migration 007: thumbnail_cache.has_recipe (NULL=unbekannt, 0=nein, 1=ja). - Fetch-Fehler hinterlassen NULL → Treffer bleibt konservativ sichtbar. - Neue export `hasRecipeJsonLd(html)` in json-ld-recipe.ts. - Alle Cache-Reads/Writes nehmen den neuen Wert mit. Tests: +2 für Filter/Failover, bestehende Thumbnail-Tests mit Recipe-JSON-LD-Stub ergänzt, damit sie nicht selber rausgefiltert werden. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -2,6 +2,7 @@ import type Database from 'better-sqlite3';
|
||||
import { parseHTML } from 'linkedom';
|
||||
import { listDomains, normalizeDomain } from '../domains/repository';
|
||||
import { fetchText } from '../http';
|
||||
import { hasRecipeJsonLd } from '../parsers/json-ld-recipe';
|
||||
|
||||
export type WebHit = {
|
||||
url: string;
|
||||
@@ -182,70 +183,91 @@ function extractPageImage(html: string, baseUrl: string): string | null {
|
||||
const THUMB_TTL_DAYS = Number(process.env.KOCHWAS_THUMB_TTL_DAYS ?? 30);
|
||||
const THUMB_TTL_MS = THUMB_TTL_DAYS * 24 * 60 * 60 * 1000;
|
||||
|
||||
function readCachedThumbnail(
|
||||
type PageMeta = {
|
||||
image: string | null;
|
||||
hasRecipe: 0 | 1 | null;
|
||||
};
|
||||
|
||||
function readCachedPageMeta(
|
||||
db: Database.Database,
|
||||
url: string
|
||||
): { image: string | null } | null {
|
||||
): PageMeta | null {
|
||||
const row = db
|
||||
.prepare<[string, string], { image: string | null }>(
|
||||
"SELECT image FROM thumbnail_cache WHERE url = ? AND expires_at > ?"
|
||||
.prepare<
|
||||
[string, string],
|
||||
{ image: string | null; has_recipe: 0 | 1 | null }
|
||||
>(
|
||||
'SELECT image, has_recipe FROM thumbnail_cache WHERE url = ? AND expires_at > ?'
|
||||
)
|
||||
.get(url, new Date().toISOString());
|
||||
return row ?? null;
|
||||
if (!row) return null;
|
||||
return { image: row.image, hasRecipe: row.has_recipe };
|
||||
}
|
||||
|
||||
function writeCachedThumbnail(
|
||||
function writeCachedPageMeta(
|
||||
db: Database.Database,
|
||||
url: string,
|
||||
image: string | null
|
||||
meta: PageMeta
|
||||
): void {
|
||||
const expiresAt = new Date(Date.now() + THUMB_TTL_MS).toISOString();
|
||||
db.prepare(
|
||||
'INSERT OR REPLACE INTO thumbnail_cache (url, image, expires_at) VALUES (?, ?, ?)'
|
||||
).run(url, image, expiresAt);
|
||||
'INSERT OR REPLACE INTO thumbnail_cache (url, image, expires_at, has_recipe) VALUES (?, ?, ?, ?)'
|
||||
).run(url, meta.image, expiresAt, meta.hasRecipe);
|
||||
}
|
||||
|
||||
async function enrichThumbnail(
|
||||
async function enrichPageMeta(
|
||||
db: Database.Database,
|
||||
url: string
|
||||
): Promise<string | null> {
|
||||
const cached = readCachedThumbnail(db, url);
|
||||
if (cached) return cached.image;
|
||||
let image: string | null = null;
|
||||
): Promise<PageMeta> {
|
||||
const cached = readCachedPageMeta(db, url);
|
||||
if (cached) return cached;
|
||||
let meta: PageMeta = { image: null, hasRecipe: null };
|
||||
try {
|
||||
const html = await fetchText(url, { timeoutMs: 4_000, maxBytes: 512 * 1024 });
|
||||
image = extractPageImage(html, url);
|
||||
meta = {
|
||||
image: extractPageImage(html, url),
|
||||
hasRecipe: hasRecipeJsonLd(html) ? 1 : 0
|
||||
};
|
||||
} catch {
|
||||
image = null;
|
||||
// Fetch failed — leave hasRecipe null (unknown) so we don't permanently
|
||||
// hide a temporary-network-error URL.
|
||||
}
|
||||
writeCachedThumbnail(db, url, image);
|
||||
return image;
|
||||
writeCachedPageMeta(db, url, meta);
|
||||
return meta;
|
||||
}
|
||||
|
||||
async function enrichAllThumbnails(
|
||||
async function enrichAndFilterHits(
|
||||
db: Database.Database,
|
||||
hits: WebHit[]
|
||||
): Promise<void> {
|
||||
// Always fetch the page image even when SearXNG gave us a thumbnail —
|
||||
// the search engine's thumbnail is typically 150-200px, while og:image
|
||||
// / JSON-LD image on the page is the full-resolution recipe photo.
|
||||
// The thumbnail_cache table (default 30-day TTL) makes repeat searches instant.
|
||||
if (hits.length === 0) return;
|
||||
): Promise<WebHit[]> {
|
||||
// Always fetch the page even when SearXNG gave us a thumbnail — we need
|
||||
// the HTML anyway for the high-res og:image AND to confirm a Recipe
|
||||
// JSON-LD actually exists. The thumbnail_cache table (default 30-day TTL)
|
||||
// makes repeat searches instant.
|
||||
if (hits.length === 0) return hits;
|
||||
// Lazy cleanup of expired entries — O(log n) index scan, cheap.
|
||||
db.prepare('DELETE FROM thumbnail_cache WHERE expires_at <= ?').run(
|
||||
new Date().toISOString()
|
||||
);
|
||||
const metas = new Map<string, PageMeta>();
|
||||
const queue = [...hits];
|
||||
const LIMIT = 6;
|
||||
const workers = Array.from({ length: Math.min(LIMIT, queue.length) }, async () => {
|
||||
while (queue.length > 0) {
|
||||
const h = queue.shift();
|
||||
if (!h) break;
|
||||
const image = await enrichThumbnail(db, h.url);
|
||||
if (image) h.thumbnail = image;
|
||||
metas.set(h.url, await enrichPageMeta(db, h.url));
|
||||
}
|
||||
});
|
||||
await Promise.all(workers);
|
||||
// Drop confirmed-non-recipe pages (hasRecipe === 0). Keep unknown (null)
|
||||
// and confirmed recipes (1).
|
||||
return hits
|
||||
.filter((h) => metas.get(h.url)?.hasRecipe !== 0)
|
||||
.map((h) => {
|
||||
const image = metas.get(h.url)?.image;
|
||||
return image ? { ...h, thumbnail: image } : h;
|
||||
});
|
||||
}
|
||||
|
||||
export async function searchWeb(
|
||||
@@ -310,7 +332,7 @@ export async function searchWeb(
|
||||
if (hits.length >= limit) break;
|
||||
}
|
||||
if (opts.enrichThumbnails !== false) {
|
||||
await enrichAllThumbnails(db, hits);
|
||||
return await enrichAndFilterHits(db, hits);
|
||||
}
|
||||
return hits;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user