feat(search): Treffer ohne Recipe-JSON-LD rausfiltern
All checks were successful
Build & Publish Docker Image / build-and-push (push) Successful in 1m17s
All checks were successful
Build & Publish Docker Image / build-and-push (push) Successful in 1m17s
Wir fetchen die Trefferseite sowieso schon fürs Thumbnail — prüfen jetzt in der gleichen HTML-Parse-Runde, ob überhaupt ein schema.org/Recipe JSON-LD vorhanden ist. Fehlt es, wird der Treffer aus der Liste entfernt, weil der Importer auf dieser Seite später sowieso mit „Diese Seite enthält kein Rezept" scheitern würde. - Migration 007: thumbnail_cache.has_recipe (NULL=unbekannt, 0=nein, 1=ja). - Fetch-Fehler hinterlassen NULL → Treffer bleibt konservativ sichtbar. - Neue export `hasRecipeJsonLd(html)` in json-ld-recipe.ts. - Alle Cache-Reads/Writes nehmen den neuen Wert mit. Tests: +2 für Filter/Failover, bestehende Thumbnail-Tests mit Recipe-JSON-LD-Stub ergänzt, damit sie nicht selber rausgefiltert werden. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,11 @@
|
|||||||
|
-- Erweitert thumbnail_cache um ein has_recipe-Flag. Beim Thumbnail-
|
||||||
|
-- Enrichment checken wir, ob die Seite überhaupt ein schema.org/Recipe
|
||||||
|
-- JSON-LD enthält — sonst kann der Importer das Rezept später sowieso
|
||||||
|
-- nicht extrahieren, und der User sieht nur die „Diese Seite enthält
|
||||||
|
-- kein Rezept"-Fehlermeldung.
|
||||||
|
--
|
||||||
|
-- NULL = unbekannt (vor dieser Migration gecached oder Fetch schlug fehl,
|
||||||
|
-- dann behalten wir den Treffer konservativ);
|
||||||
|
-- 0 = gesicherter Nicht-Treffer (ausblenden);
|
||||||
|
-- 1 = Rezept vorhanden.
|
||||||
|
ALTER TABLE thumbnail_cache ADD COLUMN has_recipe INTEGER;
|
||||||
@@ -106,6 +106,14 @@ function findRecipeNode(html: string): JsonLdNode | null {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function hasRecipeJsonLd(html: string): boolean {
|
||||||
|
try {
|
||||||
|
return findRecipeNode(html) !== null;
|
||||||
|
} catch {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
export function extractRecipeFromHtml(html: string): Recipe | null {
|
export function extractRecipeFromHtml(html: string): Recipe | null {
|
||||||
const node = findRecipeNode(html);
|
const node = findRecipeNode(html);
|
||||||
if (!node) return null;
|
if (!node) return null;
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ import type Database from 'better-sqlite3';
|
|||||||
import { parseHTML } from 'linkedom';
|
import { parseHTML } from 'linkedom';
|
||||||
import { listDomains, normalizeDomain } from '../domains/repository';
|
import { listDomains, normalizeDomain } from '../domains/repository';
|
||||||
import { fetchText } from '../http';
|
import { fetchText } from '../http';
|
||||||
|
import { hasRecipeJsonLd } from '../parsers/json-ld-recipe';
|
||||||
|
|
||||||
export type WebHit = {
|
export type WebHit = {
|
||||||
url: string;
|
url: string;
|
||||||
@@ -182,70 +183,91 @@ function extractPageImage(html: string, baseUrl: string): string | null {
|
|||||||
const THUMB_TTL_DAYS = Number(process.env.KOCHWAS_THUMB_TTL_DAYS ?? 30);
|
const THUMB_TTL_DAYS = Number(process.env.KOCHWAS_THUMB_TTL_DAYS ?? 30);
|
||||||
const THUMB_TTL_MS = THUMB_TTL_DAYS * 24 * 60 * 60 * 1000;
|
const THUMB_TTL_MS = THUMB_TTL_DAYS * 24 * 60 * 60 * 1000;
|
||||||
|
|
||||||
function readCachedThumbnail(
|
type PageMeta = {
|
||||||
|
image: string | null;
|
||||||
|
hasRecipe: 0 | 1 | null;
|
||||||
|
};
|
||||||
|
|
||||||
|
function readCachedPageMeta(
|
||||||
db: Database.Database,
|
db: Database.Database,
|
||||||
url: string
|
url: string
|
||||||
): { image: string | null } | null {
|
): PageMeta | null {
|
||||||
const row = db
|
const row = db
|
||||||
.prepare<[string, string], { image: string | null }>(
|
.prepare<
|
||||||
"SELECT image FROM thumbnail_cache WHERE url = ? AND expires_at > ?"
|
[string, string],
|
||||||
|
{ image: string | null; has_recipe: 0 | 1 | null }
|
||||||
|
>(
|
||||||
|
'SELECT image, has_recipe FROM thumbnail_cache WHERE url = ? AND expires_at > ?'
|
||||||
)
|
)
|
||||||
.get(url, new Date().toISOString());
|
.get(url, new Date().toISOString());
|
||||||
return row ?? null;
|
if (!row) return null;
|
||||||
|
return { image: row.image, hasRecipe: row.has_recipe };
|
||||||
}
|
}
|
||||||
|
|
||||||
function writeCachedThumbnail(
|
function writeCachedPageMeta(
|
||||||
db: Database.Database,
|
db: Database.Database,
|
||||||
url: string,
|
url: string,
|
||||||
image: string | null
|
meta: PageMeta
|
||||||
): void {
|
): void {
|
||||||
const expiresAt = new Date(Date.now() + THUMB_TTL_MS).toISOString();
|
const expiresAt = new Date(Date.now() + THUMB_TTL_MS).toISOString();
|
||||||
db.prepare(
|
db.prepare(
|
||||||
'INSERT OR REPLACE INTO thumbnail_cache (url, image, expires_at) VALUES (?, ?, ?)'
|
'INSERT OR REPLACE INTO thumbnail_cache (url, image, expires_at, has_recipe) VALUES (?, ?, ?, ?)'
|
||||||
).run(url, image, expiresAt);
|
).run(url, meta.image, expiresAt, meta.hasRecipe);
|
||||||
}
|
}
|
||||||
|
|
||||||
async function enrichThumbnail(
|
async function enrichPageMeta(
|
||||||
db: Database.Database,
|
db: Database.Database,
|
||||||
url: string
|
url: string
|
||||||
): Promise<string | null> {
|
): Promise<PageMeta> {
|
||||||
const cached = readCachedThumbnail(db, url);
|
const cached = readCachedPageMeta(db, url);
|
||||||
if (cached) return cached.image;
|
if (cached) return cached;
|
||||||
let image: string | null = null;
|
let meta: PageMeta = { image: null, hasRecipe: null };
|
||||||
try {
|
try {
|
||||||
const html = await fetchText(url, { timeoutMs: 4_000, maxBytes: 512 * 1024 });
|
const html = await fetchText(url, { timeoutMs: 4_000, maxBytes: 512 * 1024 });
|
||||||
image = extractPageImage(html, url);
|
meta = {
|
||||||
|
image: extractPageImage(html, url),
|
||||||
|
hasRecipe: hasRecipeJsonLd(html) ? 1 : 0
|
||||||
|
};
|
||||||
} catch {
|
} catch {
|
||||||
image = null;
|
// Fetch failed — leave hasRecipe null (unknown) so we don't permanently
|
||||||
|
// hide a temporary-network-error URL.
|
||||||
}
|
}
|
||||||
writeCachedThumbnail(db, url, image);
|
writeCachedPageMeta(db, url, meta);
|
||||||
return image;
|
return meta;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function enrichAllThumbnails(
|
async function enrichAndFilterHits(
|
||||||
db: Database.Database,
|
db: Database.Database,
|
||||||
hits: WebHit[]
|
hits: WebHit[]
|
||||||
): Promise<void> {
|
): Promise<WebHit[]> {
|
||||||
// Always fetch the page image even when SearXNG gave us a thumbnail —
|
// Always fetch the page even when SearXNG gave us a thumbnail — we need
|
||||||
// the search engine's thumbnail is typically 150-200px, while og:image
|
// the HTML anyway for the high-res og:image AND to confirm a Recipe
|
||||||
// / JSON-LD image on the page is the full-resolution recipe photo.
|
// JSON-LD actually exists. The thumbnail_cache table (default 30-day TTL)
|
||||||
// The thumbnail_cache table (default 30-day TTL) makes repeat searches instant.
|
// makes repeat searches instant.
|
||||||
if (hits.length === 0) return;
|
if (hits.length === 0) return hits;
|
||||||
// Lazy cleanup of expired entries — O(log n) index scan, cheap.
|
// Lazy cleanup of expired entries — O(log n) index scan, cheap.
|
||||||
db.prepare('DELETE FROM thumbnail_cache WHERE expires_at <= ?').run(
|
db.prepare('DELETE FROM thumbnail_cache WHERE expires_at <= ?').run(
|
||||||
new Date().toISOString()
|
new Date().toISOString()
|
||||||
);
|
);
|
||||||
|
const metas = new Map<string, PageMeta>();
|
||||||
const queue = [...hits];
|
const queue = [...hits];
|
||||||
const LIMIT = 6;
|
const LIMIT = 6;
|
||||||
const workers = Array.from({ length: Math.min(LIMIT, queue.length) }, async () => {
|
const workers = Array.from({ length: Math.min(LIMIT, queue.length) }, async () => {
|
||||||
while (queue.length > 0) {
|
while (queue.length > 0) {
|
||||||
const h = queue.shift();
|
const h = queue.shift();
|
||||||
if (!h) break;
|
if (!h) break;
|
||||||
const image = await enrichThumbnail(db, h.url);
|
metas.set(h.url, await enrichPageMeta(db, h.url));
|
||||||
if (image) h.thumbnail = image;
|
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
await Promise.all(workers);
|
await Promise.all(workers);
|
||||||
|
// Drop confirmed-non-recipe pages (hasRecipe === 0). Keep unknown (null)
|
||||||
|
// and confirmed recipes (1).
|
||||||
|
return hits
|
||||||
|
.filter((h) => metas.get(h.url)?.hasRecipe !== 0)
|
||||||
|
.map((h) => {
|
||||||
|
const image = metas.get(h.url)?.image;
|
||||||
|
return image ? { ...h, thumbnail: image } : h;
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function searchWeb(
|
export async function searchWeb(
|
||||||
@@ -310,7 +332,7 @@ export async function searchWeb(
|
|||||||
if (hits.length >= limit) break;
|
if (hits.length >= limit) break;
|
||||||
}
|
}
|
||||||
if (opts.enrichThumbnails !== false) {
|
if (opts.enrichThumbnails !== false) {
|
||||||
await enrichAllThumbnails(db, hits);
|
return await enrichAndFilterHits(db, hits);
|
||||||
}
|
}
|
||||||
return hits;
|
return hits;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -100,11 +100,62 @@ describe('searchWeb', () => {
|
|||||||
expect(receivedPageno).toBe(null);
|
expect(receivedPageno).toBe(null);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('drops hits whose page lacks a Recipe JSON-LD', async () => {
|
||||||
|
const pageServer = createServer((req, res) => {
|
||||||
|
res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' });
|
||||||
|
if (req.url === '/with-recipe') {
|
||||||
|
res.end(`<html><head>
|
||||||
|
<script type="application/ld+json">${JSON.stringify({
|
||||||
|
'@type': 'Recipe',
|
||||||
|
name: 'Pie',
|
||||||
|
image: 'https://cdn.example/pie.jpg'
|
||||||
|
})}</script>
|
||||||
|
</head></html>`);
|
||||||
|
} else {
|
||||||
|
// forum page: no Recipe JSON-LD
|
||||||
|
res.end('<html><head><title>Forum</title></head><body>Diskussion</body></html>');
|
||||||
|
}
|
||||||
|
});
|
||||||
|
await new Promise<void>((r) => pageServer.listen(0, '127.0.0.1', r));
|
||||||
|
const addr = pageServer.address() as AddressInfo;
|
||||||
|
try {
|
||||||
|
const db = openInMemoryForTest();
|
||||||
|
addDomain(db, '127.0.0.1');
|
||||||
|
respondWith([
|
||||||
|
{ url: `http://127.0.0.1:${addr.port}/with-recipe`, title: 'Recipe', content: '' },
|
||||||
|
{ url: `http://127.0.0.1:${addr.port}/forum-thread`, title: 'Forum', content: '' }
|
||||||
|
]);
|
||||||
|
const hits = await searchWeb(db, 'x', { searxngUrl: baseUrl });
|
||||||
|
expect(hits.length).toBe(1);
|
||||||
|
expect(hits[0].url.endsWith('/with-recipe')).toBe(true);
|
||||||
|
} finally {
|
||||||
|
await new Promise<void>((r) => pageServer.close(() => r()));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
it('keeps hit when page fetch fails (unknown recipe status)', async () => {
|
||||||
|
const db = openInMemoryForTest();
|
||||||
|
addDomain(db, '127.0.0.1');
|
||||||
|
// URL points to a port nobody listens on → fetch fails
|
||||||
|
respondWith([
|
||||||
|
{ url: 'http://127.0.0.1:1/unreachable', title: 'Unreachable', content: '' }
|
||||||
|
]);
|
||||||
|
const hits = await searchWeb(db, 'x', { searxngUrl: baseUrl });
|
||||||
|
expect(hits.length).toBe(1);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Minimal Recipe-JSON-LD stub so enrichAndFilterHits doesn't drop test hits
|
||||||
|
// as non-recipe pages. Used in tests that focus on thumbnail extraction.
|
||||||
|
const RECIPE_LD = `<script type="application/ld+json">${JSON.stringify({
|
||||||
|
'@type': 'Recipe',
|
||||||
|
name: 'stub'
|
||||||
|
})}</script>`;
|
||||||
|
|
||||||
it('enriches missing thumbnails from og:image', async () => {
|
it('enriches missing thumbnails from og:image', async () => {
|
||||||
const pageServer = createServer((_req, res) => {
|
const pageServer = createServer((_req, res) => {
|
||||||
res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' });
|
res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' });
|
||||||
res.end(
|
res.end(
|
||||||
'<html><head><meta property="og:image" content="https://cdn.example/foo.jpg" /></head><body></body></html>'
|
`<html><head><meta property="og:image" content="https://cdn.example/foo.jpg" />${RECIPE_LD}</head><body></body></html>`
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
await new Promise<void>((r) => pageServer.listen(0, '127.0.0.1', r));
|
await new Promise<void>((r) => pageServer.listen(0, '127.0.0.1', r));
|
||||||
@@ -151,7 +202,7 @@ describe('searchWeb', () => {
|
|||||||
const pageServer = createServer((_req, res) => {
|
const pageServer = createServer((_req, res) => {
|
||||||
res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' });
|
res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' });
|
||||||
res.end(
|
res.end(
|
||||||
'<html><body><article><img src="/uploads/dish.jpg" alt=""></article></body></html>'
|
`<html><head>${RECIPE_LD}</head><body><article><img src="/uploads/dish.jpg" alt=""></article></body></html>`
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
await new Promise<void>((r) => pageServer.listen(0, '127.0.0.1', r));
|
await new Promise<void>((r) => pageServer.listen(0, '127.0.0.1', r));
|
||||||
@@ -172,7 +223,7 @@ describe('searchWeb', () => {
|
|||||||
const pageServer = createServer((_req, res) => {
|
const pageServer = createServer((_req, res) => {
|
||||||
res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' });
|
res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' });
|
||||||
res.end(
|
res.end(
|
||||||
'<html><head><meta property="og:image" content="https://cdn.example/hq.jpg" /></head></html>'
|
`<html><head><meta property="og:image" content="https://cdn.example/hq.jpg" />${RECIPE_LD}</head></html>`
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
await new Promise<void>((r) => pageServer.listen(0, '127.0.0.1', r));
|
await new Promise<void>((r) => pageServer.listen(0, '127.0.0.1', r));
|
||||||
@@ -194,7 +245,7 @@ describe('searchWeb', () => {
|
|||||||
it('keeps SearXNG thumbnail when page has no image', async () => {
|
it('keeps SearXNG thumbnail when page has no image', async () => {
|
||||||
const pageServer = createServer((_req, res) => {
|
const pageServer = createServer((_req, res) => {
|
||||||
res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' });
|
res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' });
|
||||||
res.end('<html><head></head><body>no images here</body></html>');
|
res.end(`<html><head>${RECIPE_LD}</head><body>no images here</body></html>`);
|
||||||
});
|
});
|
||||||
await new Promise<void>((r) => pageServer.listen(0, '127.0.0.1', r));
|
await new Promise<void>((r) => pageServer.listen(0, '127.0.0.1', r));
|
||||||
const addr = pageServer.address() as AddressInfo;
|
const addr = pageServer.address() as AddressInfo;
|
||||||
@@ -217,7 +268,7 @@ describe('searchWeb', () => {
|
|||||||
const pageServer = createServer((_req, res) => {
|
const pageServer = createServer((_req, res) => {
|
||||||
pageHits += 1;
|
pageHits += 1;
|
||||||
res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' });
|
res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' });
|
||||||
res.end('<html><head><meta property="og:image" content="https://cdn.example/c.jpg"></head></html>');
|
res.end(`<html><head><meta property="og:image" content="https://cdn.example/c.jpg">${RECIPE_LD}</head></html>`);
|
||||||
});
|
});
|
||||||
await new Promise<void>((r) => pageServer.listen(0, '127.0.0.1', r));
|
await new Promise<void>((r) => pageServer.listen(0, '127.0.0.1', r));
|
||||||
const addr = pageServer.address() as AddressInfo;
|
const addr = pageServer.address() as AddressInfo;
|
||||||
|
|||||||
Reference in New Issue
Block a user