From 6a784488f5709bfe1c7b2d847663fdae322d9451 Mon Sep 17 00:00:00 2001 From: hsiegeln <37154749+hsiegeln@users.noreply.github.com> Date: Fri, 17 Apr 2026 17:55:53 +0200 Subject: [PATCH] fix(search): enrich missing SearXNG thumbnails with og:image MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SearXNG liefert je nach Seite mal ein thumbnail/img_src mit, mal nicht — bei Chefkoch-Treffern hatten deshalb zufällig die Hälfte der Kacheln einen Platzhalter, obwohl die Vorschau dann sehr wohl ein Bild fand. searchWeb() holt jetzt für jeden Treffer ohne Thumbnail parallel (max. 6 gleichzeitig, 4 s Timeout pro Request) die Seite und extrahiert das og:image- oder twitter:image-Meta-Tag. Ergebnis wird 30 min in-memory gecacht, damit wiederholte Suchen nicht wieder die gleichen Seiten laden. Tests: - Neuer Test: Treffer ohne Thumbnail wird via og:image angereichert. - Neuer Test: Treffer mit Thumbnail bleibt unverändert (keine Fetch). - Bestehende Tests deaktivieren Enrichment via enrichThumbnails:false, damit sie keine echten Chefkoch-URLs aufrufen. --- src/lib/server/search/searxng.ts | 56 ++++++++++++++++++++++++++++++- tests/integration/searxng.test.ts | 47 +++++++++++++++++++++++--- 2 files changed, 97 insertions(+), 6 deletions(-) diff --git a/src/lib/server/search/searxng.ts b/src/lib/server/search/searxng.ts index ca74508..9ec6a93 100644 --- a/src/lib/server/search/searxng.ts +++ b/src/lib/server/search/searxng.ts @@ -77,10 +77,61 @@ function looksLikeRecipePage(url: string): boolean { } } +const OG_IMAGE_RE = + /]+(?:property|name)=["']og:image(?::url)?["'][^>]+content=["']([^"']+)["']/i; +const OG_IMAGE_RE_REV = + /]+content=["']([^"']+)["'][^>]+(?:property|name)=["']og:image(?::url)?["']/i; +const TWITTER_IMAGE_RE = + /]+(?:property|name)=["']twitter:image["'][^>]+content=["']([^"']+)["']/i; + +function extractOgImage(html: string): string | null { + const m = OG_IMAGE_RE.exec(html) ?? OG_IMAGE_RE_REV.exec(html) ?? TWITTER_IMAGE_RE.exec(html); + if (!m) return null; + try { + return new URL(m[1]).toString(); + } catch { + return null; + } +} + +type ThumbCacheEntry = { image: string | null; expires: number }; +const thumbCache = new Map(); +const THUMB_TTL_MS = 30 * 60 * 1000; + +async function enrichThumbnail(url: string): Promise { + const now = Date.now(); + const cached = thumbCache.get(url); + if (cached && cached.expires > now) return cached.image; + let image: string | null = null; + try { + const html = await fetchText(url, { timeoutMs: 4_000, maxBytes: 256 * 1024 }); + image = extractOgImage(html); + } catch { + image = null; + } + thumbCache.set(url, { image, expires: now + THUMB_TTL_MS }); + return image; +} + +async function enrichMissingThumbnails(hits: WebHit[]): Promise { + const queue = hits.filter((h) => !h.thumbnail); + if (queue.length === 0) return; + const LIMIT = 6; + const workers = Array.from({ length: Math.min(LIMIT, queue.length) }, async () => { + while (queue.length > 0) { + const h = queue.shift(); + if (!h) break; + const image = await enrichThumbnail(h.url); + if (image) h.thumbnail = image; + } + }); + await Promise.all(workers); +} + export async function searchWeb( db: Database.Database, query: string, - opts: { searxngUrl?: string; limit?: number } = {} + opts: { searxngUrl?: string; limit?: number; enrichThumbnails?: boolean } = {} ): Promise { const trimmed = query.trim(); if (!trimmed) return []; @@ -131,5 +182,8 @@ export async function searchWeb( }); if (hits.length >= limit) break; } + if (opts.enrichThumbnails !== false) { + await enrichMissingThumbnails(hits); + } return hits; } diff --git a/tests/integration/searxng.test.ts b/tests/integration/searxng.test.ts index 1d47ac0..2e164fc 100644 --- a/tests/integration/searxng.test.ts +++ b/tests/integration/searxng.test.ts @@ -42,7 +42,7 @@ describe('searchWeb', () => { content: 'blocked' } ]); - const hits = await searchWeb(db, 'carbonara', { searxngUrl: baseUrl }); + const hits = await searchWeb(db, 'carbonara', { searxngUrl: baseUrl, enrichThumbnails: false }); expect(hits.length).toBe(1); expect(hits[0].domain).toBe('chefkoch.de'); expect(hits[0].title).toBe('Carbonara'); @@ -55,23 +55,60 @@ describe('searchWeb', () => { { url: 'https://www.chefkoch.de/a', title: 'A', content: '' }, { url: 'https://www.chefkoch.de/a', title: 'A dup', content: '' } ]); - const hits = await searchWeb(db, 'a', { searxngUrl: baseUrl }); + const hits = await searchWeb(db, 'a', { searxngUrl: baseUrl, enrichThumbnails: false }); expect(hits.length).toBe(1); }); it('returns empty list when no domains configured', async () => { const db = openInMemoryForTest(); - const hits = await searchWeb(db, 'x', { searxngUrl: baseUrl }); + const hits = await searchWeb(db, 'x', { searxngUrl: baseUrl, enrichThumbnails: false }); expect(hits).toEqual([]); }); it('returns empty for empty query', async () => { const db = openInMemoryForTest(); addDomain(db, 'chefkoch.de'); - const hits = await searchWeb(db, ' ', { searxngUrl: baseUrl }); + const hits = await searchWeb(db, ' ', { searxngUrl: baseUrl, enrichThumbnails: false }); expect(hits).toEqual([]); }); + it('enriches missing thumbnails from og:image', async () => { + const pageServer = createServer((_req, res) => { + res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' }); + res.end( + '' + ); + }); + await new Promise((r) => pageServer.listen(0, '127.0.0.1', r)); + const addr = pageServer.address() as AddressInfo; + const pageUrl = `http://127.0.0.1:${addr.port}/rezept`; + try { + const db = openInMemoryForTest(); + addDomain(db, '127.0.0.1'); + respondWith([{ url: pageUrl, title: 'Kuchen', content: '' }]); + const hits = await searchWeb(db, 'kuchen', { searxngUrl: baseUrl }); + expect(hits.length).toBe(1); + expect(hits[0].thumbnail).toBe('https://cdn.example/foo.jpg'); + } finally { + await new Promise((r) => pageServer.close(() => r())); + } + }); + + it('leaves existing thumbnails untouched (no enrichment fetch)', async () => { + const db = openInMemoryForTest(); + addDomain(db, 'chefkoch.de'); + respondWith([ + { + url: 'https://www.chefkoch.de/rezepte/1/x.html', + title: 'X', + thumbnail: 'https://cdn.chefkoch/x.jpg' + } + ]); + // enrichment enabled, but thumbnail is set → no fetch expected, no hang + const hits = await searchWeb(db, 'x', { searxngUrl: baseUrl }); + expect(hits[0].thumbnail).toBe('https://cdn.chefkoch/x.jpg'); + }); + it('filters out forum/magazine/listing URLs', async () => { const db = openInMemoryForTest(); addDomain(db, 'chefkoch.de'); @@ -83,7 +120,7 @@ describe('searchWeb', () => { { url: 'https://www.chefkoch.de/themen/ravioli/', title: 'Themen' }, { url: 'https://www.chefkoch.de/rezepte/', title: 'Rezepte Übersicht' } ]); - const hits = await searchWeb(db, 'ravioli', { searxngUrl: baseUrl }); + const hits = await searchWeb(db, 'ravioli', { searxngUrl: baseUrl, enrichThumbnails: false }); expect(hits.length).toBe(1); expect(hits[0].title).toBe('Ravioli'); });