From 4d90d515012a3156ee05703aeb5d507331d19bb6 Mon Sep 17 00:00:00 2001 From: hsiegeln <37154749+hsiegeln@users.noreply.github.com> Date: Fri, 17 Apr 2026 18:34:29 +0200 Subject: [PATCH] feat(search): persistenter Thumbnail-Cache in SQLite, Default-TTL 30 Tage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Vorher: In-Memory-Map, TTL 30 Minuten. Container-Neustart verwarf den kompletten Cache, also musste nach jedem Deploy jede Suche wieder alle Seiten laden. Jetzt: - Neue Tabelle thumbnail_cache (url PK, image, expires_at) - Default-TTL 30 Tage, per Env KOCHWAS_THUMB_TTL_DAYS konfigurierbar (7, 365, was der User will — is alles ok laut Nutzer) - Negative Cache: Seiten ohne Bild werden mit image=NULL gespeichert, damit wir nicht jede Suche die gleiche kaputte Seite wieder laden - Lazy-Cleanup: pro searchWeb-Aufruf werden abgelaufene Zeilen via DELETE ... WHERE expires_at <= now() weggeräumt (Index-Scan, billig) Migration 003_thumbnail_cache.sql: nicht-destruktiv, nur neue Tabelle. Bestehende DB bekommt sie beim nächsten Start automatisch dazu. Tests (99/99): - Neuer Cache-Test: zweiter searchWeb für dieselbe URL macht keinen Page-Fetch mehr und liest die image-Spalte aus SQLite. --- .../db/migrations/003_thumbnail_cache.sql | 10 ++++ src/lib/server/search/searxng.ts | 55 +++++++++++++++---- tests/integration/searxng.test.ts | 28 ++++++++++ 3 files changed, 81 insertions(+), 12 deletions(-) create mode 100644 src/lib/server/db/migrations/003_thumbnail_cache.sql diff --git a/src/lib/server/db/migrations/003_thumbnail_cache.sql b/src/lib/server/db/migrations/003_thumbnail_cache.sql new file mode 100644 index 0000000..c5e3f5b --- /dev/null +++ b/src/lib/server/db/migrations/003_thumbnail_cache.sql @@ -0,0 +1,10 @@ +-- Long-term cache for page → image URL mappings extracted via og:image, +-- JSON-LD, or first content . Fetching every recipe page on every +-- search is expensive; store the mapping with a 30-day default TTL. +CREATE TABLE thumbnail_cache ( + url TEXT PRIMARY KEY, + image TEXT, -- NULL = page has no image (cache the negative too) + expires_at TEXT NOT NULL -- ISO-8601 UTC +); + +CREATE INDEX idx_thumbnail_cache_expires ON thumbnail_cache(expires_at); diff --git a/src/lib/server/search/searxng.ts b/src/lib/server/search/searxng.ts index e88dfdb..f171e28 100644 --- a/src/lib/server/search/searxng.ts +++ b/src/lib/server/search/searxng.ts @@ -179,14 +179,38 @@ function extractPageImage(html: string, baseUrl: string): string | null { } } -type ThumbCacheEntry = { image: string | null; expires: number }; -const thumbCache = new Map(); -const THUMB_TTL_MS = 30 * 60 * 1000; +const THUMB_TTL_DAYS = Number(process.env.KOCHWAS_THUMB_TTL_DAYS ?? 30); +const THUMB_TTL_MS = THUMB_TTL_DAYS * 24 * 60 * 60 * 1000; -async function enrichThumbnail(url: string): Promise { - const now = Date.now(); - const cached = thumbCache.get(url); - if (cached && cached.expires > now) return cached.image; +function readCachedThumbnail( + db: Database.Database, + url: string +): { image: string | null } | null { + const row = db + .prepare<[string, string], { image: string | null }>( + "SELECT image FROM thumbnail_cache WHERE url = ? AND expires_at > ?" + ) + .get(url, new Date().toISOString()); + return row ?? null; +} + +function writeCachedThumbnail( + db: Database.Database, + url: string, + image: string | null +): void { + const expiresAt = new Date(Date.now() + THUMB_TTL_MS).toISOString(); + db.prepare( + 'INSERT OR REPLACE INTO thumbnail_cache (url, image, expires_at) VALUES (?, ?, ?)' + ).run(url, image, expiresAt); +} + +async function enrichThumbnail( + db: Database.Database, + url: string +): Promise { + const cached = readCachedThumbnail(db, url); + if (cached) return cached.image; let image: string | null = null; try { const html = await fetchText(url, { timeoutMs: 4_000, maxBytes: 512 * 1024 }); @@ -194,23 +218,30 @@ async function enrichThumbnail(url: string): Promise { } catch { image = null; } - thumbCache.set(url, { image, expires: now + THUMB_TTL_MS }); + writeCachedThumbnail(db, url, image); return image; } -async function enrichAllThumbnails(hits: WebHit[]): Promise { +async function enrichAllThumbnails( + db: Database.Database, + hits: WebHit[] +): Promise { // Always fetch the page image even when SearXNG gave us a thumbnail — // the search engine's thumbnail is typically 150-200px, while og:image // / JSON-LD image on the page is the full-resolution recipe photo. - // The 30-min URL cache makes repeat searches instant. + // The thumbnail_cache table (default 30-day TTL) makes repeat searches instant. if (hits.length === 0) return; + // Lazy cleanup of expired entries — O(log n) index scan, cheap. + db.prepare('DELETE FROM thumbnail_cache WHERE expires_at <= ?').run( + new Date().toISOString() + ); const queue = [...hits]; const LIMIT = 6; const workers = Array.from({ length: Math.min(LIMIT, queue.length) }, async () => { while (queue.length > 0) { const h = queue.shift(); if (!h) break; - const image = await enrichThumbnail(h.url); + const image = await enrichThumbnail(db, h.url); if (image) h.thumbnail = image; } }); @@ -272,7 +303,7 @@ export async function searchWeb( if (hits.length >= limit) break; } if (opts.enrichThumbnails !== false) { - await enrichAllThumbnails(hits); + await enrichAllThumbnails(db, hits); } return hits; } diff --git a/tests/integration/searxng.test.ts b/tests/integration/searxng.test.ts index ccbc069..b7d3016 100644 --- a/tests/integration/searxng.test.ts +++ b/tests/integration/searxng.test.ts @@ -184,6 +184,34 @@ describe('searchWeb', () => { } }); + it('SQLite cache: second search does not re-fetch the page', async () => { + let pageHits = 0; + const pageServer = createServer((_req, res) => { + pageHits += 1; + res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' }); + res.end(''); + }); + await new Promise((r) => pageServer.listen(0, '127.0.0.1', r)); + const addr = pageServer.address() as AddressInfo; + const pageUrl = `http://127.0.0.1:${addr.port}/cached`; + try { + const db = openInMemoryForTest(); + addDomain(db, '127.0.0.1'); + respondWith([{ url: pageUrl, title: 'C', content: '' }]); + const first = await searchWeb(db, 'c', { searxngUrl: baseUrl }); + const second = await searchWeb(db, 'c', { searxngUrl: baseUrl }); + expect(first[0].thumbnail).toBe('https://cdn.example/c.jpg'); + expect(second[0].thumbnail).toBe('https://cdn.example/c.jpg'); + expect(pageHits).toBe(1); // second call read from SQLite cache + const row = db + .prepare('SELECT image FROM thumbnail_cache WHERE url = ?') + .get(pageUrl) as { image: string }; + expect(row.image).toBe('https://cdn.example/c.jpg'); + } finally { + await new Promise((r) => pageServer.close(() => r())); + } + }); + it('filters out forum/magazine/listing URLs', async () => { const db = openInMemoryForTest(); addDomain(db, 'chefkoch.de');