feat(search): persistenter Thumbnail-Cache in SQLite, Default-TTL 30 Tage
All checks were successful
Build & Publish Docker Image / build-and-push (push) Successful in 54s
All checks were successful
Build & Publish Docker Image / build-and-push (push) Successful in 54s
Vorher: In-Memory-Map, TTL 30 Minuten. Container-Neustart verwarf den kompletten Cache, also musste nach jedem Deploy jede Suche wieder alle Seiten laden. Jetzt: - Neue Tabelle thumbnail_cache (url PK, image, expires_at) - Default-TTL 30 Tage, per Env KOCHWAS_THUMB_TTL_DAYS konfigurierbar (7, 365, was der User will — is alles ok laut Nutzer) - Negative Cache: Seiten ohne Bild werden mit image=NULL gespeichert, damit wir nicht jede Suche die gleiche kaputte Seite wieder laden - Lazy-Cleanup: pro searchWeb-Aufruf werden abgelaufene Zeilen via DELETE ... WHERE expires_at <= now() weggeräumt (Index-Scan, billig) Migration 003_thumbnail_cache.sql: nicht-destruktiv, nur neue Tabelle. Bestehende DB bekommt sie beim nächsten Start automatisch dazu. Tests (99/99): - Neuer Cache-Test: zweiter searchWeb für dieselbe URL macht keinen Page-Fetch mehr und liest die image-Spalte aus SQLite.
This commit is contained in:
10
src/lib/server/db/migrations/003_thumbnail_cache.sql
Normal file
10
src/lib/server/db/migrations/003_thumbnail_cache.sql
Normal file
@@ -0,0 +1,10 @@
|
||||
-- Long-term cache for page → image URL mappings extracted via og:image,
|
||||
-- JSON-LD, or first content <img>. Fetching every recipe page on every
|
||||
-- search is expensive; store the mapping with a 30-day default TTL.
|
||||
CREATE TABLE thumbnail_cache (
|
||||
url TEXT PRIMARY KEY,
|
||||
image TEXT, -- NULL = page has no image (cache the negative too)
|
||||
expires_at TEXT NOT NULL -- ISO-8601 UTC
|
||||
);
|
||||
|
||||
CREATE INDEX idx_thumbnail_cache_expires ON thumbnail_cache(expires_at);
|
||||
@@ -179,14 +179,38 @@ function extractPageImage(html: string, baseUrl: string): string | null {
|
||||
}
|
||||
}
|
||||
|
||||
type ThumbCacheEntry = { image: string | null; expires: number };
|
||||
const thumbCache = new Map<string, ThumbCacheEntry>();
|
||||
const THUMB_TTL_MS = 30 * 60 * 1000;
|
||||
const THUMB_TTL_DAYS = Number(process.env.KOCHWAS_THUMB_TTL_DAYS ?? 30);
|
||||
const THUMB_TTL_MS = THUMB_TTL_DAYS * 24 * 60 * 60 * 1000;
|
||||
|
||||
async function enrichThumbnail(url: string): Promise<string | null> {
|
||||
const now = Date.now();
|
||||
const cached = thumbCache.get(url);
|
||||
if (cached && cached.expires > now) return cached.image;
|
||||
function readCachedThumbnail(
|
||||
db: Database.Database,
|
||||
url: string
|
||||
): { image: string | null } | null {
|
||||
const row = db
|
||||
.prepare<[string, string], { image: string | null }>(
|
||||
"SELECT image FROM thumbnail_cache WHERE url = ? AND expires_at > ?"
|
||||
)
|
||||
.get(url, new Date().toISOString());
|
||||
return row ?? null;
|
||||
}
|
||||
|
||||
function writeCachedThumbnail(
|
||||
db: Database.Database,
|
||||
url: string,
|
||||
image: string | null
|
||||
): void {
|
||||
const expiresAt = new Date(Date.now() + THUMB_TTL_MS).toISOString();
|
||||
db.prepare(
|
||||
'INSERT OR REPLACE INTO thumbnail_cache (url, image, expires_at) VALUES (?, ?, ?)'
|
||||
).run(url, image, expiresAt);
|
||||
}
|
||||
|
||||
async function enrichThumbnail(
|
||||
db: Database.Database,
|
||||
url: string
|
||||
): Promise<string | null> {
|
||||
const cached = readCachedThumbnail(db, url);
|
||||
if (cached) return cached.image;
|
||||
let image: string | null = null;
|
||||
try {
|
||||
const html = await fetchText(url, { timeoutMs: 4_000, maxBytes: 512 * 1024 });
|
||||
@@ -194,23 +218,30 @@ async function enrichThumbnail(url: string): Promise<string | null> {
|
||||
} catch {
|
||||
image = null;
|
||||
}
|
||||
thumbCache.set(url, { image, expires: now + THUMB_TTL_MS });
|
||||
writeCachedThumbnail(db, url, image);
|
||||
return image;
|
||||
}
|
||||
|
||||
async function enrichAllThumbnails(hits: WebHit[]): Promise<void> {
|
||||
async function enrichAllThumbnails(
|
||||
db: Database.Database,
|
||||
hits: WebHit[]
|
||||
): Promise<void> {
|
||||
// Always fetch the page image even when SearXNG gave us a thumbnail —
|
||||
// the search engine's thumbnail is typically 150-200px, while og:image
|
||||
// / JSON-LD image on the page is the full-resolution recipe photo.
|
||||
// The 30-min URL cache makes repeat searches instant.
|
||||
// The thumbnail_cache table (default 30-day TTL) makes repeat searches instant.
|
||||
if (hits.length === 0) return;
|
||||
// Lazy cleanup of expired entries — O(log n) index scan, cheap.
|
||||
db.prepare('DELETE FROM thumbnail_cache WHERE expires_at <= ?').run(
|
||||
new Date().toISOString()
|
||||
);
|
||||
const queue = [...hits];
|
||||
const LIMIT = 6;
|
||||
const workers = Array.from({ length: Math.min(LIMIT, queue.length) }, async () => {
|
||||
while (queue.length > 0) {
|
||||
const h = queue.shift();
|
||||
if (!h) break;
|
||||
const image = await enrichThumbnail(h.url);
|
||||
const image = await enrichThumbnail(db, h.url);
|
||||
if (image) h.thumbnail = image;
|
||||
}
|
||||
});
|
||||
@@ -272,7 +303,7 @@ export async function searchWeb(
|
||||
if (hits.length >= limit) break;
|
||||
}
|
||||
if (opts.enrichThumbnails !== false) {
|
||||
await enrichAllThumbnails(hits);
|
||||
await enrichAllThumbnails(db, hits);
|
||||
}
|
||||
return hits;
|
||||
}
|
||||
|
||||
@@ -184,6 +184,34 @@ describe('searchWeb', () => {
|
||||
}
|
||||
});
|
||||
|
||||
it('SQLite cache: second search does not re-fetch the page', async () => {
|
||||
let pageHits = 0;
|
||||
const pageServer = createServer((_req, res) => {
|
||||
pageHits += 1;
|
||||
res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' });
|
||||
res.end('<html><head><meta property="og:image" content="https://cdn.example/c.jpg"></head></html>');
|
||||
});
|
||||
await new Promise<void>((r) => pageServer.listen(0, '127.0.0.1', r));
|
||||
const addr = pageServer.address() as AddressInfo;
|
||||
const pageUrl = `http://127.0.0.1:${addr.port}/cached`;
|
||||
try {
|
||||
const db = openInMemoryForTest();
|
||||
addDomain(db, '127.0.0.1');
|
||||
respondWith([{ url: pageUrl, title: 'C', content: '' }]);
|
||||
const first = await searchWeb(db, 'c', { searxngUrl: baseUrl });
|
||||
const second = await searchWeb(db, 'c', { searxngUrl: baseUrl });
|
||||
expect(first[0].thumbnail).toBe('https://cdn.example/c.jpg');
|
||||
expect(second[0].thumbnail).toBe('https://cdn.example/c.jpg');
|
||||
expect(pageHits).toBe(1); // second call read from SQLite cache
|
||||
const row = db
|
||||
.prepare('SELECT image FROM thumbnail_cache WHERE url = ?')
|
||||
.get(pageUrl) as { image: string };
|
||||
expect(row.image).toBe('https://cdn.example/c.jpg');
|
||||
} finally {
|
||||
await new Promise<void>((r) => pageServer.close(() => r()));
|
||||
}
|
||||
});
|
||||
|
||||
it('filters out forum/magazine/listing URLs', async () => {
|
||||
const db = openInMemoryForTest();
|
||||
addDomain(db, 'chefkoch.de');
|
||||
|
||||
Reference in New Issue
Block a user