fix(search): enrich missing SearXNG thumbnails with og:image
All checks were successful
Build & Publish Docker Image / build-and-push (push) Successful in 55s
All checks were successful
Build & Publish Docker Image / build-and-push (push) Successful in 55s
SearXNG liefert je nach Seite mal ein thumbnail/img_src mit, mal nicht — bei Chefkoch-Treffern hatten deshalb zufällig die Hälfte der Kacheln einen Platzhalter, obwohl die Vorschau dann sehr wohl ein Bild fand. searchWeb() holt jetzt für jeden Treffer ohne Thumbnail parallel (max. 6 gleichzeitig, 4 s Timeout pro Request) die Seite und extrahiert das og:image- oder twitter:image-Meta-Tag. Ergebnis wird 30 min in-memory gecacht, damit wiederholte Suchen nicht wieder die gleichen Seiten laden. Tests: - Neuer Test: Treffer ohne Thumbnail wird via og:image angereichert. - Neuer Test: Treffer mit Thumbnail bleibt unverändert (keine Fetch). - Bestehende Tests deaktivieren Enrichment via enrichThumbnails:false, damit sie keine echten Chefkoch-URLs aufrufen.
This commit is contained in:
@@ -77,10 +77,61 @@ function looksLikeRecipePage(url: string): boolean {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const OG_IMAGE_RE =
|
||||||
|
/<meta[^>]+(?:property|name)=["']og:image(?::url)?["'][^>]+content=["']([^"']+)["']/i;
|
||||||
|
const OG_IMAGE_RE_REV =
|
||||||
|
/<meta[^>]+content=["']([^"']+)["'][^>]+(?:property|name)=["']og:image(?::url)?["']/i;
|
||||||
|
const TWITTER_IMAGE_RE =
|
||||||
|
/<meta[^>]+(?:property|name)=["']twitter:image["'][^>]+content=["']([^"']+)["']/i;
|
||||||
|
|
||||||
|
function extractOgImage(html: string): string | null {
|
||||||
|
const m = OG_IMAGE_RE.exec(html) ?? OG_IMAGE_RE_REV.exec(html) ?? TWITTER_IMAGE_RE.exec(html);
|
||||||
|
if (!m) return null;
|
||||||
|
try {
|
||||||
|
return new URL(m[1]).toString();
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type ThumbCacheEntry = { image: string | null; expires: number };
|
||||||
|
const thumbCache = new Map<string, ThumbCacheEntry>();
|
||||||
|
const THUMB_TTL_MS = 30 * 60 * 1000;
|
||||||
|
|
||||||
|
async function enrichThumbnail(url: string): Promise<string | null> {
|
||||||
|
const now = Date.now();
|
||||||
|
const cached = thumbCache.get(url);
|
||||||
|
if (cached && cached.expires > now) return cached.image;
|
||||||
|
let image: string | null = null;
|
||||||
|
try {
|
||||||
|
const html = await fetchText(url, { timeoutMs: 4_000, maxBytes: 256 * 1024 });
|
||||||
|
image = extractOgImage(html);
|
||||||
|
} catch {
|
||||||
|
image = null;
|
||||||
|
}
|
||||||
|
thumbCache.set(url, { image, expires: now + THUMB_TTL_MS });
|
||||||
|
return image;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function enrichMissingThumbnails(hits: WebHit[]): Promise<void> {
|
||||||
|
const queue = hits.filter((h) => !h.thumbnail);
|
||||||
|
if (queue.length === 0) return;
|
||||||
|
const LIMIT = 6;
|
||||||
|
const workers = Array.from({ length: Math.min(LIMIT, queue.length) }, async () => {
|
||||||
|
while (queue.length > 0) {
|
||||||
|
const h = queue.shift();
|
||||||
|
if (!h) break;
|
||||||
|
const image = await enrichThumbnail(h.url);
|
||||||
|
if (image) h.thumbnail = image;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
await Promise.all(workers);
|
||||||
|
}
|
||||||
|
|
||||||
export async function searchWeb(
|
export async function searchWeb(
|
||||||
db: Database.Database,
|
db: Database.Database,
|
||||||
query: string,
|
query: string,
|
||||||
opts: { searxngUrl?: string; limit?: number } = {}
|
opts: { searxngUrl?: string; limit?: number; enrichThumbnails?: boolean } = {}
|
||||||
): Promise<WebHit[]> {
|
): Promise<WebHit[]> {
|
||||||
const trimmed = query.trim();
|
const trimmed = query.trim();
|
||||||
if (!trimmed) return [];
|
if (!trimmed) return [];
|
||||||
@@ -131,5 +182,8 @@ export async function searchWeb(
|
|||||||
});
|
});
|
||||||
if (hits.length >= limit) break;
|
if (hits.length >= limit) break;
|
||||||
}
|
}
|
||||||
|
if (opts.enrichThumbnails !== false) {
|
||||||
|
await enrichMissingThumbnails(hits);
|
||||||
|
}
|
||||||
return hits;
|
return hits;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ describe('searchWeb', () => {
|
|||||||
content: 'blocked'
|
content: 'blocked'
|
||||||
}
|
}
|
||||||
]);
|
]);
|
||||||
const hits = await searchWeb(db, 'carbonara', { searxngUrl: baseUrl });
|
const hits = await searchWeb(db, 'carbonara', { searxngUrl: baseUrl, enrichThumbnails: false });
|
||||||
expect(hits.length).toBe(1);
|
expect(hits.length).toBe(1);
|
||||||
expect(hits[0].domain).toBe('chefkoch.de');
|
expect(hits[0].domain).toBe('chefkoch.de');
|
||||||
expect(hits[0].title).toBe('Carbonara');
|
expect(hits[0].title).toBe('Carbonara');
|
||||||
@@ -55,23 +55,60 @@ describe('searchWeb', () => {
|
|||||||
{ url: 'https://www.chefkoch.de/a', title: 'A', content: '' },
|
{ url: 'https://www.chefkoch.de/a', title: 'A', content: '' },
|
||||||
{ url: 'https://www.chefkoch.de/a', title: 'A dup', content: '' }
|
{ url: 'https://www.chefkoch.de/a', title: 'A dup', content: '' }
|
||||||
]);
|
]);
|
||||||
const hits = await searchWeb(db, 'a', { searxngUrl: baseUrl });
|
const hits = await searchWeb(db, 'a', { searxngUrl: baseUrl, enrichThumbnails: false });
|
||||||
expect(hits.length).toBe(1);
|
expect(hits.length).toBe(1);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('returns empty list when no domains configured', async () => {
|
it('returns empty list when no domains configured', async () => {
|
||||||
const db = openInMemoryForTest();
|
const db = openInMemoryForTest();
|
||||||
const hits = await searchWeb(db, 'x', { searxngUrl: baseUrl });
|
const hits = await searchWeb(db, 'x', { searxngUrl: baseUrl, enrichThumbnails: false });
|
||||||
expect(hits).toEqual([]);
|
expect(hits).toEqual([]);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('returns empty for empty query', async () => {
|
it('returns empty for empty query', async () => {
|
||||||
const db = openInMemoryForTest();
|
const db = openInMemoryForTest();
|
||||||
addDomain(db, 'chefkoch.de');
|
addDomain(db, 'chefkoch.de');
|
||||||
const hits = await searchWeb(db, ' ', { searxngUrl: baseUrl });
|
const hits = await searchWeb(db, ' ', { searxngUrl: baseUrl, enrichThumbnails: false });
|
||||||
expect(hits).toEqual([]);
|
expect(hits).toEqual([]);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('enriches missing thumbnails from og:image', async () => {
|
||||||
|
const pageServer = createServer((_req, res) => {
|
||||||
|
res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' });
|
||||||
|
res.end(
|
||||||
|
'<html><head><meta property="og:image" content="https://cdn.example/foo.jpg" /></head><body></body></html>'
|
||||||
|
);
|
||||||
|
});
|
||||||
|
await new Promise<void>((r) => pageServer.listen(0, '127.0.0.1', r));
|
||||||
|
const addr = pageServer.address() as AddressInfo;
|
||||||
|
const pageUrl = `http://127.0.0.1:${addr.port}/rezept`;
|
||||||
|
try {
|
||||||
|
const db = openInMemoryForTest();
|
||||||
|
addDomain(db, '127.0.0.1');
|
||||||
|
respondWith([{ url: pageUrl, title: 'Kuchen', content: '' }]);
|
||||||
|
const hits = await searchWeb(db, 'kuchen', { searxngUrl: baseUrl });
|
||||||
|
expect(hits.length).toBe(1);
|
||||||
|
expect(hits[0].thumbnail).toBe('https://cdn.example/foo.jpg');
|
||||||
|
} finally {
|
||||||
|
await new Promise<void>((r) => pageServer.close(() => r()));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
it('leaves existing thumbnails untouched (no enrichment fetch)', async () => {
|
||||||
|
const db = openInMemoryForTest();
|
||||||
|
addDomain(db, 'chefkoch.de');
|
||||||
|
respondWith([
|
||||||
|
{
|
||||||
|
url: 'https://www.chefkoch.de/rezepte/1/x.html',
|
||||||
|
title: 'X',
|
||||||
|
thumbnail: 'https://cdn.chefkoch/x.jpg'
|
||||||
|
}
|
||||||
|
]);
|
||||||
|
// enrichment enabled, but thumbnail is set → no fetch expected, no hang
|
||||||
|
const hits = await searchWeb(db, 'x', { searxngUrl: baseUrl });
|
||||||
|
expect(hits[0].thumbnail).toBe('https://cdn.chefkoch/x.jpg');
|
||||||
|
});
|
||||||
|
|
||||||
it('filters out forum/magazine/listing URLs', async () => {
|
it('filters out forum/magazine/listing URLs', async () => {
|
||||||
const db = openInMemoryForTest();
|
const db = openInMemoryForTest();
|
||||||
addDomain(db, 'chefkoch.de');
|
addDomain(db, 'chefkoch.de');
|
||||||
@@ -83,7 +120,7 @@ describe('searchWeb', () => {
|
|||||||
{ url: 'https://www.chefkoch.de/themen/ravioli/', title: 'Themen' },
|
{ url: 'https://www.chefkoch.de/themen/ravioli/', title: 'Themen' },
|
||||||
{ url: 'https://www.chefkoch.de/rezepte/', title: 'Rezepte Übersicht' }
|
{ url: 'https://www.chefkoch.de/rezepte/', title: 'Rezepte Übersicht' }
|
||||||
]);
|
]);
|
||||||
const hits = await searchWeb(db, 'ravioli', { searxngUrl: baseUrl });
|
const hits = await searchWeb(db, 'ravioli', { searxngUrl: baseUrl, enrichThumbnails: false });
|
||||||
expect(hits.length).toBe(1);
|
expect(hits.length).toBe(1);
|
||||||
expect(hits[0].title).toBe('Ravioli');
|
expect(hits[0].title).toBe('Ravioli');
|
||||||
});
|
});
|
||||||
|
|||||||
Reference in New Issue
Block a user