diff --git a/src/lib/server/search/searxng.ts b/src/lib/server/search/searxng.ts index 9ec6a93..dbea01e 100644 --- a/src/lib/server/search/searxng.ts +++ b/src/lib/server/search/searxng.ts @@ -1,4 +1,5 @@ import type Database from 'better-sqlite3'; +import { parseHTML } from 'linkedom'; import { listDomains, normalizeDomain } from '../domains/repository'; import { fetchText } from '../http'; @@ -77,18 +78,102 @@ function looksLikeRecipePage(url: string): boolean { } } -const OG_IMAGE_RE = - /]+(?:property|name)=["']og:image(?::url)?["'][^>]+content=["']([^"']+)["']/i; -const OG_IMAGE_RE_REV = - /]+content=["']([^"']+)["'][^>]+(?:property|name)=["']og:image(?::url)?["']/i; -const TWITTER_IMAGE_RE = - /]+(?:property|name)=["']twitter:image["'][^>]+content=["']([^"']+)["']/i; - -function extractOgImage(html: string): string | null { - const m = OG_IMAGE_RE.exec(html) ?? OG_IMAGE_RE_REV.exec(html) ?? TWITTER_IMAGE_RE.exec(html); - if (!m) return null; +function resolveUrl(href: string, baseUrl: string): string | null { try { - return new URL(m[1]).toString(); + return new URL(href, baseUrl).toString(); + } catch { + return null; + } +} + +function imageFromJsonLd(data: unknown): string | null { + if (!data) return null; + if (Array.isArray(data)) { + for (const d of data) { + const img = imageFromJsonLd(d); + if (img) return img; + } + return null; + } + if (typeof data !== 'object') return null; + const node = data as Record; + if (Array.isArray(node['@graph'])) { + for (const d of node['@graph']) { + const img = imageFromJsonLd(d); + if (img) return img; + } + } + const image = node.image; + if (typeof image === 'string') return image; + if (Array.isArray(image) && image.length > 0) { + const first = image[0]; + if (typeof first === 'string') return first; + if (first && typeof first === 'object' && 'url' in first) { + const url = (first as Record).url; + if (typeof url === 'string') return url; + } + } + if (image && typeof image === 'object' && 'url' in image) { + const url = (image as Record).url; + if (typeof url === 'string') return url; + } + return null; +} + +const META_IMAGE_KEYS = new Set([ + 'og:image', + 'og:image:url', + 'og:image:secure_url', + 'twitter:image', + 'twitter:image:src' +]); + +function extractPageImage(html: string, baseUrl: string): string | null { + try { + const { document } = parseHTML(html); + // 1. OpenGraph / Twitter meta tags + for (const m of Array.from(document.querySelectorAll('meta'))) { + const key = (m.getAttribute('property') ?? m.getAttribute('name') ?? '').toLowerCase(); + if (!META_IMAGE_KEYS.has(key)) continue; + const content = m.getAttribute('content'); + if (!content) continue; + const resolved = resolveUrl(content, baseUrl); + if (resolved) return resolved; + } + // 2. + const link = document.querySelector('link[rel="image_src"]'); + if (link) { + const href = link.getAttribute('href'); + if (href) { + const resolved = resolveUrl(href, baseUrl); + if (resolved) return resolved; + } + } + // 3. JSON-LD image (Recipe schema etc.) + for (const s of Array.from(document.querySelectorAll('script[type="application/ld+json"]'))) { + try { + const data = JSON.parse(s.textContent ?? ''); + const img = imageFromJsonLd(data); + if (img) { + const resolved = resolveUrl(img, baseUrl); + if (resolved) return resolved; + } + } catch { + // malformed JSON-LD — skip + } + } + // 4. First content image in article/main + const contentImg = document.querySelector( + 'article img[src], main img[src], .entry-content img[src], .post-content img[src], figure img[src]' + ); + if (contentImg) { + const src = contentImg.getAttribute('src') ?? contentImg.getAttribute('data-src'); + if (src) { + const resolved = resolveUrl(src, baseUrl); + if (resolved) return resolved; + } + } + return null; } catch { return null; } @@ -104,8 +189,8 @@ async function enrichThumbnail(url: string): Promise { if (cached && cached.expires > now) return cached.image; let image: string | null = null; try { - const html = await fetchText(url, { timeoutMs: 4_000, maxBytes: 256 * 1024 }); - image = extractOgImage(html); + const html = await fetchText(url, { timeoutMs: 4_000, maxBytes: 512 * 1024 }); + image = extractPageImage(html, url); } catch { image = null; } diff --git a/src/routes/+page.svelte b/src/routes/+page.svelte index 107c9e5..8f6503c 100644 --- a/src/routes/+page.svelte +++ b/src/routes/+page.svelte @@ -1,6 +1,5 @@ + `); + }); + await new Promise((r) => pageServer.listen(0, '127.0.0.1', r)); + const addr = pageServer.address() as AddressInfo; + const pageUrl = `http://127.0.0.1:${addr.port}/pie`; + try { + const db = openInMemoryForTest(); + addDomain(db, '127.0.0.1'); + respondWith([{ url: pageUrl, title: 'Pie', content: '' }]); + const hits = await searchWeb(db, 'pie', { searxngUrl: baseUrl }); + expect(hits[0].thumbnail).toBe('https://cdn.example/pie.jpg'); + } finally { + await new Promise((r) => pageServer.close(() => r())); + } + }); + + it('falls back to first content image when no meta/JSON-LD image', async () => { + const pageServer = createServer((_req, res) => { + res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' }); + res.end( + '
' + ); + }); + await new Promise((r) => pageServer.listen(0, '127.0.0.1', r)); + const addr = pageServer.address() as AddressInfo; + const pageUrl = `http://127.0.0.1:${addr.port}/article`; + try { + const db = openInMemoryForTest(); + addDomain(db, '127.0.0.1'); + respondWith([{ url: pageUrl, title: 'Dish', content: '' }]); + const hits = await searchWeb(db, 'dish', { searxngUrl: baseUrl }); + expect(hits[0].thumbnail).toBe(`http://127.0.0.1:${addr.port}/uploads/dish.jpg`); + } finally { + await new Promise((r) => pageServer.close(() => r())); + } + }); + it('leaves existing thumbnails untouched (no enrichment fetch)', async () => { const db = openInMemoryForTest(); addDomain(db, 'chefkoch.de');