From 211d58ebecd56032c4057346d389725576869cb6 Mon Sep 17 00:00:00 2001 From: hsiegeln <37154749+hsiegeln@users.noreply.github.com> Date: Fri, 17 Apr 2026 18:04:59 +0200 Subject: [PATCH] feat(search): Enter bleibt auf Seite + robustere Thumbnail-Erkennung MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Startseite: - Enter/Return löst die Suche jetzt sofort aus (cancelt den Debounce), navigiert aber NICHT mehr auf /search. Der Anwender bleibt auf der gleichen Seite mit Inline-Ergebnissen. Thumbnail-Enrichment (searxng.ts): - Regex-basierte og:image-Extraktion durch linkedom-parseHTML ersetzt. - Neue Fallback-Kette (in dieser Reihenfolge): 1. 2. 3. JSON-LD image (auch tief in @graph; "image" als String, Array, Objekt-mit-url) 4. Erstes in article/main/.entry-content/.post-content/figure - Relative URLs werden gegen die Seiten-URL zu absoluten aufgelöst (z.B. /uploads/foo.jpg → http://host/uploads/foo.jpg). - maxBytes von 256 KB auf 512 KB angehoben, damit JSON-LD-lastige Recipe-Seiten nicht mitten im Script abgeschnitten werden. Tests (97/97): - Neu: JSON-LD-Image-Fallback-Test. - Neu: Content--Fallback-Test mit relativer URL, die zur absoluten aufgelöst wird. --- src/lib/server/search/searxng.ts | 111 ++++++++++++++++++++++++++---- src/routes/+page.svelte | 63 +++++++++-------- tests/integration/searxng.test.ts | 46 +++++++++++++ 3 files changed, 178 insertions(+), 42 deletions(-) diff --git a/src/lib/server/search/searxng.ts b/src/lib/server/search/searxng.ts index 9ec6a93..dbea01e 100644 --- a/src/lib/server/search/searxng.ts +++ b/src/lib/server/search/searxng.ts @@ -1,4 +1,5 @@ import type Database from 'better-sqlite3'; +import { parseHTML } from 'linkedom'; import { listDomains, normalizeDomain } from '../domains/repository'; import { fetchText } from '../http'; @@ -77,18 +78,102 @@ function looksLikeRecipePage(url: string): boolean { } } -const OG_IMAGE_RE = - /]+(?:property|name)=["']og:image(?::url)?["'][^>]+content=["']([^"']+)["']/i; -const OG_IMAGE_RE_REV = - /]+content=["']([^"']+)["'][^>]+(?:property|name)=["']og:image(?::url)?["']/i; -const TWITTER_IMAGE_RE = - /]+(?:property|name)=["']twitter:image["'][^>]+content=["']([^"']+)["']/i; - -function extractOgImage(html: string): string | null { - const m = OG_IMAGE_RE.exec(html) ?? OG_IMAGE_RE_REV.exec(html) ?? TWITTER_IMAGE_RE.exec(html); - if (!m) return null; +function resolveUrl(href: string, baseUrl: string): string | null { try { - return new URL(m[1]).toString(); + return new URL(href, baseUrl).toString(); + } catch { + return null; + } +} + +function imageFromJsonLd(data: unknown): string | null { + if (!data) return null; + if (Array.isArray(data)) { + for (const d of data) { + const img = imageFromJsonLd(d); + if (img) return img; + } + return null; + } + if (typeof data !== 'object') return null; + const node = data as Record; + if (Array.isArray(node['@graph'])) { + for (const d of node['@graph']) { + const img = imageFromJsonLd(d); + if (img) return img; + } + } + const image = node.image; + if (typeof image === 'string') return image; + if (Array.isArray(image) && image.length > 0) { + const first = image[0]; + if (typeof first === 'string') return first; + if (first && typeof first === 'object' && 'url' in first) { + const url = (first as Record).url; + if (typeof url === 'string') return url; + } + } + if (image && typeof image === 'object' && 'url' in image) { + const url = (image as Record).url; + if (typeof url === 'string') return url; + } + return null; +} + +const META_IMAGE_KEYS = new Set([ + 'og:image', + 'og:image:url', + 'og:image:secure_url', + 'twitter:image', + 'twitter:image:src' +]); + +function extractPageImage(html: string, baseUrl: string): string | null { + try { + const { document } = parseHTML(html); + // 1. OpenGraph / Twitter meta tags + for (const m of Array.from(document.querySelectorAll('meta'))) { + const key = (m.getAttribute('property') ?? m.getAttribute('name') ?? '').toLowerCase(); + if (!META_IMAGE_KEYS.has(key)) continue; + const content = m.getAttribute('content'); + if (!content) continue; + const resolved = resolveUrl(content, baseUrl); + if (resolved) return resolved; + } + // 2. + const link = document.querySelector('link[rel="image_src"]'); + if (link) { + const href = link.getAttribute('href'); + if (href) { + const resolved = resolveUrl(href, baseUrl); + if (resolved) return resolved; + } + } + // 3. JSON-LD image (Recipe schema etc.) + for (const s of Array.from(document.querySelectorAll('script[type="application/ld+json"]'))) { + try { + const data = JSON.parse(s.textContent ?? ''); + const img = imageFromJsonLd(data); + if (img) { + const resolved = resolveUrl(img, baseUrl); + if (resolved) return resolved; + } + } catch { + // malformed JSON-LD — skip + } + } + // 4. First content image in article/main + const contentImg = document.querySelector( + 'article img[src], main img[src], .entry-content img[src], .post-content img[src], figure img[src]' + ); + if (contentImg) { + const src = contentImg.getAttribute('src') ?? contentImg.getAttribute('data-src'); + if (src) { + const resolved = resolveUrl(src, baseUrl); + if (resolved) return resolved; + } + } + return null; } catch { return null; } @@ -104,8 +189,8 @@ async function enrichThumbnail(url: string): Promise { if (cached && cached.expires > now) return cached.image; let image: string | null = null; try { - const html = await fetchText(url, { timeoutMs: 4_000, maxBytes: 256 * 1024 }); - image = extractOgImage(html); + const html = await fetchText(url, { timeoutMs: 4_000, maxBytes: 512 * 1024 }); + image = extractPageImage(html, url); } catch { image = null; } diff --git a/src/routes/+page.svelte b/src/routes/+page.svelte index 107c9e5..8f6503c 100644 --- a/src/routes/+page.svelte +++ b/src/routes/+page.svelte @@ -1,6 +1,5 @@ + `); + }); + await new Promise((r) => pageServer.listen(0, '127.0.0.1', r)); + const addr = pageServer.address() as AddressInfo; + const pageUrl = `http://127.0.0.1:${addr.port}/pie`; + try { + const db = openInMemoryForTest(); + addDomain(db, '127.0.0.1'); + respondWith([{ url: pageUrl, title: 'Pie', content: '' }]); + const hits = await searchWeb(db, 'pie', { searxngUrl: baseUrl }); + expect(hits[0].thumbnail).toBe('https://cdn.example/pie.jpg'); + } finally { + await new Promise((r) => pageServer.close(() => r())); + } + }); + + it('falls back to first content image when no meta/JSON-LD image', async () => { + const pageServer = createServer((_req, res) => { + res.writeHead(200, { 'content-type': 'text/html; charset=utf-8' }); + res.end( + '
' + ); + }); + await new Promise((r) => pageServer.listen(0, '127.0.0.1', r)); + const addr = pageServer.address() as AddressInfo; + const pageUrl = `http://127.0.0.1:${addr.port}/article`; + try { + const db = openInMemoryForTest(); + addDomain(db, '127.0.0.1'); + respondWith([{ url: pageUrl, title: 'Dish', content: '' }]); + const hits = await searchWeb(db, 'dish', { searxngUrl: baseUrl }); + expect(hits[0].thumbnail).toBe(`http://127.0.0.1:${addr.port}/uploads/dish.jpg`); + } finally { + await new Promise((r) => pageServer.close(() => r())); + } + }); + it('leaves existing thumbnails untouched (no enrichment fetch)', async () => { const db = openInMemoryForTest(); addDomain(db, 'chefkoch.de');