feat(search): Enter bleibt auf Seite + robustere Thumbnail-Erkennung
All checks were successful
Build & Publish Docker Image / build-and-push (push) Successful in 55s

Startseite:
- Enter/Return löst die Suche jetzt sofort aus (cancelt den Debounce),
  navigiert aber NICHT mehr auf /search. Der Anwender bleibt auf der
  gleichen Seite mit Inline-Ergebnissen.

Thumbnail-Enrichment (searxng.ts):
- Regex-basierte og:image-Extraktion durch linkedom-parseHTML ersetzt.
- Neue Fallback-Kette (in dieser Reihenfolge):
    1. <meta property/name = og:image | og:image:url | og:image:secure_url
                           | twitter:image | twitter:image:src>
    2. <link rel="image_src" href="...">
    3. JSON-LD image (auch tief in @graph; "image" als String, Array,
       Objekt-mit-url)
    4. Erstes <img> in article/main/.entry-content/.post-content/figure
- Relative URLs werden gegen die Seiten-URL zu absoluten aufgelöst
  (z.B. /uploads/foo.jpg → http://host/uploads/foo.jpg).
- maxBytes von 256 KB auf 512 KB angehoben, damit JSON-LD-lastige
  Recipe-Seiten nicht mitten im Script abgeschnitten werden.

Tests (97/97):
- Neu: JSON-LD-Image-Fallback-Test.
- Neu: Content-<img>-Fallback-Test mit relativer URL, die zur
  absoluten aufgelöst wird.
This commit is contained in:
hsiegeln
2026-04-17 18:04:59 +02:00
parent 9bc4465061
commit 211d58ebec
3 changed files with 178 additions and 42 deletions

View File

@@ -1,4 +1,5 @@
import type Database from 'better-sqlite3';
import { parseHTML } from 'linkedom';
import { listDomains, normalizeDomain } from '../domains/repository';
import { fetchText } from '../http';
@@ -77,18 +78,102 @@ function looksLikeRecipePage(url: string): boolean {
}
}
const OG_IMAGE_RE =
/<meta[^>]+(?:property|name)=["']og:image(?::url)?["'][^>]+content=["']([^"']+)["']/i;
const OG_IMAGE_RE_REV =
/<meta[^>]+content=["']([^"']+)["'][^>]+(?:property|name)=["']og:image(?::url)?["']/i;
const TWITTER_IMAGE_RE =
/<meta[^>]+(?:property|name)=["']twitter:image["'][^>]+content=["']([^"']+)["']/i;
function extractOgImage(html: string): string | null {
const m = OG_IMAGE_RE.exec(html) ?? OG_IMAGE_RE_REV.exec(html) ?? TWITTER_IMAGE_RE.exec(html);
if (!m) return null;
function resolveUrl(href: string, baseUrl: string): string | null {
try {
return new URL(m[1]).toString();
return new URL(href, baseUrl).toString();
} catch {
return null;
}
}
function imageFromJsonLd(data: unknown): string | null {
if (!data) return null;
if (Array.isArray(data)) {
for (const d of data) {
const img = imageFromJsonLd(d);
if (img) return img;
}
return null;
}
if (typeof data !== 'object') return null;
const node = data as Record<string, unknown>;
if (Array.isArray(node['@graph'])) {
for (const d of node['@graph']) {
const img = imageFromJsonLd(d);
if (img) return img;
}
}
const image = node.image;
if (typeof image === 'string') return image;
if (Array.isArray(image) && image.length > 0) {
const first = image[0];
if (typeof first === 'string') return first;
if (first && typeof first === 'object' && 'url' in first) {
const url = (first as Record<string, unknown>).url;
if (typeof url === 'string') return url;
}
}
if (image && typeof image === 'object' && 'url' in image) {
const url = (image as Record<string, unknown>).url;
if (typeof url === 'string') return url;
}
return null;
}
const META_IMAGE_KEYS = new Set([
'og:image',
'og:image:url',
'og:image:secure_url',
'twitter:image',
'twitter:image:src'
]);
function extractPageImage(html: string, baseUrl: string): string | null {
try {
const { document } = parseHTML(html);
// 1. OpenGraph / Twitter meta tags
for (const m of Array.from(document.querySelectorAll('meta'))) {
const key = (m.getAttribute('property') ?? m.getAttribute('name') ?? '').toLowerCase();
if (!META_IMAGE_KEYS.has(key)) continue;
const content = m.getAttribute('content');
if (!content) continue;
const resolved = resolveUrl(content, baseUrl);
if (resolved) return resolved;
}
// 2. <link rel="image_src">
const link = document.querySelector('link[rel="image_src"]');
if (link) {
const href = link.getAttribute('href');
if (href) {
const resolved = resolveUrl(href, baseUrl);
if (resolved) return resolved;
}
}
// 3. JSON-LD image (Recipe schema etc.)
for (const s of Array.from(document.querySelectorAll('script[type="application/ld+json"]'))) {
try {
const data = JSON.parse(s.textContent ?? '');
const img = imageFromJsonLd(data);
if (img) {
const resolved = resolveUrl(img, baseUrl);
if (resolved) return resolved;
}
} catch {
// malformed JSON-LD — skip
}
}
// 4. First content image in article/main
const contentImg = document.querySelector(
'article img[src], main img[src], .entry-content img[src], .post-content img[src], figure img[src]'
);
if (contentImg) {
const src = contentImg.getAttribute('src') ?? contentImg.getAttribute('data-src');
if (src) {
const resolved = resolveUrl(src, baseUrl);
if (resolved) return resolved;
}
}
return null;
} catch {
return null;
}
@@ -104,8 +189,8 @@ async function enrichThumbnail(url: string): Promise<string | null> {
if (cached && cached.expires > now) return cached.image;
let image: string | null = null;
try {
const html = await fetchText(url, { timeoutMs: 4_000, maxBytes: 256 * 1024 });
image = extractOgImage(html);
const html = await fetchText(url, { timeoutMs: 4_000, maxBytes: 512 * 1024 });
image = extractPageImage(html, url);
} catch {
image = null;
}