Files
kochwas/src/lib/server/search/searxng.ts
hsiegeln 211d58ebec
All checks were successful
Build & Publish Docker Image / build-and-push (push) Successful in 55s
feat(search): Enter bleibt auf Seite + robustere Thumbnail-Erkennung
Startseite:
- Enter/Return löst die Suche jetzt sofort aus (cancelt den Debounce),
  navigiert aber NICHT mehr auf /search. Der Anwender bleibt auf der
  gleichen Seite mit Inline-Ergebnissen.

Thumbnail-Enrichment (searxng.ts):
- Regex-basierte og:image-Extraktion durch linkedom-parseHTML ersetzt.
- Neue Fallback-Kette (in dieser Reihenfolge):
    1. <meta property/name = og:image | og:image:url | og:image:secure_url
                           | twitter:image | twitter:image:src>
    2. <link rel="image_src" href="...">
    3. JSON-LD image (auch tief in @graph; "image" als String, Array,
       Objekt-mit-url)
    4. Erstes <img> in article/main/.entry-content/.post-content/figure
- Relative URLs werden gegen die Seiten-URL zu absoluten aufgelöst
  (z.B. /uploads/foo.jpg → http://host/uploads/foo.jpg).
- maxBytes von 256 KB auf 512 KB angehoben, damit JSON-LD-lastige
  Recipe-Seiten nicht mitten im Script abgeschnitten werden.

Tests (97/97):
- Neu: JSON-LD-Image-Fallback-Test.
- Neu: Content-<img>-Fallback-Test mit relativer URL, die zur
  absoluten aufgelöst wird.
2026-04-17 18:04:59 +02:00

275 lines
8.0 KiB
TypeScript

import type Database from 'better-sqlite3';
import { parseHTML } from 'linkedom';
import { listDomains, normalizeDomain } from '../domains/repository';
import { fetchText } from '../http';
export type WebHit = {
url: string;
title: string;
domain: string;
snippet: string | null;
thumbnail: string | null;
};
type SearxngResult = {
url: string;
title: string;
content?: string;
thumbnail?: string;
img_src?: string;
};
type SearxngResponse = {
results?: SearxngResult[];
};
function hostnameFromUrl(url: string): string | null {
try {
return normalizeDomain(new URL(url).hostname);
} catch {
return null;
}
}
// Paths that are clearly NOT a single recipe page on common domains.
// Broad enough to cover forum/magazine/listing URLs we've seen in the wild
// across chefkoch.de, emmikochteinfach.de, experimente-aus-meiner-kueche.de etc.
const NON_RECIPE_PATH_PATTERNS: RegExp[] = [
/\/forum\//i,
/\/magazin\//i,
/\/magazine\//i,
/\/suche($|\/|\?)/i,
/\/search($|\/|\?)/i,
/\/benutzer\//i,
/\/profil\//i,
/\/autoren\//i,
/\/themen\//i,
/\/kategorie\//i,
/\/kategorien\//i,
/\/cook-and-style\//i,
/\/tag\//i,
/\/rezepte\/?$/i, // "/rezepte/" listing root
/\/rezepte\/kategorien/i,
/\/rezepte\/was-kocht/i,
/\/gewinnspiel/i,
/\/impressum/i,
/\/datenschutz/i,
/\/ueber-(uns|mich)/i,
// Chefkoch-specific search-/listing-URLs
/\/rs\/s\d+\//i,
/\/rs\/s\d+$/i,
/Rezepte\.html/i // /rs/.../Rezepte.html is a listing
];
function looksLikeRecipePage(url: string): boolean {
try {
const u = new URL(url);
const path = u.pathname + u.search;
for (const rx of NON_RECIPE_PATH_PATTERNS) {
if (rx.test(path)) return false;
}
// Heuristic: very short paths (just "/" or "/xyz") on non-blog sites
// are usually landing pages. Allow when at least one path segment exists
// AND path isn't just the root or a single top-level category keyword.
if (path === '/' || path === '') return false;
return true;
} catch {
return false;
}
}
function resolveUrl(href: string, baseUrl: string): string | null {
try {
return new URL(href, baseUrl).toString();
} catch {
return null;
}
}
function imageFromJsonLd(data: unknown): string | null {
if (!data) return null;
if (Array.isArray(data)) {
for (const d of data) {
const img = imageFromJsonLd(d);
if (img) return img;
}
return null;
}
if (typeof data !== 'object') return null;
const node = data as Record<string, unknown>;
if (Array.isArray(node['@graph'])) {
for (const d of node['@graph']) {
const img = imageFromJsonLd(d);
if (img) return img;
}
}
const image = node.image;
if (typeof image === 'string') return image;
if (Array.isArray(image) && image.length > 0) {
const first = image[0];
if (typeof first === 'string') return first;
if (first && typeof first === 'object' && 'url' in first) {
const url = (first as Record<string, unknown>).url;
if (typeof url === 'string') return url;
}
}
if (image && typeof image === 'object' && 'url' in image) {
const url = (image as Record<string, unknown>).url;
if (typeof url === 'string') return url;
}
return null;
}
const META_IMAGE_KEYS = new Set([
'og:image',
'og:image:url',
'og:image:secure_url',
'twitter:image',
'twitter:image:src'
]);
function extractPageImage(html: string, baseUrl: string): string | null {
try {
const { document } = parseHTML(html);
// 1. OpenGraph / Twitter meta tags
for (const m of Array.from(document.querySelectorAll('meta'))) {
const key = (m.getAttribute('property') ?? m.getAttribute('name') ?? '').toLowerCase();
if (!META_IMAGE_KEYS.has(key)) continue;
const content = m.getAttribute('content');
if (!content) continue;
const resolved = resolveUrl(content, baseUrl);
if (resolved) return resolved;
}
// 2. <link rel="image_src">
const link = document.querySelector('link[rel="image_src"]');
if (link) {
const href = link.getAttribute('href');
if (href) {
const resolved = resolveUrl(href, baseUrl);
if (resolved) return resolved;
}
}
// 3. JSON-LD image (Recipe schema etc.)
for (const s of Array.from(document.querySelectorAll('script[type="application/ld+json"]'))) {
try {
const data = JSON.parse(s.textContent ?? '');
const img = imageFromJsonLd(data);
if (img) {
const resolved = resolveUrl(img, baseUrl);
if (resolved) return resolved;
}
} catch {
// malformed JSON-LD — skip
}
}
// 4. First content image in article/main
const contentImg = document.querySelector(
'article img[src], main img[src], .entry-content img[src], .post-content img[src], figure img[src]'
);
if (contentImg) {
const src = contentImg.getAttribute('src') ?? contentImg.getAttribute('data-src');
if (src) {
const resolved = resolveUrl(src, baseUrl);
if (resolved) return resolved;
}
}
return null;
} catch {
return null;
}
}
type ThumbCacheEntry = { image: string | null; expires: number };
const thumbCache = new Map<string, ThumbCacheEntry>();
const THUMB_TTL_MS = 30 * 60 * 1000;
async function enrichThumbnail(url: string): Promise<string | null> {
const now = Date.now();
const cached = thumbCache.get(url);
if (cached && cached.expires > now) return cached.image;
let image: string | null = null;
try {
const html = await fetchText(url, { timeoutMs: 4_000, maxBytes: 512 * 1024 });
image = extractPageImage(html, url);
} catch {
image = null;
}
thumbCache.set(url, { image, expires: now + THUMB_TTL_MS });
return image;
}
async function enrichMissingThumbnails(hits: WebHit[]): Promise<void> {
const queue = hits.filter((h) => !h.thumbnail);
if (queue.length === 0) return;
const LIMIT = 6;
const workers = Array.from({ length: Math.min(LIMIT, queue.length) }, async () => {
while (queue.length > 0) {
const h = queue.shift();
if (!h) break;
const image = await enrichThumbnail(h.url);
if (image) h.thumbnail = image;
}
});
await Promise.all(workers);
}
export async function searchWeb(
db: Database.Database,
query: string,
opts: { searxngUrl?: string; limit?: number; enrichThumbnails?: boolean } = {}
): Promise<WebHit[]> {
const trimmed = query.trim();
if (!trimmed) return [];
const domains = listDomains(db).map((d) => d.domain);
if (domains.length === 0) return [];
const searxngUrl = opts.searxngUrl ?? process.env.SEARXNG_URL ?? 'http://localhost:8888';
const limit = opts.limit ?? 20;
const siteFilter = domains.map((d) => `site:${d}`).join(' OR ');
const q = `${trimmed} (${siteFilter})`;
const endpoint = new URL('/search', searxngUrl);
endpoint.searchParams.set('q', q);
endpoint.searchParams.set('format', 'json');
endpoint.searchParams.set('language', 'de');
const body = await fetchText(endpoint.toString(), {
timeoutMs: 15_000,
// SearXNG's bot detection complains without these; we are the only caller
// and we're not a bot, so satisfy the check deterministically.
extraHeaders: {
'X-Forwarded-For': '127.0.0.1',
'X-Real-IP': '127.0.0.1',
Accept: 'application/json'
}
});
let parsed: SearxngResponse;
try {
parsed = JSON.parse(body) as SearxngResponse;
} catch {
throw new Error('SearXNG did not return JSON');
}
const results = parsed.results ?? [];
const allowed = new Set(domains);
const seen = new Set<string>();
const hits: WebHit[] = [];
for (const r of results) {
const host = hostnameFromUrl(r.url);
if (!host || !allowed.has(host)) continue;
if (!looksLikeRecipePage(r.url)) continue;
if (seen.has(r.url)) continue;
seen.add(r.url);
hits.push({
url: r.url,
title: r.title,
domain: host,
snippet: r.content ?? null,
thumbnail: r.thumbnail ?? r.img_src ?? null
});
if (hits.length >= limit) break;
}
if (opts.enrichThumbnails !== false) {
await enrichMissingThumbnails(hits);
}
return hits;
}