import type Database from 'better-sqlite3'; import { parseHTML } from 'linkedom'; import { listDomains, normalizeDomain } from '../domains/repository'; import { fetchText } from '../http'; import { hasRecipeJsonLd } from '../parsers/json-ld-recipe'; export type WebHit = { url: string; title: string; domain: string; snippet: string | null; thumbnail: string | null; }; type SearxngResult = { url: string; title: string; content?: string; thumbnail?: string; img_src?: string; }; type SearxngResponse = { results?: SearxngResult[]; }; function hostnameFromUrl(url: string): string | null { try { return normalizeDomain(new URL(url).hostname); } catch { return null; } } // Paths that are clearly NOT a single recipe page on common domains. // Broad enough to cover forum/magazine/listing URLs we've seen in the wild // across chefkoch.de, emmikochteinfach.de, experimente-aus-meiner-kueche.de etc. const NON_RECIPE_PATH_PATTERNS: RegExp[] = [ /\/forum\//i, /\/magazin\//i, /\/magazine\//i, /\/suche($|\/|\?)/i, /\/search($|\/|\?)/i, /\/benutzer\//i, /\/profil\//i, /\/autoren\//i, /\/themen\//i, /\/kategorie\//i, /\/kategorien\//i, /\/cook-and-style\//i, /\/tag\//i, /\/rezepte\/?$/i, // "/rezepte/" listing root /\/rezepte\/kategorien/i, /\/rezepte\/was-kocht/i, /\/gewinnspiel/i, /\/impressum/i, /\/datenschutz/i, /\/ueber-(uns|mich)/i, // Chefkoch-specific search-/listing-URLs /\/rs\/s\d+\//i, /\/rs\/s\d+$/i, /Rezepte\.html/i // /rs/.../Rezepte.html is a listing ]; function looksLikeRecipePage(url: string): boolean { try { const u = new URL(url); const path = u.pathname + u.search; for (const rx of NON_RECIPE_PATH_PATTERNS) { if (rx.test(path)) return false; } // Heuristic: very short paths (just "/" or "/xyz") on non-blog sites // are usually landing pages. Allow when at least one path segment exists // AND path isn't just the root or a single top-level category keyword. if (path === '/' || path === '') return false; return true; } catch { return false; } } function resolveUrl(href: string, baseUrl: string): string | null { try { return new URL(href, baseUrl).toString(); } catch { return null; } } function imageFromJsonLd(data: unknown): string | null { if (!data) return null; if (Array.isArray(data)) { for (const d of data) { const img = imageFromJsonLd(d); if (img) return img; } return null; } if (typeof data !== 'object') return null; const node = data as Record; if (Array.isArray(node['@graph'])) { for (const d of node['@graph']) { const img = imageFromJsonLd(d); if (img) return img; } } const image = node.image; if (typeof image === 'string') return image; if (Array.isArray(image) && image.length > 0) { const first = image[0]; if (typeof first === 'string') return first; if (first && typeof first === 'object' && 'url' in first) { const url = (first as Record).url; if (typeof url === 'string') return url; } } if (image && typeof image === 'object' && 'url' in image) { const url = (image as Record).url; if (typeof url === 'string') return url; } return null; } const META_IMAGE_KEYS = new Set([ 'og:image', 'og:image:url', 'og:image:secure_url', 'twitter:image', 'twitter:image:src' ]); function extractPageImage(html: string, baseUrl: string): string | null { try { const { document } = parseHTML(html); // 1. OpenGraph / Twitter meta tags for (const m of Array.from(document.querySelectorAll('meta'))) { const key = (m.getAttribute('property') ?? m.getAttribute('name') ?? '').toLowerCase(); if (!META_IMAGE_KEYS.has(key)) continue; const content = m.getAttribute('content'); if (!content) continue; const resolved = resolveUrl(content, baseUrl); if (resolved) return resolved; } // 2. const link = document.querySelector('link[rel="image_src"]'); if (link) { const href = link.getAttribute('href'); if (href) { const resolved = resolveUrl(href, baseUrl); if (resolved) return resolved; } } // 3. JSON-LD image (Recipe schema etc.) for (const s of Array.from(document.querySelectorAll('script[type="application/ld+json"]'))) { try { const data = JSON.parse(s.textContent ?? ''); const img = imageFromJsonLd(data); if (img) { const resolved = resolveUrl(img, baseUrl); if (resolved) return resolved; } } catch { // malformed JSON-LD — skip } } // 4. First content image in article/main const contentImg = document.querySelector( 'article img[src], main img[src], .entry-content img[src], .post-content img[src], figure img[src]' ); if (contentImg) { const src = contentImg.getAttribute('src') ?? contentImg.getAttribute('data-src'); if (src) { const resolved = resolveUrl(src, baseUrl); if (resolved) return resolved; } } return null; } catch { return null; } } const THUMB_TTL_DAYS = Number(process.env.KOCHWAS_THUMB_TTL_DAYS ?? 30); const THUMB_TTL_MS = THUMB_TTL_DAYS * 24 * 60 * 60 * 1000; type PageMeta = { image: string | null; hasRecipe: 0 | 1 | null; }; function readCachedPageMeta( db: Database.Database, url: string ): PageMeta | null { const row = db .prepare< [string, string], { image: string | null; has_recipe: 0 | 1 | null } >( 'SELECT image, has_recipe FROM thumbnail_cache WHERE url = ? AND expires_at > ?' ) .get(url, new Date().toISOString()); if (!row) return null; return { image: row.image, hasRecipe: row.has_recipe }; } function writeCachedPageMeta( db: Database.Database, url: string, meta: PageMeta ): void { const expiresAt = new Date(Date.now() + THUMB_TTL_MS).toISOString(); db.prepare( 'INSERT OR REPLACE INTO thumbnail_cache (url, image, expires_at, has_recipe) VALUES (?, ?, ?, ?)' ).run(url, meta.image, expiresAt, meta.hasRecipe); } async function enrichPageMeta( db: Database.Database, url: string ): Promise { const cached = readCachedPageMeta(db, url); if (cached) return cached; let meta: PageMeta = { image: null, hasRecipe: null }; try { // allowTruncate: moderne Rezeptseiten sind oft >1 MB (eingebettete // Bundles, base64-Bilder). Das og:image und JSON-LD steht praktisch // immer im , was locker in die ersten 512 KB passt. Früher // warf fetchText auf Überschreitung und hasRecipe blieb NULL, sodass // Nicht-Rezept-Seiten fälschlich durchgingen. const html = await fetchText(url, { timeoutMs: 8_000, maxBytes: 512 * 1024, allowTruncate: true }); meta = { image: extractPageImage(html, url), hasRecipe: hasRecipeJsonLd(html) ? 1 : 0 }; } catch { // Fetch failed — leave hasRecipe null (unknown) so we don't permanently // hide a temporary-network-error URL. } writeCachedPageMeta(db, url, meta); return meta; } async function enrichAndFilterHits( db: Database.Database, hits: WebHit[] ): Promise { // Always fetch the page even when SearXNG gave us a thumbnail — we need // the HTML anyway for the high-res og:image AND to confirm a Recipe // JSON-LD actually exists. The thumbnail_cache table (default 30-day TTL) // makes repeat searches instant. if (hits.length === 0) return hits; // Lazy cleanup of expired entries — O(log n) index scan, cheap. db.prepare('DELETE FROM thumbnail_cache WHERE expires_at <= ?').run( new Date().toISOString() ); const metas = new Map(); const queue = [...hits]; const LIMIT = 6; const workers = Array.from({ length: Math.min(LIMIT, queue.length) }, async () => { while (queue.length > 0) { const h = queue.shift(); if (!h) break; metas.set(h.url, await enrichPageMeta(db, h.url)); } }); await Promise.all(workers); // Drop confirmed-non-recipe pages (hasRecipe === 0). Keep unknown (null) // and confirmed recipes (1). return hits .filter((h) => metas.get(h.url)?.hasRecipe !== 0) .map((h) => { const image = metas.get(h.url)?.image; return image ? { ...h, thumbnail: image } : h; }); } export async function searchWeb( db: Database.Database, query: string, opts: { searxngUrl?: string; limit?: number; enrichThumbnails?: boolean; pageno?: number; domains?: string[]; } = {} ): Promise { const trimmed = query.trim(); if (!trimmed) return []; const allDomains = listDomains(db).map((d) => d.domain); if (allDomains.length === 0) return []; // Optionaler Domain-Filter: Intersection mit der Whitelist, damit der // Filter nie außerhalb der erlaubten Domains sucht. const whitelist = new Set(allDomains); const filtered = opts.domains?.filter((d) => whitelist.has(d)) ?? []; const domains = filtered.length > 0 ? filtered : allDomains; const searxngUrl = opts.searxngUrl ?? process.env.SEARXNG_URL ?? 'http://localhost:8888'; const limit = opts.limit ?? 20; const pageno = Math.max(1, opts.pageno ?? 1); const siteFilter = domains.map((d) => `site:${d}`).join(' OR '); const q = `${trimmed} (${siteFilter})`; const endpoint = new URL('/search', searxngUrl); endpoint.searchParams.set('q', q); endpoint.searchParams.set('format', 'json'); endpoint.searchParams.set('language', 'de'); if (pageno > 1) endpoint.searchParams.set('pageno', String(pageno)); const body = await fetchText(endpoint.toString(), { timeoutMs: 15_000, // SearXNG's bot detection complains without these; we are the only caller // and we're not a bot, so satisfy the check deterministically. extraHeaders: { 'X-Forwarded-For': '127.0.0.1', 'X-Real-IP': '127.0.0.1', Accept: 'application/json' } }); let parsed: SearxngResponse; try { parsed = JSON.parse(body) as SearxngResponse; } catch { throw new Error('SearXNG did not return JSON'); } const results = parsed.results ?? []; const allowed = new Set(domains); const seen = new Set(); const hits: WebHit[] = []; for (const r of results) { const host = hostnameFromUrl(r.url); if (!host || !allowed.has(host)) continue; if (!looksLikeRecipePage(r.url)) continue; if (seen.has(r.url)) continue; seen.add(r.url); hits.push({ url: r.url, title: r.title, domain: host, snippet: r.content ?? null, thumbnail: r.thumbnail ?? r.img_src ?? null }); if (hits.length >= limit) break; } if (opts.enrichThumbnails !== false) { return await enrichAndFilterHits(db, hits); } return hits; }