import type Database from 'better-sqlite3'; import { listDomains, normalizeDomain } from '../domains/repository'; import { fetchText } from '../http'; export type WebHit = { url: string; title: string; domain: string; snippet: string | null; thumbnail: string | null; }; type SearxngResult = { url: string; title: string; content?: string; thumbnail?: string; img_src?: string; }; type SearxngResponse = { results?: SearxngResult[]; }; function hostnameFromUrl(url: string): string | null { try { return normalizeDomain(new URL(url).hostname); } catch { return null; } } // Paths that are clearly NOT a single recipe page on common domains. // Broad enough to cover forum/magazine/listing URLs we've seen in the wild // across chefkoch.de, emmikochteinfach.de, experimente-aus-meiner-kueche.de etc. const NON_RECIPE_PATH_PATTERNS: RegExp[] = [ /\/forum\//i, /\/magazin\//i, /\/magazine\//i, /\/suche($|\/|\?)/i, /\/search($|\/|\?)/i, /\/benutzer\//i, /\/profil\//i, /\/autoren\//i, /\/themen\//i, /\/kategorie\//i, /\/kategorien\//i, /\/cook-and-style\//i, /\/tag\//i, /\/rezepte\/?$/i, // "/rezepte/" listing root /\/rezepte\/kategorien/i, /\/rezepte\/was-kocht/i, /\/gewinnspiel/i, /\/impressum/i, /\/datenschutz/i, /\/ueber-(uns|mich)/i, // Chefkoch-specific search-/listing-URLs /\/rs\/s\d+\//i, /\/rs\/s\d+$/i, /Rezepte\.html/i // /rs/.../Rezepte.html is a listing ]; function looksLikeRecipePage(url: string): boolean { try { const u = new URL(url); const path = u.pathname + u.search; for (const rx of NON_RECIPE_PATH_PATTERNS) { if (rx.test(path)) return false; } // Heuristic: very short paths (just "/" or "/xyz") on non-blog sites // are usually landing pages. Allow when at least one path segment exists // AND path isn't just the root or a single top-level category keyword. if (path === '/' || path === '') return false; return true; } catch { return false; } } const OG_IMAGE_RE = /]+(?:property|name)=["']og:image(?::url)?["'][^>]+content=["']([^"']+)["']/i; const OG_IMAGE_RE_REV = /]+content=["']([^"']+)["'][^>]+(?:property|name)=["']og:image(?::url)?["']/i; const TWITTER_IMAGE_RE = /]+(?:property|name)=["']twitter:image["'][^>]+content=["']([^"']+)["']/i; function extractOgImage(html: string): string | null { const m = OG_IMAGE_RE.exec(html) ?? OG_IMAGE_RE_REV.exec(html) ?? TWITTER_IMAGE_RE.exec(html); if (!m) return null; try { return new URL(m[1]).toString(); } catch { return null; } } type ThumbCacheEntry = { image: string | null; expires: number }; const thumbCache = new Map(); const THUMB_TTL_MS = 30 * 60 * 1000; async function enrichThumbnail(url: string): Promise { const now = Date.now(); const cached = thumbCache.get(url); if (cached && cached.expires > now) return cached.image; let image: string | null = null; try { const html = await fetchText(url, { timeoutMs: 4_000, maxBytes: 256 * 1024 }); image = extractOgImage(html); } catch { image = null; } thumbCache.set(url, { image, expires: now + THUMB_TTL_MS }); return image; } async function enrichMissingThumbnails(hits: WebHit[]): Promise { const queue = hits.filter((h) => !h.thumbnail); if (queue.length === 0) return; const LIMIT = 6; const workers = Array.from({ length: Math.min(LIMIT, queue.length) }, async () => { while (queue.length > 0) { const h = queue.shift(); if (!h) break; const image = await enrichThumbnail(h.url); if (image) h.thumbnail = image; } }); await Promise.all(workers); } export async function searchWeb( db: Database.Database, query: string, opts: { searxngUrl?: string; limit?: number; enrichThumbnails?: boolean } = {} ): Promise { const trimmed = query.trim(); if (!trimmed) return []; const domains = listDomains(db).map((d) => d.domain); if (domains.length === 0) return []; const searxngUrl = opts.searxngUrl ?? process.env.SEARXNG_URL ?? 'http://localhost:8888'; const limit = opts.limit ?? 20; const siteFilter = domains.map((d) => `site:${d}`).join(' OR '); const q = `${trimmed} (${siteFilter})`; const endpoint = new URL('/search', searxngUrl); endpoint.searchParams.set('q', q); endpoint.searchParams.set('format', 'json'); endpoint.searchParams.set('language', 'de'); const body = await fetchText(endpoint.toString(), { timeoutMs: 15_000, // SearXNG's bot detection complains without these; we are the only caller // and we're not a bot, so satisfy the check deterministically. extraHeaders: { 'X-Forwarded-For': '127.0.0.1', 'X-Real-IP': '127.0.0.1', Accept: 'application/json' } }); let parsed: SearxngResponse; try { parsed = JSON.parse(body) as SearxngResponse; } catch { throw new Error('SearXNG did not return JSON'); } const results = parsed.results ?? []; const allowed = new Set(domains); const seen = new Set(); const hits: WebHit[] = []; for (const r of results) { const host = hostnameFromUrl(r.url); if (!host || !allowed.has(host)) continue; if (!looksLikeRecipePage(r.url)) continue; if (seen.has(r.url)) continue; seen.add(r.url); hits.push({ url: r.url, title: r.title, domain: host, snippet: r.content ?? null, thumbnail: r.thumbnail ?? r.img_src ?? null }); if (hits.length >= limit) break; } if (opts.enrichThumbnails !== false) { await enrichMissingThumbnails(hits); } return hits; }