import type Database from 'better-sqlite3'; import { parseHTML } from 'linkedom'; import { listDomains, normalizeDomain } from '../domains/repository'; import { fetchText } from '../http'; export type WebHit = { url: string; title: string; domain: string; snippet: string | null; thumbnail: string | null; }; type SearxngResult = { url: string; title: string; content?: string; thumbnail?: string; img_src?: string; }; type SearxngResponse = { results?: SearxngResult[]; }; function hostnameFromUrl(url: string): string | null { try { return normalizeDomain(new URL(url).hostname); } catch { return null; } } // Paths that are clearly NOT a single recipe page on common domains. // Broad enough to cover forum/magazine/listing URLs we've seen in the wild // across chefkoch.de, emmikochteinfach.de, experimente-aus-meiner-kueche.de etc. const NON_RECIPE_PATH_PATTERNS: RegExp[] = [ /\/forum\//i, /\/magazin\//i, /\/magazine\//i, /\/suche($|\/|\?)/i, /\/search($|\/|\?)/i, /\/benutzer\//i, /\/profil\//i, /\/autoren\//i, /\/themen\//i, /\/kategorie\//i, /\/kategorien\//i, /\/cook-and-style\//i, /\/tag\//i, /\/rezepte\/?$/i, // "/rezepte/" listing root /\/rezepte\/kategorien/i, /\/rezepte\/was-kocht/i, /\/gewinnspiel/i, /\/impressum/i, /\/datenschutz/i, /\/ueber-(uns|mich)/i, // Chefkoch-specific search-/listing-URLs /\/rs\/s\d+\//i, /\/rs\/s\d+$/i, /Rezepte\.html/i // /rs/.../Rezepte.html is a listing ]; function looksLikeRecipePage(url: string): boolean { try { const u = new URL(url); const path = u.pathname + u.search; for (const rx of NON_RECIPE_PATH_PATTERNS) { if (rx.test(path)) return false; } // Heuristic: very short paths (just "/" or "/xyz") on non-blog sites // are usually landing pages. Allow when at least one path segment exists // AND path isn't just the root or a single top-level category keyword. if (path === '/' || path === '') return false; return true; } catch { return false; } } function resolveUrl(href: string, baseUrl: string): string | null { try { return new URL(href, baseUrl).toString(); } catch { return null; } } function imageFromJsonLd(data: unknown): string | null { if (!data) return null; if (Array.isArray(data)) { for (const d of data) { const img = imageFromJsonLd(d); if (img) return img; } return null; } if (typeof data !== 'object') return null; const node = data as Record; if (Array.isArray(node['@graph'])) { for (const d of node['@graph']) { const img = imageFromJsonLd(d); if (img) return img; } } const image = node.image; if (typeof image === 'string') return image; if (Array.isArray(image) && image.length > 0) { const first = image[0]; if (typeof first === 'string') return first; if (first && typeof first === 'object' && 'url' in first) { const url = (first as Record).url; if (typeof url === 'string') return url; } } if (image && typeof image === 'object' && 'url' in image) { const url = (image as Record).url; if (typeof url === 'string') return url; } return null; } const META_IMAGE_KEYS = new Set([ 'og:image', 'og:image:url', 'og:image:secure_url', 'twitter:image', 'twitter:image:src' ]); function extractPageImage(html: string, baseUrl: string): string | null { try { const { document } = parseHTML(html); // 1. OpenGraph / Twitter meta tags for (const m of Array.from(document.querySelectorAll('meta'))) { const key = (m.getAttribute('property') ?? m.getAttribute('name') ?? '').toLowerCase(); if (!META_IMAGE_KEYS.has(key)) continue; const content = m.getAttribute('content'); if (!content) continue; const resolved = resolveUrl(content, baseUrl); if (resolved) return resolved; } // 2. const link = document.querySelector('link[rel="image_src"]'); if (link) { const href = link.getAttribute('href'); if (href) { const resolved = resolveUrl(href, baseUrl); if (resolved) return resolved; } } // 3. JSON-LD image (Recipe schema etc.) for (const s of Array.from(document.querySelectorAll('script[type="application/ld+json"]'))) { try { const data = JSON.parse(s.textContent ?? ''); const img = imageFromJsonLd(data); if (img) { const resolved = resolveUrl(img, baseUrl); if (resolved) return resolved; } } catch { // malformed JSON-LD — skip } } // 4. First content image in article/main const contentImg = document.querySelector( 'article img[src], main img[src], .entry-content img[src], .post-content img[src], figure img[src]' ); if (contentImg) { const src = contentImg.getAttribute('src') ?? contentImg.getAttribute('data-src'); if (src) { const resolved = resolveUrl(src, baseUrl); if (resolved) return resolved; } } return null; } catch { return null; } } const THUMB_TTL_DAYS = Number(process.env.KOCHWAS_THUMB_TTL_DAYS ?? 30); const THUMB_TTL_MS = THUMB_TTL_DAYS * 24 * 60 * 60 * 1000; function readCachedThumbnail( db: Database.Database, url: string ): { image: string | null } | null { const row = db .prepare<[string, string], { image: string | null }>( "SELECT image FROM thumbnail_cache WHERE url = ? AND expires_at > ?" ) .get(url, new Date().toISOString()); return row ?? null; } function writeCachedThumbnail( db: Database.Database, url: string, image: string | null ): void { const expiresAt = new Date(Date.now() + THUMB_TTL_MS).toISOString(); db.prepare( 'INSERT OR REPLACE INTO thumbnail_cache (url, image, expires_at) VALUES (?, ?, ?)' ).run(url, image, expiresAt); } async function enrichThumbnail( db: Database.Database, url: string ): Promise { const cached = readCachedThumbnail(db, url); if (cached) return cached.image; let image: string | null = null; try { const html = await fetchText(url, { timeoutMs: 4_000, maxBytes: 512 * 1024 }); image = extractPageImage(html, url); } catch { image = null; } writeCachedThumbnail(db, url, image); return image; } async function enrichAllThumbnails( db: Database.Database, hits: WebHit[] ): Promise { // Always fetch the page image even when SearXNG gave us a thumbnail — // the search engine's thumbnail is typically 150-200px, while og:image // / JSON-LD image on the page is the full-resolution recipe photo. // The thumbnail_cache table (default 30-day TTL) makes repeat searches instant. if (hits.length === 0) return; // Lazy cleanup of expired entries — O(log n) index scan, cheap. db.prepare('DELETE FROM thumbnail_cache WHERE expires_at <= ?').run( new Date().toISOString() ); const queue = [...hits]; const LIMIT = 6; const workers = Array.from({ length: Math.min(LIMIT, queue.length) }, async () => { while (queue.length > 0) { const h = queue.shift(); if (!h) break; const image = await enrichThumbnail(db, h.url); if (image) h.thumbnail = image; } }); await Promise.all(workers); } export async function searchWeb( db: Database.Database, query: string, opts: { searxngUrl?: string; limit?: number; enrichThumbnails?: boolean } = {} ): Promise { const trimmed = query.trim(); if (!trimmed) return []; const domains = listDomains(db).map((d) => d.domain); if (domains.length === 0) return []; const searxngUrl = opts.searxngUrl ?? process.env.SEARXNG_URL ?? 'http://localhost:8888'; const limit = opts.limit ?? 20; const siteFilter = domains.map((d) => `site:${d}`).join(' OR '); const q = `${trimmed} (${siteFilter})`; const endpoint = new URL('/search', searxngUrl); endpoint.searchParams.set('q', q); endpoint.searchParams.set('format', 'json'); endpoint.searchParams.set('language', 'de'); const body = await fetchText(endpoint.toString(), { timeoutMs: 15_000, // SearXNG's bot detection complains without these; we are the only caller // and we're not a bot, so satisfy the check deterministically. extraHeaders: { 'X-Forwarded-For': '127.0.0.1', 'X-Real-IP': '127.0.0.1', Accept: 'application/json' } }); let parsed: SearxngResponse; try { parsed = JSON.parse(body) as SearxngResponse; } catch { throw new Error('SearXNG did not return JSON'); } const results = parsed.results ?? []; const allowed = new Set(domains); const seen = new Set(); const hits: WebHit[] = []; for (const r of results) { const host = hostnameFromUrl(r.url); if (!host || !allowed.has(host)) continue; if (!looksLikeRecipePage(r.url)) continue; if (seen.has(r.url)) continue; seen.add(r.url); hits.push({ url: r.url, title: r.title, domain: host, snippet: r.content ?? null, thumbnail: r.thumbnail ?? r.img_src ?? null }); if (hits.length >= limit) break; } if (opts.enrichThumbnails !== false) { await enrichAllThumbnails(db, hits); } return hits; }