src/lib/server/search/searxng.ts

import type Database from 'better-sqlite3';
import { parseHTML } from 'linkedom';
import { listDomains, normalizeDomain } from '../domains/repository';
import { fetchText } from '../http';
import { hasRecipeJsonLd } from '../parsers/json-ld-recipe';

export type WebHit = {
  url: string;
  title: string;
  domain: string;
  snippet: string | null;
  thumbnail: string | null;
};

type SearxngResult = {
  url: string;
  title: string;
  content?: string;
  thumbnail?: string;
  img_src?: string;
};

type SearxngResponse = {
  results?: SearxngResult[];
};

function hostnameFromUrl(url: string): string | null {
  try {
    return normalizeDomain(new URL(url).hostname);
  } catch {
    return null;
  }
}

// Paths that are clearly NOT a single recipe page on common domains.
// Broad enough to cover forum/magazine/listing URLs we've seen in the wild
// across chefkoch.de, emmikochteinfach.de, experimente-aus-meiner-kueche.de etc.
const NON_RECIPE_PATH_PATTERNS: RegExp[] = [
  /\/forum\//i,
  /\/magazin\//i,
  /\/magazine\//i,
  /\/suche($|\/|\?)/i,
  /\/search($|\/|\?)/i,
  /\/benutzer\//i,
  /\/profil\//i,
  /\/autoren\//i,
  /\/themen\//i,
  /\/kategorie\//i,
  /\/kategorien\//i,
  /\/cook-and-style\//i,
  /\/tag\//i,
  /\/rezepte\/?$/i, // "/rezepte/" listing root
  /\/rezepte\/kategorien/i,
  /\/rezepte\/was-kocht/i,
  /\/gewinnspiel/i,
  /\/impressum/i,
  /\/datenschutz/i,
  /\/ueber-(uns|mich)/i,
  // Chefkoch-specific search-/listing-URLs
  /\/rs\/s\d+\//i,
  /\/rs\/s\d+$/i,
  /Rezepte\.html/i // /rs/.../Rezepte.html is a listing
];

function looksLikeRecipePage(url: string): boolean {
  try {
    const u = new URL(url);
    const path = u.pathname + u.search;
    for (const rx of NON_RECIPE_PATH_PATTERNS) {
      if (rx.test(path)) return false;
    }
    // Heuristic: very short paths (just "/" or "/xyz") on non-blog sites
    // are usually landing pages. Allow when at least one path segment exists
    // AND path isn't just the root or a single top-level category keyword.
    if (path === '/' || path === '') return false;
    return true;
  } catch {
    return false;
  }
}

function resolveUrl(href: string, baseUrl: string): string | null {
  try {
    return new URL(href, baseUrl).toString();
  } catch {
    return null;
  }
}

function imageFromJsonLd(data: unknown): string | null {
  if (!data) return null;
  if (Array.isArray(data)) {
    for (const d of data) {
      const img = imageFromJsonLd(d);
      if (img) return img;
    }
    return null;
  }
  if (typeof data !== 'object') return null;
  const node = data as Record<string, unknown>;
  if (Array.isArray(node['@graph'])) {
    for (const d of node['@graph']) {
      const img = imageFromJsonLd(d);
      if (img) return img;
    }
  }
  const image = node.image;
  if (typeof image === 'string') return image;
  if (Array.isArray(image) && image.length > 0) {
    const first = image[0];
    if (typeof first === 'string') return first;
    if (first && typeof first === 'object' && 'url' in first) {
      const url = (first as Record<string, unknown>).url;
      if (typeof url === 'string') return url;
    }
  }
  if (image && typeof image === 'object' && 'url' in image) {
    const url = (image as Record<string, unknown>).url;
    if (typeof url === 'string') return url;
  }
  return null;
}

const META_IMAGE_KEYS = new Set([
  'og:image',
  'og:image:url',
  'og:image:secure_url',
  'twitter:image',
  'twitter:image:src'
]);

function extractPageImage(html: string, baseUrl: string): string | null {
  try {
    const { document } = parseHTML(html);
    // 1. OpenGraph / Twitter meta tags
    for (const m of Array.from(document.querySelectorAll('meta'))) {
      const key = (m.getAttribute('property') ?? m.getAttribute('name') ?? '').toLowerCase();
      if (!META_IMAGE_KEYS.has(key)) continue;
      const content = m.getAttribute('content');
      if (!content) continue;
      const resolved = resolveUrl(content, baseUrl);
      if (resolved) return resolved;
    }
    // 2. <link rel="image_src">
    const link = document.querySelector('link[rel="image_src"]');
    if (link) {
      const href = link.getAttribute('href');
      if (href) {
        const resolved = resolveUrl(href, baseUrl);
        if (resolved) return resolved;
      }
    }
    // 3. JSON-LD image (Recipe schema etc.)
    for (const s of Array.from(document.querySelectorAll('script[type="application/ld+json"]'))) {
      try {
        const data = JSON.parse(s.textContent ?? '');
        const img = imageFromJsonLd(data);
        if (img) {
          const resolved = resolveUrl(img, baseUrl);
          if (resolved) return resolved;
        }
      } catch {
        // malformed JSON-LD — skip
      }
    }
    // 4. First content image in article/main
    const contentImg = document.querySelector(
      'article img[src], main img[src], .entry-content img[src], .post-content img[src], figure img[src]'
    );
    if (contentImg) {
      const src = contentImg.getAttribute('src') ?? contentImg.getAttribute('data-src');
      if (src) {
        const resolved = resolveUrl(src, baseUrl);
        if (resolved) return resolved;
      }
    }
    return null;
  } catch {
    return null;
  }
}

const THUMB_TTL_DAYS = Number(process.env.KOCHWAS_THUMB_TTL_DAYS ?? 30);
const THUMB_TTL_MS = THUMB_TTL_DAYS * 24 * 60 * 60 * 1000;

type PageMeta = {
  image: string | null;
  hasRecipe: 0 | 1 | null;
};

function readCachedPageMeta(
  db: Database.Database,
  url: string
): PageMeta | null {
  const row = db
    .prepare<
      [string, string],
      { image: string | null; has_recipe: 0 | 1 | null }
    >(
      'SELECT image, has_recipe FROM thumbnail_cache WHERE url = ? AND expires_at > ?'
    )
    .get(url, new Date().toISOString());
  if (!row) return null;
  return { image: row.image, hasRecipe: row.has_recipe };
}

function writeCachedPageMeta(
  db: Database.Database,
  url: string,
  meta: PageMeta
): void {
  const expiresAt = new Date(Date.now() + THUMB_TTL_MS).toISOString();
  db.prepare(
    'INSERT OR REPLACE INTO thumbnail_cache (url, image, expires_at, has_recipe) VALUES (?, ?, ?, ?)'
  ).run(url, meta.image, expiresAt, meta.hasRecipe);
}

async function enrichPageMeta(
  db: Database.Database,
  url: string
): Promise<PageMeta> {
  const cached = readCachedPageMeta(db, url);
  if (cached) return cached;
  let meta: PageMeta = { image: null, hasRecipe: null };
  try {
    // allowTruncate: moderne Rezeptseiten sind oft >1 MB (eingebettete
    // Bundles, base64-Bilder). Das og:image und JSON-LD steht praktisch
    // immer im <head>, was locker in die ersten 512 KB passt. Früher
    // warf fetchText auf Überschreitung und hasRecipe blieb NULL, sodass
    // Nicht-Rezept-Seiten fälschlich durchgingen.
    const html = await fetchText(url, {
      timeoutMs: 8_000,
      maxBytes: 512 * 1024,
      allowTruncate: true
    });
    meta = {
      image: extractPageImage(html, url),
      hasRecipe: hasRecipeJsonLd(html) ? 1 : 0
    };
  } catch {
    // Fetch failed — leave hasRecipe null (unknown) so we don't permanently
    // hide a temporary-network-error URL.
  }
  writeCachedPageMeta(db, url, meta);
  return meta;
}

async function enrichAndFilterHits(
  db: Database.Database,
  hits: WebHit[]
): Promise<WebHit[]> {
  // Always fetch the page even when SearXNG gave us a thumbnail — we need
  // the HTML anyway for the high-res og:image AND to confirm a Recipe
  // JSON-LD actually exists. The thumbnail_cache table (default 30-day TTL)
  // makes repeat searches instant.
  if (hits.length === 0) return hits;
  // Lazy cleanup of expired entries — O(log n) index scan, cheap.
  db.prepare('DELETE FROM thumbnail_cache WHERE expires_at <= ?').run(
    new Date().toISOString()
  );
  const metas = new Map<string, PageMeta>();
  const queue = [...hits];
  const LIMIT = 6;
  const workers = Array.from({ length: Math.min(LIMIT, queue.length) }, async () => {
    while (queue.length > 0) {
      const h = queue.shift();
      if (!h) break;
      metas.set(h.url, await enrichPageMeta(db, h.url));
    }
  });
  await Promise.all(workers);
  // Drop confirmed-non-recipe pages (hasRecipe === 0). Keep unknown (null)
  // and confirmed recipes (1).
  return hits
    .filter((h) => metas.get(h.url)?.hasRecipe !== 0)
    .map((h) => {
      const image = metas.get(h.url)?.image;
      return image ? { ...h, thumbnail: image } : h;
    });
}

export async function searchWeb(
  db: Database.Database,
  query: string,
  opts: {
    searxngUrl?: string;
    limit?: number;
    enrichThumbnails?: boolean;
    pageno?: number;
    domains?: string[];
  } = {}
): Promise<WebHit[]> {
  const trimmed = query.trim();
  if (!trimmed) return [];
  const allDomains = listDomains(db).map((d) => d.domain);
  if (allDomains.length === 0) return [];
  // Optionaler Domain-Filter: Intersection mit der Whitelist, damit der
  // Filter nie außerhalb der erlaubten Domains sucht.
  const whitelist = new Set(allDomains);
  const filtered = opts.domains?.filter((d) => whitelist.has(d)) ?? [];
  const domains = filtered.length > 0 ? filtered : allDomains;

  const searxngUrl = opts.searxngUrl ?? process.env.SEARXNG_URL ?? 'http://localhost:8888';
  const limit = opts.limit ?? 20;
  const pageno = Math.max(1, opts.pageno ?? 1);
  const siteFilter = domains.map((d) => `site:${d}`).join(' OR ');
  const q = `${trimmed} (${siteFilter})`;
  const endpoint = new URL('/search', searxngUrl);
  endpoint.searchParams.set('q', q);
  endpoint.searchParams.set('format', 'json');
  endpoint.searchParams.set('language', 'de');
  if (pageno > 1) endpoint.searchParams.set('pageno', String(pageno));

  const body = await fetchText(endpoint.toString(), {
    timeoutMs: 15_000,
    // SearXNG's bot detection complains without these; we are the only caller
    // and we're not a bot, so satisfy the check deterministically.
    extraHeaders: {
      'X-Forwarded-For': '127.0.0.1',
      'X-Real-IP': '127.0.0.1',
      Accept: 'application/json'
    }
  });
  let parsed: SearxngResponse;
  try {
    parsed = JSON.parse(body) as SearxngResponse;
  } catch {
    throw new Error('SearXNG did not return JSON');
  }
  const results = parsed.results ?? [];
  const allowed = new Set(domains);
  const seen = new Set<string>();
  const hits: WebHit[] = [];
  let dropNonWhitelist = 0;
  let dropNonRecipeUrl = 0;
  let dropDup = 0;
  for (const r of results) {
    const host = hostnameFromUrl(r.url);
    if (!host || !allowed.has(host)) {
      dropNonWhitelist += 1;
      continue;
    }
    if (!looksLikeRecipePage(r.url)) {
      dropNonRecipeUrl += 1;
      continue;
    }
    if (seen.has(r.url)) {
      dropDup += 1;
      continue;
    }
    seen.add(r.url);
    hits.push({
      url: r.url,
      title: r.title,
      domain: host,
      snippet: r.content ?? null,
      thumbnail: r.thumbnail ?? r.img_src ?? null
    });
    if (hits.length >= limit) break;
  }
  console.log(
    `[searxng] q=${JSON.stringify(trimmed)} pageno=${pageno} domains=${domains.length} raw=${results.length} non_whitelist=${dropNonWhitelist} non_recipe_url=${dropNonRecipeUrl} dup=${dropDup} kept_pre_enrich=${hits.length}`
  );
  if (opts.enrichThumbnails !== false) {
    const enriched = await enrichAndFilterHits(db, hits);
    console.log(
      `[searxng] q=${JSON.stringify(trimmed)} pageno=${pageno} enrich=${hits.length} dropped_non_recipe=${hits.length - enriched.length} final=${enriched.length}`
    );
    return enriched;
  }
  return hits;
}