2026-04-17 15:33:21 +02:00
|
|
|
import type Database from 'better-sqlite3';
|
|
|
|
|
import { listDomains, normalizeDomain } from '../domains/repository';
|
|
|
|
|
import { fetchText } from '../http';
|
|
|
|
|
|
|
|
|
|
export type WebHit = {
|
|
|
|
|
url: string;
|
|
|
|
|
title: string;
|
|
|
|
|
domain: string;
|
|
|
|
|
snippet: string | null;
|
|
|
|
|
thumbnail: string | null;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
type SearxngResult = {
|
|
|
|
|
url: string;
|
|
|
|
|
title: string;
|
|
|
|
|
content?: string;
|
|
|
|
|
thumbnail?: string;
|
|
|
|
|
img_src?: string;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
type SearxngResponse = {
|
|
|
|
|
results?: SearxngResult[];
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
function hostnameFromUrl(url: string): string | null {
|
|
|
|
|
try {
|
|
|
|
|
return normalizeDomain(new URL(url).hostname);
|
|
|
|
|
} catch {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-17 15:47:28 +02:00
|
|
|
// Paths that are clearly NOT a single recipe page on common domains.
|
|
|
|
|
// Broad enough to cover forum/magazine/listing URLs we've seen in the wild
|
|
|
|
|
// across chefkoch.de, emmikochteinfach.de, experimente-aus-meiner-kueche.de etc.
|
|
|
|
|
const NON_RECIPE_PATH_PATTERNS: RegExp[] = [
|
|
|
|
|
/\/forum\//i,
|
|
|
|
|
/\/magazin\//i,
|
|
|
|
|
/\/magazine\//i,
|
|
|
|
|
/\/suche($|\/|\?)/i,
|
|
|
|
|
/\/search($|\/|\?)/i,
|
|
|
|
|
/\/benutzer\//i,
|
|
|
|
|
/\/profil\//i,
|
|
|
|
|
/\/autoren\//i,
|
|
|
|
|
/\/themen\//i,
|
|
|
|
|
/\/kategorie\//i,
|
|
|
|
|
/\/kategorien\//i,
|
|
|
|
|
/\/cook-and-style\//i,
|
|
|
|
|
/\/tag\//i,
|
|
|
|
|
/\/rezepte\/?$/i, // "/rezepte/" listing root
|
|
|
|
|
/\/rezepte\/kategorien/i,
|
|
|
|
|
/\/rezepte\/was-kocht/i,
|
|
|
|
|
/\/gewinnspiel/i,
|
|
|
|
|
/\/impressum/i,
|
|
|
|
|
/\/datenschutz/i,
|
|
|
|
|
/\/ueber-(uns|mich)/i,
|
|
|
|
|
// Chefkoch-specific search-/listing-URLs
|
|
|
|
|
/\/rs\/s\d+\//i,
|
|
|
|
|
/\/rs\/s\d+$/i,
|
|
|
|
|
/Rezepte\.html/i // /rs/.../Rezepte.html is a listing
|
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
function looksLikeRecipePage(url: string): boolean {
|
|
|
|
|
try {
|
|
|
|
|
const u = new URL(url);
|
|
|
|
|
const path = u.pathname + u.search;
|
|
|
|
|
for (const rx of NON_RECIPE_PATH_PATTERNS) {
|
|
|
|
|
if (rx.test(path)) return false;
|
|
|
|
|
}
|
|
|
|
|
// Heuristic: very short paths (just "/" or "/xyz") on non-blog sites
|
|
|
|
|
// are usually landing pages. Allow when at least one path segment exists
|
|
|
|
|
// AND path isn't just the root or a single top-level category keyword.
|
|
|
|
|
if (path === '/' || path === '') return false;
|
|
|
|
|
return true;
|
|
|
|
|
} catch {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-17 15:33:21 +02:00
|
|
|
export async function searchWeb(
|
|
|
|
|
db: Database.Database,
|
|
|
|
|
query: string,
|
|
|
|
|
opts: { searxngUrl?: string; limit?: number } = {}
|
|
|
|
|
): Promise<WebHit[]> {
|
|
|
|
|
const trimmed = query.trim();
|
|
|
|
|
if (!trimmed) return [];
|
|
|
|
|
const domains = listDomains(db).map((d) => d.domain);
|
|
|
|
|
if (domains.length === 0) return [];
|
|
|
|
|
|
|
|
|
|
const searxngUrl = opts.searxngUrl ?? process.env.SEARXNG_URL ?? 'http://localhost:8888';
|
|
|
|
|
const limit = opts.limit ?? 20;
|
|
|
|
|
const siteFilter = domains.map((d) => `site:${d}`).join(' OR ');
|
|
|
|
|
const q = `${trimmed} (${siteFilter})`;
|
|
|
|
|
const endpoint = new URL('/search', searxngUrl);
|
|
|
|
|
endpoint.searchParams.set('q', q);
|
|
|
|
|
endpoint.searchParams.set('format', 'json');
|
|
|
|
|
endpoint.searchParams.set('language', 'de');
|
|
|
|
|
|
|
|
|
|
const body = await fetchText(endpoint.toString(), { timeoutMs: 15_000 });
|
|
|
|
|
let parsed: SearxngResponse;
|
|
|
|
|
try {
|
|
|
|
|
parsed = JSON.parse(body) as SearxngResponse;
|
|
|
|
|
} catch {
|
|
|
|
|
throw new Error('SearXNG did not return JSON');
|
|
|
|
|
}
|
|
|
|
|
const results = parsed.results ?? [];
|
|
|
|
|
const allowed = new Set(domains);
|
|
|
|
|
const seen = new Set<string>();
|
|
|
|
|
const hits: WebHit[] = [];
|
|
|
|
|
for (const r of results) {
|
|
|
|
|
const host = hostnameFromUrl(r.url);
|
|
|
|
|
if (!host || !allowed.has(host)) continue;
|
2026-04-17 15:47:28 +02:00
|
|
|
if (!looksLikeRecipePage(r.url)) continue;
|
2026-04-17 15:33:21 +02:00
|
|
|
if (seen.has(r.url)) continue;
|
|
|
|
|
seen.add(r.url);
|
|
|
|
|
hits.push({
|
|
|
|
|
url: r.url,
|
|
|
|
|
title: r.title,
|
|
|
|
|
domain: host,
|
|
|
|
|
snippet: r.content ?? null,
|
|
|
|
|
thumbnail: r.thumbnail ?? r.img_src ?? null
|
|
|
|
|
});
|
|
|
|
|
if (hits.length >= limit) break;
|
|
|
|
|
}
|
|
|
|
|
return hits;
|
|
|
|
|
}
|