fix(search): Filter zuverlässiger durch allowTruncate

Vorher warf fetchText einen Fehler, sobald eine Seite >512 KB war — bei modernen Rezeptseiten (eingebettete Bundles, base64-Bilder) läuft das praktisch immer voll. Der Catch-Block hat dann hasRecipe auf NULL gelassen, und der Treffer ging ungefiltert durch. Neue FetchOptions.allowTruncate: true → wir bekommen die ersten 512 KB (das reicht für <head> mit og:image und JSON-LD) statt eines Throws. Timeout auf 8s erhöht, weil der Pi manchmal langsamer ist. Migration 008 räumt alte NULL-has_recipe-Einträge aus dem Cache, damit sie beim nächsten Search frisch klassifiziert werden statt weitere 30 Tage falsch gecached zu bleiben. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-17 22:33:55 +02:00
parent d3c9bc5619
commit 0992e51a5d
4 changed files with 61 additions and 9 deletions
--- a/src/lib/server/db/migrations/008_thumbnail_cache_drop_unknown.sql
+++ b/src/lib/server/db/migrations/008_thumbnail_cache_drop_unknown.sql
@@ -0,0 +1,7 @@
+-- Bei Migration 007 war `allowTruncate` in fetchText noch nicht implementiert,
+-- weshalb Seiten >512 KB einen Fehler warfen und hasRecipe als NULL (unbekannt)
+-- gespeichert wurde. Diese Einträge würden weitere 30 Tage nicht revalidiert
+-- und Treffer ohne schema.org/Recipe-Markup fälschlich durchlassen. Wir
+-- räumen sie jetzt einmalig ab, damit sie beim nächsten Fetch korrekt
+-- klassifiziert werden. Ein reines Cache-Flush, keine User-Daten betroffen.
+DELETE FROM thumbnail_cache WHERE has_recipe IS NULL;
--- a/src/lib/server/http.ts
+++ b/src/lib/server/http.ts
@@ -3,9 +3,16 @@ export type FetchOptions = {
  timeoutMs?: number;
  userAgent?: string;
  extraHeaders?: Record<string, string>;
+  /**
+   * When true, return the data read up to `maxBytes` instead of throwing.
+   * Useful when we only care about the page head (og:image, JSON-LD) — most
+   * recipe sites are >1 MB today because of inlined bundles, but the head is
+   * usually well under 512 KB.
+   */
+  allowTruncate?: boolean;
 };

-const DEFAULTS: Required<Omit<FetchOptions, 'extraHeaders'>> = {
+const DEFAULTS: Required<Omit<FetchOptions, 'extraHeaders' | 'allowTruncate'>> = {
  maxBytes: 10 * 1024 * 1024,
  timeoutMs: 10_000,
  userAgent: 'Kochwas/0.1'
@@ -25,16 +32,23 @@ function assertSafeUrl(url: string): void {

 async function readBody(
  response: Response,
-  maxBytes: number
-): Promise<{ data: Uint8Array; total: number }> {
+  maxBytes: number,
+  allowTruncate: boolean
+): Promise<{ data: Uint8Array; total: number; truncated: boolean }> {
  const reader = response.body?.getReader();
  if (!reader) {
    const buf = new Uint8Array(await response.arrayBuffer());
-    if (buf.byteLength > maxBytes) throw new Error(`Response exceeds ${maxBytes} bytes`);
-    return { data: buf, total: buf.byteLength };
+    if (buf.byteLength > maxBytes) {
+      if (allowTruncate) {
+        return { data: buf.slice(0, maxBytes), total: maxBytes, truncated: true };
+      }
+      throw new Error(`Response exceeds ${maxBytes} bytes`);
+    }
+    return { data: buf, total: buf.byteLength, truncated: false };
  }
  const chunks: Uint8Array[] = [];
  let total = 0;
+  let truncated = false;
  for (;;) {
    const { value, done } = await reader.read();
    if (done) break;
@@ -42,6 +56,14 @@ async function readBody(
      total += value.byteLength;
      if (total > maxBytes) {
        await reader.cancel();
+        if (allowTruncate) {
+          // keep what we have up to the chunk boundary; good enough for HTML head
+          const keep = value.byteLength - (total - maxBytes);
+          if (keep > 0) chunks.push(value.slice(0, keep));
+          total = maxBytes;
+          truncated = true;
+          break;
+        }
        throw new Error(`Response exceeds ${maxBytes} bytes`);
      }
      chunks.push(value);
@@ -53,7 +75,7 @@ async function readBody(
    merged.set(c, offset);
    offset += c.byteLength;
  }
-  return { data: merged, total };
+  return { data: merged, total, truncated };
 }

 async function doFetch(url: string, opts: FetchOptions): Promise<Response> {
@@ -82,7 +104,7 @@ async function doFetch(url: string, opts: FetchOptions): Promise<Response> {
 export async function fetchText(url: string, opts: FetchOptions = {}): Promise<string> {
  const maxBytes = opts.maxBytes ?? DEFAULTS.maxBytes;
  const res = await doFetch(url, opts);
-  const { data } = await readBody(res, maxBytes);
+  const { data } = await readBody(res, maxBytes, opts.allowTruncate ?? false);
  return new TextDecoder('utf-8').decode(data);
 }

@@ -92,6 +114,6 @@ export async function fetchBuffer(
 ): Promise<{ data: Uint8Array; contentType: string | null }> {
  const maxBytes = opts.maxBytes ?? DEFAULTS.maxBytes;
  const res = await doFetch(url, opts);
-  const { data } = await readBody(res, maxBytes);
+  const { data } = await readBody(res, maxBytes, opts.allowTruncate ?? false);
  return { data, contentType: res.headers.get('content-type') };
 }
--- a/src/lib/server/search/searxng.ts
+++ b/src/lib/server/search/searxng.ts
@@ -223,7 +223,16 @@ async function enrichPageMeta(
  if (cached) return cached;
  let meta: PageMeta = { image: null, hasRecipe: null };
  try {
-    const html = await fetchText(url, { timeoutMs: 4_000, maxBytes: 512 * 1024 });
+    // allowTruncate: moderne Rezeptseiten sind oft >1 MB (eingebettete
+    // Bundles, base64-Bilder). Das og:image und JSON-LD steht praktisch
+    // immer im <head>, was locker in die ersten 512 KB passt. Früher
+    // warf fetchText auf Überschreitung und hasRecipe blieb NULL, sodass
+    // Nicht-Rezept-Seiten fälschlich durchgingen.
+    const html = await fetchText(url, {
+      timeoutMs: 8_000,
+      maxBytes: 512 * 1024,
+      allowTruncate: true
+    });
    meta = {
      image: extractPageImage(html, url),
      hasRecipe: hasRecipeJsonLd(html) ? 1 : 0
--- a/tests/integration/http.test.ts
+++ b/tests/integration/http.test.ts
@@ -45,6 +45,20 @@ describe('fetchText', () => {
    });
    await expect(fetchText(`${baseUrl}/`, { timeoutMs: 150 })).rejects.toThrow();
  });
+
+  it('allowTruncate returns first maxBytes instead of throwing', async () => {
+    const head = '<html><head><title>hi</title></head>';
+    const filler = 'x'.repeat(2000);
+    server.on('request', (_req, res) => {
+      res.writeHead(200, { 'content-type': 'text/html' });
+      res.end(head + filler);
+    });
+    const text = await fetchText(`${baseUrl}/`, { maxBytes: 100, allowTruncate: true });
+    // First 100 bytes of body — should contain the <head> opening at least
+    expect(text.length).toBeLessThanOrEqual(2048); // chunk boundary may overshoot exact bytes slightly
+    expect(text).toContain('<html>');
+    expect(text).toContain('<head>');
+  });
 });

 describe('fetchBuffer', () => {