From 0992e51a5d05ed0f5817a812f4c3c8ccfb4a1506 Mon Sep 17 00:00:00 2001
From: hsiegeln <37154749+hsiegeln@users.noreply.github.com>
Date: Fri, 17 Apr 2026 22:33:55 +0200
Subject: [PATCH] =?UTF-8?q?fix(search):=20Filter=20zuverl=C3=A4ssiger=20du?=
 =?UTF-8?q?rch=20allowTruncate?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Vorher warf fetchText einen Fehler, sobald eine Seite >512 KB war —
bei modernen Rezeptseiten (eingebettete Bundles, base64-Bilder) läuft
das praktisch immer voll. Der Catch-Block hat dann hasRecipe auf NULL
gelassen, und der Treffer ging ungefiltert durch.

Neue FetchOptions.allowTruncate: true → wir bekommen die ersten 512 KB
(das reicht für <head> mit og:image und JSON-LD) statt eines Throws.
Timeout auf 8s erhöht, weil der Pi manchmal langsamer ist.

Migration 008 räumt alte NULL-has_recipe-Einträge aus dem Cache, damit
sie beim nächsten Search frisch klassifiziert werden statt weitere
30 Tage falsch gecached zu bleiben.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../008_thumbnail_cache_drop_unknown.sql      |  7 ++++
 src/lib/server/http.ts                        | 38 +++++++++++++++----
 src/lib/server/search/searxng.ts              | 11 +++++-
 tests/integration/http.test.ts                | 14 +++++++
 4 files changed, 61 insertions(+), 9 deletions(-)
 create mode 100644 src/lib/server/db/migrations/008_thumbnail_cache_drop_unknown.sql
diff --git a/src/lib/server/db/migrations/008_thumbnail_cache_drop_unknown.sql b/src/lib/server/db/migrations/008_thumbnail_cache_drop_unknown.sql
new file mode 100644
index 0000000..18b5111
--- /dev/null
+++ b/src/lib/server/db/migrations/008_thumbnail_cache_drop_unknown.sql
@@ -0,0 +1,7 @@
+-- Bei Migration 007 war `allowTruncate` in fetchText noch nicht implementiert,
+-- weshalb Seiten >512 KB einen Fehler warfen und hasRecipe als NULL (unbekannt)
+-- gespeichert wurde. Diese Einträge würden weitere 30 Tage nicht revalidiert
+-- und Treffer ohne schema.org/Recipe-Markup fälschlich durchlassen. Wir
+-- räumen sie jetzt einmalig ab, damit sie beim nächsten Fetch korrekt
+-- klassifiziert werden. Ein reines Cache-Flush, keine User-Daten betroffen.
+DELETE FROM thumbnail_cache WHERE has_recipe IS NULL;
diff --git a/src/lib/server/http.ts b/src/lib/server/http.ts
index 9a149fc..4c68c26 100644
--- a/src/lib/server/http.ts
+++ b/src/lib/server/http.ts
@@ -3,9 +3,16 @@ export type FetchOptions = {
   timeoutMs?: number;
   userAgent?: string;
   extraHeaders?: Record<string, string>;
+  /**
+   * When true, return the data read up to `maxBytes` instead of throwing.
+   * Useful when we only care about the page head (og:image, JSON-LD) — most
+   * recipe sites are >1 MB today because of inlined bundles, but the head is
+   * usually well under 512 KB.
+   */
+  allowTruncate?: boolean;
 };
 
-const DEFAULTS: Required<Omit<FetchOptions, 'extraHeaders'>> = {
+const DEFAULTS: Required<Omit<FetchOptions, 'extraHeaders' | 'allowTruncate'>> = {
   maxBytes: 10 * 1024 * 1024,
   timeoutMs: 10_000,
   userAgent: 'Kochwas/0.1'
@@ -25,16 +32,23 @@ function assertSafeUrl(url: string): void {
 
 async function readBody(
   response: Response,
-  maxBytes: number
-): Promise<{ data: Uint8Array; total: number }> {
+  maxBytes: number,
+  allowTruncate: boolean
+): Promise<{ data: Uint8Array; total: number; truncated: boolean }> {
   const reader = response.body?.getReader();
   if (!reader) {
     const buf = new Uint8Array(await response.arrayBuffer());
-    if (buf.byteLength > maxBytes) throw new Error(`Response exceeds ${maxBytes} bytes`);
-    return { data: buf, total: buf.byteLength };
+    if (buf.byteLength > maxBytes) {
+      if (allowTruncate) {
+        return { data: buf.slice(0, maxBytes), total: maxBytes, truncated: true };
+      }
+      throw new Error(`Response exceeds ${maxBytes} bytes`);
+    }
+    return { data: buf, total: buf.byteLength, truncated: false };
   }
   const chunks: Uint8Array[] = [];
   let total = 0;
+  let truncated = false;
   for (;;) {
     const { value, done } = await reader.read();
     if (done) break;
@@ -42,6 +56,14 @@ async function readBody(
       total += value.byteLength;
       if (total > maxBytes) {
         await reader.cancel();
+        if (allowTruncate) {
+          // keep what we have up to the chunk boundary; good enough for HTML head
+          const keep = value.byteLength - (total - maxBytes);
+          if (keep > 0) chunks.push(value.slice(0, keep));
+          total = maxBytes;
+          truncated = true;
+          break;
+        }
         throw new Error(`Response exceeds ${maxBytes} bytes`);
       }
       chunks.push(value);
@@ -53,7 +75,7 @@ async function readBody(
     merged.set(c, offset);
     offset += c.byteLength;
   }
-  return { data: merged, total };
+  return { data: merged, total, truncated };
 }
 
 async function doFetch(url: string, opts: FetchOptions): Promise<Response> {
@@ -82,7 +104,7 @@ async function doFetch(url: string, opts: FetchOptions): Promise<Response> {
 export async function fetchText(url: string, opts: FetchOptions = {}): Promise<string> {
   const maxBytes = opts.maxBytes ?? DEFAULTS.maxBytes;
   const res = await doFetch(url, opts);
-  const { data } = await readBody(res, maxBytes);
+  const { data } = await readBody(res, maxBytes, opts.allowTruncate ?? false);
   return new TextDecoder('utf-8').decode(data);
 }
 
@@ -92,6 +114,6 @@ export async function fetchBuffer(
 ): Promise<{ data: Uint8Array; contentType: string | null }> {
   const maxBytes = opts.maxBytes ?? DEFAULTS.maxBytes;
   const res = await doFetch(url, opts);
-  const { data } = await readBody(res, maxBytes);
+  const { data } = await readBody(res, maxBytes, opts.allowTruncate ?? false);
   return { data, contentType: res.headers.get('content-type') };
 }
diff --git a/src/lib/server/search/searxng.ts b/src/lib/server/search/searxng.ts
index 7d5da30..64d6c74 100644
--- a/src/lib/server/search/searxng.ts
+++ b/src/lib/server/search/searxng.ts
@@ -223,7 +223,16 @@ async function enrichPageMeta(
   if (cached) return cached;
   let meta: PageMeta = { image: null, hasRecipe: null };
   try {
-    const html = await fetchText(url, { timeoutMs: 4_000, maxBytes: 512 * 1024 });
+    // allowTruncate: moderne Rezeptseiten sind oft >1 MB (eingebettete
+    // Bundles, base64-Bilder). Das og:image und JSON-LD steht praktisch
+    // immer im <head>, was locker in die ersten 512 KB passt. Früher
+    // warf fetchText auf Überschreitung und hasRecipe blieb NULL, sodass
+    // Nicht-Rezept-Seiten fälschlich durchgingen.
+    const html = await fetchText(url, {
+      timeoutMs: 8_000,
+      maxBytes: 512 * 1024,
+      allowTruncate: true
+    });
     meta = {
       image: extractPageImage(html, url),
       hasRecipe: hasRecipeJsonLd(html) ? 1 : 0
diff --git a/tests/integration/http.test.ts b/tests/integration/http.test.ts
index 67de641..c181689 100644
--- a/tests/integration/http.test.ts
+++ b/tests/integration/http.test.ts
@@ -45,6 +45,20 @@ describe('fetchText', () => {
     });
     await expect(fetchText(`${baseUrl}/`, { timeoutMs: 150 })).rejects.toThrow();
   });
+
+  it('allowTruncate returns first maxBytes instead of throwing', async () => {
+    const head = '<html><head><title>hi</title></head>';
+    const filler = 'x'.repeat(2000);
+    server.on('request', (_req, res) => {
+      res.writeHead(200, { 'content-type': 'text/html' });
+      res.end(head + filler);
+    });
+    const text = await fetchText(`${baseUrl}/`, { maxBytes: 100, allowTruncate: true });
+    // First 100 bytes of body — should contain the <head> opening at least
+    expect(text.length).toBeLessThanOrEqual(2048); // chunk boundary may overshoot exact bytes slightly
+    expect(text).toContain('<html>');
+    expect(text).toContain('<head>');
+  });
 });
 
 describe('fetchBuffer', () => {