From 0992e51a5d05ed0f5817a812f4c3c8ccfb4a1506 Mon Sep 17 00:00:00 2001
From: hsiegeln <37154749+hsiegeln@users.noreply.github.com>
Date: Fri, 17 Apr 2026 22:33:55 +0200
Subject: [PATCH] =?UTF-8?q?fix(search):=20Filter=20zuverl=C3=A4ssiger=20du?=
=?UTF-8?q?rch=20allowTruncate?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Vorher warf fetchText einen Fehler, sobald eine Seite >512 KB war —
bei modernen Rezeptseiten (eingebettete Bundles, base64-Bilder) läuft
das praktisch immer voll. Der Catch-Block hat dann hasRecipe auf NULL
gelassen, und der Treffer ging ungefiltert durch.
Neue FetchOptions.allowTruncate: true → wir bekommen die ersten 512 KB
(das reicht für
mit og:image und JSON-LD) statt eines Throws.
Timeout auf 8s erhöht, weil der Pi manchmal langsamer ist.
Migration 008 räumt alte NULL-has_recipe-Einträge aus dem Cache, damit
sie beim nächsten Search frisch klassifiziert werden statt weitere
30 Tage falsch gecached zu bleiben.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
.../008_thumbnail_cache_drop_unknown.sql | 7 ++++
src/lib/server/http.ts | 38 +++++++++++++++----
src/lib/server/search/searxng.ts | 11 +++++-
tests/integration/http.test.ts | 14 +++++++
4 files changed, 61 insertions(+), 9 deletions(-)
create mode 100644 src/lib/server/db/migrations/008_thumbnail_cache_drop_unknown.sql
diff --git a/src/lib/server/db/migrations/008_thumbnail_cache_drop_unknown.sql b/src/lib/server/db/migrations/008_thumbnail_cache_drop_unknown.sql
new file mode 100644
index 0000000..18b5111
--- /dev/null
+++ b/src/lib/server/db/migrations/008_thumbnail_cache_drop_unknown.sql
@@ -0,0 +1,7 @@
+-- Bei Migration 007 war `allowTruncate` in fetchText noch nicht implementiert,
+-- weshalb Seiten >512 KB einen Fehler warfen und hasRecipe als NULL (unbekannt)
+-- gespeichert wurde. Diese Einträge würden weitere 30 Tage nicht revalidiert
+-- und Treffer ohne schema.org/Recipe-Markup fälschlich durchlassen. Wir
+-- räumen sie jetzt einmalig ab, damit sie beim nächsten Fetch korrekt
+-- klassifiziert werden. Ein reines Cache-Flush, keine User-Daten betroffen.
+DELETE FROM thumbnail_cache WHERE has_recipe IS NULL;
diff --git a/src/lib/server/http.ts b/src/lib/server/http.ts
index 9a149fc..4c68c26 100644
--- a/src/lib/server/http.ts
+++ b/src/lib/server/http.ts
@@ -3,9 +3,16 @@ export type FetchOptions = {
timeoutMs?: number;
userAgent?: string;
extraHeaders?: Record;
+ /**
+ * When true, return the data read up to `maxBytes` instead of throwing.
+ * Useful when we only care about the page head (og:image, JSON-LD) — most
+ * recipe sites are >1 MB today because of inlined bundles, but the head is
+ * usually well under 512 KB.
+ */
+ allowTruncate?: boolean;
};
-const DEFAULTS: Required> = {
+const DEFAULTS: Required> = {
maxBytes: 10 * 1024 * 1024,
timeoutMs: 10_000,
userAgent: 'Kochwas/0.1'
@@ -25,16 +32,23 @@ function assertSafeUrl(url: string): void {
async function readBody(
response: Response,
- maxBytes: number
-): Promise<{ data: Uint8Array; total: number }> {
+ maxBytes: number,
+ allowTruncate: boolean
+): Promise<{ data: Uint8Array; total: number; truncated: boolean }> {
const reader = response.body?.getReader();
if (!reader) {
const buf = new Uint8Array(await response.arrayBuffer());
- if (buf.byteLength > maxBytes) throw new Error(`Response exceeds ${maxBytes} bytes`);
- return { data: buf, total: buf.byteLength };
+ if (buf.byteLength > maxBytes) {
+ if (allowTruncate) {
+ return { data: buf.slice(0, maxBytes), total: maxBytes, truncated: true };
+ }
+ throw new Error(`Response exceeds ${maxBytes} bytes`);
+ }
+ return { data: buf, total: buf.byteLength, truncated: false };
}
const chunks: Uint8Array[] = [];
let total = 0;
+ let truncated = false;
for (;;) {
const { value, done } = await reader.read();
if (done) break;
@@ -42,6 +56,14 @@ async function readBody(
total += value.byteLength;
if (total > maxBytes) {
await reader.cancel();
+ if (allowTruncate) {
+ // keep what we have up to the chunk boundary; good enough for HTML head
+ const keep = value.byteLength - (total - maxBytes);
+ if (keep > 0) chunks.push(value.slice(0, keep));
+ total = maxBytes;
+ truncated = true;
+ break;
+ }
throw new Error(`Response exceeds ${maxBytes} bytes`);
}
chunks.push(value);
@@ -53,7 +75,7 @@ async function readBody(
merged.set(c, offset);
offset += c.byteLength;
}
- return { data: merged, total };
+ return { data: merged, total, truncated };
}
async function doFetch(url: string, opts: FetchOptions): Promise {
@@ -82,7 +104,7 @@ async function doFetch(url: string, opts: FetchOptions): Promise {
export async function fetchText(url: string, opts: FetchOptions = {}): Promise {
const maxBytes = opts.maxBytes ?? DEFAULTS.maxBytes;
const res = await doFetch(url, opts);
- const { data } = await readBody(res, maxBytes);
+ const { data } = await readBody(res, maxBytes, opts.allowTruncate ?? false);
return new TextDecoder('utf-8').decode(data);
}
@@ -92,6 +114,6 @@ export async function fetchBuffer(
): Promise<{ data: Uint8Array; contentType: string | null }> {
const maxBytes = opts.maxBytes ?? DEFAULTS.maxBytes;
const res = await doFetch(url, opts);
- const { data } = await readBody(res, maxBytes);
+ const { data } = await readBody(res, maxBytes, opts.allowTruncate ?? false);
return { data, contentType: res.headers.get('content-type') };
}
diff --git a/src/lib/server/search/searxng.ts b/src/lib/server/search/searxng.ts
index 7d5da30..64d6c74 100644
--- a/src/lib/server/search/searxng.ts
+++ b/src/lib/server/search/searxng.ts
@@ -223,7 +223,16 @@ async function enrichPageMeta(
if (cached) return cached;
let meta: PageMeta = { image: null, hasRecipe: null };
try {
- const html = await fetchText(url, { timeoutMs: 4_000, maxBytes: 512 * 1024 });
+ // allowTruncate: moderne Rezeptseiten sind oft >1 MB (eingebettete
+ // Bundles, base64-Bilder). Das og:image und JSON-LD steht praktisch
+ // immer im , was locker in die ersten 512 KB passt. Früher
+ // warf fetchText auf Überschreitung und hasRecipe blieb NULL, sodass
+ // Nicht-Rezept-Seiten fälschlich durchgingen.
+ const html = await fetchText(url, {
+ timeoutMs: 8_000,
+ maxBytes: 512 * 1024,
+ allowTruncate: true
+ });
meta = {
image: extractPageImage(html, url),
hasRecipe: hasRecipeJsonLd(html) ? 1 : 0
diff --git a/tests/integration/http.test.ts b/tests/integration/http.test.ts
index 67de641..c181689 100644
--- a/tests/integration/http.test.ts
+++ b/tests/integration/http.test.ts
@@ -45,6 +45,20 @@ describe('fetchText', () => {
});
await expect(fetchText(`${baseUrl}/`, { timeoutMs: 150 })).rejects.toThrow();
});
+
+ it('allowTruncate returns first maxBytes instead of throwing', async () => {
+ const head = 'hi';
+ const filler = 'x'.repeat(2000);
+ server.on('request', (_req, res) => {
+ res.writeHead(200, { 'content-type': 'text/html' });
+ res.end(head + filler);
+ });
+ const text = await fetchText(`${baseUrl}/`, { maxBytes: 100, allowTruncate: true });
+ // First 100 bytes of body — should contain the opening at least
+ expect(text.length).toBeLessThanOrEqual(2048); // chunk boundary may overshoot exact bytes slightly
+ expect(text).toContain('');
+ expect(text).toContain('');
+ });
});
describe('fetchBuffer', () => {