diff --git a/src/lib/server/db/migrations/008_thumbnail_cache_drop_unknown.sql b/src/lib/server/db/migrations/008_thumbnail_cache_drop_unknown.sql new file mode 100644 index 0000000..18b5111 --- /dev/null +++ b/src/lib/server/db/migrations/008_thumbnail_cache_drop_unknown.sql @@ -0,0 +1,7 @@ +-- Bei Migration 007 war `allowTruncate` in fetchText noch nicht implementiert, +-- weshalb Seiten >512 KB einen Fehler warfen und hasRecipe als NULL (unbekannt) +-- gespeichert wurde. Diese Einträge würden weitere 30 Tage nicht revalidiert +-- und Treffer ohne schema.org/Recipe-Markup fälschlich durchlassen. Wir +-- räumen sie jetzt einmalig ab, damit sie beim nächsten Fetch korrekt +-- klassifiziert werden. Ein reines Cache-Flush, keine User-Daten betroffen. +DELETE FROM thumbnail_cache WHERE has_recipe IS NULL; diff --git a/src/lib/server/http.ts b/src/lib/server/http.ts index 9a149fc..4c68c26 100644 --- a/src/lib/server/http.ts +++ b/src/lib/server/http.ts @@ -3,9 +3,16 @@ export type FetchOptions = { timeoutMs?: number; userAgent?: string; extraHeaders?: Record; + /** + * When true, return the data read up to `maxBytes` instead of throwing. + * Useful when we only care about the page head (og:image, JSON-LD) — most + * recipe sites are >1 MB today because of inlined bundles, but the head is + * usually well under 512 KB. + */ + allowTruncate?: boolean; }; -const DEFAULTS: Required> = { +const DEFAULTS: Required> = { maxBytes: 10 * 1024 * 1024, timeoutMs: 10_000, userAgent: 'Kochwas/0.1' @@ -25,16 +32,23 @@ function assertSafeUrl(url: string): void { async function readBody( response: Response, - maxBytes: number -): Promise<{ data: Uint8Array; total: number }> { + maxBytes: number, + allowTruncate: boolean +): Promise<{ data: Uint8Array; total: number; truncated: boolean }> { const reader = response.body?.getReader(); if (!reader) { const buf = new Uint8Array(await response.arrayBuffer()); - if (buf.byteLength > maxBytes) throw new Error(`Response exceeds ${maxBytes} bytes`); - return { data: buf, total: buf.byteLength }; + if (buf.byteLength > maxBytes) { + if (allowTruncate) { + return { data: buf.slice(0, maxBytes), total: maxBytes, truncated: true }; + } + throw new Error(`Response exceeds ${maxBytes} bytes`); + } + return { data: buf, total: buf.byteLength, truncated: false }; } const chunks: Uint8Array[] = []; let total = 0; + let truncated = false; for (;;) { const { value, done } = await reader.read(); if (done) break; @@ -42,6 +56,14 @@ async function readBody( total += value.byteLength; if (total > maxBytes) { await reader.cancel(); + if (allowTruncate) { + // keep what we have up to the chunk boundary; good enough for HTML head + const keep = value.byteLength - (total - maxBytes); + if (keep > 0) chunks.push(value.slice(0, keep)); + total = maxBytes; + truncated = true; + break; + } throw new Error(`Response exceeds ${maxBytes} bytes`); } chunks.push(value); @@ -53,7 +75,7 @@ async function readBody( merged.set(c, offset); offset += c.byteLength; } - return { data: merged, total }; + return { data: merged, total, truncated }; } async function doFetch(url: string, opts: FetchOptions): Promise { @@ -82,7 +104,7 @@ async function doFetch(url: string, opts: FetchOptions): Promise { export async function fetchText(url: string, opts: FetchOptions = {}): Promise { const maxBytes = opts.maxBytes ?? DEFAULTS.maxBytes; const res = await doFetch(url, opts); - const { data } = await readBody(res, maxBytes); + const { data } = await readBody(res, maxBytes, opts.allowTruncate ?? false); return new TextDecoder('utf-8').decode(data); } @@ -92,6 +114,6 @@ export async function fetchBuffer( ): Promise<{ data: Uint8Array; contentType: string | null }> { const maxBytes = opts.maxBytes ?? DEFAULTS.maxBytes; const res = await doFetch(url, opts); - const { data } = await readBody(res, maxBytes); + const { data } = await readBody(res, maxBytes, opts.allowTruncate ?? false); return { data, contentType: res.headers.get('content-type') }; } diff --git a/src/lib/server/search/searxng.ts b/src/lib/server/search/searxng.ts index 7d5da30..64d6c74 100644 --- a/src/lib/server/search/searxng.ts +++ b/src/lib/server/search/searxng.ts @@ -223,7 +223,16 @@ async function enrichPageMeta( if (cached) return cached; let meta: PageMeta = { image: null, hasRecipe: null }; try { - const html = await fetchText(url, { timeoutMs: 4_000, maxBytes: 512 * 1024 }); + // allowTruncate: moderne Rezeptseiten sind oft >1 MB (eingebettete + // Bundles, base64-Bilder). Das og:image und JSON-LD steht praktisch + // immer im , was locker in die ersten 512 KB passt. Früher + // warf fetchText auf Überschreitung und hasRecipe blieb NULL, sodass + // Nicht-Rezept-Seiten fälschlich durchgingen. + const html = await fetchText(url, { + timeoutMs: 8_000, + maxBytes: 512 * 1024, + allowTruncate: true + }); meta = { image: extractPageImage(html, url), hasRecipe: hasRecipeJsonLd(html) ? 1 : 0 diff --git a/tests/integration/http.test.ts b/tests/integration/http.test.ts index 67de641..c181689 100644 --- a/tests/integration/http.test.ts +++ b/tests/integration/http.test.ts @@ -45,6 +45,20 @@ describe('fetchText', () => { }); await expect(fetchText(`${baseUrl}/`, { timeoutMs: 150 })).rejects.toThrow(); }); + + it('allowTruncate returns first maxBytes instead of throwing', async () => { + const head = 'hi'; + const filler = 'x'.repeat(2000); + server.on('request', (_req, res) => { + res.writeHead(200, { 'content-type': 'text/html' }); + res.end(head + filler); + }); + const text = await fetchText(`${baseUrl}/`, { maxBytes: 100, allowTruncate: true }); + // First 100 bytes of body — should contain the opening at least + expect(text.length).toBeLessThanOrEqual(2048); // chunk boundary may overshoot exact bytes slightly + expect(text).toContain(''); + expect(text).toContain(''); + }); }); describe('fetchBuffer', () => {