fix(search): Filter zuverlässiger durch allowTruncate
All checks were successful
Build & Publish Docker Image / build-and-push (push) Successful in 1m16s
All checks were successful
Build & Publish Docker Image / build-and-push (push) Successful in 1m16s
Vorher warf fetchText einen Fehler, sobald eine Seite >512 KB war — bei modernen Rezeptseiten (eingebettete Bundles, base64-Bilder) läuft das praktisch immer voll. Der Catch-Block hat dann hasRecipe auf NULL gelassen, und der Treffer ging ungefiltert durch. Neue FetchOptions.allowTruncate: true → wir bekommen die ersten 512 KB (das reicht für <head> mit og:image und JSON-LD) statt eines Throws. Timeout auf 8s erhöht, weil der Pi manchmal langsamer ist. Migration 008 räumt alte NULL-has_recipe-Einträge aus dem Cache, damit sie beim nächsten Search frisch klassifiziert werden statt weitere 30 Tage falsch gecached zu bleiben. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,7 @@
|
|||||||
|
-- Bei Migration 007 war `allowTruncate` in fetchText noch nicht implementiert,
|
||||||
|
-- weshalb Seiten >512 KB einen Fehler warfen und hasRecipe als NULL (unbekannt)
|
||||||
|
-- gespeichert wurde. Diese Einträge würden weitere 30 Tage nicht revalidiert
|
||||||
|
-- und Treffer ohne schema.org/Recipe-Markup fälschlich durchlassen. Wir
|
||||||
|
-- räumen sie jetzt einmalig ab, damit sie beim nächsten Fetch korrekt
|
||||||
|
-- klassifiziert werden. Ein reines Cache-Flush, keine User-Daten betroffen.
|
||||||
|
DELETE FROM thumbnail_cache WHERE has_recipe IS NULL;
|
||||||
@@ -3,9 +3,16 @@ export type FetchOptions = {
|
|||||||
timeoutMs?: number;
|
timeoutMs?: number;
|
||||||
userAgent?: string;
|
userAgent?: string;
|
||||||
extraHeaders?: Record<string, string>;
|
extraHeaders?: Record<string, string>;
|
||||||
|
/**
|
||||||
|
* When true, return the data read up to `maxBytes` instead of throwing.
|
||||||
|
* Useful when we only care about the page head (og:image, JSON-LD) — most
|
||||||
|
* recipe sites are >1 MB today because of inlined bundles, but the head is
|
||||||
|
* usually well under 512 KB.
|
||||||
|
*/
|
||||||
|
allowTruncate?: boolean;
|
||||||
};
|
};
|
||||||
|
|
||||||
const DEFAULTS: Required<Omit<FetchOptions, 'extraHeaders'>> = {
|
const DEFAULTS: Required<Omit<FetchOptions, 'extraHeaders' | 'allowTruncate'>> = {
|
||||||
maxBytes: 10 * 1024 * 1024,
|
maxBytes: 10 * 1024 * 1024,
|
||||||
timeoutMs: 10_000,
|
timeoutMs: 10_000,
|
||||||
userAgent: 'Kochwas/0.1'
|
userAgent: 'Kochwas/0.1'
|
||||||
@@ -25,16 +32,23 @@ function assertSafeUrl(url: string): void {
|
|||||||
|
|
||||||
async function readBody(
|
async function readBody(
|
||||||
response: Response,
|
response: Response,
|
||||||
maxBytes: number
|
maxBytes: number,
|
||||||
): Promise<{ data: Uint8Array; total: number }> {
|
allowTruncate: boolean
|
||||||
|
): Promise<{ data: Uint8Array; total: number; truncated: boolean }> {
|
||||||
const reader = response.body?.getReader();
|
const reader = response.body?.getReader();
|
||||||
if (!reader) {
|
if (!reader) {
|
||||||
const buf = new Uint8Array(await response.arrayBuffer());
|
const buf = new Uint8Array(await response.arrayBuffer());
|
||||||
if (buf.byteLength > maxBytes) throw new Error(`Response exceeds ${maxBytes} bytes`);
|
if (buf.byteLength > maxBytes) {
|
||||||
return { data: buf, total: buf.byteLength };
|
if (allowTruncate) {
|
||||||
|
return { data: buf.slice(0, maxBytes), total: maxBytes, truncated: true };
|
||||||
|
}
|
||||||
|
throw new Error(`Response exceeds ${maxBytes} bytes`);
|
||||||
|
}
|
||||||
|
return { data: buf, total: buf.byteLength, truncated: false };
|
||||||
}
|
}
|
||||||
const chunks: Uint8Array[] = [];
|
const chunks: Uint8Array[] = [];
|
||||||
let total = 0;
|
let total = 0;
|
||||||
|
let truncated = false;
|
||||||
for (;;) {
|
for (;;) {
|
||||||
const { value, done } = await reader.read();
|
const { value, done } = await reader.read();
|
||||||
if (done) break;
|
if (done) break;
|
||||||
@@ -42,6 +56,14 @@ async function readBody(
|
|||||||
total += value.byteLength;
|
total += value.byteLength;
|
||||||
if (total > maxBytes) {
|
if (total > maxBytes) {
|
||||||
await reader.cancel();
|
await reader.cancel();
|
||||||
|
if (allowTruncate) {
|
||||||
|
// keep what we have up to the chunk boundary; good enough for HTML head
|
||||||
|
const keep = value.byteLength - (total - maxBytes);
|
||||||
|
if (keep > 0) chunks.push(value.slice(0, keep));
|
||||||
|
total = maxBytes;
|
||||||
|
truncated = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
throw new Error(`Response exceeds ${maxBytes} bytes`);
|
throw new Error(`Response exceeds ${maxBytes} bytes`);
|
||||||
}
|
}
|
||||||
chunks.push(value);
|
chunks.push(value);
|
||||||
@@ -53,7 +75,7 @@ async function readBody(
|
|||||||
merged.set(c, offset);
|
merged.set(c, offset);
|
||||||
offset += c.byteLength;
|
offset += c.byteLength;
|
||||||
}
|
}
|
||||||
return { data: merged, total };
|
return { data: merged, total, truncated };
|
||||||
}
|
}
|
||||||
|
|
||||||
async function doFetch(url: string, opts: FetchOptions): Promise<Response> {
|
async function doFetch(url: string, opts: FetchOptions): Promise<Response> {
|
||||||
@@ -82,7 +104,7 @@ async function doFetch(url: string, opts: FetchOptions): Promise<Response> {
|
|||||||
export async function fetchText(url: string, opts: FetchOptions = {}): Promise<string> {
|
export async function fetchText(url: string, opts: FetchOptions = {}): Promise<string> {
|
||||||
const maxBytes = opts.maxBytes ?? DEFAULTS.maxBytes;
|
const maxBytes = opts.maxBytes ?? DEFAULTS.maxBytes;
|
||||||
const res = await doFetch(url, opts);
|
const res = await doFetch(url, opts);
|
||||||
const { data } = await readBody(res, maxBytes);
|
const { data } = await readBody(res, maxBytes, opts.allowTruncate ?? false);
|
||||||
return new TextDecoder('utf-8').decode(data);
|
return new TextDecoder('utf-8').decode(data);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -92,6 +114,6 @@ export async function fetchBuffer(
|
|||||||
): Promise<{ data: Uint8Array; contentType: string | null }> {
|
): Promise<{ data: Uint8Array; contentType: string | null }> {
|
||||||
const maxBytes = opts.maxBytes ?? DEFAULTS.maxBytes;
|
const maxBytes = opts.maxBytes ?? DEFAULTS.maxBytes;
|
||||||
const res = await doFetch(url, opts);
|
const res = await doFetch(url, opts);
|
||||||
const { data } = await readBody(res, maxBytes);
|
const { data } = await readBody(res, maxBytes, opts.allowTruncate ?? false);
|
||||||
return { data, contentType: res.headers.get('content-type') };
|
return { data, contentType: res.headers.get('content-type') };
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -223,7 +223,16 @@ async function enrichPageMeta(
|
|||||||
if (cached) return cached;
|
if (cached) return cached;
|
||||||
let meta: PageMeta = { image: null, hasRecipe: null };
|
let meta: PageMeta = { image: null, hasRecipe: null };
|
||||||
try {
|
try {
|
||||||
const html = await fetchText(url, { timeoutMs: 4_000, maxBytes: 512 * 1024 });
|
// allowTruncate: moderne Rezeptseiten sind oft >1 MB (eingebettete
|
||||||
|
// Bundles, base64-Bilder). Das og:image und JSON-LD steht praktisch
|
||||||
|
// immer im <head>, was locker in die ersten 512 KB passt. Früher
|
||||||
|
// warf fetchText auf Überschreitung und hasRecipe blieb NULL, sodass
|
||||||
|
// Nicht-Rezept-Seiten fälschlich durchgingen.
|
||||||
|
const html = await fetchText(url, {
|
||||||
|
timeoutMs: 8_000,
|
||||||
|
maxBytes: 512 * 1024,
|
||||||
|
allowTruncate: true
|
||||||
|
});
|
||||||
meta = {
|
meta = {
|
||||||
image: extractPageImage(html, url),
|
image: extractPageImage(html, url),
|
||||||
hasRecipe: hasRecipeJsonLd(html) ? 1 : 0
|
hasRecipe: hasRecipeJsonLd(html) ? 1 : 0
|
||||||
|
|||||||
@@ -45,6 +45,20 @@ describe('fetchText', () => {
|
|||||||
});
|
});
|
||||||
await expect(fetchText(`${baseUrl}/`, { timeoutMs: 150 })).rejects.toThrow();
|
await expect(fetchText(`${baseUrl}/`, { timeoutMs: 150 })).rejects.toThrow();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('allowTruncate returns first maxBytes instead of throwing', async () => {
|
||||||
|
const head = '<html><head><title>hi</title></head>';
|
||||||
|
const filler = 'x'.repeat(2000);
|
||||||
|
server.on('request', (_req, res) => {
|
||||||
|
res.writeHead(200, { 'content-type': 'text/html' });
|
||||||
|
res.end(head + filler);
|
||||||
|
});
|
||||||
|
const text = await fetchText(`${baseUrl}/`, { maxBytes: 100, allowTruncate: true });
|
||||||
|
// First 100 bytes of body — should contain the <head> opening at least
|
||||||
|
expect(text.length).toBeLessThanOrEqual(2048); // chunk boundary may overshoot exact bytes slightly
|
||||||
|
expect(text).toContain('<html>');
|
||||||
|
expect(text).toContain('<head>');
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('fetchBuffer', () => {
|
describe('fetchBuffer', () => {
|
||||||
|
|||||||
Reference in New Issue
Block a user