diff --git a/src/lib/server/parsers/ingredient.ts b/src/lib/server/parsers/ingredient.ts index 1e2ae43..51230fc 100644 --- a/src/lib/server/parsers/ingredient.ts +++ b/src/lib/server/parsers/ingredient.ts @@ -28,6 +28,42 @@ const FRACTION_MAP: Record = { '3/4': 0.75 }; +// Vulgar-Fraction-Codepoints — kommen in deutschsprachigen Rezept-Quellen +// regelmäßig vor (Chefkoch et al. liefern sie vereinzelt, mehr aber bei +// Apple's Food App, Fork etc.). +const UNICODE_FRACTION_MAP: Record = { + '\u00BD': 0.5, // ½ + '\u00BC': 0.25, // ¼ + '\u00BE': 0.75, // ¾ + '\u2150': 1 / 7, + '\u2151': 1 / 9, + '\u2152': 1 / 10, + '\u2153': 1 / 3, // ⅓ + '\u2154': 2 / 3, // ⅔ + '\u2155': 0.2, // ⅕ + '\u2156': 0.4, // ⅖ + '\u2157': 0.6, // ⅗ + '\u2158': 0.8, // ⅘ + '\u2159': 1 / 6, // ⅙ + '\u215A': 5 / 6, // ⅚ + '\u215B': 0.125, // ⅛ + '\u215C': 0.375, // ⅜ + '\u215D': 0.625, // ⅝ + '\u215E': 0.875 // ⅞ +}; + +// Mengen außerhalb dieses Bereichs sind fast sicher ein Parse-Müll +// (z. B. Microformat-Date oder Telefon-Nummer in einem JSON-LD-Quantity- +// Feld). Wir geben null zurück, raw_text bleibt für die UI erhalten. +const MAX_REASONABLE_QTY = 10000; + +function clampQuantity(n: number | null): number | null { + if (n === null || !Number.isFinite(n)) return null; + if (n <= 0) return null; + if (n > MAX_REASONABLE_QTY) return null; + return n; +} + function parseQuantity(raw: string): number | null { const trimmed = raw.trim(); if (FRACTION_MAP[trimmed] !== undefined) return FRACTION_MAP[trimmed]; @@ -39,6 +75,16 @@ function parseQuantity(raw: string): number | null { return Number.isFinite(num) ? num : null; } +// Splits "TL Salz" → unit "TL", name "Salz"; "Zitrone" → unit null, name "Zitrone". +function splitUnitAndName(rest: string): { unit: string | null; name: string } { + const trimmed = rest.trim(); + const firstTokenMatch = /^(\S+)\s+(.+)$/.exec(trimmed); + if (firstTokenMatch && UNITS.has(firstTokenMatch[1])) { + return { unit: firstTokenMatch[1], name: firstTokenMatch[2].trim() }; + } + return { unit: null, name: trimmed }; +} + export function parseIngredient(raw: string, position = 0): Ingredient { const rawText = raw.trim(); let working = rawText; @@ -51,18 +97,24 @@ export function parseIngredient(raw: string, position = 0): Ingredient { ).trim(); } + // Unicode-Bruch am Anfang? Dann das eine Zeichen als Menge nehmen + // und den Rest wie üblich in Unit + Name aufteilen. + const firstChar = working.charAt(0); + if (UNICODE_FRACTION_MAP[firstChar] !== undefined) { + const tail = working.slice(1).trimStart(); + if (tail.length > 0) { + const quantity = clampQuantity(UNICODE_FRACTION_MAP[firstChar]); + const { unit, name } = splitUnitAndName(tail); + return { position, quantity, unit, name, note, raw_text: rawText }; + } + } + const qtyPattern = /^((?:\d+[.,]?\d*(?:\s*[-–]\s*\d+[.,]?\d*)?)|(?:\d+\/\d+))\s+(.+)$/; const qtyMatch = qtyPattern.exec(working); if (!qtyMatch) { return { position, quantity: null, unit: null, name: working, note, raw_text: rawText }; } - const quantity = parseQuantity(qtyMatch[1]); - let rest = qtyMatch[2].trim(); - let unit: string | null = null; - const firstTokenMatch = /^(\S+)\s+(.+)$/.exec(rest); - if (firstTokenMatch && UNITS.has(firstTokenMatch[1])) { - unit = firstTokenMatch[1]; - rest = firstTokenMatch[2].trim(); - } - return { position, quantity, unit, name: rest, note, raw_text: rawText }; + const quantity = clampQuantity(parseQuantity(qtyMatch[1])); + const { unit, name } = splitUnitAndName(qtyMatch[2]); + return { position, quantity, unit, name, note, raw_text: rawText }; } diff --git a/tests/unit/ingredient.test.ts b/tests/unit/ingredient.test.ts index 4fbc73a..cc3495c 100644 --- a/tests/unit/ingredient.test.ts +++ b/tests/unit/ingredient.test.ts @@ -39,4 +39,66 @@ describe('parseIngredient', () => { expect(p.quantity).toBe(2); expect(p.name).toBe('Tomaten'); }); + + describe('Unicode-Bruchzeichen', () => { + it.each([ + ['½ TL Salz', 0.5, 'TL', 'Salz'], + ['¼ kg Zucker', 0.25, 'kg', 'Zucker'], + ['¾ l Milch', 0.75, 'l', 'Milch'], + ['⅓ Tasse Mehl', 1 / 3, 'Tasse', 'Mehl'], + ['⅔ TL Pfeffer', 2 / 3, 'TL', 'Pfeffer'], + ['⅛ TL Muskat', 0.125, 'TL', 'Muskat'] + ] as const)('%s', (input, qty, unit, name) => { + const p = parseIngredient(input); + expect(p.quantity).toBeCloseTo(qty, 5); + expect(p.unit).toBe(unit); + expect(p.name).toBe(name); + }); + + it('Unicode-Bruch ohne Unit', () => { + const p = parseIngredient('½ Zitrone'); + expect(p.quantity).toBeCloseTo(0.5, 5); + expect(p.unit).toBe(null); + expect(p.name).toBe('Zitrone'); + }); + }); + + describe('Mengen-Plausibilitaet (Bounds)', () => { + it('weist 0 als Menge ab → quantity null', () => { + const p = parseIngredient('0 g Mehl'); + expect(p.quantity).toBe(null); + // name bleibt das was nach der "0" kommt — Importer muss das nicht + // perfekt rekonstruieren, der raw_text bleibt erhalten. + expect(p.raw_text).toBe('0 g Mehl'); + }); + + it('weist negative Menge ab', () => { + // "-1 EL Öl" — Minus führt regex direkt ins Fallback (kein \d am Start), + // also bleibt name = full text. + const p = parseIngredient('-1 EL Öl'); + expect(p.quantity).toBe(null); + }); + + it('weist Menge > 10000 ab', () => { + const p = parseIngredient('99999 g Hokuspokus'); + expect(p.quantity).toBe(null); + }); + + it('akzeptiert die Obergrenze 10000 selbst', () => { + const p = parseIngredient('10000 g Mehl'); + expect(p.quantity).toBe(10000); + }); + + it('akzeptiert führende Null bei Dezimalbrüchen', () => { + const p = parseIngredient('0.5 kg Salz'); + expect(p.quantity).toBe(0.5); + expect(p.unit).toBe('kg'); + }); + + it('akzeptiert deutsche führende Null', () => { + const p = parseIngredient('0,25 l Wasser'); + expect(p.quantity).toBe(0.25); + expect(p.unit).toBe('l'); + }); + }); });