kochwas/src/lib/server/parsers/ingredient.ts

import type { Ingredient } from '$lib/types';

const UNITS = new Set([
  'g',
  'kg',
  'ml',
  'l',
  'cl',
  'dl',
  'TL',
  'EL',
  'Prise',
  'Pck.',
  'Pkg',
  'Becher',
  'Stk',
  'Stück',
  'Bund',
  'Tasse',
  'Dose'
]);

const FRACTION_MAP: Record<string, number> = {
  '1/2': 0.5,
  '1/3': 1 / 3,
  '2/3': 2 / 3,
  '1/4': 0.25,
  '3/4': 0.75
};

// Vulgar-Fraction-Codepoints — kommen in deutschsprachigen Rezept-Quellen
// regelmäßig vor (Chefkoch et al. liefern sie vereinzelt, mehr aber bei
// Apple's Food App, Fork etc.).
const UNICODE_FRACTION_MAP: Record<string, number> = {
  '\u00BD': 0.5, // ½
  '\u00BC': 0.25, // ¼
  '\u00BE': 0.75, // ¾
  '\u2150': 1 / 7,
  '\u2151': 1 / 9,
  '\u2152': 1 / 10,
  '\u2153': 1 / 3, // ⅓
  '\u2154': 2 / 3, // ⅔
  '\u2155': 0.2, // ⅕
  '\u2156': 0.4, // ⅖
  '\u2157': 0.6, // ⅗
  '\u2158': 0.8, // ⅘
  '\u2159': 1 / 6, // ⅙
  '\u215A': 5 / 6, // ⅚
  '\u215B': 0.125, // ⅛
  '\u215C': 0.375, // ⅜
  '\u215D': 0.625, // ⅝
  '\u215E': 0.875 // ⅞
};

// Mengen außerhalb dieses Bereichs sind fast sicher ein Parse-Müll
// (z. B. Microformat-Date oder Telefon-Nummer in einem JSON-LD-Quantity-
// Feld). Wir geben null zurück, raw_text bleibt für die UI erhalten.
const MAX_REASONABLE_QTY = 10000;

function clampQuantity(n: number | null): number | null {
  if (n === null || !Number.isFinite(n)) return null;
  if (n <= 0) return null;
  if (n > MAX_REASONABLE_QTY) return null;
  return n;
}

function parseQuantity(raw: string): number | null {
  const trimmed = raw.trim();
  if (FRACTION_MAP[trimmed] !== undefined) return FRACTION_MAP[trimmed];
  const rangeMatch = /^(\d+[.,]?\d*)\s*[-–]\s*\d+[.,]?\d*$/.exec(trimmed);
  if (rangeMatch) {
    return parseFloat(rangeMatch[1].replace(',', '.'));
  }
  const num = parseFloat(trimmed.replace(',', '.'));
  return Number.isFinite(num) ? num : null;
}

// Splits "TL Salz" → unit "TL", name "Salz"; "Zitrone" → unit null, name "Zitrone".
function splitUnitAndName(rest: string): { unit: string | null; name: string } {
  const trimmed = rest.trim();
  const firstTokenMatch = /^(\S+)\s+(.+)$/.exec(trimmed);
  if (firstTokenMatch && UNITS.has(firstTokenMatch[1])) {
    return { unit: firstTokenMatch[1], name: firstTokenMatch[2].trim() };
  }
  return { unit: null, name: trimmed };
}

export function parseIngredient(raw: string, position = 0): Ingredient {
  const rawText = raw.trim();
  let working = rawText;
  let note: string | null = null;
  const noteMatch = /\(([^)]+)\)/.exec(working);
  if (noteMatch) {
    note = noteMatch[1].trim();
    working = (
      working.slice(0, noteMatch.index) + working.slice(noteMatch.index + noteMatch[0].length)
    ).trim();
  }

  // Unicode-Bruch am Anfang? Dann das eine Zeichen als Menge nehmen
  // und den Rest wie üblich in Unit + Name aufteilen.
  const firstChar = working.charAt(0);
  if (UNICODE_FRACTION_MAP[firstChar] !== undefined) {
    const tail = working.slice(1).trimStart();
    if (tail.length > 0) {
      const quantity = clampQuantity(UNICODE_FRACTION_MAP[firstChar]);
      const { unit, name } = splitUnitAndName(tail);
      return { position, quantity, unit, name, note, raw_text: rawText };
    }
  }

  const qtyPattern = /^((?:\d+[.,]?\d*(?:\s*[-–]\s*\d+[.,]?\d*)?)|(?:\d+\/\d+))\s+(.+)$/;
  const qtyMatch = qtyPattern.exec(working);
  if (!qtyMatch) {
    return { position, quantity: null, unit: null, name: working, note, raw_text: rawText };
  }
  const quantity = clampQuantity(parseQuantity(qtyMatch[1]));
  const { unit, name } = splitUnitAndName(qtyMatch[2]);
  return { position, quantity, unit, name, note, raw_text: rawText };
}