Files
kochwas/src/lib/server/parsers/json-ld-recipe.ts
hsiegeln 3e3afc0102
All checks were successful
Build & Publish Docker Image / build-and-push (push) Successful in 1m19s
fix(importer): Microdata-Steps bei HowToSection + mehrfach-Schritten
Rezeptwelt lieferte Zubereitungs-Steps immer als einen einzigen Treffer,
oft mit vermischtem Icon-alt-Text. Zwei Ursachen, beide in der
generischen Microdata-Logik — kein rezeptwelt-spezifischer Parser nötig.

1. HowToSection wrappt HowToSteps als itemListElement, unser Parser sah
   nur das erste. Jetzt: recipeInstructions-Container mit itemtype=
   HowToSection werden abgestiegen, jedes itemListElement wird ein Step.

2. Ein einzelner HowToStep kann intern "1. …<br>2. …<br>3. …" enthalten.
   Neuer textWithLineBreaks(el) konvertiert <br>/Block-Grenzen zu \n und
   ignoriert <img>/<script>/<style>. splitStepText(raw) erkennt
   nummerierte Zeilen und erzeugt einen eigenen Step pro Nummer; Fort-
   setzungszeilen ohne Nummer hängen an den aktuellen Step an.

3 neue Tests: HowToSection-Kette, inline-nummerierter Multi-Step,
<img>-alt-Unterdrückung.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 09:03:50 +02:00

403 lines
13 KiB
TypeScript

import { parseHTML } from 'linkedom';
import { parseIso8601Duration } from './iso8601-duration';
import { parseIngredient } from './ingredient';
import type { Recipe, Step } from '$lib/types';
type JsonLdNode = Record<string, unknown>;
function unwrapGraph(node: unknown): JsonLdNode[] {
if (Array.isArray(node)) return node.flatMap(unwrapGraph);
if (node && typeof node === 'object') {
const obj = node as JsonLdNode;
if (obj['@graph']) return unwrapGraph(obj['@graph']);
return [obj];
}
return [];
}
function isRecipeType(t: unknown): boolean {
if (typeof t === 'string') return t === 'Recipe' || t.endsWith('/Recipe');
if (Array.isArray(t)) return t.some(isRecipeType);
return false;
}
function toText(v: unknown): string | null {
if (typeof v === 'string') return v.trim() || null;
if (Array.isArray(v) && v.length > 0) return toText(v[0]);
if (v && typeof v === 'object') {
const o = v as JsonLdNode;
if (typeof o.name === 'string') return o.name.trim();
if (typeof o.text === 'string') return o.text.trim();
}
return null;
}
function toImageUrl(v: unknown): string | null {
if (typeof v === 'string') return v;
if (Array.isArray(v) && v.length > 0) return toImageUrl(v[0]);
if (v && typeof v === 'object') {
const o = v as JsonLdNode;
if (typeof o.url === 'string') return o.url;
}
return null;
}
function toStringArray(v: unknown): string[] {
if (Array.isArray(v)) return v.map((x) => toText(x)).filter((x): x is string => x !== null);
if (typeof v === 'string') return v.split(',').map((s) => s.trim()).filter(Boolean);
return [];
}
function toSteps(v: unknown): Step[] {
const out: Step[] = [];
const walk = (x: unknown): void => {
if (Array.isArray(x)) {
for (const item of x) walk(item);
return;
}
if (typeof x === 'string') {
if (x.trim()) out.push({ position: out.length + 1, text: x.trim() });
return;
}
if (x && typeof x === 'object') {
const obj = x as JsonLdNode;
if (obj['@type'] === 'HowToSection' && obj.itemListElement) {
walk(obj.itemListElement);
return;
}
if (obj['@type'] === 'HowToStep' && typeof obj.text === 'string') {
if (obj.text.trim()) out.push({ position: out.length + 1, text: obj.text.trim() });
return;
}
if (typeof obj.text === 'string' && obj.text.trim()) {
out.push({ position: out.length + 1, text: obj.text.trim() });
}
}
};
walk(v);
return out;
}
function toServings(v: unknown): number | null {
if (typeof v === 'number' && Number.isFinite(v)) return Math.trunc(v);
if (typeof v === 'string') {
const m = /(\d+)/.exec(v);
if (m) return parseInt(m[1], 10);
}
if (Array.isArray(v) && v.length > 0) return toServings(v[0]);
return null;
}
function findRecipeNode(html: string): JsonLdNode | null {
const { document } = parseHTML(html);
const scripts = document.querySelectorAll('script[type="application/ld+json"]');
for (const script of scripts) {
const raw = script.textContent;
if (!raw) continue;
try {
const parsed = JSON.parse(raw);
for (const node of unwrapGraph(parsed)) {
if (isRecipeType(node['@type'])) return node;
}
} catch {
// malformed JSON-LD, keep scanning
}
}
return null;
}
// Microdata-Alternative zum JSON-LD: viele SSR-Sites (inkl. rezeptwelt.de)
// nutzen <div itemtype="https://schema.org/Recipe"> statt application/ld+json.
// Ein einfacher Regex reicht — wir brauchen nur das Flag, nicht die Daten.
const MICRODATA_RECIPE = /itemtype\s*=\s*["']https?:\/\/schema\.org\/Recipe["']/i;
export function hasRecipeMarkup(html: string): boolean {
if (MICRODATA_RECIPE.test(html)) return true;
try {
return findRecipeNode(html) !== null;
} catch {
return false;
}
}
// @deprecated use hasRecipeMarkup
export function hasRecipeJsonLd(html: string): boolean {
return hasRecipeMarkup(html);
}
function microdataValueOf(el: Element): string {
if (el.hasAttribute('content')) return (el.getAttribute('content') ?? '').trim();
const tag = el.tagName.toLowerCase();
if (tag === 'meta') return (el.getAttribute('content') ?? '').trim();
if (tag === 'a' || tag === 'link' || tag === 'area')
return (el.getAttribute('href') ?? '').trim();
if (
tag === 'img' ||
tag === 'source' ||
tag === 'video' ||
tag === 'audio' ||
tag === 'embed' ||
tag === 'iframe' ||
tag === 'track'
)
return (el.getAttribute('src') ?? '').trim();
if (tag === 'object') return (el.getAttribute('data') ?? '').trim();
if (tag === 'data' || tag === 'meter')
return (el.getAttribute('value') ?? '').trim();
if (tag === 'time')
return (el.getAttribute('datetime') ?? el.textContent ?? '').trim();
return (el.textContent ?? '').trim();
}
type MicroProps = Map<string, Element[]>;
function gatherMicrodataProps(scope: Element): MicroProps {
// Alle itemprop-Descendants sammeln, dabei aber nicht in verschachtelte
// itemscopes einsteigen (sonst landen z.B. HowToStep.text im Haupt-Scope).
const map: MicroProps = new Map();
function walk(el: Element) {
for (const child of Array.from(el.children) as Element[]) {
const hasProp = child.hasAttribute('itemprop');
const hasScope = child.hasAttribute('itemscope');
if (hasProp) {
const names = (child.getAttribute('itemprop') ?? '')
.split(/\s+/)
.filter(Boolean);
for (const name of names) {
const arr = map.get(name) ?? [];
arr.push(child);
map.set(name, arr);
}
}
if (!hasScope) walk(child);
}
}
walk(scope);
return map;
}
function microText(map: MicroProps, name: string): string | null {
const els = map.get(name);
if (!els || els.length === 0) return null;
const v = microdataValueOf(els[0]);
return v || null;
}
function microAllTexts(map: MicroProps, name: string): string[] {
const els = map.get(name) ?? [];
return els.map(microdataValueOf).filter((v) => v !== '');
}
// Rausholen von Text mit erhaltenen Zeilenumbrüchen — <br> → \n, Block-
// Elemente (<p>, <li> …) bekommen ebenfalls Newline-Grenzen. <img>, <script>,
// <style> werden komplett übersprungen, damit alt-Attribute und andere
// Nicht-Text-Content nicht in den Rezepttext bluten.
function textWithLineBreaks(el: Element): string {
const BLOCK = new Set(['p', 'div', 'li', 'br', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'tr']);
const SKIP = new Set(['script', 'style', 'img', 'noscript']);
let out = '';
const walk = (node: Node): void => {
if (node.nodeType === 3) {
out += node.nodeValue ?? '';
return;
}
if (node.nodeType !== 1) return;
const e = node as Element;
const tag = e.tagName.toLowerCase();
if (SKIP.has(tag)) return;
const block = BLOCK.has(tag);
if (tag === 'br') {
out += '\n';
return;
}
if (block && out && !out.endsWith('\n')) out += '\n';
for (const child of Array.from(node.childNodes)) walk(child);
if (block && out && !out.endsWith('\n')) out += '\n';
};
walk(el);
return out;
}
// Teilt extrahierten Rezepttext in einzelne Schritte auf. Rezeptwelt und
// andere SSR-Sites liefern oft einen einzigen HowToStep-Block, der intern
// mit "1. …<br>2. …<br>3. …" mehrere Schritte vereint.
function splitStepText(raw: string): string[] {
const numbered = /^(\d+)[.)]\s+(.+)$/;
const lines = raw
.split(/\n+/)
.map((l) => l.replace(/\s+/g, ' ').trim())
.filter(Boolean);
if (lines.length === 0) return [];
const numberedCount = lines.filter((l) => numbered.test(l)).length;
if (numberedCount >= 2) {
// Mehrere nummerierte Zeilen → jede ist ein eigener Schritt. Nicht-
// nummerierte Folgezeilen gehören zum vorherigen Schritt.
const out: string[] = [];
let current = '';
for (const l of lines) {
const m = l.match(numbered);
if (m) {
if (current) out.push(current);
current = m[2];
} else {
current += current ? ' ' + l : l;
}
}
if (current) out.push(current);
return out;
}
return [lines.join(' ')];
}
function stepsFromElement(el: Element): string[] {
const textEl = el.querySelector('[itemprop="text"]') ?? el;
const raw = textWithLineBreaks(textEl);
return splitStepText(raw);
}
function microSteps(scope: Element): Step[] {
const out: Step[] = [];
let pos = 1;
const containers = Array.from(scope.querySelectorAll('[itemprop="recipeInstructions"]'));
for (const el of containers) {
const itemtype = (el.getAttribute('itemtype') ?? '').toLowerCase();
if (itemtype.includes('howtosection')) {
// HowToSection enthält HowToStep-Kinder als itemListElement.
const steps = Array.from(
el.querySelectorAll(
'[itemprop="itemListElement"]'
)
);
for (const step of steps) {
for (const t of stepsFromElement(step)) out.push({ position: pos++, text: t });
}
} else if (itemtype.includes('howtostep')) {
for (const t of stepsFromElement(el)) out.push({ position: pos++, text: t });
} else if (el.hasAttribute('itemscope')) {
// Anderer unbekannter Scope — trotzdem Text versuchen.
for (const t of stepsFromElement(el)) out.push({ position: pos++, text: t });
} else {
const lis = Array.from(el.querySelectorAll('li'));
if (lis.length > 0) {
for (const li of lis) {
for (const t of splitStepText(textWithLineBreaks(li))) {
out.push({ position: pos++, text: t });
}
}
} else {
for (const t of splitStepText(textWithLineBreaks(el))) {
out.push({ position: pos++, text: t });
}
}
}
}
return out;
}
export function extractRecipeFromMicrodata(html: string): Recipe | null {
let document: Document;
try {
({ document } = parseHTML(html));
} catch {
return null;
}
const scope = document.querySelector(
'[itemtype*="schema.org/Recipe" i]'
);
if (!scope) return null;
const props = gatherMicrodataProps(scope);
const title = microText(props, 'name');
if (!title) return null;
const ingredients = microAllTexts(props, 'recipeIngredient')
.map((raw, i) => parseIngredient(raw, i + 1))
.filter((x): x is NonNullable<typeof x> => x !== null);
const steps = microSteps(scope);
const prep = parseIso8601Duration(microText(props, 'prepTime') ?? undefined);
const cook = parseIso8601Duration(microText(props, 'cookTime') ?? undefined);
const total = parseIso8601Duration(microText(props, 'totalTime') ?? undefined);
const tags = new Set<string>([
...microAllTexts(props, 'recipeCategory'),
...microAllTexts(props, 'recipeCuisine'),
...microAllTexts(props, 'keywords')
]);
return {
id: null,
title,
description: microText(props, 'description'),
source_url: microText(props, 'url'),
source_domain: null,
image_path: microText(props, 'image'),
servings_default: toServings(microText(props, 'recipeYield')),
servings_unit: null,
prep_time_min: prep,
cook_time_min: cook,
total_time_min: total,
cuisine: microText(props, 'recipeCuisine'),
category: microText(props, 'recipeCategory'),
ingredients,
steps,
tags: [...tags]
};
}
export function extractRecipeFromHtml(html: string): Recipe | null {
const node = findRecipeNode(html);
if (!node) {
// Fallback auf Microdata — rezeptwelt.de & andere SSR-Sites nutzen das
// anstatt application/ld+json.
return extractRecipeFromMicrodata(html);
}
const title = toText(node.name) ?? '';
if (!title) return null;
const ingredients = Array.isArray(node.recipeIngredient)
? (node.recipeIngredient as unknown[])
.map((x, i) => (typeof x === 'string' ? parseIngredient(x, i + 1) : null))
.filter((x): x is NonNullable<typeof x> => x !== null)
: [];
const steps = toSteps(node.recipeInstructions);
const imageUrl = toImageUrl(node.image);
const prep = parseIso8601Duration(
typeof node.prepTime === 'string' ? node.prepTime : undefined
);
const cook = parseIso8601Duration(
typeof node.cookTime === 'string' ? node.cookTime : undefined
);
const total = parseIso8601Duration(
typeof node.totalTime === 'string' ? node.totalTime : undefined
);
const tags = new Set<string>([
...toStringArray(node.recipeCategory),
...toStringArray(node.recipeCuisine),
...toStringArray(node.keywords)
]);
return {
id: null,
title,
description: toText(node.description),
source_url: typeof node.url === 'string' ? node.url : null,
source_domain: null,
image_path: imageUrl,
servings_default: toServings(node.recipeYield),
servings_unit: null,
prep_time_min: prep,
cook_time_min: cook,
total_time_min: total,
cuisine: toText(node.recipeCuisine),
category: toText(node.recipeCategory),
ingredients,
steps,
tags: [...tags]
};
}