All checks were successful
Build & Publish Docker Image / build-and-push (push) Successful in 1m19s
Rezeptwelt lieferte Zubereitungs-Steps immer als einen einzigen Treffer, oft mit vermischtem Icon-alt-Text. Zwei Ursachen, beide in der generischen Microdata-Logik — kein rezeptwelt-spezifischer Parser nötig. 1. HowToSection wrappt HowToSteps als itemListElement, unser Parser sah nur das erste. Jetzt: recipeInstructions-Container mit itemtype= HowToSection werden abgestiegen, jedes itemListElement wird ein Step. 2. Ein einzelner HowToStep kann intern "1. …<br>2. …<br>3. …" enthalten. Neuer textWithLineBreaks(el) konvertiert <br>/Block-Grenzen zu \n und ignoriert <img>/<script>/<style>. splitStepText(raw) erkennt nummerierte Zeilen und erzeugt einen eigenen Step pro Nummer; Fort- setzungszeilen ohne Nummer hängen an den aktuellen Step an. 3 neue Tests: HowToSection-Kette, inline-nummerierter Multi-Step, <img>-alt-Unterdrückung. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
403 lines
13 KiB
TypeScript
403 lines
13 KiB
TypeScript
import { parseHTML } from 'linkedom';
|
|
import { parseIso8601Duration } from './iso8601-duration';
|
|
import { parseIngredient } from './ingredient';
|
|
import type { Recipe, Step } from '$lib/types';
|
|
|
|
type JsonLdNode = Record<string, unknown>;
|
|
|
|
function unwrapGraph(node: unknown): JsonLdNode[] {
|
|
if (Array.isArray(node)) return node.flatMap(unwrapGraph);
|
|
if (node && typeof node === 'object') {
|
|
const obj = node as JsonLdNode;
|
|
if (obj['@graph']) return unwrapGraph(obj['@graph']);
|
|
return [obj];
|
|
}
|
|
return [];
|
|
}
|
|
|
|
function isRecipeType(t: unknown): boolean {
|
|
if (typeof t === 'string') return t === 'Recipe' || t.endsWith('/Recipe');
|
|
if (Array.isArray(t)) return t.some(isRecipeType);
|
|
return false;
|
|
}
|
|
|
|
function toText(v: unknown): string | null {
|
|
if (typeof v === 'string') return v.trim() || null;
|
|
if (Array.isArray(v) && v.length > 0) return toText(v[0]);
|
|
if (v && typeof v === 'object') {
|
|
const o = v as JsonLdNode;
|
|
if (typeof o.name === 'string') return o.name.trim();
|
|
if (typeof o.text === 'string') return o.text.trim();
|
|
}
|
|
return null;
|
|
}
|
|
|
|
function toImageUrl(v: unknown): string | null {
|
|
if (typeof v === 'string') return v;
|
|
if (Array.isArray(v) && v.length > 0) return toImageUrl(v[0]);
|
|
if (v && typeof v === 'object') {
|
|
const o = v as JsonLdNode;
|
|
if (typeof o.url === 'string') return o.url;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
function toStringArray(v: unknown): string[] {
|
|
if (Array.isArray(v)) return v.map((x) => toText(x)).filter((x): x is string => x !== null);
|
|
if (typeof v === 'string') return v.split(',').map((s) => s.trim()).filter(Boolean);
|
|
return [];
|
|
}
|
|
|
|
function toSteps(v: unknown): Step[] {
|
|
const out: Step[] = [];
|
|
const walk = (x: unknown): void => {
|
|
if (Array.isArray(x)) {
|
|
for (const item of x) walk(item);
|
|
return;
|
|
}
|
|
if (typeof x === 'string') {
|
|
if (x.trim()) out.push({ position: out.length + 1, text: x.trim() });
|
|
return;
|
|
}
|
|
if (x && typeof x === 'object') {
|
|
const obj = x as JsonLdNode;
|
|
if (obj['@type'] === 'HowToSection' && obj.itemListElement) {
|
|
walk(obj.itemListElement);
|
|
return;
|
|
}
|
|
if (obj['@type'] === 'HowToStep' && typeof obj.text === 'string') {
|
|
if (obj.text.trim()) out.push({ position: out.length + 1, text: obj.text.trim() });
|
|
return;
|
|
}
|
|
if (typeof obj.text === 'string' && obj.text.trim()) {
|
|
out.push({ position: out.length + 1, text: obj.text.trim() });
|
|
}
|
|
}
|
|
};
|
|
walk(v);
|
|
return out;
|
|
}
|
|
|
|
function toServings(v: unknown): number | null {
|
|
if (typeof v === 'number' && Number.isFinite(v)) return Math.trunc(v);
|
|
if (typeof v === 'string') {
|
|
const m = /(\d+)/.exec(v);
|
|
if (m) return parseInt(m[1], 10);
|
|
}
|
|
if (Array.isArray(v) && v.length > 0) return toServings(v[0]);
|
|
return null;
|
|
}
|
|
|
|
function findRecipeNode(html: string): JsonLdNode | null {
|
|
const { document } = parseHTML(html);
|
|
const scripts = document.querySelectorAll('script[type="application/ld+json"]');
|
|
for (const script of scripts) {
|
|
const raw = script.textContent;
|
|
if (!raw) continue;
|
|
try {
|
|
const parsed = JSON.parse(raw);
|
|
for (const node of unwrapGraph(parsed)) {
|
|
if (isRecipeType(node['@type'])) return node;
|
|
}
|
|
} catch {
|
|
// malformed JSON-LD, keep scanning
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
// Microdata-Alternative zum JSON-LD: viele SSR-Sites (inkl. rezeptwelt.de)
|
|
// nutzen <div itemtype="https://schema.org/Recipe"> statt application/ld+json.
|
|
// Ein einfacher Regex reicht — wir brauchen nur das Flag, nicht die Daten.
|
|
const MICRODATA_RECIPE = /itemtype\s*=\s*["']https?:\/\/schema\.org\/Recipe["']/i;
|
|
|
|
export function hasRecipeMarkup(html: string): boolean {
|
|
if (MICRODATA_RECIPE.test(html)) return true;
|
|
try {
|
|
return findRecipeNode(html) !== null;
|
|
} catch {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// @deprecated use hasRecipeMarkup
|
|
export function hasRecipeJsonLd(html: string): boolean {
|
|
return hasRecipeMarkup(html);
|
|
}
|
|
|
|
function microdataValueOf(el: Element): string {
|
|
if (el.hasAttribute('content')) return (el.getAttribute('content') ?? '').trim();
|
|
const tag = el.tagName.toLowerCase();
|
|
if (tag === 'meta') return (el.getAttribute('content') ?? '').trim();
|
|
if (tag === 'a' || tag === 'link' || tag === 'area')
|
|
return (el.getAttribute('href') ?? '').trim();
|
|
if (
|
|
tag === 'img' ||
|
|
tag === 'source' ||
|
|
tag === 'video' ||
|
|
tag === 'audio' ||
|
|
tag === 'embed' ||
|
|
tag === 'iframe' ||
|
|
tag === 'track'
|
|
)
|
|
return (el.getAttribute('src') ?? '').trim();
|
|
if (tag === 'object') return (el.getAttribute('data') ?? '').trim();
|
|
if (tag === 'data' || tag === 'meter')
|
|
return (el.getAttribute('value') ?? '').trim();
|
|
if (tag === 'time')
|
|
return (el.getAttribute('datetime') ?? el.textContent ?? '').trim();
|
|
return (el.textContent ?? '').trim();
|
|
}
|
|
|
|
type MicroProps = Map<string, Element[]>;
|
|
|
|
function gatherMicrodataProps(scope: Element): MicroProps {
|
|
// Alle itemprop-Descendants sammeln, dabei aber nicht in verschachtelte
|
|
// itemscopes einsteigen (sonst landen z.B. HowToStep.text im Haupt-Scope).
|
|
const map: MicroProps = new Map();
|
|
function walk(el: Element) {
|
|
for (const child of Array.from(el.children) as Element[]) {
|
|
const hasProp = child.hasAttribute('itemprop');
|
|
const hasScope = child.hasAttribute('itemscope');
|
|
if (hasProp) {
|
|
const names = (child.getAttribute('itemprop') ?? '')
|
|
.split(/\s+/)
|
|
.filter(Boolean);
|
|
for (const name of names) {
|
|
const arr = map.get(name) ?? [];
|
|
arr.push(child);
|
|
map.set(name, arr);
|
|
}
|
|
}
|
|
if (!hasScope) walk(child);
|
|
}
|
|
}
|
|
walk(scope);
|
|
return map;
|
|
}
|
|
|
|
function microText(map: MicroProps, name: string): string | null {
|
|
const els = map.get(name);
|
|
if (!els || els.length === 0) return null;
|
|
const v = microdataValueOf(els[0]);
|
|
return v || null;
|
|
}
|
|
|
|
function microAllTexts(map: MicroProps, name: string): string[] {
|
|
const els = map.get(name) ?? [];
|
|
return els.map(microdataValueOf).filter((v) => v !== '');
|
|
}
|
|
|
|
// Rausholen von Text mit erhaltenen Zeilenumbrüchen — <br> → \n, Block-
|
|
// Elemente (<p>, <li> …) bekommen ebenfalls Newline-Grenzen. <img>, <script>,
|
|
// <style> werden komplett übersprungen, damit alt-Attribute und andere
|
|
// Nicht-Text-Content nicht in den Rezepttext bluten.
|
|
function textWithLineBreaks(el: Element): string {
|
|
const BLOCK = new Set(['p', 'div', 'li', 'br', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'tr']);
|
|
const SKIP = new Set(['script', 'style', 'img', 'noscript']);
|
|
let out = '';
|
|
const walk = (node: Node): void => {
|
|
if (node.nodeType === 3) {
|
|
out += node.nodeValue ?? '';
|
|
return;
|
|
}
|
|
if (node.nodeType !== 1) return;
|
|
const e = node as Element;
|
|
const tag = e.tagName.toLowerCase();
|
|
if (SKIP.has(tag)) return;
|
|
const block = BLOCK.has(tag);
|
|
if (tag === 'br') {
|
|
out += '\n';
|
|
return;
|
|
}
|
|
if (block && out && !out.endsWith('\n')) out += '\n';
|
|
for (const child of Array.from(node.childNodes)) walk(child);
|
|
if (block && out && !out.endsWith('\n')) out += '\n';
|
|
};
|
|
walk(el);
|
|
return out;
|
|
}
|
|
|
|
// Teilt extrahierten Rezepttext in einzelne Schritte auf. Rezeptwelt und
|
|
// andere SSR-Sites liefern oft einen einzigen HowToStep-Block, der intern
|
|
// mit "1. …<br>2. …<br>3. …" mehrere Schritte vereint.
|
|
function splitStepText(raw: string): string[] {
|
|
const numbered = /^(\d+)[.)]\s+(.+)$/;
|
|
const lines = raw
|
|
.split(/\n+/)
|
|
.map((l) => l.replace(/\s+/g, ' ').trim())
|
|
.filter(Boolean);
|
|
if (lines.length === 0) return [];
|
|
const numberedCount = lines.filter((l) => numbered.test(l)).length;
|
|
if (numberedCount >= 2) {
|
|
// Mehrere nummerierte Zeilen → jede ist ein eigener Schritt. Nicht-
|
|
// nummerierte Folgezeilen gehören zum vorherigen Schritt.
|
|
const out: string[] = [];
|
|
let current = '';
|
|
for (const l of lines) {
|
|
const m = l.match(numbered);
|
|
if (m) {
|
|
if (current) out.push(current);
|
|
current = m[2];
|
|
} else {
|
|
current += current ? ' ' + l : l;
|
|
}
|
|
}
|
|
if (current) out.push(current);
|
|
return out;
|
|
}
|
|
return [lines.join(' ')];
|
|
}
|
|
|
|
function stepsFromElement(el: Element): string[] {
|
|
const textEl = el.querySelector('[itemprop="text"]') ?? el;
|
|
const raw = textWithLineBreaks(textEl);
|
|
return splitStepText(raw);
|
|
}
|
|
|
|
function microSteps(scope: Element): Step[] {
|
|
const out: Step[] = [];
|
|
let pos = 1;
|
|
const containers = Array.from(scope.querySelectorAll('[itemprop="recipeInstructions"]'));
|
|
for (const el of containers) {
|
|
const itemtype = (el.getAttribute('itemtype') ?? '').toLowerCase();
|
|
if (itemtype.includes('howtosection')) {
|
|
// HowToSection enthält HowToStep-Kinder als itemListElement.
|
|
const steps = Array.from(
|
|
el.querySelectorAll(
|
|
'[itemprop="itemListElement"]'
|
|
)
|
|
);
|
|
for (const step of steps) {
|
|
for (const t of stepsFromElement(step)) out.push({ position: pos++, text: t });
|
|
}
|
|
} else if (itemtype.includes('howtostep')) {
|
|
for (const t of stepsFromElement(el)) out.push({ position: pos++, text: t });
|
|
} else if (el.hasAttribute('itemscope')) {
|
|
// Anderer unbekannter Scope — trotzdem Text versuchen.
|
|
for (const t of stepsFromElement(el)) out.push({ position: pos++, text: t });
|
|
} else {
|
|
const lis = Array.from(el.querySelectorAll('li'));
|
|
if (lis.length > 0) {
|
|
for (const li of lis) {
|
|
for (const t of splitStepText(textWithLineBreaks(li))) {
|
|
out.push({ position: pos++, text: t });
|
|
}
|
|
}
|
|
} else {
|
|
for (const t of splitStepText(textWithLineBreaks(el))) {
|
|
out.push({ position: pos++, text: t });
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return out;
|
|
}
|
|
|
|
export function extractRecipeFromMicrodata(html: string): Recipe | null {
|
|
let document: Document;
|
|
try {
|
|
({ document } = parseHTML(html));
|
|
} catch {
|
|
return null;
|
|
}
|
|
const scope = document.querySelector(
|
|
'[itemtype*="schema.org/Recipe" i]'
|
|
);
|
|
if (!scope) return null;
|
|
const props = gatherMicrodataProps(scope);
|
|
|
|
const title = microText(props, 'name');
|
|
if (!title) return null;
|
|
|
|
const ingredients = microAllTexts(props, 'recipeIngredient')
|
|
.map((raw, i) => parseIngredient(raw, i + 1))
|
|
.filter((x): x is NonNullable<typeof x> => x !== null);
|
|
|
|
const steps = microSteps(scope);
|
|
const prep = parseIso8601Duration(microText(props, 'prepTime') ?? undefined);
|
|
const cook = parseIso8601Duration(microText(props, 'cookTime') ?? undefined);
|
|
const total = parseIso8601Duration(microText(props, 'totalTime') ?? undefined);
|
|
|
|
const tags = new Set<string>([
|
|
...microAllTexts(props, 'recipeCategory'),
|
|
...microAllTexts(props, 'recipeCuisine'),
|
|
...microAllTexts(props, 'keywords')
|
|
]);
|
|
|
|
return {
|
|
id: null,
|
|
title,
|
|
description: microText(props, 'description'),
|
|
source_url: microText(props, 'url'),
|
|
source_domain: null,
|
|
image_path: microText(props, 'image'),
|
|
servings_default: toServings(microText(props, 'recipeYield')),
|
|
servings_unit: null,
|
|
prep_time_min: prep,
|
|
cook_time_min: cook,
|
|
total_time_min: total,
|
|
cuisine: microText(props, 'recipeCuisine'),
|
|
category: microText(props, 'recipeCategory'),
|
|
ingredients,
|
|
steps,
|
|
tags: [...tags]
|
|
};
|
|
}
|
|
|
|
export function extractRecipeFromHtml(html: string): Recipe | null {
|
|
const node = findRecipeNode(html);
|
|
if (!node) {
|
|
// Fallback auf Microdata — rezeptwelt.de & andere SSR-Sites nutzen das
|
|
// anstatt application/ld+json.
|
|
return extractRecipeFromMicrodata(html);
|
|
}
|
|
|
|
const title = toText(node.name) ?? '';
|
|
if (!title) return null;
|
|
|
|
const ingredients = Array.isArray(node.recipeIngredient)
|
|
? (node.recipeIngredient as unknown[])
|
|
.map((x, i) => (typeof x === 'string' ? parseIngredient(x, i + 1) : null))
|
|
.filter((x): x is NonNullable<typeof x> => x !== null)
|
|
: [];
|
|
|
|
const steps = toSteps(node.recipeInstructions);
|
|
const imageUrl = toImageUrl(node.image);
|
|
|
|
const prep = parseIso8601Duration(
|
|
typeof node.prepTime === 'string' ? node.prepTime : undefined
|
|
);
|
|
const cook = parseIso8601Duration(
|
|
typeof node.cookTime === 'string' ? node.cookTime : undefined
|
|
);
|
|
const total = parseIso8601Duration(
|
|
typeof node.totalTime === 'string' ? node.totalTime : undefined
|
|
);
|
|
|
|
const tags = new Set<string>([
|
|
...toStringArray(node.recipeCategory),
|
|
...toStringArray(node.recipeCuisine),
|
|
...toStringArray(node.keywords)
|
|
]);
|
|
|
|
return {
|
|
id: null,
|
|
title,
|
|
description: toText(node.description),
|
|
source_url: typeof node.url === 'string' ? node.url : null,
|
|
source_domain: null,
|
|
image_path: imageUrl,
|
|
servings_default: toServings(node.recipeYield),
|
|
servings_unit: null,
|
|
prep_time_min: prep,
|
|
cook_time_min: cook,
|
|
total_time_min: total,
|
|
cuisine: toText(node.recipeCuisine),
|
|
category: toText(node.recipeCategory),
|
|
ingredients,
|
|
steps,
|
|
tags: [...tags]
|
|
};
|
|
}
|