Files
kochwas/src/lib/server/parsers/json-ld-recipe.ts
hsiegeln aad3ad689d
All checks were successful
Build & Publish Docker Image / build-and-push (push) Successful in 1m17s
feat(importer): Microdata-Fallback für Seiten ohne JSON-LD
Bisher scheiterte der Import auf Seiten wie rezeptwelt.de mit „Diese Seite
enthält kein Rezept", obwohl unser Such-Filter die Treffer durchließ
(Microdata wird seit dem vorherigen Commit erkannt). Jetzt kann der
Importer die Daten auch tatsächlich extrahieren:

- extractRecipeFromMicrodata(html): parst [itemtype=schema.org/Recipe]-
  Scopes per linkedom, sammelt itemprop-Werte unter Beachtung der
  verschachtelten itemscope-Grenzen (HowToStep-Texts landen nicht im
  Haupt-Scope).
- Übernimmt Content-Attribute auf <meta>/<time> (z.B. prepTime="PT20M"),
  src auf <img>, textContent als Fallback — die Standard-Microdata-
  Value-Regeln.
- Behandelt HowToStep-Items UND einfache <li>/<ol>-Listen als
  recipeInstructions.
- extractRecipeFromHtml ruft JSON-LD zuerst, fällt nur bei null auf
  Microdata zurück — damit bleibt bestehendes Verhalten stabil.

Tests: Königsberger-Klopse-Fixture mit HowToSteps, einfache ol/li-
Variante und Priorität-JSON-LD-über-Microdata-Check.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 08:52:00 +02:00

322 lines
9.9 KiB
TypeScript

import { parseHTML } from 'linkedom';
import { parseIso8601Duration } from './iso8601-duration';
import { parseIngredient } from './ingredient';
import type { Recipe, Step } from '$lib/types';
type JsonLdNode = Record<string, unknown>;
function unwrapGraph(node: unknown): JsonLdNode[] {
if (Array.isArray(node)) return node.flatMap(unwrapGraph);
if (node && typeof node === 'object') {
const obj = node as JsonLdNode;
if (obj['@graph']) return unwrapGraph(obj['@graph']);
return [obj];
}
return [];
}
function isRecipeType(t: unknown): boolean {
if (typeof t === 'string') return t === 'Recipe' || t.endsWith('/Recipe');
if (Array.isArray(t)) return t.some(isRecipeType);
return false;
}
function toText(v: unknown): string | null {
if (typeof v === 'string') return v.trim() || null;
if (Array.isArray(v) && v.length > 0) return toText(v[0]);
if (v && typeof v === 'object') {
const o = v as JsonLdNode;
if (typeof o.name === 'string') return o.name.trim();
if (typeof o.text === 'string') return o.text.trim();
}
return null;
}
function toImageUrl(v: unknown): string | null {
if (typeof v === 'string') return v;
if (Array.isArray(v) && v.length > 0) return toImageUrl(v[0]);
if (v && typeof v === 'object') {
const o = v as JsonLdNode;
if (typeof o.url === 'string') return o.url;
}
return null;
}
function toStringArray(v: unknown): string[] {
if (Array.isArray(v)) return v.map((x) => toText(x)).filter((x): x is string => x !== null);
if (typeof v === 'string') return v.split(',').map((s) => s.trim()).filter(Boolean);
return [];
}
function toSteps(v: unknown): Step[] {
const out: Step[] = [];
const walk = (x: unknown): void => {
if (Array.isArray(x)) {
for (const item of x) walk(item);
return;
}
if (typeof x === 'string') {
if (x.trim()) out.push({ position: out.length + 1, text: x.trim() });
return;
}
if (x && typeof x === 'object') {
const obj = x as JsonLdNode;
if (obj['@type'] === 'HowToSection' && obj.itemListElement) {
walk(obj.itemListElement);
return;
}
if (obj['@type'] === 'HowToStep' && typeof obj.text === 'string') {
if (obj.text.trim()) out.push({ position: out.length + 1, text: obj.text.trim() });
return;
}
if (typeof obj.text === 'string' && obj.text.trim()) {
out.push({ position: out.length + 1, text: obj.text.trim() });
}
}
};
walk(v);
return out;
}
function toServings(v: unknown): number | null {
if (typeof v === 'number' && Number.isFinite(v)) return Math.trunc(v);
if (typeof v === 'string') {
const m = /(\d+)/.exec(v);
if (m) return parseInt(m[1], 10);
}
if (Array.isArray(v) && v.length > 0) return toServings(v[0]);
return null;
}
function findRecipeNode(html: string): JsonLdNode | null {
const { document } = parseHTML(html);
const scripts = document.querySelectorAll('script[type="application/ld+json"]');
for (const script of scripts) {
const raw = script.textContent;
if (!raw) continue;
try {
const parsed = JSON.parse(raw);
for (const node of unwrapGraph(parsed)) {
if (isRecipeType(node['@type'])) return node;
}
} catch {
// malformed JSON-LD, keep scanning
}
}
return null;
}
// Microdata-Alternative zum JSON-LD: viele SSR-Sites (inkl. rezeptwelt.de)
// nutzen <div itemtype="https://schema.org/Recipe"> statt application/ld+json.
// Ein einfacher Regex reicht — wir brauchen nur das Flag, nicht die Daten.
const MICRODATA_RECIPE = /itemtype\s*=\s*["']https?:\/\/schema\.org\/Recipe["']/i;
export function hasRecipeMarkup(html: string): boolean {
if (MICRODATA_RECIPE.test(html)) return true;
try {
return findRecipeNode(html) !== null;
} catch {
return false;
}
}
// @deprecated use hasRecipeMarkup
export function hasRecipeJsonLd(html: string): boolean {
return hasRecipeMarkup(html);
}
function microdataValueOf(el: Element): string {
if (el.hasAttribute('content')) return (el.getAttribute('content') ?? '').trim();
const tag = el.tagName.toLowerCase();
if (tag === 'meta') return (el.getAttribute('content') ?? '').trim();
if (tag === 'a' || tag === 'link' || tag === 'area')
return (el.getAttribute('href') ?? '').trim();
if (
tag === 'img' ||
tag === 'source' ||
tag === 'video' ||
tag === 'audio' ||
tag === 'embed' ||
tag === 'iframe' ||
tag === 'track'
)
return (el.getAttribute('src') ?? '').trim();
if (tag === 'object') return (el.getAttribute('data') ?? '').trim();
if (tag === 'data' || tag === 'meter')
return (el.getAttribute('value') ?? '').trim();
if (tag === 'time')
return (el.getAttribute('datetime') ?? el.textContent ?? '').trim();
return (el.textContent ?? '').trim();
}
type MicroProps = Map<string, Element[]>;
function gatherMicrodataProps(scope: Element): MicroProps {
// Alle itemprop-Descendants sammeln, dabei aber nicht in verschachtelte
// itemscopes einsteigen (sonst landen z.B. HowToStep.text im Haupt-Scope).
const map: MicroProps = new Map();
function walk(el: Element) {
for (const child of Array.from(el.children) as Element[]) {
const hasProp = child.hasAttribute('itemprop');
const hasScope = child.hasAttribute('itemscope');
if (hasProp) {
const names = (child.getAttribute('itemprop') ?? '')
.split(/\s+/)
.filter(Boolean);
for (const name of names) {
const arr = map.get(name) ?? [];
arr.push(child);
map.set(name, arr);
}
}
if (!hasScope) walk(child);
}
}
walk(scope);
return map;
}
function microText(map: MicroProps, name: string): string | null {
const els = map.get(name);
if (!els || els.length === 0) return null;
const v = microdataValueOf(els[0]);
return v || null;
}
function microAllTexts(map: MicroProps, name: string): string[] {
const els = map.get(name) ?? [];
return els.map(microdataValueOf).filter((v) => v !== '');
}
function microSteps(scope: Element): Step[] {
const out: Step[] = [];
let pos = 1;
const nodes = Array.from(scope.querySelectorAll('[itemprop="recipeInstructions"]'));
for (const el of nodes) {
if (el.hasAttribute('itemscope')) {
const textEl = el.querySelector('[itemprop="text"]');
const t = (textEl?.textContent ?? el.textContent ?? '').trim();
if (t) out.push({ position: pos++, text: t });
} else {
const lis = el.querySelectorAll('li');
if (lis.length > 0) {
for (const li of Array.from(lis)) {
const t = (li.textContent ?? '').trim();
if (t) out.push({ position: pos++, text: t });
}
} else {
const t = (el.textContent ?? '').trim();
if (t) out.push({ position: pos++, text: t });
}
}
}
return out;
}
export function extractRecipeFromMicrodata(html: string): Recipe | null {
let document: Document;
try {
({ document } = parseHTML(html));
} catch {
return null;
}
const scope = document.querySelector(
'[itemtype*="schema.org/Recipe" i]'
);
if (!scope) return null;
const props = gatherMicrodataProps(scope);
const title = microText(props, 'name');
if (!title) return null;
const ingredients = microAllTexts(props, 'recipeIngredient')
.map((raw, i) => parseIngredient(raw, i + 1))
.filter((x): x is NonNullable<typeof x> => x !== null);
const steps = microSteps(scope);
const prep = parseIso8601Duration(microText(props, 'prepTime') ?? undefined);
const cook = parseIso8601Duration(microText(props, 'cookTime') ?? undefined);
const total = parseIso8601Duration(microText(props, 'totalTime') ?? undefined);
const tags = new Set<string>([
...microAllTexts(props, 'recipeCategory'),
...microAllTexts(props, 'recipeCuisine'),
...microAllTexts(props, 'keywords')
]);
return {
id: null,
title,
description: microText(props, 'description'),
source_url: microText(props, 'url'),
source_domain: null,
image_path: microText(props, 'image'),
servings_default: toServings(microText(props, 'recipeYield')),
servings_unit: null,
prep_time_min: prep,
cook_time_min: cook,
total_time_min: total,
cuisine: microText(props, 'recipeCuisine'),
category: microText(props, 'recipeCategory'),
ingredients,
steps,
tags: [...tags]
};
}
export function extractRecipeFromHtml(html: string): Recipe | null {
const node = findRecipeNode(html);
if (!node) {
// Fallback auf Microdata — rezeptwelt.de & andere SSR-Sites nutzen das
// anstatt application/ld+json.
return extractRecipeFromMicrodata(html);
}
const title = toText(node.name) ?? '';
if (!title) return null;
const ingredients = Array.isArray(node.recipeIngredient)
? (node.recipeIngredient as unknown[])
.map((x, i) => (typeof x === 'string' ? parseIngredient(x, i + 1) : null))
.filter((x): x is NonNullable<typeof x> => x !== null)
: [];
const steps = toSteps(node.recipeInstructions);
const imageUrl = toImageUrl(node.image);
const prep = parseIso8601Duration(
typeof node.prepTime === 'string' ? node.prepTime : undefined
);
const cook = parseIso8601Duration(
typeof node.cookTime === 'string' ? node.cookTime : undefined
);
const total = parseIso8601Duration(
typeof node.totalTime === 'string' ? node.totalTime : undefined
);
const tags = new Set<string>([
...toStringArray(node.recipeCategory),
...toStringArray(node.recipeCuisine),
...toStringArray(node.keywords)
]);
return {
id: null,
title,
description: toText(node.description),
source_url: typeof node.url === 'string' ? node.url : null,
source_domain: null,
image_path: imageUrl,
servings_default: toServings(node.recipeYield),
servings_unit: null,
prep_time_min: prep,
cook_time_min: cook,
total_time_min: total,
cuisine: toText(node.recipeCuisine),
category: toText(node.recipeCategory),
ingredients,
steps,
tags: [...tags]
};
}