All checks were successful
Build & Publish Docker Image / build-and-push (push) Successful in 1m17s
Bisher scheiterte der Import auf Seiten wie rezeptwelt.de mit „Diese Seite enthält kein Rezept", obwohl unser Such-Filter die Treffer durchließ (Microdata wird seit dem vorherigen Commit erkannt). Jetzt kann der Importer die Daten auch tatsächlich extrahieren: - extractRecipeFromMicrodata(html): parst [itemtype=schema.org/Recipe]- Scopes per linkedom, sammelt itemprop-Werte unter Beachtung der verschachtelten itemscope-Grenzen (HowToStep-Texts landen nicht im Haupt-Scope). - Übernimmt Content-Attribute auf <meta>/<time> (z.B. prepTime="PT20M"), src auf <img>, textContent als Fallback — die Standard-Microdata- Value-Regeln. - Behandelt HowToStep-Items UND einfache <li>/<ol>-Listen als recipeInstructions. - extractRecipeFromHtml ruft JSON-LD zuerst, fällt nur bei null auf Microdata zurück — damit bleibt bestehendes Verhalten stabil. Tests: Königsberger-Klopse-Fixture mit HowToSteps, einfache ol/li- Variante und Priorität-JSON-LD-über-Microdata-Check. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
322 lines
9.9 KiB
TypeScript
322 lines
9.9 KiB
TypeScript
import { parseHTML } from 'linkedom';
|
|
import { parseIso8601Duration } from './iso8601-duration';
|
|
import { parseIngredient } from './ingredient';
|
|
import type { Recipe, Step } from '$lib/types';
|
|
|
|
type JsonLdNode = Record<string, unknown>;
|
|
|
|
function unwrapGraph(node: unknown): JsonLdNode[] {
|
|
if (Array.isArray(node)) return node.flatMap(unwrapGraph);
|
|
if (node && typeof node === 'object') {
|
|
const obj = node as JsonLdNode;
|
|
if (obj['@graph']) return unwrapGraph(obj['@graph']);
|
|
return [obj];
|
|
}
|
|
return [];
|
|
}
|
|
|
|
function isRecipeType(t: unknown): boolean {
|
|
if (typeof t === 'string') return t === 'Recipe' || t.endsWith('/Recipe');
|
|
if (Array.isArray(t)) return t.some(isRecipeType);
|
|
return false;
|
|
}
|
|
|
|
function toText(v: unknown): string | null {
|
|
if (typeof v === 'string') return v.trim() || null;
|
|
if (Array.isArray(v) && v.length > 0) return toText(v[0]);
|
|
if (v && typeof v === 'object') {
|
|
const o = v as JsonLdNode;
|
|
if (typeof o.name === 'string') return o.name.trim();
|
|
if (typeof o.text === 'string') return o.text.trim();
|
|
}
|
|
return null;
|
|
}
|
|
|
|
function toImageUrl(v: unknown): string | null {
|
|
if (typeof v === 'string') return v;
|
|
if (Array.isArray(v) && v.length > 0) return toImageUrl(v[0]);
|
|
if (v && typeof v === 'object') {
|
|
const o = v as JsonLdNode;
|
|
if (typeof o.url === 'string') return o.url;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
function toStringArray(v: unknown): string[] {
|
|
if (Array.isArray(v)) return v.map((x) => toText(x)).filter((x): x is string => x !== null);
|
|
if (typeof v === 'string') return v.split(',').map((s) => s.trim()).filter(Boolean);
|
|
return [];
|
|
}
|
|
|
|
function toSteps(v: unknown): Step[] {
|
|
const out: Step[] = [];
|
|
const walk = (x: unknown): void => {
|
|
if (Array.isArray(x)) {
|
|
for (const item of x) walk(item);
|
|
return;
|
|
}
|
|
if (typeof x === 'string') {
|
|
if (x.trim()) out.push({ position: out.length + 1, text: x.trim() });
|
|
return;
|
|
}
|
|
if (x && typeof x === 'object') {
|
|
const obj = x as JsonLdNode;
|
|
if (obj['@type'] === 'HowToSection' && obj.itemListElement) {
|
|
walk(obj.itemListElement);
|
|
return;
|
|
}
|
|
if (obj['@type'] === 'HowToStep' && typeof obj.text === 'string') {
|
|
if (obj.text.trim()) out.push({ position: out.length + 1, text: obj.text.trim() });
|
|
return;
|
|
}
|
|
if (typeof obj.text === 'string' && obj.text.trim()) {
|
|
out.push({ position: out.length + 1, text: obj.text.trim() });
|
|
}
|
|
}
|
|
};
|
|
walk(v);
|
|
return out;
|
|
}
|
|
|
|
function toServings(v: unknown): number | null {
|
|
if (typeof v === 'number' && Number.isFinite(v)) return Math.trunc(v);
|
|
if (typeof v === 'string') {
|
|
const m = /(\d+)/.exec(v);
|
|
if (m) return parseInt(m[1], 10);
|
|
}
|
|
if (Array.isArray(v) && v.length > 0) return toServings(v[0]);
|
|
return null;
|
|
}
|
|
|
|
function findRecipeNode(html: string): JsonLdNode | null {
|
|
const { document } = parseHTML(html);
|
|
const scripts = document.querySelectorAll('script[type="application/ld+json"]');
|
|
for (const script of scripts) {
|
|
const raw = script.textContent;
|
|
if (!raw) continue;
|
|
try {
|
|
const parsed = JSON.parse(raw);
|
|
for (const node of unwrapGraph(parsed)) {
|
|
if (isRecipeType(node['@type'])) return node;
|
|
}
|
|
} catch {
|
|
// malformed JSON-LD, keep scanning
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
// Microdata-Alternative zum JSON-LD: viele SSR-Sites (inkl. rezeptwelt.de)
|
|
// nutzen <div itemtype="https://schema.org/Recipe"> statt application/ld+json.
|
|
// Ein einfacher Regex reicht — wir brauchen nur das Flag, nicht die Daten.
|
|
const MICRODATA_RECIPE = /itemtype\s*=\s*["']https?:\/\/schema\.org\/Recipe["']/i;
|
|
|
|
export function hasRecipeMarkup(html: string): boolean {
|
|
if (MICRODATA_RECIPE.test(html)) return true;
|
|
try {
|
|
return findRecipeNode(html) !== null;
|
|
} catch {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// @deprecated use hasRecipeMarkup
|
|
export function hasRecipeJsonLd(html: string): boolean {
|
|
return hasRecipeMarkup(html);
|
|
}
|
|
|
|
function microdataValueOf(el: Element): string {
|
|
if (el.hasAttribute('content')) return (el.getAttribute('content') ?? '').trim();
|
|
const tag = el.tagName.toLowerCase();
|
|
if (tag === 'meta') return (el.getAttribute('content') ?? '').trim();
|
|
if (tag === 'a' || tag === 'link' || tag === 'area')
|
|
return (el.getAttribute('href') ?? '').trim();
|
|
if (
|
|
tag === 'img' ||
|
|
tag === 'source' ||
|
|
tag === 'video' ||
|
|
tag === 'audio' ||
|
|
tag === 'embed' ||
|
|
tag === 'iframe' ||
|
|
tag === 'track'
|
|
)
|
|
return (el.getAttribute('src') ?? '').trim();
|
|
if (tag === 'object') return (el.getAttribute('data') ?? '').trim();
|
|
if (tag === 'data' || tag === 'meter')
|
|
return (el.getAttribute('value') ?? '').trim();
|
|
if (tag === 'time')
|
|
return (el.getAttribute('datetime') ?? el.textContent ?? '').trim();
|
|
return (el.textContent ?? '').trim();
|
|
}
|
|
|
|
type MicroProps = Map<string, Element[]>;
|
|
|
|
function gatherMicrodataProps(scope: Element): MicroProps {
|
|
// Alle itemprop-Descendants sammeln, dabei aber nicht in verschachtelte
|
|
// itemscopes einsteigen (sonst landen z.B. HowToStep.text im Haupt-Scope).
|
|
const map: MicroProps = new Map();
|
|
function walk(el: Element) {
|
|
for (const child of Array.from(el.children) as Element[]) {
|
|
const hasProp = child.hasAttribute('itemprop');
|
|
const hasScope = child.hasAttribute('itemscope');
|
|
if (hasProp) {
|
|
const names = (child.getAttribute('itemprop') ?? '')
|
|
.split(/\s+/)
|
|
.filter(Boolean);
|
|
for (const name of names) {
|
|
const arr = map.get(name) ?? [];
|
|
arr.push(child);
|
|
map.set(name, arr);
|
|
}
|
|
}
|
|
if (!hasScope) walk(child);
|
|
}
|
|
}
|
|
walk(scope);
|
|
return map;
|
|
}
|
|
|
|
function microText(map: MicroProps, name: string): string | null {
|
|
const els = map.get(name);
|
|
if (!els || els.length === 0) return null;
|
|
const v = microdataValueOf(els[0]);
|
|
return v || null;
|
|
}
|
|
|
|
function microAllTexts(map: MicroProps, name: string): string[] {
|
|
const els = map.get(name) ?? [];
|
|
return els.map(microdataValueOf).filter((v) => v !== '');
|
|
}
|
|
|
|
function microSteps(scope: Element): Step[] {
|
|
const out: Step[] = [];
|
|
let pos = 1;
|
|
const nodes = Array.from(scope.querySelectorAll('[itemprop="recipeInstructions"]'));
|
|
for (const el of nodes) {
|
|
if (el.hasAttribute('itemscope')) {
|
|
const textEl = el.querySelector('[itemprop="text"]');
|
|
const t = (textEl?.textContent ?? el.textContent ?? '').trim();
|
|
if (t) out.push({ position: pos++, text: t });
|
|
} else {
|
|
const lis = el.querySelectorAll('li');
|
|
if (lis.length > 0) {
|
|
for (const li of Array.from(lis)) {
|
|
const t = (li.textContent ?? '').trim();
|
|
if (t) out.push({ position: pos++, text: t });
|
|
}
|
|
} else {
|
|
const t = (el.textContent ?? '').trim();
|
|
if (t) out.push({ position: pos++, text: t });
|
|
}
|
|
}
|
|
}
|
|
return out;
|
|
}
|
|
|
|
export function extractRecipeFromMicrodata(html: string): Recipe | null {
|
|
let document: Document;
|
|
try {
|
|
({ document } = parseHTML(html));
|
|
} catch {
|
|
return null;
|
|
}
|
|
const scope = document.querySelector(
|
|
'[itemtype*="schema.org/Recipe" i]'
|
|
);
|
|
if (!scope) return null;
|
|
const props = gatherMicrodataProps(scope);
|
|
|
|
const title = microText(props, 'name');
|
|
if (!title) return null;
|
|
|
|
const ingredients = microAllTexts(props, 'recipeIngredient')
|
|
.map((raw, i) => parseIngredient(raw, i + 1))
|
|
.filter((x): x is NonNullable<typeof x> => x !== null);
|
|
|
|
const steps = microSteps(scope);
|
|
const prep = parseIso8601Duration(microText(props, 'prepTime') ?? undefined);
|
|
const cook = parseIso8601Duration(microText(props, 'cookTime') ?? undefined);
|
|
const total = parseIso8601Duration(microText(props, 'totalTime') ?? undefined);
|
|
|
|
const tags = new Set<string>([
|
|
...microAllTexts(props, 'recipeCategory'),
|
|
...microAllTexts(props, 'recipeCuisine'),
|
|
...microAllTexts(props, 'keywords')
|
|
]);
|
|
|
|
return {
|
|
id: null,
|
|
title,
|
|
description: microText(props, 'description'),
|
|
source_url: microText(props, 'url'),
|
|
source_domain: null,
|
|
image_path: microText(props, 'image'),
|
|
servings_default: toServings(microText(props, 'recipeYield')),
|
|
servings_unit: null,
|
|
prep_time_min: prep,
|
|
cook_time_min: cook,
|
|
total_time_min: total,
|
|
cuisine: microText(props, 'recipeCuisine'),
|
|
category: microText(props, 'recipeCategory'),
|
|
ingredients,
|
|
steps,
|
|
tags: [...tags]
|
|
};
|
|
}
|
|
|
|
export function extractRecipeFromHtml(html: string): Recipe | null {
|
|
const node = findRecipeNode(html);
|
|
if (!node) {
|
|
// Fallback auf Microdata — rezeptwelt.de & andere SSR-Sites nutzen das
|
|
// anstatt application/ld+json.
|
|
return extractRecipeFromMicrodata(html);
|
|
}
|
|
|
|
const title = toText(node.name) ?? '';
|
|
if (!title) return null;
|
|
|
|
const ingredients = Array.isArray(node.recipeIngredient)
|
|
? (node.recipeIngredient as unknown[])
|
|
.map((x, i) => (typeof x === 'string' ? parseIngredient(x, i + 1) : null))
|
|
.filter((x): x is NonNullable<typeof x> => x !== null)
|
|
: [];
|
|
|
|
const steps = toSteps(node.recipeInstructions);
|
|
const imageUrl = toImageUrl(node.image);
|
|
|
|
const prep = parseIso8601Duration(
|
|
typeof node.prepTime === 'string' ? node.prepTime : undefined
|
|
);
|
|
const cook = parseIso8601Duration(
|
|
typeof node.cookTime === 'string' ? node.cookTime : undefined
|
|
);
|
|
const total = parseIso8601Duration(
|
|
typeof node.totalTime === 'string' ? node.totalTime : undefined
|
|
);
|
|
|
|
const tags = new Set<string>([
|
|
...toStringArray(node.recipeCategory),
|
|
...toStringArray(node.recipeCuisine),
|
|
...toStringArray(node.keywords)
|
|
]);
|
|
|
|
return {
|
|
id: null,
|
|
title,
|
|
description: toText(node.description),
|
|
source_url: typeof node.url === 'string' ? node.url : null,
|
|
source_domain: null,
|
|
image_path: imageUrl,
|
|
servings_default: toServings(node.recipeYield),
|
|
servings_unit: null,
|
|
prep_time_min: prep,
|
|
cook_time_min: cook,
|
|
total_time_min: total,
|
|
cuisine: toText(node.recipeCuisine),
|
|
category: toText(node.recipeCategory),
|
|
ingredients,
|
|
steps,
|
|
tags: [...tags]
|
|
};
|
|
}
|