feat(importer): Microdata-Fallback für Seiten ohne JSON-LD
All checks were successful
Build & Publish Docker Image / build-and-push (push) Successful in 1m17s
All checks were successful
Build & Publish Docker Image / build-and-push (push) Successful in 1m17s
Bisher scheiterte der Import auf Seiten wie rezeptwelt.de mit „Diese Seite enthält kein Rezept", obwohl unser Such-Filter die Treffer durchließ (Microdata wird seit dem vorherigen Commit erkannt). Jetzt kann der Importer die Daten auch tatsächlich extrahieren: - extractRecipeFromMicrodata(html): parst [itemtype=schema.org/Recipe]- Scopes per linkedom, sammelt itemprop-Werte unter Beachtung der verschachtelten itemscope-Grenzen (HowToStep-Texts landen nicht im Haupt-Scope). - Übernimmt Content-Attribute auf <meta>/<time> (z.B. prepTime="PT20M"), src auf <img>, textContent als Fallback — die Standard-Microdata- Value-Regeln. - Behandelt HowToStep-Items UND einfache <li>/<ol>-Listen als recipeInstructions. - extractRecipeFromHtml ruft JSON-LD zuerst, fällt nur bei null auf Microdata zurück — damit bleibt bestehendes Verhalten stabil. Tests: Königsberger-Klopse-Fixture mit HowToSteps, einfache ol/li- Variante und Priorität-JSON-LD-über-Microdata-Check. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -125,9 +125,152 @@ export function hasRecipeJsonLd(html: string): boolean {
|
||||
return hasRecipeMarkup(html);
|
||||
}
|
||||
|
||||
function microdataValueOf(el: Element): string {
|
||||
if (el.hasAttribute('content')) return (el.getAttribute('content') ?? '').trim();
|
||||
const tag = el.tagName.toLowerCase();
|
||||
if (tag === 'meta') return (el.getAttribute('content') ?? '').trim();
|
||||
if (tag === 'a' || tag === 'link' || tag === 'area')
|
||||
return (el.getAttribute('href') ?? '').trim();
|
||||
if (
|
||||
tag === 'img' ||
|
||||
tag === 'source' ||
|
||||
tag === 'video' ||
|
||||
tag === 'audio' ||
|
||||
tag === 'embed' ||
|
||||
tag === 'iframe' ||
|
||||
tag === 'track'
|
||||
)
|
||||
return (el.getAttribute('src') ?? '').trim();
|
||||
if (tag === 'object') return (el.getAttribute('data') ?? '').trim();
|
||||
if (tag === 'data' || tag === 'meter')
|
||||
return (el.getAttribute('value') ?? '').trim();
|
||||
if (tag === 'time')
|
||||
return (el.getAttribute('datetime') ?? el.textContent ?? '').trim();
|
||||
return (el.textContent ?? '').trim();
|
||||
}
|
||||
|
||||
type MicroProps = Map<string, Element[]>;
|
||||
|
||||
function gatherMicrodataProps(scope: Element): MicroProps {
|
||||
// Alle itemprop-Descendants sammeln, dabei aber nicht in verschachtelte
|
||||
// itemscopes einsteigen (sonst landen z.B. HowToStep.text im Haupt-Scope).
|
||||
const map: MicroProps = new Map();
|
||||
function walk(el: Element) {
|
||||
for (const child of Array.from(el.children) as Element[]) {
|
||||
const hasProp = child.hasAttribute('itemprop');
|
||||
const hasScope = child.hasAttribute('itemscope');
|
||||
if (hasProp) {
|
||||
const names = (child.getAttribute('itemprop') ?? '')
|
||||
.split(/\s+/)
|
||||
.filter(Boolean);
|
||||
for (const name of names) {
|
||||
const arr = map.get(name) ?? [];
|
||||
arr.push(child);
|
||||
map.set(name, arr);
|
||||
}
|
||||
}
|
||||
if (!hasScope) walk(child);
|
||||
}
|
||||
}
|
||||
walk(scope);
|
||||
return map;
|
||||
}
|
||||
|
||||
function microText(map: MicroProps, name: string): string | null {
|
||||
const els = map.get(name);
|
||||
if (!els || els.length === 0) return null;
|
||||
const v = microdataValueOf(els[0]);
|
||||
return v || null;
|
||||
}
|
||||
|
||||
function microAllTexts(map: MicroProps, name: string): string[] {
|
||||
const els = map.get(name) ?? [];
|
||||
return els.map(microdataValueOf).filter((v) => v !== '');
|
||||
}
|
||||
|
||||
function microSteps(scope: Element): Step[] {
|
||||
const out: Step[] = [];
|
||||
let pos = 1;
|
||||
const nodes = Array.from(scope.querySelectorAll('[itemprop="recipeInstructions"]'));
|
||||
for (const el of nodes) {
|
||||
if (el.hasAttribute('itemscope')) {
|
||||
const textEl = el.querySelector('[itemprop="text"]');
|
||||
const t = (textEl?.textContent ?? el.textContent ?? '').trim();
|
||||
if (t) out.push({ position: pos++, text: t });
|
||||
} else {
|
||||
const lis = el.querySelectorAll('li');
|
||||
if (lis.length > 0) {
|
||||
for (const li of Array.from(lis)) {
|
||||
const t = (li.textContent ?? '').trim();
|
||||
if (t) out.push({ position: pos++, text: t });
|
||||
}
|
||||
} else {
|
||||
const t = (el.textContent ?? '').trim();
|
||||
if (t) out.push({ position: pos++, text: t });
|
||||
}
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
export function extractRecipeFromMicrodata(html: string): Recipe | null {
|
||||
let document: Document;
|
||||
try {
|
||||
({ document } = parseHTML(html));
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
const scope = document.querySelector(
|
||||
'[itemtype*="schema.org/Recipe" i]'
|
||||
);
|
||||
if (!scope) return null;
|
||||
const props = gatherMicrodataProps(scope);
|
||||
|
||||
const title = microText(props, 'name');
|
||||
if (!title) return null;
|
||||
|
||||
const ingredients = microAllTexts(props, 'recipeIngredient')
|
||||
.map((raw, i) => parseIngredient(raw, i + 1))
|
||||
.filter((x): x is NonNullable<typeof x> => x !== null);
|
||||
|
||||
const steps = microSteps(scope);
|
||||
const prep = parseIso8601Duration(microText(props, 'prepTime') ?? undefined);
|
||||
const cook = parseIso8601Duration(microText(props, 'cookTime') ?? undefined);
|
||||
const total = parseIso8601Duration(microText(props, 'totalTime') ?? undefined);
|
||||
|
||||
const tags = new Set<string>([
|
||||
...microAllTexts(props, 'recipeCategory'),
|
||||
...microAllTexts(props, 'recipeCuisine'),
|
||||
...microAllTexts(props, 'keywords')
|
||||
]);
|
||||
|
||||
return {
|
||||
id: null,
|
||||
title,
|
||||
description: microText(props, 'description'),
|
||||
source_url: microText(props, 'url'),
|
||||
source_domain: null,
|
||||
image_path: microText(props, 'image'),
|
||||
servings_default: toServings(microText(props, 'recipeYield')),
|
||||
servings_unit: null,
|
||||
prep_time_min: prep,
|
||||
cook_time_min: cook,
|
||||
total_time_min: total,
|
||||
cuisine: microText(props, 'recipeCuisine'),
|
||||
category: microText(props, 'recipeCategory'),
|
||||
ingredients,
|
||||
steps,
|
||||
tags: [...tags]
|
||||
};
|
||||
}
|
||||
|
||||
export function extractRecipeFromHtml(html: string): Recipe | null {
|
||||
const node = findRecipeNode(html);
|
||||
if (!node) return null;
|
||||
if (!node) {
|
||||
// Fallback auf Microdata — rezeptwelt.de & andere SSR-Sites nutzen das
|
||||
// anstatt application/ld+json.
|
||||
return extractRecipeFromMicrodata(html);
|
||||
}
|
||||
|
||||
const title = toText(node.name) ?? '';
|
||||
if (!title) return null;
|
||||
|
||||
Reference in New Issue
Block a user