feat(importer): Microdata-Fallback für Seiten ohne JSON-LD
All checks were successful
Build & Publish Docker Image / build-and-push (push) Successful in 1m17s
All checks were successful
Build & Publish Docker Image / build-and-push (push) Successful in 1m17s
Bisher scheiterte der Import auf Seiten wie rezeptwelt.de mit „Diese Seite enthält kein Rezept", obwohl unser Such-Filter die Treffer durchließ (Microdata wird seit dem vorherigen Commit erkannt). Jetzt kann der Importer die Daten auch tatsächlich extrahieren: - extractRecipeFromMicrodata(html): parst [itemtype=schema.org/Recipe]- Scopes per linkedom, sammelt itemprop-Werte unter Beachtung der verschachtelten itemscope-Grenzen (HowToStep-Texts landen nicht im Haupt-Scope). - Übernimmt Content-Attribute auf <meta>/<time> (z.B. prepTime="PT20M"), src auf <img>, textContent als Fallback — die Standard-Microdata- Value-Regeln. - Behandelt HowToStep-Items UND einfache <li>/<ol>-Listen als recipeInstructions. - extractRecipeFromHtml ruft JSON-LD zuerst, fällt nur bei null auf Microdata zurück — damit bleibt bestehendes Verhalten stabil. Tests: Königsberger-Klopse-Fixture mit HowToSteps, einfache ol/li- Variante und Priorität-JSON-LD-über-Microdata-Check. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -125,9 +125,152 @@ export function hasRecipeJsonLd(html: string): boolean {
|
|||||||
return hasRecipeMarkup(html);
|
return hasRecipeMarkup(html);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function microdataValueOf(el: Element): string {
|
||||||
|
if (el.hasAttribute('content')) return (el.getAttribute('content') ?? '').trim();
|
||||||
|
const tag = el.tagName.toLowerCase();
|
||||||
|
if (tag === 'meta') return (el.getAttribute('content') ?? '').trim();
|
||||||
|
if (tag === 'a' || tag === 'link' || tag === 'area')
|
||||||
|
return (el.getAttribute('href') ?? '').trim();
|
||||||
|
if (
|
||||||
|
tag === 'img' ||
|
||||||
|
tag === 'source' ||
|
||||||
|
tag === 'video' ||
|
||||||
|
tag === 'audio' ||
|
||||||
|
tag === 'embed' ||
|
||||||
|
tag === 'iframe' ||
|
||||||
|
tag === 'track'
|
||||||
|
)
|
||||||
|
return (el.getAttribute('src') ?? '').trim();
|
||||||
|
if (tag === 'object') return (el.getAttribute('data') ?? '').trim();
|
||||||
|
if (tag === 'data' || tag === 'meter')
|
||||||
|
return (el.getAttribute('value') ?? '').trim();
|
||||||
|
if (tag === 'time')
|
||||||
|
return (el.getAttribute('datetime') ?? el.textContent ?? '').trim();
|
||||||
|
return (el.textContent ?? '').trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
type MicroProps = Map<string, Element[]>;
|
||||||
|
|
||||||
|
function gatherMicrodataProps(scope: Element): MicroProps {
|
||||||
|
// Alle itemprop-Descendants sammeln, dabei aber nicht in verschachtelte
|
||||||
|
// itemscopes einsteigen (sonst landen z.B. HowToStep.text im Haupt-Scope).
|
||||||
|
const map: MicroProps = new Map();
|
||||||
|
function walk(el: Element) {
|
||||||
|
for (const child of Array.from(el.children) as Element[]) {
|
||||||
|
const hasProp = child.hasAttribute('itemprop');
|
||||||
|
const hasScope = child.hasAttribute('itemscope');
|
||||||
|
if (hasProp) {
|
||||||
|
const names = (child.getAttribute('itemprop') ?? '')
|
||||||
|
.split(/\s+/)
|
||||||
|
.filter(Boolean);
|
||||||
|
for (const name of names) {
|
||||||
|
const arr = map.get(name) ?? [];
|
||||||
|
arr.push(child);
|
||||||
|
map.set(name, arr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!hasScope) walk(child);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
walk(scope);
|
||||||
|
return map;
|
||||||
|
}
|
||||||
|
|
||||||
|
function microText(map: MicroProps, name: string): string | null {
|
||||||
|
const els = map.get(name);
|
||||||
|
if (!els || els.length === 0) return null;
|
||||||
|
const v = microdataValueOf(els[0]);
|
||||||
|
return v || null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function microAllTexts(map: MicroProps, name: string): string[] {
|
||||||
|
const els = map.get(name) ?? [];
|
||||||
|
return els.map(microdataValueOf).filter((v) => v !== '');
|
||||||
|
}
|
||||||
|
|
||||||
|
function microSteps(scope: Element): Step[] {
|
||||||
|
const out: Step[] = [];
|
||||||
|
let pos = 1;
|
||||||
|
const nodes = Array.from(scope.querySelectorAll('[itemprop="recipeInstructions"]'));
|
||||||
|
for (const el of nodes) {
|
||||||
|
if (el.hasAttribute('itemscope')) {
|
||||||
|
const textEl = el.querySelector('[itemprop="text"]');
|
||||||
|
const t = (textEl?.textContent ?? el.textContent ?? '').trim();
|
||||||
|
if (t) out.push({ position: pos++, text: t });
|
||||||
|
} else {
|
||||||
|
const lis = el.querySelectorAll('li');
|
||||||
|
if (lis.length > 0) {
|
||||||
|
for (const li of Array.from(lis)) {
|
||||||
|
const t = (li.textContent ?? '').trim();
|
||||||
|
if (t) out.push({ position: pos++, text: t });
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
const t = (el.textContent ?? '').trim();
|
||||||
|
if (t) out.push({ position: pos++, text: t });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function extractRecipeFromMicrodata(html: string): Recipe | null {
|
||||||
|
let document: Document;
|
||||||
|
try {
|
||||||
|
({ document } = parseHTML(html));
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
const scope = document.querySelector(
|
||||||
|
'[itemtype*="schema.org/Recipe" i]'
|
||||||
|
);
|
||||||
|
if (!scope) return null;
|
||||||
|
const props = gatherMicrodataProps(scope);
|
||||||
|
|
||||||
|
const title = microText(props, 'name');
|
||||||
|
if (!title) return null;
|
||||||
|
|
||||||
|
const ingredients = microAllTexts(props, 'recipeIngredient')
|
||||||
|
.map((raw, i) => parseIngredient(raw, i + 1))
|
||||||
|
.filter((x): x is NonNullable<typeof x> => x !== null);
|
||||||
|
|
||||||
|
const steps = microSteps(scope);
|
||||||
|
const prep = parseIso8601Duration(microText(props, 'prepTime') ?? undefined);
|
||||||
|
const cook = parseIso8601Duration(microText(props, 'cookTime') ?? undefined);
|
||||||
|
const total = parseIso8601Duration(microText(props, 'totalTime') ?? undefined);
|
||||||
|
|
||||||
|
const tags = new Set<string>([
|
||||||
|
...microAllTexts(props, 'recipeCategory'),
|
||||||
|
...microAllTexts(props, 'recipeCuisine'),
|
||||||
|
...microAllTexts(props, 'keywords')
|
||||||
|
]);
|
||||||
|
|
||||||
|
return {
|
||||||
|
id: null,
|
||||||
|
title,
|
||||||
|
description: microText(props, 'description'),
|
||||||
|
source_url: microText(props, 'url'),
|
||||||
|
source_domain: null,
|
||||||
|
image_path: microText(props, 'image'),
|
||||||
|
servings_default: toServings(microText(props, 'recipeYield')),
|
||||||
|
servings_unit: null,
|
||||||
|
prep_time_min: prep,
|
||||||
|
cook_time_min: cook,
|
||||||
|
total_time_min: total,
|
||||||
|
cuisine: microText(props, 'recipeCuisine'),
|
||||||
|
category: microText(props, 'recipeCategory'),
|
||||||
|
ingredients,
|
||||||
|
steps,
|
||||||
|
tags: [...tags]
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
export function extractRecipeFromHtml(html: string): Recipe | null {
|
export function extractRecipeFromHtml(html: string): Recipe | null {
|
||||||
const node = findRecipeNode(html);
|
const node = findRecipeNode(html);
|
||||||
if (!node) return null;
|
if (!node) {
|
||||||
|
// Fallback auf Microdata — rezeptwelt.de & andere SSR-Sites nutzen das
|
||||||
|
// anstatt application/ld+json.
|
||||||
|
return extractRecipeFromMicrodata(html);
|
||||||
|
}
|
||||||
|
|
||||||
const title = toText(node.name) ?? '';
|
const title = toText(node.name) ?? '';
|
||||||
if (!title) return null;
|
if (!title) return null;
|
||||||
|
|||||||
@@ -46,6 +46,83 @@ describe('extractRecipeFromHtml', () => {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe('extractRecipeFromHtml — Microdata fallback', () => {
|
||||||
|
it('extracts title, ingredients and HowToStep instructions', () => {
|
||||||
|
const html = `<!doctype html><html><body>
|
||||||
|
<article itemscope itemtype="https://schema.org/Recipe">
|
||||||
|
<h1 itemprop="name">Königsberger Klopse</h1>
|
||||||
|
<img itemprop="image" src="/img/klopse.jpg" />
|
||||||
|
<p itemprop="description">Klassische Königsberger Klopse.</p>
|
||||||
|
<meta itemprop="prepTime" content="PT20M" />
|
||||||
|
<meta itemprop="cookTime" content="PT25M" />
|
||||||
|
<span itemprop="recipeYield">4</span>
|
||||||
|
<span itemprop="recipeCuisine">Ostpreußisch</span>
|
||||||
|
<ul>
|
||||||
|
<li itemprop="recipeIngredient">500 g Hackfleisch gemischt</li>
|
||||||
|
<li itemprop="recipeIngredient">1 Zwiebel, fein gewürfelt</li>
|
||||||
|
<li itemprop="recipeIngredient">2 EL Kapern</li>
|
||||||
|
</ul>
|
||||||
|
<ol>
|
||||||
|
<li itemprop="recipeInstructions" itemscope itemtype="https://schema.org/HowToStep">
|
||||||
|
<span itemprop="text">Hackfleisch und Zwiebel vermengen.</span>
|
||||||
|
</li>
|
||||||
|
<li itemprop="recipeInstructions" itemscope itemtype="https://schema.org/HowToStep">
|
||||||
|
<span itemprop="text">Klopse formen und in Salzwasser garen.</span>
|
||||||
|
</li>
|
||||||
|
</ol>
|
||||||
|
</article>
|
||||||
|
</body></html>`;
|
||||||
|
const r = extractRecipeFromHtml(html);
|
||||||
|
expect(r).not.toBeNull();
|
||||||
|
expect(r!.title).toBe('Königsberger Klopse');
|
||||||
|
expect(r!.ingredients.length).toBe(3);
|
||||||
|
expect(r!.ingredients[0].raw_text).toContain('Hackfleisch');
|
||||||
|
expect(r!.steps.length).toBe(2);
|
||||||
|
expect(r!.steps[1].text).toContain('Klopse formen');
|
||||||
|
expect(r!.prep_time_min).toBe(20);
|
||||||
|
expect(r!.cook_time_min).toBe(25);
|
||||||
|
expect(r!.servings_default).toBe(4);
|
||||||
|
expect(r!.cuisine).toBe('Ostpreußisch');
|
||||||
|
expect(r!.image_path).toBe('/img/klopse.jpg');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('handles plain-text recipeInstructions without HowToStep', () => {
|
||||||
|
const html = `<html><body>
|
||||||
|
<div itemscope itemtype="http://schema.org/Recipe">
|
||||||
|
<span itemprop="name">Test</span>
|
||||||
|
<span itemprop="recipeIngredient">1 Apfel</span>
|
||||||
|
<div itemprop="recipeInstructions">
|
||||||
|
<ol>
|
||||||
|
<li>Schälen.</li>
|
||||||
|
<li>Essen.</li>
|
||||||
|
</ol>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</body></html>`;
|
||||||
|
const r = extractRecipeFromHtml(html);
|
||||||
|
expect(r).not.toBeNull();
|
||||||
|
expect(r!.steps.length).toBe(2);
|
||||||
|
expect(r!.steps[0].text).toBe('Schälen.');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('prefers JSON-LD when both are present', () => {
|
||||||
|
const html = `<html><head>
|
||||||
|
<script type="application/ld+json">${JSON.stringify({
|
||||||
|
'@type': 'Recipe',
|
||||||
|
name: 'From JSON-LD',
|
||||||
|
recipeIngredient: ['x'],
|
||||||
|
recipeInstructions: ['y']
|
||||||
|
})}</script>
|
||||||
|
</head><body>
|
||||||
|
<div itemscope itemtype="https://schema.org/Recipe">
|
||||||
|
<span itemprop="name">From Microdata</span>
|
||||||
|
</div>
|
||||||
|
</body></html>`;
|
||||||
|
const r = extractRecipeFromHtml(html);
|
||||||
|
expect(r?.title).toBe('From JSON-LD');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
describe('hasRecipeMarkup', () => {
|
describe('hasRecipeMarkup', () => {
|
||||||
it('detects JSON-LD Recipe', () => {
|
it('detects JSON-LD Recipe', () => {
|
||||||
const html = `<html><head>
|
const html = `<html><head>
|
||||||
|
|||||||
Reference in New Issue
Block a user