feat(importer): Microdata-Fallback für Seiten ohne JSON-LD
All checks were successful
Build & Publish Docker Image / build-and-push (push) Successful in 1m17s

Bisher scheiterte der Import auf Seiten wie rezeptwelt.de mit „Diese Seite
enthält kein Rezept", obwohl unser Such-Filter die Treffer durchließ
(Microdata wird seit dem vorherigen Commit erkannt). Jetzt kann der
Importer die Daten auch tatsächlich extrahieren:

- extractRecipeFromMicrodata(html): parst [itemtype=schema.org/Recipe]-
  Scopes per linkedom, sammelt itemprop-Werte unter Beachtung der
  verschachtelten itemscope-Grenzen (HowToStep-Texts landen nicht im
  Haupt-Scope).
- Übernimmt Content-Attribute auf <meta>/<time> (z.B. prepTime="PT20M"),
  src auf <img>, textContent als Fallback — die Standard-Microdata-
  Value-Regeln.
- Behandelt HowToStep-Items UND einfache <li>/<ol>-Listen als
  recipeInstructions.
- extractRecipeFromHtml ruft JSON-LD zuerst, fällt nur bei null auf
  Microdata zurück — damit bleibt bestehendes Verhalten stabil.

Tests: Königsberger-Klopse-Fixture mit HowToSteps, einfache ol/li-
Variante und Priorität-JSON-LD-über-Microdata-Check.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
hsiegeln
2026-04-18 08:52:00 +02:00
parent ab2acb6437
commit aad3ad689d
2 changed files with 221 additions and 1 deletions

View File

@@ -125,9 +125,152 @@ export function hasRecipeJsonLd(html: string): boolean {
return hasRecipeMarkup(html);
}
function microdataValueOf(el: Element): string {
if (el.hasAttribute('content')) return (el.getAttribute('content') ?? '').trim();
const tag = el.tagName.toLowerCase();
if (tag === 'meta') return (el.getAttribute('content') ?? '').trim();
if (tag === 'a' || tag === 'link' || tag === 'area')
return (el.getAttribute('href') ?? '').trim();
if (
tag === 'img' ||
tag === 'source' ||
tag === 'video' ||
tag === 'audio' ||
tag === 'embed' ||
tag === 'iframe' ||
tag === 'track'
)
return (el.getAttribute('src') ?? '').trim();
if (tag === 'object') return (el.getAttribute('data') ?? '').trim();
if (tag === 'data' || tag === 'meter')
return (el.getAttribute('value') ?? '').trim();
if (tag === 'time')
return (el.getAttribute('datetime') ?? el.textContent ?? '').trim();
return (el.textContent ?? '').trim();
}
type MicroProps = Map<string, Element[]>;
function gatherMicrodataProps(scope: Element): MicroProps {
// Alle itemprop-Descendants sammeln, dabei aber nicht in verschachtelte
// itemscopes einsteigen (sonst landen z.B. HowToStep.text im Haupt-Scope).
const map: MicroProps = new Map();
function walk(el: Element) {
for (const child of Array.from(el.children) as Element[]) {
const hasProp = child.hasAttribute('itemprop');
const hasScope = child.hasAttribute('itemscope');
if (hasProp) {
const names = (child.getAttribute('itemprop') ?? '')
.split(/\s+/)
.filter(Boolean);
for (const name of names) {
const arr = map.get(name) ?? [];
arr.push(child);
map.set(name, arr);
}
}
if (!hasScope) walk(child);
}
}
walk(scope);
return map;
}
function microText(map: MicroProps, name: string): string | null {
const els = map.get(name);
if (!els || els.length === 0) return null;
const v = microdataValueOf(els[0]);
return v || null;
}
function microAllTexts(map: MicroProps, name: string): string[] {
const els = map.get(name) ?? [];
return els.map(microdataValueOf).filter((v) => v !== '');
}
function microSteps(scope: Element): Step[] {
const out: Step[] = [];
let pos = 1;
const nodes = Array.from(scope.querySelectorAll('[itemprop="recipeInstructions"]'));
for (const el of nodes) {
if (el.hasAttribute('itemscope')) {
const textEl = el.querySelector('[itemprop="text"]');
const t = (textEl?.textContent ?? el.textContent ?? '').trim();
if (t) out.push({ position: pos++, text: t });
} else {
const lis = el.querySelectorAll('li');
if (lis.length > 0) {
for (const li of Array.from(lis)) {
const t = (li.textContent ?? '').trim();
if (t) out.push({ position: pos++, text: t });
}
} else {
const t = (el.textContent ?? '').trim();
if (t) out.push({ position: pos++, text: t });
}
}
}
return out;
}
export function extractRecipeFromMicrodata(html: string): Recipe | null {
let document: Document;
try {
({ document } = parseHTML(html));
} catch {
return null;
}
const scope = document.querySelector(
'[itemtype*="schema.org/Recipe" i]'
);
if (!scope) return null;
const props = gatherMicrodataProps(scope);
const title = microText(props, 'name');
if (!title) return null;
const ingredients = microAllTexts(props, 'recipeIngredient')
.map((raw, i) => parseIngredient(raw, i + 1))
.filter((x): x is NonNullable<typeof x> => x !== null);
const steps = microSteps(scope);
const prep = parseIso8601Duration(microText(props, 'prepTime') ?? undefined);
const cook = parseIso8601Duration(microText(props, 'cookTime') ?? undefined);
const total = parseIso8601Duration(microText(props, 'totalTime') ?? undefined);
const tags = new Set<string>([
...microAllTexts(props, 'recipeCategory'),
...microAllTexts(props, 'recipeCuisine'),
...microAllTexts(props, 'keywords')
]);
return {
id: null,
title,
description: microText(props, 'description'),
source_url: microText(props, 'url'),
source_domain: null,
image_path: microText(props, 'image'),
servings_default: toServings(microText(props, 'recipeYield')),
servings_unit: null,
prep_time_min: prep,
cook_time_min: cook,
total_time_min: total,
cuisine: microText(props, 'recipeCuisine'),
category: microText(props, 'recipeCategory'),
ingredients,
steps,
tags: [...tags]
};
}
export function extractRecipeFromHtml(html: string): Recipe | null {
const node = findRecipeNode(html);
if (!node) return null;
if (!node) {
// Fallback auf Microdata — rezeptwelt.de & andere SSR-Sites nutzen das
// anstatt application/ld+json.
return extractRecipeFromMicrodata(html);
}
const title = toText(node.name) ?? '';
if (!title) return null;

View File

@@ -46,6 +46,83 @@ describe('extractRecipeFromHtml', () => {
});
});
describe('extractRecipeFromHtml — Microdata fallback', () => {
it('extracts title, ingredients and HowToStep instructions', () => {
const html = `<!doctype html><html><body>
<article itemscope itemtype="https://schema.org/Recipe">
<h1 itemprop="name">Königsberger Klopse</h1>
<img itemprop="image" src="/img/klopse.jpg" />
<p itemprop="description">Klassische Königsberger Klopse.</p>
<meta itemprop="prepTime" content="PT20M" />
<meta itemprop="cookTime" content="PT25M" />
<span itemprop="recipeYield">4</span>
<span itemprop="recipeCuisine">Ostpreußisch</span>
<ul>
<li itemprop="recipeIngredient">500 g Hackfleisch gemischt</li>
<li itemprop="recipeIngredient">1 Zwiebel, fein gewürfelt</li>
<li itemprop="recipeIngredient">2 EL Kapern</li>
</ul>
<ol>
<li itemprop="recipeInstructions" itemscope itemtype="https://schema.org/HowToStep">
<span itemprop="text">Hackfleisch und Zwiebel vermengen.</span>
</li>
<li itemprop="recipeInstructions" itemscope itemtype="https://schema.org/HowToStep">
<span itemprop="text">Klopse formen und in Salzwasser garen.</span>
</li>
</ol>
</article>
</body></html>`;
const r = extractRecipeFromHtml(html);
expect(r).not.toBeNull();
expect(r!.title).toBe('Königsberger Klopse');
expect(r!.ingredients.length).toBe(3);
expect(r!.ingredients[0].raw_text).toContain('Hackfleisch');
expect(r!.steps.length).toBe(2);
expect(r!.steps[1].text).toContain('Klopse formen');
expect(r!.prep_time_min).toBe(20);
expect(r!.cook_time_min).toBe(25);
expect(r!.servings_default).toBe(4);
expect(r!.cuisine).toBe('Ostpreußisch');
expect(r!.image_path).toBe('/img/klopse.jpg');
});
it('handles plain-text recipeInstructions without HowToStep', () => {
const html = `<html><body>
<div itemscope itemtype="http://schema.org/Recipe">
<span itemprop="name">Test</span>
<span itemprop="recipeIngredient">1 Apfel</span>
<div itemprop="recipeInstructions">
<ol>
<li>Schälen.</li>
<li>Essen.</li>
</ol>
</div>
</div>
</body></html>`;
const r = extractRecipeFromHtml(html);
expect(r).not.toBeNull();
expect(r!.steps.length).toBe(2);
expect(r!.steps[0].text).toBe('Schälen.');
});
it('prefers JSON-LD when both are present', () => {
const html = `<html><head>
<script type="application/ld+json">${JSON.stringify({
'@type': 'Recipe',
name: 'From JSON-LD',
recipeIngredient: ['x'],
recipeInstructions: ['y']
})}</script>
</head><body>
<div itemscope itemtype="https://schema.org/Recipe">
<span itemprop="name">From Microdata</span>
</div>
</body></html>`;
const r = extractRecipeFromHtml(html);
expect(r?.title).toBe('From JSON-LD');
});
});
describe('hasRecipeMarkup', () => {
it('detects JSON-LD Recipe', () => {
const html = `<html><head>