fix(importer): Microdata-Steps bei HowToSection + mehrfach-Schritten
All checks were successful
Build & Publish Docker Image / build-and-push (push) Successful in 1m19s
All checks were successful
Build & Publish Docker Image / build-and-push (push) Successful in 1m19s
Rezeptwelt lieferte Zubereitungs-Steps immer als einen einzigen Treffer, oft mit vermischtem Icon-alt-Text. Zwei Ursachen, beide in der generischen Microdata-Logik — kein rezeptwelt-spezifischer Parser nötig. 1. HowToSection wrappt HowToSteps als itemListElement, unser Parser sah nur das erste. Jetzt: recipeInstructions-Container mit itemtype= HowToSection werden abgestiegen, jedes itemListElement wird ein Step. 2. Ein einzelner HowToStep kann intern "1. …<br>2. …<br>3. …" enthalten. Neuer textWithLineBreaks(el) konvertiert <br>/Block-Grenzen zu \n und ignoriert <img>/<script>/<style>. splitStepText(raw) erkennt nummerierte Zeilen und erzeugt einen eigenen Step pro Nummer; Fort- setzungszeilen ohne Nummer hängen an den aktuellen Step an. 3 neue Tests: HowToSection-Kette, inline-nummerierter Multi-Step, <img>-alt-Unterdrückung. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -188,25 +188,106 @@ function microAllTexts(map: MicroProps, name: string): string[] {
|
||||
return els.map(microdataValueOf).filter((v) => v !== '');
|
||||
}
|
||||
|
||||
// Rausholen von Text mit erhaltenen Zeilenumbrüchen — <br> → \n, Block-
|
||||
// Elemente (<p>, <li> …) bekommen ebenfalls Newline-Grenzen. <img>, <script>,
|
||||
// <style> werden komplett übersprungen, damit alt-Attribute und andere
|
||||
// Nicht-Text-Content nicht in den Rezepttext bluten.
|
||||
function textWithLineBreaks(el: Element): string {
|
||||
const BLOCK = new Set(['p', 'div', 'li', 'br', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'tr']);
|
||||
const SKIP = new Set(['script', 'style', 'img', 'noscript']);
|
||||
let out = '';
|
||||
const walk = (node: Node): void => {
|
||||
if (node.nodeType === 3) {
|
||||
out += node.nodeValue ?? '';
|
||||
return;
|
||||
}
|
||||
if (node.nodeType !== 1) return;
|
||||
const e = node as Element;
|
||||
const tag = e.tagName.toLowerCase();
|
||||
if (SKIP.has(tag)) return;
|
||||
const block = BLOCK.has(tag);
|
||||
if (tag === 'br') {
|
||||
out += '\n';
|
||||
return;
|
||||
}
|
||||
if (block && out && !out.endsWith('\n')) out += '\n';
|
||||
for (const child of Array.from(node.childNodes)) walk(child);
|
||||
if (block && out && !out.endsWith('\n')) out += '\n';
|
||||
};
|
||||
walk(el);
|
||||
return out;
|
||||
}
|
||||
|
||||
// Teilt extrahierten Rezepttext in einzelne Schritte auf. Rezeptwelt und
|
||||
// andere SSR-Sites liefern oft einen einzigen HowToStep-Block, der intern
|
||||
// mit "1. …<br>2. …<br>3. …" mehrere Schritte vereint.
|
||||
function splitStepText(raw: string): string[] {
|
||||
const numbered = /^(\d+)[.)]\s+(.+)$/;
|
||||
const lines = raw
|
||||
.split(/\n+/)
|
||||
.map((l) => l.replace(/\s+/g, ' ').trim())
|
||||
.filter(Boolean);
|
||||
if (lines.length === 0) return [];
|
||||
const numberedCount = lines.filter((l) => numbered.test(l)).length;
|
||||
if (numberedCount >= 2) {
|
||||
// Mehrere nummerierte Zeilen → jede ist ein eigener Schritt. Nicht-
|
||||
// nummerierte Folgezeilen gehören zum vorherigen Schritt.
|
||||
const out: string[] = [];
|
||||
let current = '';
|
||||
for (const l of lines) {
|
||||
const m = l.match(numbered);
|
||||
if (m) {
|
||||
if (current) out.push(current);
|
||||
current = m[2];
|
||||
} else {
|
||||
current += current ? ' ' + l : l;
|
||||
}
|
||||
}
|
||||
if (current) out.push(current);
|
||||
return out;
|
||||
}
|
||||
return [lines.join(' ')];
|
||||
}
|
||||
|
||||
function stepsFromElement(el: Element): string[] {
|
||||
const textEl = el.querySelector('[itemprop="text"]') ?? el;
|
||||
const raw = textWithLineBreaks(textEl);
|
||||
return splitStepText(raw);
|
||||
}
|
||||
|
||||
function microSteps(scope: Element): Step[] {
|
||||
const out: Step[] = [];
|
||||
let pos = 1;
|
||||
const nodes = Array.from(scope.querySelectorAll('[itemprop="recipeInstructions"]'));
|
||||
for (const el of nodes) {
|
||||
if (el.hasAttribute('itemscope')) {
|
||||
const textEl = el.querySelector('[itemprop="text"]');
|
||||
const t = (textEl?.textContent ?? el.textContent ?? '').trim();
|
||||
if (t) out.push({ position: pos++, text: t });
|
||||
const containers = Array.from(scope.querySelectorAll('[itemprop="recipeInstructions"]'));
|
||||
for (const el of containers) {
|
||||
const itemtype = (el.getAttribute('itemtype') ?? '').toLowerCase();
|
||||
if (itemtype.includes('howtosection')) {
|
||||
// HowToSection enthält HowToStep-Kinder als itemListElement.
|
||||
const steps = Array.from(
|
||||
el.querySelectorAll(
|
||||
'[itemprop="itemListElement"]'
|
||||
)
|
||||
);
|
||||
for (const step of steps) {
|
||||
for (const t of stepsFromElement(step)) out.push({ position: pos++, text: t });
|
||||
}
|
||||
} else if (itemtype.includes('howtostep')) {
|
||||
for (const t of stepsFromElement(el)) out.push({ position: pos++, text: t });
|
||||
} else if (el.hasAttribute('itemscope')) {
|
||||
// Anderer unbekannter Scope — trotzdem Text versuchen.
|
||||
for (const t of stepsFromElement(el)) out.push({ position: pos++, text: t });
|
||||
} else {
|
||||
const lis = el.querySelectorAll('li');
|
||||
const lis = Array.from(el.querySelectorAll('li'));
|
||||
if (lis.length > 0) {
|
||||
for (const li of Array.from(lis)) {
|
||||
const t = (li.textContent ?? '').trim();
|
||||
if (t) out.push({ position: pos++, text: t });
|
||||
for (const li of lis) {
|
||||
for (const t of splitStepText(textWithLineBreaks(li))) {
|
||||
out.push({ position: pos++, text: t });
|
||||
}
|
||||
}
|
||||
} else {
|
||||
const t = (el.textContent ?? '').trim();
|
||||
if (t) out.push({ position: pos++, text: t });
|
||||
for (const t of splitStepText(textWithLineBreaks(el))) {
|
||||
out.push({ position: pos++, text: t });
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -105,6 +105,60 @@ describe('extractRecipeFromHtml — Microdata fallback', () => {
|
||||
expect(r!.steps[0].text).toBe('Schälen.');
|
||||
});
|
||||
|
||||
it('splits a single HowToStep containing "1.<br>2.<br>3." into separate steps', () => {
|
||||
const html = `<html><body>
|
||||
<div itemscope itemtype="https://schema.org/Recipe">
|
||||
<span itemprop="name">Multi-step</span>
|
||||
<span itemprop="recipeIngredient">x</span>
|
||||
<div itemprop="recipeInstructions" itemscope itemtype="https://schema.org/HowToStep">
|
||||
<p itemprop="text">1. Teig kneten.<br>2. Gehen lassen.<br>3. Backen.</p>
|
||||
</div>
|
||||
</div>
|
||||
</body></html>`;
|
||||
const r = extractRecipeFromHtml(html);
|
||||
expect(r).not.toBeNull();
|
||||
expect(r!.steps.length).toBe(3);
|
||||
expect(r!.steps[0].text).toBe('Teig kneten.');
|
||||
expect(r!.steps[1].text).toBe('Gehen lassen.');
|
||||
expect(r!.steps[2].text).toBe('Backen.');
|
||||
});
|
||||
|
||||
it('handles HowToSection wrapping multiple HowToStep itemListElements', () => {
|
||||
const html = `<html><body>
|
||||
<div itemscope itemtype="https://schema.org/Recipe">
|
||||
<span itemprop="name">Sections</span>
|
||||
<span itemprop="recipeIngredient">x</span>
|
||||
<div itemprop="recipeInstructions" itemscope itemtype="https://schema.org/HowToSection">
|
||||
<div itemprop="itemListElement" itemscope itemtype="https://schema.org/HowToStep">
|
||||
<span itemprop="text">Erst schneiden.</span>
|
||||
</div>
|
||||
<div itemprop="itemListElement" itemscope itemtype="https://schema.org/HowToStep">
|
||||
<span itemprop="text">Dann kochen.</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</body></html>`;
|
||||
const r = extractRecipeFromHtml(html);
|
||||
expect(r!.steps.length).toBe(2);
|
||||
expect(r!.steps[0].text).toBe('Erst schneiden.');
|
||||
expect(r!.steps[1].text).toBe('Dann kochen.');
|
||||
});
|
||||
|
||||
it('ignores <img> alt/title content in step text', () => {
|
||||
const html = `<html><body>
|
||||
<div itemscope itemtype="https://schema.org/Recipe">
|
||||
<span itemprop="name">WithIcon</span>
|
||||
<span itemprop="recipeIngredient">x</span>
|
||||
<div itemprop="recipeInstructions" itemscope itemtype="https://schema.org/HowToStep">
|
||||
<span itemprop="text">Teig <img alt="Icon Teig kneten" src="/x.png"> verarbeiten.</span>
|
||||
</div>
|
||||
</div>
|
||||
</body></html>`;
|
||||
const r = extractRecipeFromHtml(html);
|
||||
expect(r!.steps[0].text).not.toMatch(/Icon Teig kneten/);
|
||||
expect(r!.steps[0].text).toMatch(/Teig.*verarbeiten/);
|
||||
});
|
||||
|
||||
it('prefers JSON-LD when both are present', () => {
|
||||
const html = `<html><head>
|
||||
<script type="application/ld+json">${JSON.stringify({
|
||||
|
||||
Reference in New Issue
Block a user