fix(importer): Microdata-Steps bei HowToSection + mehrfach-Schritten
All checks were successful
Build & Publish Docker Image / build-and-push (push) Successful in 1m19s
All checks were successful
Build & Publish Docker Image / build-and-push (push) Successful in 1m19s
Rezeptwelt lieferte Zubereitungs-Steps immer als einen einzigen Treffer, oft mit vermischtem Icon-alt-Text. Zwei Ursachen, beide in der generischen Microdata-Logik — kein rezeptwelt-spezifischer Parser nötig. 1. HowToSection wrappt HowToSteps als itemListElement, unser Parser sah nur das erste. Jetzt: recipeInstructions-Container mit itemtype= HowToSection werden abgestiegen, jedes itemListElement wird ein Step. 2. Ein einzelner HowToStep kann intern "1. …<br>2. …<br>3. …" enthalten. Neuer textWithLineBreaks(el) konvertiert <br>/Block-Grenzen zu \n und ignoriert <img>/<script>/<style>. splitStepText(raw) erkennt nummerierte Zeilen und erzeugt einen eigenen Step pro Nummer; Fort- setzungszeilen ohne Nummer hängen an den aktuellen Step an. 3 neue Tests: HowToSection-Kette, inline-nummerierter Multi-Step, <img>-alt-Unterdrückung. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -188,25 +188,106 @@ function microAllTexts(map: MicroProps, name: string): string[] {
|
|||||||
return els.map(microdataValueOf).filter((v) => v !== '');
|
return els.map(microdataValueOf).filter((v) => v !== '');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Rausholen von Text mit erhaltenen Zeilenumbrüchen — <br> → \n, Block-
|
||||||
|
// Elemente (<p>, <li> …) bekommen ebenfalls Newline-Grenzen. <img>, <script>,
|
||||||
|
// <style> werden komplett übersprungen, damit alt-Attribute und andere
|
||||||
|
// Nicht-Text-Content nicht in den Rezepttext bluten.
|
||||||
|
function textWithLineBreaks(el: Element): string {
|
||||||
|
const BLOCK = new Set(['p', 'div', 'li', 'br', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'tr']);
|
||||||
|
const SKIP = new Set(['script', 'style', 'img', 'noscript']);
|
||||||
|
let out = '';
|
||||||
|
const walk = (node: Node): void => {
|
||||||
|
if (node.nodeType === 3) {
|
||||||
|
out += node.nodeValue ?? '';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (node.nodeType !== 1) return;
|
||||||
|
const e = node as Element;
|
||||||
|
const tag = e.tagName.toLowerCase();
|
||||||
|
if (SKIP.has(tag)) return;
|
||||||
|
const block = BLOCK.has(tag);
|
||||||
|
if (tag === 'br') {
|
||||||
|
out += '\n';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (block && out && !out.endsWith('\n')) out += '\n';
|
||||||
|
for (const child of Array.from(node.childNodes)) walk(child);
|
||||||
|
if (block && out && !out.endsWith('\n')) out += '\n';
|
||||||
|
};
|
||||||
|
walk(el);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Teilt extrahierten Rezepttext in einzelne Schritte auf. Rezeptwelt und
|
||||||
|
// andere SSR-Sites liefern oft einen einzigen HowToStep-Block, der intern
|
||||||
|
// mit "1. …<br>2. …<br>3. …" mehrere Schritte vereint.
|
||||||
|
function splitStepText(raw: string): string[] {
|
||||||
|
const numbered = /^(\d+)[.)]\s+(.+)$/;
|
||||||
|
const lines = raw
|
||||||
|
.split(/\n+/)
|
||||||
|
.map((l) => l.replace(/\s+/g, ' ').trim())
|
||||||
|
.filter(Boolean);
|
||||||
|
if (lines.length === 0) return [];
|
||||||
|
const numberedCount = lines.filter((l) => numbered.test(l)).length;
|
||||||
|
if (numberedCount >= 2) {
|
||||||
|
// Mehrere nummerierte Zeilen → jede ist ein eigener Schritt. Nicht-
|
||||||
|
// nummerierte Folgezeilen gehören zum vorherigen Schritt.
|
||||||
|
const out: string[] = [];
|
||||||
|
let current = '';
|
||||||
|
for (const l of lines) {
|
||||||
|
const m = l.match(numbered);
|
||||||
|
if (m) {
|
||||||
|
if (current) out.push(current);
|
||||||
|
current = m[2];
|
||||||
|
} else {
|
||||||
|
current += current ? ' ' + l : l;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (current) out.push(current);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
return [lines.join(' ')];
|
||||||
|
}
|
||||||
|
|
||||||
|
function stepsFromElement(el: Element): string[] {
|
||||||
|
const textEl = el.querySelector('[itemprop="text"]') ?? el;
|
||||||
|
const raw = textWithLineBreaks(textEl);
|
||||||
|
return splitStepText(raw);
|
||||||
|
}
|
||||||
|
|
||||||
function microSteps(scope: Element): Step[] {
|
function microSteps(scope: Element): Step[] {
|
||||||
const out: Step[] = [];
|
const out: Step[] = [];
|
||||||
let pos = 1;
|
let pos = 1;
|
||||||
const nodes = Array.from(scope.querySelectorAll('[itemprop="recipeInstructions"]'));
|
const containers = Array.from(scope.querySelectorAll('[itemprop="recipeInstructions"]'));
|
||||||
for (const el of nodes) {
|
for (const el of containers) {
|
||||||
if (el.hasAttribute('itemscope')) {
|
const itemtype = (el.getAttribute('itemtype') ?? '').toLowerCase();
|
||||||
const textEl = el.querySelector('[itemprop="text"]');
|
if (itemtype.includes('howtosection')) {
|
||||||
const t = (textEl?.textContent ?? el.textContent ?? '').trim();
|
// HowToSection enthält HowToStep-Kinder als itemListElement.
|
||||||
if (t) out.push({ position: pos++, text: t });
|
const steps = Array.from(
|
||||||
|
el.querySelectorAll(
|
||||||
|
'[itemprop="itemListElement"]'
|
||||||
|
)
|
||||||
|
);
|
||||||
|
for (const step of steps) {
|
||||||
|
for (const t of stepsFromElement(step)) out.push({ position: pos++, text: t });
|
||||||
|
}
|
||||||
|
} else if (itemtype.includes('howtostep')) {
|
||||||
|
for (const t of stepsFromElement(el)) out.push({ position: pos++, text: t });
|
||||||
|
} else if (el.hasAttribute('itemscope')) {
|
||||||
|
// Anderer unbekannter Scope — trotzdem Text versuchen.
|
||||||
|
for (const t of stepsFromElement(el)) out.push({ position: pos++, text: t });
|
||||||
} else {
|
} else {
|
||||||
const lis = el.querySelectorAll('li');
|
const lis = Array.from(el.querySelectorAll('li'));
|
||||||
if (lis.length > 0) {
|
if (lis.length > 0) {
|
||||||
for (const li of Array.from(lis)) {
|
for (const li of lis) {
|
||||||
const t = (li.textContent ?? '').trim();
|
for (const t of splitStepText(textWithLineBreaks(li))) {
|
||||||
if (t) out.push({ position: pos++, text: t });
|
out.push({ position: pos++, text: t });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
const t = (el.textContent ?? '').trim();
|
for (const t of splitStepText(textWithLineBreaks(el))) {
|
||||||
if (t) out.push({ position: pos++, text: t });
|
out.push({ position: pos++, text: t });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -105,6 +105,60 @@ describe('extractRecipeFromHtml — Microdata fallback', () => {
|
|||||||
expect(r!.steps[0].text).toBe('Schälen.');
|
expect(r!.steps[0].text).toBe('Schälen.');
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('splits a single HowToStep containing "1.<br>2.<br>3." into separate steps', () => {
|
||||||
|
const html = `<html><body>
|
||||||
|
<div itemscope itemtype="https://schema.org/Recipe">
|
||||||
|
<span itemprop="name">Multi-step</span>
|
||||||
|
<span itemprop="recipeIngredient">x</span>
|
||||||
|
<div itemprop="recipeInstructions" itemscope itemtype="https://schema.org/HowToStep">
|
||||||
|
<p itemprop="text">1. Teig kneten.<br>2. Gehen lassen.<br>3. Backen.</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</body></html>`;
|
||||||
|
const r = extractRecipeFromHtml(html);
|
||||||
|
expect(r).not.toBeNull();
|
||||||
|
expect(r!.steps.length).toBe(3);
|
||||||
|
expect(r!.steps[0].text).toBe('Teig kneten.');
|
||||||
|
expect(r!.steps[1].text).toBe('Gehen lassen.');
|
||||||
|
expect(r!.steps[2].text).toBe('Backen.');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('handles HowToSection wrapping multiple HowToStep itemListElements', () => {
|
||||||
|
const html = `<html><body>
|
||||||
|
<div itemscope itemtype="https://schema.org/Recipe">
|
||||||
|
<span itemprop="name">Sections</span>
|
||||||
|
<span itemprop="recipeIngredient">x</span>
|
||||||
|
<div itemprop="recipeInstructions" itemscope itemtype="https://schema.org/HowToSection">
|
||||||
|
<div itemprop="itemListElement" itemscope itemtype="https://schema.org/HowToStep">
|
||||||
|
<span itemprop="text">Erst schneiden.</span>
|
||||||
|
</div>
|
||||||
|
<div itemprop="itemListElement" itemscope itemtype="https://schema.org/HowToStep">
|
||||||
|
<span itemprop="text">Dann kochen.</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</body></html>`;
|
||||||
|
const r = extractRecipeFromHtml(html);
|
||||||
|
expect(r!.steps.length).toBe(2);
|
||||||
|
expect(r!.steps[0].text).toBe('Erst schneiden.');
|
||||||
|
expect(r!.steps[1].text).toBe('Dann kochen.');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('ignores <img> alt/title content in step text', () => {
|
||||||
|
const html = `<html><body>
|
||||||
|
<div itemscope itemtype="https://schema.org/Recipe">
|
||||||
|
<span itemprop="name">WithIcon</span>
|
||||||
|
<span itemprop="recipeIngredient">x</span>
|
||||||
|
<div itemprop="recipeInstructions" itemscope itemtype="https://schema.org/HowToStep">
|
||||||
|
<span itemprop="text">Teig <img alt="Icon Teig kneten" src="/x.png"> verarbeiten.</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</body></html>`;
|
||||||
|
const r = extractRecipeFromHtml(html);
|
||||||
|
expect(r!.steps[0].text).not.toMatch(/Icon Teig kneten/);
|
||||||
|
expect(r!.steps[0].text).toMatch(/Teig.*verarbeiten/);
|
||||||
|
});
|
||||||
|
|
||||||
it('prefers JSON-LD when both are present', () => {
|
it('prefers JSON-LD when both are present', () => {
|
||||||
const html = `<html><head>
|
const html = `<html><head>
|
||||||
<script type="application/ld+json">${JSON.stringify({
|
<script type="application/ld+json">${JSON.stringify({
|
||||||
|
|||||||
Reference in New Issue
Block a user