fix(importer): Microdata-Steps bei HowToSection + mehrfach-Schritten
All checks were successful
Build & Publish Docker Image / build-and-push (push) Successful in 1m19s
All checks were successful
Build & Publish Docker Image / build-and-push (push) Successful in 1m19s
Rezeptwelt lieferte Zubereitungs-Steps immer als einen einzigen Treffer, oft mit vermischtem Icon-alt-Text. Zwei Ursachen, beide in der generischen Microdata-Logik — kein rezeptwelt-spezifischer Parser nötig. 1. HowToSection wrappt HowToSteps als itemListElement, unser Parser sah nur das erste. Jetzt: recipeInstructions-Container mit itemtype= HowToSection werden abgestiegen, jedes itemListElement wird ein Step. 2. Ein einzelner HowToStep kann intern "1. …<br>2. …<br>3. …" enthalten. Neuer textWithLineBreaks(el) konvertiert <br>/Block-Grenzen zu \n und ignoriert <img>/<script>/<style>. splitStepText(raw) erkennt nummerierte Zeilen und erzeugt einen eigenen Step pro Nummer; Fort- setzungszeilen ohne Nummer hängen an den aktuellen Step an. 3 neue Tests: HowToSection-Kette, inline-nummerierter Multi-Step, <img>-alt-Unterdrückung. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -188,25 +188,106 @@ function microAllTexts(map: MicroProps, name: string): string[] {
|
||||
return els.map(microdataValueOf).filter((v) => v !== '');
|
||||
}
|
||||
|
||||
// Rausholen von Text mit erhaltenen Zeilenumbrüchen — <br> → \n, Block-
|
||||
// Elemente (<p>, <li> …) bekommen ebenfalls Newline-Grenzen. <img>, <script>,
|
||||
// <style> werden komplett übersprungen, damit alt-Attribute und andere
|
||||
// Nicht-Text-Content nicht in den Rezepttext bluten.
|
||||
function textWithLineBreaks(el: Element): string {
|
||||
const BLOCK = new Set(['p', 'div', 'li', 'br', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'tr']);
|
||||
const SKIP = new Set(['script', 'style', 'img', 'noscript']);
|
||||
let out = '';
|
||||
const walk = (node: Node): void => {
|
||||
if (node.nodeType === 3) {
|
||||
out += node.nodeValue ?? '';
|
||||
return;
|
||||
}
|
||||
if (node.nodeType !== 1) return;
|
||||
const e = node as Element;
|
||||
const tag = e.tagName.toLowerCase();
|
||||
if (SKIP.has(tag)) return;
|
||||
const block = BLOCK.has(tag);
|
||||
if (tag === 'br') {
|
||||
out += '\n';
|
||||
return;
|
||||
}
|
||||
if (block && out && !out.endsWith('\n')) out += '\n';
|
||||
for (const child of Array.from(node.childNodes)) walk(child);
|
||||
if (block && out && !out.endsWith('\n')) out += '\n';
|
||||
};
|
||||
walk(el);
|
||||
return out;
|
||||
}
|
||||
|
||||
// Teilt extrahierten Rezepttext in einzelne Schritte auf. Rezeptwelt und
|
||||
// andere SSR-Sites liefern oft einen einzigen HowToStep-Block, der intern
|
||||
// mit "1. …<br>2. …<br>3. …" mehrere Schritte vereint.
|
||||
function splitStepText(raw: string): string[] {
|
||||
const numbered = /^(\d+)[.)]\s+(.+)$/;
|
||||
const lines = raw
|
||||
.split(/\n+/)
|
||||
.map((l) => l.replace(/\s+/g, ' ').trim())
|
||||
.filter(Boolean);
|
||||
if (lines.length === 0) return [];
|
||||
const numberedCount = lines.filter((l) => numbered.test(l)).length;
|
||||
if (numberedCount >= 2) {
|
||||
// Mehrere nummerierte Zeilen → jede ist ein eigener Schritt. Nicht-
|
||||
// nummerierte Folgezeilen gehören zum vorherigen Schritt.
|
||||
const out: string[] = [];
|
||||
let current = '';
|
||||
for (const l of lines) {
|
||||
const m = l.match(numbered);
|
||||
if (m) {
|
||||
if (current) out.push(current);
|
||||
current = m[2];
|
||||
} else {
|
||||
current += current ? ' ' + l : l;
|
||||
}
|
||||
}
|
||||
if (current) out.push(current);
|
||||
return out;
|
||||
}
|
||||
return [lines.join(' ')];
|
||||
}
|
||||
|
||||
function stepsFromElement(el: Element): string[] {
|
||||
const textEl = el.querySelector('[itemprop="text"]') ?? el;
|
||||
const raw = textWithLineBreaks(textEl);
|
||||
return splitStepText(raw);
|
||||
}
|
||||
|
||||
function microSteps(scope: Element): Step[] {
|
||||
const out: Step[] = [];
|
||||
let pos = 1;
|
||||
const nodes = Array.from(scope.querySelectorAll('[itemprop="recipeInstructions"]'));
|
||||
for (const el of nodes) {
|
||||
if (el.hasAttribute('itemscope')) {
|
||||
const textEl = el.querySelector('[itemprop="text"]');
|
||||
const t = (textEl?.textContent ?? el.textContent ?? '').trim();
|
||||
if (t) out.push({ position: pos++, text: t });
|
||||
const containers = Array.from(scope.querySelectorAll('[itemprop="recipeInstructions"]'));
|
||||
for (const el of containers) {
|
||||
const itemtype = (el.getAttribute('itemtype') ?? '').toLowerCase();
|
||||
if (itemtype.includes('howtosection')) {
|
||||
// HowToSection enthält HowToStep-Kinder als itemListElement.
|
||||
const steps = Array.from(
|
||||
el.querySelectorAll(
|
||||
'[itemprop="itemListElement"]'
|
||||
)
|
||||
);
|
||||
for (const step of steps) {
|
||||
for (const t of stepsFromElement(step)) out.push({ position: pos++, text: t });
|
||||
}
|
||||
} else if (itemtype.includes('howtostep')) {
|
||||
for (const t of stepsFromElement(el)) out.push({ position: pos++, text: t });
|
||||
} else if (el.hasAttribute('itemscope')) {
|
||||
// Anderer unbekannter Scope — trotzdem Text versuchen.
|
||||
for (const t of stepsFromElement(el)) out.push({ position: pos++, text: t });
|
||||
} else {
|
||||
const lis = el.querySelectorAll('li');
|
||||
const lis = Array.from(el.querySelectorAll('li'));
|
||||
if (lis.length > 0) {
|
||||
for (const li of Array.from(lis)) {
|
||||
const t = (li.textContent ?? '').trim();
|
||||
if (t) out.push({ position: pos++, text: t });
|
||||
for (const li of lis) {
|
||||
for (const t of splitStepText(textWithLineBreaks(li))) {
|
||||
out.push({ position: pos++, text: t });
|
||||
}
|
||||
}
|
||||
} else {
|
||||
const t = (el.textContent ?? '').trim();
|
||||
if (t) out.push({ position: pos++, text: t });
|
||||
for (const t of splitStepText(textWithLineBreaks(el))) {
|
||||
out.push({ position: pos++, text: t });
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user