feat(parser): add JSON-LD schema.org/Recipe extractor

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-17 15:04:05 +02:00
parent 789af122f4
commit 2f3248c9a3
4 changed files with 2266 additions and 0 deletions

View File

@@ -0,0 +1,159 @@
import { parseHTML } from 'linkedom';
import { parseIso8601Duration } from './iso8601-duration';
import { parseIngredient } from './ingredient';
import type { Recipe, Step } from '$lib/types';
type JsonLdNode = Record<string, unknown>;
function unwrapGraph(node: unknown): JsonLdNode[] {
if (Array.isArray(node)) return node.flatMap(unwrapGraph);
if (node && typeof node === 'object') {
const obj = node as JsonLdNode;
if (obj['@graph']) return unwrapGraph(obj['@graph']);
return [obj];
}
return [];
}
function isRecipeType(t: unknown): boolean {
if (typeof t === 'string') return t === 'Recipe' || t.endsWith('/Recipe');
if (Array.isArray(t)) return t.some(isRecipeType);
return false;
}
function toText(v: unknown): string | null {
if (typeof v === 'string') return v.trim() || null;
if (Array.isArray(v) && v.length > 0) return toText(v[0]);
if (v && typeof v === 'object') {
const o = v as JsonLdNode;
if (typeof o.name === 'string') return o.name.trim();
if (typeof o.text === 'string') return o.text.trim();
}
return null;
}
function toImageUrl(v: unknown): string | null {
if (typeof v === 'string') return v;
if (Array.isArray(v) && v.length > 0) return toImageUrl(v[0]);
if (v && typeof v === 'object') {
const o = v as JsonLdNode;
if (typeof o.url === 'string') return o.url;
}
return null;
}
function toStringArray(v: unknown): string[] {
if (Array.isArray(v)) return v.map((x) => toText(x)).filter((x): x is string => x !== null);
if (typeof v === 'string') return v.split(',').map((s) => s.trim()).filter(Boolean);
return [];
}
function toSteps(v: unknown): Step[] {
const out: Step[] = [];
const walk = (x: unknown): void => {
if (Array.isArray(x)) {
for (const item of x) walk(item);
return;
}
if (typeof x === 'string') {
if (x.trim()) out.push({ position: out.length + 1, text: x.trim() });
return;
}
if (x && typeof x === 'object') {
const obj = x as JsonLdNode;
if (obj['@type'] === 'HowToSection' && obj.itemListElement) {
walk(obj.itemListElement);
return;
}
if (obj['@type'] === 'HowToStep' && typeof obj.text === 'string') {
if (obj.text.trim()) out.push({ position: out.length + 1, text: obj.text.trim() });
return;
}
if (typeof obj.text === 'string' && obj.text.trim()) {
out.push({ position: out.length + 1, text: obj.text.trim() });
}
}
};
walk(v);
return out;
}
function toServings(v: unknown): number | null {
if (typeof v === 'number' && Number.isFinite(v)) return Math.trunc(v);
if (typeof v === 'string') {
const m = /(\d+)/.exec(v);
if (m) return parseInt(m[1], 10);
}
if (Array.isArray(v) && v.length > 0) return toServings(v[0]);
return null;
}
function findRecipeNode(html: string): JsonLdNode | null {
const { document } = parseHTML(html);
const scripts = document.querySelectorAll('script[type="application/ld+json"]');
for (const script of scripts) {
const raw = script.textContent;
if (!raw) continue;
try {
const parsed = JSON.parse(raw);
for (const node of unwrapGraph(parsed)) {
if (isRecipeType(node['@type'])) return node;
}
} catch {
// malformed JSON-LD, keep scanning
}
}
return null;
}
export function extractRecipeFromHtml(html: string): Recipe | null {
const node = findRecipeNode(html);
if (!node) return null;
const title = toText(node.name) ?? '';
if (!title) return null;
const ingredients = Array.isArray(node.recipeIngredient)
? (node.recipeIngredient as unknown[])
.map((x, i) => (typeof x === 'string' ? parseIngredient(x, i + 1) : null))
.filter((x): x is NonNullable<typeof x> => x !== null)
: [];
const steps = toSteps(node.recipeInstructions);
const imageUrl = toImageUrl(node.image);
const prep = parseIso8601Duration(
typeof node.prepTime === 'string' ? node.prepTime : undefined
);
const cook = parseIso8601Duration(
typeof node.cookTime === 'string' ? node.cookTime : undefined
);
const total = parseIso8601Duration(
typeof node.totalTime === 'string' ? node.totalTime : undefined
);
const tags = new Set<string>([
...toStringArray(node.recipeCategory),
...toStringArray(node.recipeCuisine),
...toStringArray(node.keywords)
]);
return {
id: null,
title,
description: toText(node.description),
source_url: typeof node.url === 'string' ? node.url : null,
source_domain: null,
image_path: imageUrl,
servings_default: toServings(node.recipeYield),
servings_unit: null,
prep_time_min: prep,
cook_time_min: cook,
total_time_min: total,
cuisine: toText(node.recipeCuisine),
category: toText(node.recipeCategory),
ingredients,
steps,
tags: [...tags]
};
}

File diff suppressed because one or more lines are too long

1941
tests/fixtures/emmi-bolognese.html vendored Normal file

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,44 @@
import { describe, it, expect } from 'vitest';
import { readFileSync } from 'node:fs';
import { dirname, join } from 'node:path';
import { fileURLToPath } from 'node:url';
import { extractRecipeFromHtml } from '../../src/lib/server/parsers/json-ld-recipe';
const here = dirname(fileURLToPath(import.meta.url));
function load(name: string): string {
return readFileSync(join(here, '../fixtures', name), 'utf8');
}
describe('extractRecipeFromHtml', () => {
it('extracts a recipe from Chefkoch HTML', () => {
const html = load('chefkoch-schupfnudeln.html');
const r = extractRecipeFromHtml(html);
expect(r).not.toBeNull();
expect(r!.title.toLowerCase()).toContain('schupfnudel');
expect(r!.ingredients.length).toBeGreaterThan(2);
expect(r!.steps.length).toBeGreaterThan(0);
});
it('extracts a recipe from Emmi kocht einfach HTML', () => {
const html = load('emmi-bolognese.html');
const r = extractRecipeFromHtml(html);
expect(r).not.toBeNull();
expect(r!.title.toLowerCase()).toContain('bolognese');
expect(r!.ingredients.length).toBeGreaterThan(0);
expect(r!.steps.length).toBeGreaterThan(0);
});
it('returns null when no Recipe JSON-LD present', () => {
const html = '<html><body><p>no recipe</p></body></html>';
expect(extractRecipeFromHtml(html)).toBeNull();
});
it('returns null when JSON-LD has only non-Recipe types', () => {
const html = `
<html><head>
<script type="application/ld+json">{"@context":"https://schema.org","@type":"Organization","name":"Foo"}</script>
</head></html>`;
expect(extractRecipeFromHtml(html)).toBeNull();
});
});