feat(parser): add JSON-LD schema.org/Recipe extractor
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
159
src/lib/server/parsers/json-ld-recipe.ts
Normal file
159
src/lib/server/parsers/json-ld-recipe.ts
Normal file
@@ -0,0 +1,159 @@
|
||||
import { parseHTML } from 'linkedom';
|
||||
import { parseIso8601Duration } from './iso8601-duration';
|
||||
import { parseIngredient } from './ingredient';
|
||||
import type { Recipe, Step } from '$lib/types';
|
||||
|
||||
type JsonLdNode = Record<string, unknown>;
|
||||
|
||||
function unwrapGraph(node: unknown): JsonLdNode[] {
|
||||
if (Array.isArray(node)) return node.flatMap(unwrapGraph);
|
||||
if (node && typeof node === 'object') {
|
||||
const obj = node as JsonLdNode;
|
||||
if (obj['@graph']) return unwrapGraph(obj['@graph']);
|
||||
return [obj];
|
||||
}
|
||||
return [];
|
||||
}
|
||||
|
||||
function isRecipeType(t: unknown): boolean {
|
||||
if (typeof t === 'string') return t === 'Recipe' || t.endsWith('/Recipe');
|
||||
if (Array.isArray(t)) return t.some(isRecipeType);
|
||||
return false;
|
||||
}
|
||||
|
||||
function toText(v: unknown): string | null {
|
||||
if (typeof v === 'string') return v.trim() || null;
|
||||
if (Array.isArray(v) && v.length > 0) return toText(v[0]);
|
||||
if (v && typeof v === 'object') {
|
||||
const o = v as JsonLdNode;
|
||||
if (typeof o.name === 'string') return o.name.trim();
|
||||
if (typeof o.text === 'string') return o.text.trim();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function toImageUrl(v: unknown): string | null {
|
||||
if (typeof v === 'string') return v;
|
||||
if (Array.isArray(v) && v.length > 0) return toImageUrl(v[0]);
|
||||
if (v && typeof v === 'object') {
|
||||
const o = v as JsonLdNode;
|
||||
if (typeof o.url === 'string') return o.url;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function toStringArray(v: unknown): string[] {
|
||||
if (Array.isArray(v)) return v.map((x) => toText(x)).filter((x): x is string => x !== null);
|
||||
if (typeof v === 'string') return v.split(',').map((s) => s.trim()).filter(Boolean);
|
||||
return [];
|
||||
}
|
||||
|
||||
function toSteps(v: unknown): Step[] {
|
||||
const out: Step[] = [];
|
||||
const walk = (x: unknown): void => {
|
||||
if (Array.isArray(x)) {
|
||||
for (const item of x) walk(item);
|
||||
return;
|
||||
}
|
||||
if (typeof x === 'string') {
|
||||
if (x.trim()) out.push({ position: out.length + 1, text: x.trim() });
|
||||
return;
|
||||
}
|
||||
if (x && typeof x === 'object') {
|
||||
const obj = x as JsonLdNode;
|
||||
if (obj['@type'] === 'HowToSection' && obj.itemListElement) {
|
||||
walk(obj.itemListElement);
|
||||
return;
|
||||
}
|
||||
if (obj['@type'] === 'HowToStep' && typeof obj.text === 'string') {
|
||||
if (obj.text.trim()) out.push({ position: out.length + 1, text: obj.text.trim() });
|
||||
return;
|
||||
}
|
||||
if (typeof obj.text === 'string' && obj.text.trim()) {
|
||||
out.push({ position: out.length + 1, text: obj.text.trim() });
|
||||
}
|
||||
}
|
||||
};
|
||||
walk(v);
|
||||
return out;
|
||||
}
|
||||
|
||||
function toServings(v: unknown): number | null {
|
||||
if (typeof v === 'number' && Number.isFinite(v)) return Math.trunc(v);
|
||||
if (typeof v === 'string') {
|
||||
const m = /(\d+)/.exec(v);
|
||||
if (m) return parseInt(m[1], 10);
|
||||
}
|
||||
if (Array.isArray(v) && v.length > 0) return toServings(v[0]);
|
||||
return null;
|
||||
}
|
||||
|
||||
function findRecipeNode(html: string): JsonLdNode | null {
|
||||
const { document } = parseHTML(html);
|
||||
const scripts = document.querySelectorAll('script[type="application/ld+json"]');
|
||||
for (const script of scripts) {
|
||||
const raw = script.textContent;
|
||||
if (!raw) continue;
|
||||
try {
|
||||
const parsed = JSON.parse(raw);
|
||||
for (const node of unwrapGraph(parsed)) {
|
||||
if (isRecipeType(node['@type'])) return node;
|
||||
}
|
||||
} catch {
|
||||
// malformed JSON-LD, keep scanning
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
export function extractRecipeFromHtml(html: string): Recipe | null {
|
||||
const node = findRecipeNode(html);
|
||||
if (!node) return null;
|
||||
|
||||
const title = toText(node.name) ?? '';
|
||||
if (!title) return null;
|
||||
|
||||
const ingredients = Array.isArray(node.recipeIngredient)
|
||||
? (node.recipeIngredient as unknown[])
|
||||
.map((x, i) => (typeof x === 'string' ? parseIngredient(x, i + 1) : null))
|
||||
.filter((x): x is NonNullable<typeof x> => x !== null)
|
||||
: [];
|
||||
|
||||
const steps = toSteps(node.recipeInstructions);
|
||||
const imageUrl = toImageUrl(node.image);
|
||||
|
||||
const prep = parseIso8601Duration(
|
||||
typeof node.prepTime === 'string' ? node.prepTime : undefined
|
||||
);
|
||||
const cook = parseIso8601Duration(
|
||||
typeof node.cookTime === 'string' ? node.cookTime : undefined
|
||||
);
|
||||
const total = parseIso8601Duration(
|
||||
typeof node.totalTime === 'string' ? node.totalTime : undefined
|
||||
);
|
||||
|
||||
const tags = new Set<string>([
|
||||
...toStringArray(node.recipeCategory),
|
||||
...toStringArray(node.recipeCuisine),
|
||||
...toStringArray(node.keywords)
|
||||
]);
|
||||
|
||||
return {
|
||||
id: null,
|
||||
title,
|
||||
description: toText(node.description),
|
||||
source_url: typeof node.url === 'string' ? node.url : null,
|
||||
source_domain: null,
|
||||
image_path: imageUrl,
|
||||
servings_default: toServings(node.recipeYield),
|
||||
servings_unit: null,
|
||||
prep_time_min: prep,
|
||||
cook_time_min: cook,
|
||||
total_time_min: total,
|
||||
cuisine: toText(node.recipeCuisine),
|
||||
category: toText(node.recipeCategory),
|
||||
ingredients,
|
||||
steps,
|
||||
tags: [...tags]
|
||||
};
|
||||
}
|
||||
122
tests/fixtures/chefkoch-schupfnudeln.html
vendored
Normal file
122
tests/fixtures/chefkoch-schupfnudeln.html
vendored
Normal file
File diff suppressed because one or more lines are too long
1941
tests/fixtures/emmi-bolognese.html
vendored
Normal file
1941
tests/fixtures/emmi-bolognese.html
vendored
Normal file
File diff suppressed because one or more lines are too long
44
tests/unit/json-ld-recipe.test.ts
Normal file
44
tests/unit/json-ld-recipe.test.ts
Normal file
@@ -0,0 +1,44 @@
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import { readFileSync } from 'node:fs';
|
||||
import { dirname, join } from 'node:path';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
import { extractRecipeFromHtml } from '../../src/lib/server/parsers/json-ld-recipe';
|
||||
|
||||
const here = dirname(fileURLToPath(import.meta.url));
|
||||
|
||||
function load(name: string): string {
|
||||
return readFileSync(join(here, '../fixtures', name), 'utf8');
|
||||
}
|
||||
|
||||
describe('extractRecipeFromHtml', () => {
|
||||
it('extracts a recipe from Chefkoch HTML', () => {
|
||||
const html = load('chefkoch-schupfnudeln.html');
|
||||
const r = extractRecipeFromHtml(html);
|
||||
expect(r).not.toBeNull();
|
||||
expect(r!.title.toLowerCase()).toContain('schupfnudel');
|
||||
expect(r!.ingredients.length).toBeGreaterThan(2);
|
||||
expect(r!.steps.length).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it('extracts a recipe from Emmi kocht einfach HTML', () => {
|
||||
const html = load('emmi-bolognese.html');
|
||||
const r = extractRecipeFromHtml(html);
|
||||
expect(r).not.toBeNull();
|
||||
expect(r!.title.toLowerCase()).toContain('bolognese');
|
||||
expect(r!.ingredients.length).toBeGreaterThan(0);
|
||||
expect(r!.steps.length).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it('returns null when no Recipe JSON-LD present', () => {
|
||||
const html = '<html><body><p>no recipe</p></body></html>';
|
||||
expect(extractRecipeFromHtml(html)).toBeNull();
|
||||
});
|
||||
|
||||
it('returns null when JSON-LD has only non-Recipe types', () => {
|
||||
const html = `
|
||||
<html><head>
|
||||
<script type="application/ld+json">{"@context":"https://schema.org","@type":"Organization","name":"Foo"}</script>
|
||||
</head></html>`;
|
||||
expect(extractRecipeFromHtml(html)).toBeNull();
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user