feat(parser): add JSON-LD schema.org/Recipe extractor
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
159
src/lib/server/parsers/json-ld-recipe.ts
Normal file
159
src/lib/server/parsers/json-ld-recipe.ts
Normal file
@@ -0,0 +1,159 @@
|
|||||||
|
import { parseHTML } from 'linkedom';
|
||||||
|
import { parseIso8601Duration } from './iso8601-duration';
|
||||||
|
import { parseIngredient } from './ingredient';
|
||||||
|
import type { Recipe, Step } from '$lib/types';
|
||||||
|
|
||||||
|
type JsonLdNode = Record<string, unknown>;
|
||||||
|
|
||||||
|
function unwrapGraph(node: unknown): JsonLdNode[] {
|
||||||
|
if (Array.isArray(node)) return node.flatMap(unwrapGraph);
|
||||||
|
if (node && typeof node === 'object') {
|
||||||
|
const obj = node as JsonLdNode;
|
||||||
|
if (obj['@graph']) return unwrapGraph(obj['@graph']);
|
||||||
|
return [obj];
|
||||||
|
}
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
function isRecipeType(t: unknown): boolean {
|
||||||
|
if (typeof t === 'string') return t === 'Recipe' || t.endsWith('/Recipe');
|
||||||
|
if (Array.isArray(t)) return t.some(isRecipeType);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
function toText(v: unknown): string | null {
|
||||||
|
if (typeof v === 'string') return v.trim() || null;
|
||||||
|
if (Array.isArray(v) && v.length > 0) return toText(v[0]);
|
||||||
|
if (v && typeof v === 'object') {
|
||||||
|
const o = v as JsonLdNode;
|
||||||
|
if (typeof o.name === 'string') return o.name.trim();
|
||||||
|
if (typeof o.text === 'string') return o.text.trim();
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function toImageUrl(v: unknown): string | null {
|
||||||
|
if (typeof v === 'string') return v;
|
||||||
|
if (Array.isArray(v) && v.length > 0) return toImageUrl(v[0]);
|
||||||
|
if (v && typeof v === 'object') {
|
||||||
|
const o = v as JsonLdNode;
|
||||||
|
if (typeof o.url === 'string') return o.url;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function toStringArray(v: unknown): string[] {
|
||||||
|
if (Array.isArray(v)) return v.map((x) => toText(x)).filter((x): x is string => x !== null);
|
||||||
|
if (typeof v === 'string') return v.split(',').map((s) => s.trim()).filter(Boolean);
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
function toSteps(v: unknown): Step[] {
|
||||||
|
const out: Step[] = [];
|
||||||
|
const walk = (x: unknown): void => {
|
||||||
|
if (Array.isArray(x)) {
|
||||||
|
for (const item of x) walk(item);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (typeof x === 'string') {
|
||||||
|
if (x.trim()) out.push({ position: out.length + 1, text: x.trim() });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (x && typeof x === 'object') {
|
||||||
|
const obj = x as JsonLdNode;
|
||||||
|
if (obj['@type'] === 'HowToSection' && obj.itemListElement) {
|
||||||
|
walk(obj.itemListElement);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (obj['@type'] === 'HowToStep' && typeof obj.text === 'string') {
|
||||||
|
if (obj.text.trim()) out.push({ position: out.length + 1, text: obj.text.trim() });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (typeof obj.text === 'string' && obj.text.trim()) {
|
||||||
|
out.push({ position: out.length + 1, text: obj.text.trim() });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
walk(v);
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
function toServings(v: unknown): number | null {
|
||||||
|
if (typeof v === 'number' && Number.isFinite(v)) return Math.trunc(v);
|
||||||
|
if (typeof v === 'string') {
|
||||||
|
const m = /(\d+)/.exec(v);
|
||||||
|
if (m) return parseInt(m[1], 10);
|
||||||
|
}
|
||||||
|
if (Array.isArray(v) && v.length > 0) return toServings(v[0]);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function findRecipeNode(html: string): JsonLdNode | null {
|
||||||
|
const { document } = parseHTML(html);
|
||||||
|
const scripts = document.querySelectorAll('script[type="application/ld+json"]');
|
||||||
|
for (const script of scripts) {
|
||||||
|
const raw = script.textContent;
|
||||||
|
if (!raw) continue;
|
||||||
|
try {
|
||||||
|
const parsed = JSON.parse(raw);
|
||||||
|
for (const node of unwrapGraph(parsed)) {
|
||||||
|
if (isRecipeType(node['@type'])) return node;
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// malformed JSON-LD, keep scanning
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function extractRecipeFromHtml(html: string): Recipe | null {
|
||||||
|
const node = findRecipeNode(html);
|
||||||
|
if (!node) return null;
|
||||||
|
|
||||||
|
const title = toText(node.name) ?? '';
|
||||||
|
if (!title) return null;
|
||||||
|
|
||||||
|
const ingredients = Array.isArray(node.recipeIngredient)
|
||||||
|
? (node.recipeIngredient as unknown[])
|
||||||
|
.map((x, i) => (typeof x === 'string' ? parseIngredient(x, i + 1) : null))
|
||||||
|
.filter((x): x is NonNullable<typeof x> => x !== null)
|
||||||
|
: [];
|
||||||
|
|
||||||
|
const steps = toSteps(node.recipeInstructions);
|
||||||
|
const imageUrl = toImageUrl(node.image);
|
||||||
|
|
||||||
|
const prep = parseIso8601Duration(
|
||||||
|
typeof node.prepTime === 'string' ? node.prepTime : undefined
|
||||||
|
);
|
||||||
|
const cook = parseIso8601Duration(
|
||||||
|
typeof node.cookTime === 'string' ? node.cookTime : undefined
|
||||||
|
);
|
||||||
|
const total = parseIso8601Duration(
|
||||||
|
typeof node.totalTime === 'string' ? node.totalTime : undefined
|
||||||
|
);
|
||||||
|
|
||||||
|
const tags = new Set<string>([
|
||||||
|
...toStringArray(node.recipeCategory),
|
||||||
|
...toStringArray(node.recipeCuisine),
|
||||||
|
...toStringArray(node.keywords)
|
||||||
|
]);
|
||||||
|
|
||||||
|
return {
|
||||||
|
id: null,
|
||||||
|
title,
|
||||||
|
description: toText(node.description),
|
||||||
|
source_url: typeof node.url === 'string' ? node.url : null,
|
||||||
|
source_domain: null,
|
||||||
|
image_path: imageUrl,
|
||||||
|
servings_default: toServings(node.recipeYield),
|
||||||
|
servings_unit: null,
|
||||||
|
prep_time_min: prep,
|
||||||
|
cook_time_min: cook,
|
||||||
|
total_time_min: total,
|
||||||
|
cuisine: toText(node.recipeCuisine),
|
||||||
|
category: toText(node.recipeCategory),
|
||||||
|
ingredients,
|
||||||
|
steps,
|
||||||
|
tags: [...tags]
|
||||||
|
};
|
||||||
|
}
|
||||||
122
tests/fixtures/chefkoch-schupfnudeln.html
vendored
Normal file
122
tests/fixtures/chefkoch-schupfnudeln.html
vendored
Normal file
File diff suppressed because one or more lines are too long
1941
tests/fixtures/emmi-bolognese.html
vendored
Normal file
1941
tests/fixtures/emmi-bolognese.html
vendored
Normal file
File diff suppressed because one or more lines are too long
44
tests/unit/json-ld-recipe.test.ts
Normal file
44
tests/unit/json-ld-recipe.test.ts
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
import { describe, it, expect } from 'vitest';
|
||||||
|
import { readFileSync } from 'node:fs';
|
||||||
|
import { dirname, join } from 'node:path';
|
||||||
|
import { fileURLToPath } from 'node:url';
|
||||||
|
import { extractRecipeFromHtml } from '../../src/lib/server/parsers/json-ld-recipe';
|
||||||
|
|
||||||
|
const here = dirname(fileURLToPath(import.meta.url));
|
||||||
|
|
||||||
|
function load(name: string): string {
|
||||||
|
return readFileSync(join(here, '../fixtures', name), 'utf8');
|
||||||
|
}
|
||||||
|
|
||||||
|
describe('extractRecipeFromHtml', () => {
|
||||||
|
it('extracts a recipe from Chefkoch HTML', () => {
|
||||||
|
const html = load('chefkoch-schupfnudeln.html');
|
||||||
|
const r = extractRecipeFromHtml(html);
|
||||||
|
expect(r).not.toBeNull();
|
||||||
|
expect(r!.title.toLowerCase()).toContain('schupfnudel');
|
||||||
|
expect(r!.ingredients.length).toBeGreaterThan(2);
|
||||||
|
expect(r!.steps.length).toBeGreaterThan(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('extracts a recipe from Emmi kocht einfach HTML', () => {
|
||||||
|
const html = load('emmi-bolognese.html');
|
||||||
|
const r = extractRecipeFromHtml(html);
|
||||||
|
expect(r).not.toBeNull();
|
||||||
|
expect(r!.title.toLowerCase()).toContain('bolognese');
|
||||||
|
expect(r!.ingredients.length).toBeGreaterThan(0);
|
||||||
|
expect(r!.steps.length).toBeGreaterThan(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns null when no Recipe JSON-LD present', () => {
|
||||||
|
const html = '<html><body><p>no recipe</p></body></html>';
|
||||||
|
expect(extractRecipeFromHtml(html)).toBeNull();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns null when JSON-LD has only non-Recipe types', () => {
|
||||||
|
const html = `
|
||||||
|
<html><head>
|
||||||
|
<script type="application/ld+json">{"@context":"https://schema.org","@type":"Organization","name":"Foo"}</script>
|
||||||
|
</head></html>`;
|
||||||
|
expect(extractRecipeFromHtml(html)).toBeNull();
|
||||||
|
});
|
||||||
|
});
|
||||||
Reference in New Issue
Block a user