feat(search): Microdata-Fallback erkennt rezeptwelt & Co.
All checks were successful
Build & Publish Docker Image / build-and-push (push) Successful in 1m15s
All checks were successful
Build & Publish Docker Image / build-and-push (push) Successful in 1m15s
Aus dem Log (q="Königsberger klopse"): 11 rezeptwelt-Treffer kamen durch alle URL-Filter, wurden aber von hasRecipeJsonLd als non-recipe gedroppt. Ursache: rezeptwelt.de nutzt Microdata (itemtype=schema.org/Recipe) statt application/ld+json. - hasRecipeJsonLd → hasRecipeMarkup: prüft jetzt zusätzlich per Regex auf itemtype=(https?://)schema.org/Recipe. Alter Export bleibt als Deprecated-Weiterleitung erhalten. - Log zeigt jetzt auch die ersten 3 gedropten URLs als dropped samples, damit neue Problem-Domains einfach zu diagnostizieren sind. - Migration 010 räumt alle thumbnail_cache-Einträge mit has_recipe=0 aus — die waren mit dem alten Check falsch-negativ und müssen neu klassifiziert werden. Tests: 4 neue Cases für hasRecipeMarkup (JSON-LD, http/https Microdata, Negativ-Fall). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,6 @@
|
|||||||
|
-- Der Recipe-Detektor prüft ab jetzt zusätzlich zu JSON-LD auch Microdata
|
||||||
|
-- (itemtype=schema.org/Recipe). Der Cache kann has_recipe=0-Einträge
|
||||||
|
-- enthalten, die mit dem alten Check falsch-negativ waren (z.B. rezeptwelt.de,
|
||||||
|
-- das Microdata statt JSON-LD nutzt). Einmalig wegräumen, damit die Seiten
|
||||||
|
-- beim nächsten Search neu klassifiziert werden. Reiner Cache-Flush.
|
||||||
|
DELETE FROM thumbnail_cache WHERE has_recipe = 0;
|
||||||
@@ -106,7 +106,13 @@ function findRecipeNode(html: string): JsonLdNode | null {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function hasRecipeJsonLd(html: string): boolean {
|
// Microdata-Alternative zum JSON-LD: viele SSR-Sites (inkl. rezeptwelt.de)
|
||||||
|
// nutzen <div itemtype="https://schema.org/Recipe"> statt application/ld+json.
|
||||||
|
// Ein einfacher Regex reicht — wir brauchen nur das Flag, nicht die Daten.
|
||||||
|
const MICRODATA_RECIPE = /itemtype\s*=\s*["']https?:\/\/schema\.org\/Recipe["']/i;
|
||||||
|
|
||||||
|
export function hasRecipeMarkup(html: string): boolean {
|
||||||
|
if (MICRODATA_RECIPE.test(html)) return true;
|
||||||
try {
|
try {
|
||||||
return findRecipeNode(html) !== null;
|
return findRecipeNode(html) !== null;
|
||||||
} catch {
|
} catch {
|
||||||
@@ -114,6 +120,11 @@ export function hasRecipeJsonLd(html: string): boolean {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// @deprecated use hasRecipeMarkup
|
||||||
|
export function hasRecipeJsonLd(html: string): boolean {
|
||||||
|
return hasRecipeMarkup(html);
|
||||||
|
}
|
||||||
|
|
||||||
export function extractRecipeFromHtml(html: string): Recipe | null {
|
export function extractRecipeFromHtml(html: string): Recipe | null {
|
||||||
const node = findRecipeNode(html);
|
const node = findRecipeNode(html);
|
||||||
if (!node) return null;
|
if (!node) return null;
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import type Database from 'better-sqlite3';
|
|||||||
import { parseHTML } from 'linkedom';
|
import { parseHTML } from 'linkedom';
|
||||||
import { listDomains, normalizeDomain } from '../domains/repository';
|
import { listDomains, normalizeDomain } from '../domains/repository';
|
||||||
import { fetchText } from '../http';
|
import { fetchText } from '../http';
|
||||||
import { hasRecipeJsonLd } from '../parsers/json-ld-recipe';
|
import { hasRecipeMarkup } from '../parsers/json-ld-recipe';
|
||||||
|
|
||||||
export type WebHit = {
|
export type WebHit = {
|
||||||
url: string;
|
url: string;
|
||||||
@@ -235,7 +235,7 @@ async function enrichPageMeta(
|
|||||||
});
|
});
|
||||||
meta = {
|
meta = {
|
||||||
image: extractPageImage(html, url),
|
image: extractPageImage(html, url),
|
||||||
hasRecipe: hasRecipeJsonLd(html) ? 1 : 0
|
hasRecipe: hasRecipeMarkup(html) ? 1 : 0
|
||||||
};
|
};
|
||||||
} catch {
|
} catch {
|
||||||
// Fetch failed — leave hasRecipe null (unknown) so we don't permanently
|
// Fetch failed — leave hasRecipe null (unknown) so we don't permanently
|
||||||
@@ -363,9 +363,19 @@ export async function searchWeb(
|
|||||||
);
|
);
|
||||||
if (opts.enrichThumbnails !== false) {
|
if (opts.enrichThumbnails !== false) {
|
||||||
const enriched = await enrichAndFilterHits(db, hits);
|
const enriched = await enrichAndFilterHits(db, hits);
|
||||||
|
const droppedUrls = hits
|
||||||
|
.filter((h) => !enriched.find((e) => e.url === h.url))
|
||||||
|
.map((h) => h.url);
|
||||||
console.log(
|
console.log(
|
||||||
`[searxng] q=${JSON.stringify(trimmed)} pageno=${pageno} enrich=${hits.length} dropped_non_recipe=${hits.length - enriched.length} final=${enriched.length}`
|
`[searxng] q=${JSON.stringify(trimmed)} pageno=${pageno} enrich=${hits.length} dropped_non_recipe=${droppedUrls.length} final=${enriched.length}`
|
||||||
);
|
);
|
||||||
|
// Nur die ersten 3 URLs mitloggen, damit das Log nicht explodiert. Genug
|
||||||
|
// um eine Seite manuell zu analysieren („warum wurde die abgelehnt?").
|
||||||
|
if (droppedUrls.length > 0) {
|
||||||
|
console.log(
|
||||||
|
`[searxng] dropped samples: ${droppedUrls.slice(0, 3).join(' | ')}`
|
||||||
|
);
|
||||||
|
}
|
||||||
return enriched;
|
return enriched;
|
||||||
}
|
}
|
||||||
return hits;
|
return hits;
|
||||||
|
|||||||
@@ -2,7 +2,10 @@ import { describe, it, expect } from 'vitest';
|
|||||||
import { readFileSync } from 'node:fs';
|
import { readFileSync } from 'node:fs';
|
||||||
import { dirname, join } from 'node:path';
|
import { dirname, join } from 'node:path';
|
||||||
import { fileURLToPath } from 'node:url';
|
import { fileURLToPath } from 'node:url';
|
||||||
import { extractRecipeFromHtml } from '../../src/lib/server/parsers/json-ld-recipe';
|
import {
|
||||||
|
extractRecipeFromHtml,
|
||||||
|
hasRecipeMarkup
|
||||||
|
} from '../../src/lib/server/parsers/json-ld-recipe';
|
||||||
|
|
||||||
const here = dirname(fileURLToPath(import.meta.url));
|
const here = dirname(fileURLToPath(import.meta.url));
|
||||||
|
|
||||||
@@ -42,3 +45,31 @@ describe('extractRecipeFromHtml', () => {
|
|||||||
expect(extractRecipeFromHtml(html)).toBeNull();
|
expect(extractRecipeFromHtml(html)).toBeNull();
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe('hasRecipeMarkup', () => {
|
||||||
|
it('detects JSON-LD Recipe', () => {
|
||||||
|
const html = `<html><head>
|
||||||
|
<script type="application/ld+json">{"@type":"Recipe","name":"x"}</script>
|
||||||
|
</head></html>`;
|
||||||
|
expect(hasRecipeMarkup(html)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('detects schema.org/Recipe microdata', () => {
|
||||||
|
const html = `<html><body>
|
||||||
|
<div itemscope itemtype="https://schema.org/Recipe">
|
||||||
|
<span itemprop="name">Königsberger Klopse</span>
|
||||||
|
</div>
|
||||||
|
</body></html>`;
|
||||||
|
expect(hasRecipeMarkup(html)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('also detects http:// variant of schema.org/Recipe', () => {
|
||||||
|
const html = `<div itemscope itemtype="http://schema.org/Recipe"></div>`;
|
||||||
|
expect(hasRecipeMarkup(html)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns false for pages without any recipe markup', () => {
|
||||||
|
const html = '<html><body><p>nothing here</p></body></html>';
|
||||||
|
expect(hasRecipeMarkup(html)).toBe(false);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|||||||
Reference in New Issue
Block a user