import { stripHtmlToPlainText, tokenizeForSimilarity } from '@/lib/text/plain-text' export interface ExtractedBlock { blockId: string content: string } export function extractBlocksFromHtml(html: string): ExtractedBlock[] { const blocks: ExtractedBlock[] = [] const regex = /<(?:p|h[1-6]|blockquote|li)[^>]*data-id="([^"]+)"[^>]*>([\s\S]*?)<\/(?:p|h[1-6]|blockquote|li)>/gi let match while ((match = regex.exec(html)) !== null) { const blockId = match[1] const content = stripHtmlToPlainText(match[2]) if (content.length >= 10) { blocks.push({ blockId, content }) } } return blocks } export function jaccardSimilarity(a: string, b: string): number { const A = tokenizeForSimilarity(a) const B = tokenizeForSimilarity(b) if (A.size === 0 || B.size === 0) return 0 let intersection = 0 A.forEach(w => { if (B.has(w)) intersection++ }) return intersection / (A.size + B.size - intersection) } function extractPlainBlocksFromHtml(html: string): ExtractedBlock[] { const blocks: ExtractedBlock[] = [] const regex = /<(?:p|h[1-6]|blockquote|li|td|th|div)[^>]*>([\s\S]*?)<\/(?:p|h[1-6]|blockquote|li|td|th|div)>/gi let match while ((match = regex.exec(html)) !== null) { const content = stripHtmlToPlainText(match[1]) if (content.length >= 10) { blocks.push({ blockId: '', content }) } } return blocks } function pickBestFromBlocks(blocks: ExtractedBlock[], hint: string): ExtractedBlock | null { if (blocks.length === 0) return null if (!hint.trim()) return blocks[0] let best = blocks[0] let bestScore = jaccardSimilarity(hint, best.content) for (const block of blocks.slice(1)) { const score = jaccardSimilarity(hint, block.content) if (score > bestScore) { best = block bestScore = score } } return best } export function pickBestBlockForHint(html: string, hint: string): ExtractedBlock | null { return pickBestFromBlocks(extractBlocksFromHtml(html), hint) } /** Fallback when notes have no data-id yet (citation statique, pas de bloc vivant). */ export function pickBestPlainPassageForHint(html: string, hint: string): ExtractedBlock | null { return pickBestFromBlocks(extractPlainBlocksFromHtml(html), hint) }