import { stripHtmlToPlainText, tokenizeForSimilarity } from '@/lib/text/plain-text'
export interface ExtractedBlock {
blockId: string
content: string
}
export function extractBlocksFromHtml(html: string): ExtractedBlock[] {
const blocks: ExtractedBlock[] = []
const regex = /<(?:p|h[1-6]|blockquote|li)[^>]*data-id="([^"]+)"[^>]*>([\s\S]*?)<\/(?:p|h[1-6]|blockquote|li)>/gi
let match
while ((match = regex.exec(html)) !== null) {
const blockId = match[1]
const content = stripHtmlToPlainText(match[2])
if (content.length >= 10) {
blocks.push({ blockId, content })
}
}
return blocks
}
export function jaccardSimilarity(a: string, b: string): number {
const A = tokenizeForSimilarity(a)
const B = tokenizeForSimilarity(b)
if (A.size === 0 || B.size === 0) return 0
let intersection = 0
A.forEach(w => { if (B.has(w)) intersection++ })
return intersection / (A.size + B.size - intersection)
}
function extractPlainBlocksFromHtml(html: string): ExtractedBlock[] {
const blocks: ExtractedBlock[] = []
const regex = /<(?:p|h[1-6]|blockquote|li|td|th|div)[^>]*>([\s\S]*?)<\/(?:p|h[1-6]|blockquote|li|td|th|div)>/gi
let match
while ((match = regex.exec(html)) !== null) {
const content = stripHtmlToPlainText(match[1])
if (content.length >= 10) {
blocks.push({ blockId: '', content })
}
}
return blocks
}
function pickBestFromBlocks(blocks: ExtractedBlock[], hint: string): ExtractedBlock | null {
if (blocks.length === 0) return null
if (!hint.trim()) return blocks[0]
let best = blocks[0]
let bestScore = jaccardSimilarity(hint, best.content)
for (const block of blocks.slice(1)) {
const score = jaccardSimilarity(hint, block.content)
if (score > bestScore) {
best = block
bestScore = score
}
}
return best
}
export function pickBestBlockForHint(html: string, hint: string): ExtractedBlock | null {
return pickBestFromBlocks(extractBlocksFromHtml(html), hint)
}
/** Fallback when notes have no data-id yet (citation statique, pas de bloc vivant). */
export function pickBestPlainPassageForHint(html: string, hint: string): ExtractedBlock | null {
return pickBestFromBlocks(extractPlainBlocksFromHtml(html), hint)
}