68 lines
2.2 KiB
TypeScript
68 lines
2.2 KiB
TypeScript
import { stripHtmlToPlainText, tokenizeForSimilarity } from '@/lib/text/plain-text'
|
|
|
|
export interface ExtractedBlock {
|
|
blockId: string
|
|
content: string
|
|
}
|
|
|
|
export function extractBlocksFromHtml(html: string): ExtractedBlock[] {
|
|
const blocks: ExtractedBlock[] = []
|
|
const regex = /<(?:p|h[1-6]|blockquote|li)[^>]*data-id="([^"]+)"[^>]*>([\s\S]*?)<\/(?:p|h[1-6]|blockquote|li)>/gi
|
|
let match
|
|
while ((match = regex.exec(html)) !== null) {
|
|
const blockId = match[1]
|
|
const content = stripHtmlToPlainText(match[2])
|
|
if (content.length >= 10) {
|
|
blocks.push({ blockId, content })
|
|
}
|
|
}
|
|
return blocks
|
|
}
|
|
|
|
export function jaccardSimilarity(a: string, b: string): number {
|
|
const A = tokenizeForSimilarity(a)
|
|
const B = tokenizeForSimilarity(b)
|
|
if (A.size === 0 || B.size === 0) return 0
|
|
let intersection = 0
|
|
A.forEach(w => { if (B.has(w)) intersection++ })
|
|
return intersection / (A.size + B.size - intersection)
|
|
}
|
|
|
|
function extractPlainBlocksFromHtml(html: string): ExtractedBlock[] {
|
|
const blocks: ExtractedBlock[] = []
|
|
const regex = /<(?:p|h[1-6]|blockquote|li|td|th|div)[^>]*>([\s\S]*?)<\/(?:p|h[1-6]|blockquote|li|td|th|div)>/gi
|
|
let match
|
|
while ((match = regex.exec(html)) !== null) {
|
|
const content = stripHtmlToPlainText(match[1])
|
|
if (content.length >= 10) {
|
|
blocks.push({ blockId: '', content })
|
|
}
|
|
}
|
|
return blocks
|
|
}
|
|
|
|
function pickBestFromBlocks(blocks: ExtractedBlock[], hint: string): ExtractedBlock | null {
|
|
if (blocks.length === 0) return null
|
|
if (!hint.trim()) return blocks[0]
|
|
|
|
let best = blocks[0]
|
|
let bestScore = jaccardSimilarity(hint, best.content)
|
|
for (const block of blocks.slice(1)) {
|
|
const score = jaccardSimilarity(hint, block.content)
|
|
if (score > bestScore) {
|
|
best = block
|
|
bestScore = score
|
|
}
|
|
}
|
|
return best
|
|
}
|
|
|
|
export function pickBestBlockForHint(html: string, hint: string): ExtractedBlock | null {
|
|
return pickBestFromBlocks(extractBlocksFromHtml(html), hint)
|
|
}
|
|
|
|
/** Fallback when notes have no data-id yet (citation statique, pas de bloc vivant). */
|
|
export function pickBestPlainPassageForHint(html: string, hint: string): ExtractedBlock | null {
|
|
return pickBestFromBlocks(extractPlainBlocksFromHtml(html), hint)
|
|
}
|