/** Texte brut pour embeddings et similarité (HTML clippé, persan, arabe, etc.). */ /** Taille d'un chunk API embedding (~2500 tokens, safe pour le persan). */ export const EMBEDDING_CHUNK_CHARS = 6000 export const EMBEDDING_CHUNK_OVERLAP = 300 /** @deprecated Utiliser le découpage multi-chunks — conservé pour compat. */ export const MAX_EMBEDDING_CHARS = EMBEDDING_CHUNK_CHARS const CLIP_FOOTER_PATTERN = /\s*]*>\s*[\s\S]*?<\/small>\s*<\/p>\s*$/i export function stripHtmlToPlainText(html: string): string { if (!html) return '' return html .replace(//gi, ' ') .replace(//gi, ' ') .replace(/<[^>]+>/g, ' ') .replace(/ /gi, ' ') .replace(/&/g, '&') .replace(/</g, '<') .replace(/>/g, '>') .replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCharCode(parseInt(hex, 16))) .replace(/&#(\d+);/g, (_, code) => String.fromCharCode(Number(code))) .replace(/\s+/g, ' ') .trim() } /** Retire le footer « Extrait de… » des notes clippées (bruit LTR pour embeddings). */ export function stripClipFooterFromHtml(html: string): string { if (!html) return '' return html.replace(CLIP_FOOTER_PATTERN, '').trim() } export function looksLikeHtml(text: string): boolean { return /<[a-z][\s\S]*>/i.test(text) } /** Titre + corps entier en plain text — aucune troncature (les longs articles passent en multi-chunks). */ export function prepareNoteTextForEmbedding( title: string | null | undefined, content: string, ): string { const withoutFooter = stripClipFooterFromHtml(content || '') const body = looksLikeHtml(withoutFooter) ? stripHtmlToPlainText(withoutFooter) : withoutFooter.trim() const parts = [title?.trim(), body].filter(Boolean) as string[] return parts.join('\n\n') } /** Découpe un long article en chunks chevauchants pour embedding complet. */ export function splitPlainTextForEmbeddingChunks(text: string): string[] { const normalized = text.trim() if (!normalized) return [] if (normalized.length <= EMBEDDING_CHUNK_CHARS) return [normalized] const chunks: string[] = [] let start = 0 while (start < normalized.length) { const end = Math.min(start + EMBEDDING_CHUNK_CHARS, normalized.length) chunks.push(normalized.slice(start, end)) if (end >= normalized.length) break start = Math.max(start + 1, end - EMBEDDING_CHUNK_OVERLAP) } return chunks } /** Moyenne + normalisation L2 de plusieurs vecteurs (standard pour longs documents). */ export function meanPoolEmbeddingVectors(vectors: number[][]): number[] { if (vectors.length === 0) return [] if (vectors.length === 1) return vectors[0] const dim = vectors[0].length const sums = new Array(dim).fill(0) for (const vec of vectors) { for (let i = 0; i < dim; i++) sums[i] += vec[i] } const mean = sums.map((s) => s / vectors.length) let norm = 0 for (const x of mean) norm += x * x norm = Math.sqrt(norm) if (norm === 0) return mean return mean.map((x) => x / norm) } /** Contenu prêt pour text-embedding (corps seul, rétrocompat). */ export function prepareTextForEmbedding(content: string): string { return prepareNoteTextForEmbedding(null, content) } /** Aperçu UI court — n'affecte PAS la similarité sémantique. */ export function excerptPlainNoteContent( title: string | null | undefined, content: string, maxLen = 280, ): string { const plain = prepareNoteTextForEmbedding(title, content) if (!plain) return '' if (plain.length <= maxLen) return plain return `${plain.slice(0, maxLen).trim()}…` } /** Tokens pour Jaccard — toutes écritures Unicode (persan, arabe, latin…). */ export function tokenizeForSimilarity(text: string, minLength = 2): Set { const normalized = text.toLowerCase().normalize('NFKC') const words = normalized.match(/[\p{L}\p{N}]{2,}/gu) ?? [] return new Set(words.filter((w) => w.length >= minLength)) }