110 lines
3.9 KiB
TypeScript
110 lines
3.9 KiB
TypeScript
/** Texte brut pour embeddings et similarité (HTML clippé, persan, arabe, etc.). */
|
|
|
|
/** Taille d'un chunk API embedding (~2500 tokens, safe pour le persan). */
|
|
export const EMBEDDING_CHUNK_CHARS = 6000
|
|
export const EMBEDDING_CHUNK_OVERLAP = 300
|
|
|
|
/** @deprecated Utiliser le découpage multi-chunks — conservé pour compat. */
|
|
export const MAX_EMBEDDING_CHARS = EMBEDDING_CHUNK_CHARS
|
|
|
|
const CLIP_FOOTER_PATTERN =
|
|
/<hr\s*\/?>\s*<p[^>]*>\s*<small>[\s\S]*?<\/small>\s*<\/p>\s*$/i
|
|
|
|
export function stripHtmlToPlainText(html: string): string {
|
|
if (!html) return ''
|
|
return html
|
|
.replace(/<script[\s\S]*?<\/script>/gi, ' ')
|
|
.replace(/<style[\s\S]*?<\/style>/gi, ' ')
|
|
.replace(/<[^>]+>/g, ' ')
|
|
.replace(/ /gi, ' ')
|
|
.replace(/&/g, '&')
|
|
.replace(/</g, '<')
|
|
.replace(/>/g, '>')
|
|
.replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCharCode(parseInt(hex, 16)))
|
|
.replace(/&#(\d+);/g, (_, code) => String.fromCharCode(Number(code)))
|
|
.replace(/\s+/g, ' ')
|
|
.trim()
|
|
}
|
|
|
|
/** Retire le footer « Extrait de… » des notes clippées (bruit LTR pour embeddings). */
|
|
export function stripClipFooterFromHtml(html: string): string {
|
|
if (!html) return ''
|
|
return html.replace(CLIP_FOOTER_PATTERN, '').trim()
|
|
}
|
|
|
|
export function looksLikeHtml(text: string): boolean {
|
|
return /<[a-z][\s\S]*>/i.test(text)
|
|
}
|
|
|
|
/** Titre + corps entier en plain text — aucune troncature (les longs articles passent en multi-chunks). */
|
|
export function prepareNoteTextForEmbedding(
|
|
title: string | null | undefined,
|
|
content: string,
|
|
): string {
|
|
const withoutFooter = stripClipFooterFromHtml(content || '')
|
|
const body = looksLikeHtml(withoutFooter)
|
|
? stripHtmlToPlainText(withoutFooter)
|
|
: withoutFooter.trim()
|
|
const parts = [title?.trim(), body].filter(Boolean) as string[]
|
|
return parts.join('\n\n')
|
|
}
|
|
|
|
/** Découpe un long article en chunks chevauchants pour embedding complet. */
|
|
export function splitPlainTextForEmbeddingChunks(text: string): string[] {
|
|
const normalized = text.trim()
|
|
if (!normalized) return []
|
|
if (normalized.length <= EMBEDDING_CHUNK_CHARS) return [normalized]
|
|
|
|
const chunks: string[] = []
|
|
let start = 0
|
|
while (start < normalized.length) {
|
|
const end = Math.min(start + EMBEDDING_CHUNK_CHARS, normalized.length)
|
|
chunks.push(normalized.slice(start, end))
|
|
if (end >= normalized.length) break
|
|
start = Math.max(start + 1, end - EMBEDDING_CHUNK_OVERLAP)
|
|
}
|
|
return chunks
|
|
}
|
|
|
|
/** Moyenne + normalisation L2 de plusieurs vecteurs (standard pour longs documents). */
|
|
export function meanPoolEmbeddingVectors(vectors: number[][]): number[] {
|
|
if (vectors.length === 0) return []
|
|
if (vectors.length === 1) return vectors[0]
|
|
|
|
const dim = vectors[0].length
|
|
const sums = new Array(dim).fill(0)
|
|
for (const vec of vectors) {
|
|
for (let i = 0; i < dim; i++) sums[i] += vec[i]
|
|
}
|
|
const mean = sums.map((s) => s / vectors.length)
|
|
let norm = 0
|
|
for (const x of mean) norm += x * x
|
|
norm = Math.sqrt(norm)
|
|
if (norm === 0) return mean
|
|
return mean.map((x) => x / norm)
|
|
}
|
|
|
|
/** Contenu prêt pour text-embedding (corps seul, rétrocompat). */
|
|
export function prepareTextForEmbedding(content: string): string {
|
|
return prepareNoteTextForEmbedding(null, content)
|
|
}
|
|
|
|
/** Aperçu UI court — n'affecte PAS la similarité sémantique. */
|
|
export function excerptPlainNoteContent(
|
|
title: string | null | undefined,
|
|
content: string,
|
|
maxLen = 280,
|
|
): string {
|
|
const plain = prepareNoteTextForEmbedding(title, content)
|
|
if (!plain) return ''
|
|
if (plain.length <= maxLen) return plain
|
|
return `${plain.slice(0, maxLen).trim()}…`
|
|
}
|
|
|
|
/** Tokens pour Jaccard — toutes écritures Unicode (persan, arabe, latin…). */
|
|
export function tokenizeForSimilarity(text: string, minLength = 2): Set<string> {
|
|
const normalized = text.toLowerCase().normalize('NFKC')
|
|
const words = normalized.match(/[\p{L}\p{N}]{2,}/gu) ?? []
|
|
return new Set(words.filter((w) => w.length >= minLength))
|
|
}
|