feat(insights): fix DBSCAN, Persian embeddings crash, D3 physics layouts, and D3 node not found runtime error
Some checks failed
CI / Lint, Test & Build (push) Failing after 1m7s
CI / Deploy production (on server) (push) Has been skipped

This commit is contained in:
Antigravity
2026-05-24 18:57:33 +00:00
parent e2672cd2c2
commit e881004c77
63 changed files with 5729 additions and 563 deletions

View File

@@ -0,0 +1,109 @@
/** Texte brut pour embeddings et similarité (HTML clippé, persan, arabe, etc.). */
/** Taille d'un chunk API embedding (~2500 tokens, safe pour le persan). */
export const EMBEDDING_CHUNK_CHARS = 6000
export const EMBEDDING_CHUNK_OVERLAP = 300
/** @deprecated Utiliser le découpage multi-chunks — conservé pour compat. */
export const MAX_EMBEDDING_CHARS = EMBEDDING_CHUNK_CHARS
const CLIP_FOOTER_PATTERN =
/<hr\s*\/?>\s*<p[^>]*>\s*<small>[\s\S]*?<\/small>\s*<\/p>\s*$/i
export function stripHtmlToPlainText(html: string): string {
if (!html) return ''
return html
.replace(/<script[\s\S]*?<\/script>/gi, ' ')
.replace(/<style[\s\S]*?<\/style>/gi, ' ')
.replace(/<[^>]+>/g, ' ')
.replace(/&nbsp;/gi, ' ')
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCharCode(parseInt(hex, 16)))
.replace(/&#(\d+);/g, (_, code) => String.fromCharCode(Number(code)))
.replace(/\s+/g, ' ')
.trim()
}
/** Retire le footer « Extrait de… » des notes clippées (bruit LTR pour embeddings). */
export function stripClipFooterFromHtml(html: string): string {
if (!html) return ''
return html.replace(CLIP_FOOTER_PATTERN, '').trim()
}
export function looksLikeHtml(text: string): boolean {
return /<[a-z][\s\S]*>/i.test(text)
}
/** Titre + corps entier en plain text — aucune troncature (les longs articles passent en multi-chunks). */
export function prepareNoteTextForEmbedding(
title: string | null | undefined,
content: string,
): string {
const withoutFooter = stripClipFooterFromHtml(content || '')
const body = looksLikeHtml(withoutFooter)
? stripHtmlToPlainText(withoutFooter)
: withoutFooter.trim()
const parts = [title?.trim(), body].filter(Boolean) as string[]
return parts.join('\n\n')
}
/** Découpe un long article en chunks chevauchants pour embedding complet. */
export function splitPlainTextForEmbeddingChunks(text: string): string[] {
const normalized = text.trim()
if (!normalized) return []
if (normalized.length <= EMBEDDING_CHUNK_CHARS) return [normalized]
const chunks: string[] = []
let start = 0
while (start < normalized.length) {
const end = Math.min(start + EMBEDDING_CHUNK_CHARS, normalized.length)
chunks.push(normalized.slice(start, end))
if (end >= normalized.length) break
start = Math.max(start + 1, end - EMBEDDING_CHUNK_OVERLAP)
}
return chunks
}
/** Moyenne + normalisation L2 de plusieurs vecteurs (standard pour longs documents). */
export function meanPoolEmbeddingVectors(vectors: number[][]): number[] {
if (vectors.length === 0) return []
if (vectors.length === 1) return vectors[0]
const dim = vectors[0].length
const sums = new Array(dim).fill(0)
for (const vec of vectors) {
for (let i = 0; i < dim; i++) sums[i] += vec[i]
}
const mean = sums.map((s) => s / vectors.length)
let norm = 0
for (const x of mean) norm += x * x
norm = Math.sqrt(norm)
if (norm === 0) return mean
return mean.map((x) => x / norm)
}
/** Contenu prêt pour text-embedding (corps seul, rétrocompat). */
export function prepareTextForEmbedding(content: string): string {
return prepareNoteTextForEmbedding(null, content)
}
/** Aperçu UI court — n'affecte PAS la similarité sémantique. */
export function excerptPlainNoteContent(
title: string | null | undefined,
content: string,
maxLen = 280,
): string {
const plain = prepareNoteTextForEmbedding(title, content)
if (!plain) return ''
if (plain.length <= maxLen) return plain
return `${plain.slice(0, maxLen).trim()}`
}
/** Tokens pour Jaccard — toutes écritures Unicode (persan, arabe, latin…). */
export function tokenizeForSimilarity(text: string, minLength = 2): Set<string> {
const normalized = text.toLowerCase().normalize('NFKC')
const words = normalized.match(/[\p{L}\p{N}]{2,}/gu) ?? []
return new Set(words.filter((w) => w.length >= minLength))
}