feat(insights): fix DBSCAN, Persian embeddings crash, D3 physics layouts, and D3 node not found runtime error

2026-05-24 18:57:33 +00:00
parent e2672cd2c2
commit e881004c77
63 changed files with 5729 additions and 563 deletions
--- a/memento-note/lib/text/plain-text.ts
+++ b/memento-note/lib/text/plain-text.ts
@@ -0,0 +1,109 @@
+/** Texte brut pour embeddings et similarité (HTML clippé, persan, arabe, etc.). */
+
+/** Taille d'un chunk API embedding (~2500 tokens, safe pour le persan). */
+export const EMBEDDING_CHUNK_CHARS = 6000
+export const EMBEDDING_CHUNK_OVERLAP = 300
+
+/** @deprecated Utiliser le découpage multi-chunks — conservé pour compat. */
+export const MAX_EMBEDDING_CHARS = EMBEDDING_CHUNK_CHARS
+
+const CLIP_FOOTER_PATTERN =
+  /<hr\s*\/?>\s*<p[^>]*>\s*<small>[\s\S]*?<\/small>\s*<\/p>\s*$/i
+
+export function stripHtmlToPlainText(html: string): string {
+  if (!html) return ''
+  return html
+    .replace(/<script[\s\S]*?<\/script>/gi, ' ')
+    .replace(/<style[\s\S]*?<\/style>/gi, ' ')
+    .replace(/<[^>]+>/g, ' ')
+    .replace(/&nbsp;/gi, ' ')
+    .replace(/&amp;/g, '&')
+    .replace(/&lt;/g, '<')
+    .replace(/&gt;/g, '>')
+    .replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCharCode(parseInt(hex, 16)))
+    .replace(/&#(\d+);/g, (_, code) => String.fromCharCode(Number(code)))
+    .replace(/\s+/g, ' ')
+    .trim()
+}
+
+/** Retire le footer « Extrait de… » des notes clippées (bruit LTR pour embeddings). */
+export function stripClipFooterFromHtml(html: string): string {
+  if (!html) return ''
+  return html.replace(CLIP_FOOTER_PATTERN, '').trim()
+}
+
+export function looksLikeHtml(text: string): boolean {
+  return /<[a-z][\s\S]*>/i.test(text)
+}
+
+/** Titre + corps entier en plain text — aucune troncature (les longs articles passent en multi-chunks). */
+export function prepareNoteTextForEmbedding(
+  title: string | null | undefined,
+  content: string,
+): string {
+  const withoutFooter = stripClipFooterFromHtml(content || '')
+  const body = looksLikeHtml(withoutFooter)
+    ? stripHtmlToPlainText(withoutFooter)
+    : withoutFooter.trim()
+  const parts = [title?.trim(), body].filter(Boolean) as string[]
+  return parts.join('\n\n')
+}
+
+/** Découpe un long article en chunks chevauchants pour embedding complet. */
+export function splitPlainTextForEmbeddingChunks(text: string): string[] {
+  const normalized = text.trim()
+  if (!normalized) return []
+  if (normalized.length <= EMBEDDING_CHUNK_CHARS) return [normalized]
+
+  const chunks: string[] = []
+  let start = 0
+  while (start < normalized.length) {
+    const end = Math.min(start + EMBEDDING_CHUNK_CHARS, normalized.length)
+    chunks.push(normalized.slice(start, end))
+    if (end >= normalized.length) break
+    start = Math.max(start + 1, end - EMBEDDING_CHUNK_OVERLAP)
+  }
+  return chunks
+}
+
+/** Moyenne + normalisation L2 de plusieurs vecteurs (standard pour longs documents). */
+export function meanPoolEmbeddingVectors(vectors: number[][]): number[] {
+  if (vectors.length === 0) return []
+  if (vectors.length === 1) return vectors[0]
+
+  const dim = vectors[0].length
+  const sums = new Array(dim).fill(0)
+  for (const vec of vectors) {
+    for (let i = 0; i < dim; i++) sums[i] += vec[i]
+  }
+  const mean = sums.map((s) => s / vectors.length)
+  let norm = 0
+  for (const x of mean) norm += x * x
+  norm = Math.sqrt(norm)
+  if (norm === 0) return mean
+  return mean.map((x) => x / norm)
+}
+
+/** Contenu prêt pour text-embedding (corps seul, rétrocompat). */
+export function prepareTextForEmbedding(content: string): string {
+  return prepareNoteTextForEmbedding(null, content)
+}
+
+/** Aperçu UI court — n'affecte PAS la similarité sémantique. */
+export function excerptPlainNoteContent(
+  title: string | null | undefined,
+  content: string,
+  maxLen = 280,
+): string {
+  const plain = prepareNoteTextForEmbedding(title, content)
+  if (!plain) return ''
+  if (plain.length <= maxLen) return plain
+  return `${plain.slice(0, maxLen).trim()}…`
+}
+
+/** Tokens pour Jaccard — toutes écritures Unicode (persan, arabe, latin…). */
+export function tokenizeForSimilarity(text: string, minLength = 2): Set<string> {
+  const normalized = text.toLowerCase().normalize('NFKC')
+  const words = normalized.match(/[\p{L}\p{N}]{2,}/gu) ?? []
+  return new Set(words.filter((w) => w.length >= minLength))
+}