Momento/memento-note/lib/text/plain-text.ts

/** Texte brut pour embeddings et similarité (HTML clippé, persan, arabe, etc.). */

/** Taille d'un chunk API embedding (~2500 tokens, safe pour le persan). */
export const EMBEDDING_CHUNK_CHARS = 6000
export const EMBEDDING_CHUNK_OVERLAP = 300

/** @deprecated Utiliser le découpage multi-chunks — conservé pour compat. */
export const MAX_EMBEDDING_CHARS = EMBEDDING_CHUNK_CHARS

const CLIP_FOOTER_PATTERN =
  /<hr\s*\/?>\s*<p[^>]*>\s*<small>[\s\S]*?<\/small>\s*<\/p>\s*$/i

export function stripHtmlToPlainText(html: string): string {
  if (!html) return ''
  return html
    .replace(/<script[\s\S]*?<\/script>/gi, ' ')
    .replace(/<style[\s\S]*?<\/style>/gi, ' ')
    .replace(/<[^>]+>/g, ' ')
    .replace(/&nbsp;/gi, ' ')
    .replace(/&amp;/g, '&')
    .replace(/&lt;/g, '<')
    .replace(/&gt;/g, '>')
    .replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCharCode(parseInt(hex, 16)))
    .replace(/&#(\d+);/g, (_, code) => String.fromCharCode(Number(code)))
    .replace(/\s+/g, ' ')
    .trim()
}

/** Retire le footer « Extrait de… » des notes clippées (bruit LTR pour embeddings). */
export function stripClipFooterFromHtml(html: string): string {
  if (!html) return ''
  return html.replace(CLIP_FOOTER_PATTERN, '').trim()
}

export function looksLikeHtml(text: string): boolean {
  return /<[a-z][\s\S]*>/i.test(text)
}

/** Titre + corps entier en plain text — aucune troncature (les longs articles passent en multi-chunks). */
export function prepareNoteTextForEmbedding(
  title: string | null | undefined,
  content: string,
): string {
  const withoutFooter = stripClipFooterFromHtml(content || '')
  const body = looksLikeHtml(withoutFooter)
    ? stripHtmlToPlainText(withoutFooter)
    : withoutFooter.trim()
  const parts = [title?.trim(), body].filter(Boolean) as string[]
  return parts.join('\n\n')
}

/** Découpe un long article en chunks chevauchants pour embedding complet. */
export function splitPlainTextForEmbeddingChunks(text: string): string[] {
  const normalized = text.trim()
  if (!normalized) return []
  if (normalized.length <= EMBEDDING_CHUNK_CHARS) return [normalized]

  const chunks: string[] = []
  let start = 0
  while (start < normalized.length) {
    const end = Math.min(start + EMBEDDING_CHUNK_CHARS, normalized.length)
    chunks.push(normalized.slice(start, end))
    if (end >= normalized.length) break
    start = Math.max(start + 1, end - EMBEDDING_CHUNK_OVERLAP)
  }
  return chunks
}

/** Moyenne + normalisation L2 de plusieurs vecteurs (standard pour longs documents). */
export function meanPoolEmbeddingVectors(vectors: number[][]): number[] {
  if (vectors.length === 0) return []
  if (vectors.length === 1) return vectors[0]

  const dim = vectors[0].length
  const sums = new Array(dim).fill(0)
  for (const vec of vectors) {
    for (let i = 0; i < dim; i++) sums[i] += vec[i]
  }
  const mean = sums.map((s) => s / vectors.length)
  let norm = 0
  for (const x of mean) norm += x * x
  norm = Math.sqrt(norm)
  if (norm === 0) return mean
  return mean.map((x) => x / norm)
}

/** Contenu prêt pour text-embedding (corps seul, rétrocompat). */
export function prepareTextForEmbedding(content: string): string {
  return prepareNoteTextForEmbedding(null, content)
}

/** Aperçu UI court — n'affecte PAS la similarité sémantique. */
export function excerptPlainNoteContent(
  title: string | null | undefined,
  content: string,
  maxLen = 280,
): string {
  const plain = prepareNoteTextForEmbedding(title, content)
  if (!plain) return ''
  if (plain.length <= maxLen) return plain
  return `${plain.slice(0, maxLen).trim()}…`
}

/** Tokens pour Jaccard — toutes écritures Unicode (persan, arabe, latin…). */
export function tokenizeForSimilarity(text: string, minLength = 2): Set<string> {
  const normalized = text.toLowerCase().normalize('NFKC')
  const words = normalized.match(/[\p{L}\p{N}]{2,}/gu) ?? []
  return new Set(words.filter((w) => w.length >= minLength))
}