Momento/memento-note/lib/text/note-chunking.ts

/**
 * Chunking sémantique pour embeddings par fragments.
 *
 * Inspiré d'AppFlowy flowy-ai/src/embeddings/document_indexer.rs.
 * Découpe le plain text d'une note en fragments cohérents (~1000 chars),
 * avec overlap pour préserver le contexte aux frontières.
 *
 * Chaque fragment reçoit un fragmentId stable (sha256) pour le dedup :
 * si le contenu d'un fragment ne change pas entre deux sauvegardes,
 * il n'est pas re-embeddé.
 */

import { createHash } from 'crypto'

const CHUNK_TARGET_CHARS = 1000
const CHUNK_OVERLAP_CHARS = 200
const MIN_FRAGMENT_CHARS = 10
const MAX_PARAGRAPH_BEFORE_SPLIT = 1500

export interface NoteChunk {
  fragmentId: string
  content: string
  chunkIndex: number
  charCount: number
}

/**
 * Découpe le plain text d'une note en fragments sémantiques.
 *
 * @param noteId   ID de la note (inclus dans le hash pour isolation)
 * @param plainText  Texte brut (titre + corps), déjà nettoyé via prepareNoteTextForEmbedding
 * @returns fragments triés par chunkIndex
 */
export function chunkNoteContent(noteId: string, plainText: string): NoteChunk[] {
  const normalized = plainText.trim()
  if (normalized.length < MIN_FRAGMENT_CHARS) return []

  const paragraphs = normalized
    .split(/\n\s*\n/)
    .map((p) => p.trim())
    .filter((p) => p.length >= MIN_FRAGMENT_CHARS)

  if (paragraphs.length === 0) return []

  const atomicParagraphs: string[] = []
  for (const para of paragraphs) {
    if (para.length > MAX_PARAGRAPH_BEFORE_SPLIT) {
      atomicParagraphs.push(...splitLongParagraph(para, CHUNK_TARGET_CHARS))
    } else {
      atomicParagraphs.push(para)
    }
  }

  const groups = groupParagraphsByMaxContentLen(
    atomicParagraphs,
    CHUNK_TARGET_CHARS,
    CHUNK_OVERLAP_CHARS,
  )

  const chunks: NoteChunk[] = []
  const seen = new Set<string>()

  for (let i = 0; i < groups.length; i++) {
    const content = groups[i]
    if (content.length < MIN_FRAGMENT_CHARS) continue

    const fragmentId = hashFragment(noteId, content)
    if (seen.has(fragmentId)) continue
    seen.add(fragmentId)

    chunks.push({
      fragmentId,
      content,
      chunkIndex: i,
      charCount: content.length,
    })
  }

  return chunks
}

function hashFragment(noteId: string, content: string): string {
  return createHash('sha256')
    .update(`${noteId}::${content}`)
    .digest('hex')
    .slice(0, 32)
}

function splitLongParagraph(para: string, maxLen: number): string[] {
  const sentences = para.split(/(?<=[.!?؟！。])\s+/)
  const chunks: string[] = []
  let current = ''

  for (const sentence of sentences) {
    if ((current + ' ' + sentence).length > maxLen && current) {
      chunks.push(current.trim())
      current = sentence
    } else {
      current = current ? `${current} ${sentence}` : sentence
    }
  }
  if (current.trim()) chunks.push(current.trim())

  return chunks.flatMap((chunk) =>
    chunk.length > maxLen * 1.5 ? hardSplitByWords(chunk, maxLen) : [chunk],
  )
}

function hardSplitByWords(text: string, maxLen: number): string[] {
  const words = text.split(/\s+/)
  const chunks: string[] = []
  let current = ''

  for (const word of words) {
    if ((current + ' ' + word).length > maxLen && current) {
      chunks.push(current.trim())
      current = word
    } else {
      current = current ? `${current} ${word}` : word
    }
  }
  if (current.trim()) chunks.push(current.trim())

  return chunks
}

function groupParagraphsByMaxContentLen(
  paragraphs: string[],
  maxLen: number,
  overlap: number,
): string[] {
  if (paragraphs.length === 0) return []
  if (overlap > maxLen) overlap = Math.floor(maxLen / 2)

  const result: string[] = []
  let current = ''

  for (const para of paragraphs) {
    if (current.length + para.length > maxLen && current) {
      result.push(current.trim())
      const tail = current.slice(-overlap)
      current = `${tail}${para}`
    } else {
      current = current ? `${current}\n\n${para}` : para
    }
  }
  if (current.trim()) result.push(current.trim())

  return result
}