/** * Chunking sémantique pour embeddings par fragments. * * Inspiré d'AppFlowy flowy-ai/src/embeddings/document_indexer.rs. * Découpe le plain text d'une note en fragments cohérents (~1000 chars), * avec overlap pour préserver le contexte aux frontières. * * Chaque fragment reçoit un fragmentId stable (sha256) pour le dedup : * si le contenu d'un fragment ne change pas entre deux sauvegardes, * il n'est pas re-embeddé. */ import { createHash } from 'crypto' const CHUNK_TARGET_CHARS = 1000 const CHUNK_OVERLAP_CHARS = 200 const MIN_FRAGMENT_CHARS = 10 const MAX_PARAGRAPH_BEFORE_SPLIT = 1500 export interface NoteChunk { fragmentId: string content: string chunkIndex: number charCount: number } /** * Découpe le plain text d'une note en fragments sémantiques. * * @param noteId ID de la note (inclus dans le hash pour isolation) * @param plainText Texte brut (titre + corps), déjà nettoyé via prepareNoteTextForEmbedding * @returns fragments triés par chunkIndex */ export function chunkNoteContent(noteId: string, plainText: string): NoteChunk[] { const normalized = plainText.trim() if (normalized.length < MIN_FRAGMENT_CHARS) return [] const paragraphs = normalized .split(/\n\s*\n/) .map((p) => p.trim()) .filter((p) => p.length >= MIN_FRAGMENT_CHARS) if (paragraphs.length === 0) return [] const atomicParagraphs: string[] = [] for (const para of paragraphs) { if (para.length > MAX_PARAGRAPH_BEFORE_SPLIT) { atomicParagraphs.push(...splitLongParagraph(para, CHUNK_TARGET_CHARS)) } else { atomicParagraphs.push(para) } } const groups = groupParagraphsByMaxContentLen( atomicParagraphs, CHUNK_TARGET_CHARS, CHUNK_OVERLAP_CHARS, ) const chunks: NoteChunk[] = [] const seen = new Set() for (let i = 0; i < groups.length; i++) { const content = groups[i] if (content.length < MIN_FRAGMENT_CHARS) continue const fragmentId = hashFragment(noteId, content) if (seen.has(fragmentId)) continue seen.add(fragmentId) chunks.push({ fragmentId, content, chunkIndex: i, charCount: content.length, }) } return chunks } function hashFragment(noteId: string, content: string): string { return createHash('sha256') .update(`${noteId}::${content}`) .digest('hex') .slice(0, 32) } function splitLongParagraph(para: string, maxLen: number): string[] { const sentences = para.split(/(?<=[.!?؟!。])\s+/) const chunks: string[] = [] let current = '' for (const sentence of sentences) { if ((current + ' ' + sentence).length > maxLen && current) { chunks.push(current.trim()) current = sentence } else { current = current ? `${current} ${sentence}` : sentence } } if (current.trim()) chunks.push(current.trim()) return chunks.flatMap((chunk) => chunk.length > maxLen * 1.5 ? hardSplitByWords(chunk, maxLen) : [chunk], ) } function hardSplitByWords(text: string, maxLen: number): string[] { const words = text.split(/\s+/) const chunks: string[] = [] let current = '' for (const word of words) { if ((current + ' ' + word).length > maxLen && current) { chunks.push(current.trim()) current = word } else { current = current ? `${current} ${word}` : word } } if (current.trim()) chunks.push(current.trim()) return chunks } function groupParagraphsByMaxContentLen( paragraphs: string[], maxLen: number, overlap: number, ): string[] { if (paragraphs.length === 0) return [] if (overlap > maxLen) overlap = Math.floor(maxLen / 2) const result: string[] = [] let current = '' for (const para of paragraphs) { if (current.length + para.length > maxLen && current) { result.push(current.trim()) const tail = current.slice(-overlap) current = `${tail}${para}` } else { current = current ? `${current}\n\n${para}` : para } } if (current.trim()) result.push(current.trim()) return result }