Files
Momento/memento-note/lib/text/note-chunking.ts
Antigravity a623454347
Some checks failed
CI / Lint, Unit Tests & Build (push) Failing after 1m32s
CI / Deploy production (on server) (push) Has been skipped
perf: memo GridCard, fuse save fns, fix slash tab active color
2026-06-14 14:06:05 +00:00

151 lines
3.9 KiB
TypeScript

/**
* Chunking sémantique pour embeddings par fragments.
*
* Inspiré d'AppFlowy flowy-ai/src/embeddings/document_indexer.rs.
* Découpe le plain text d'une note en fragments cohérents (~1000 chars),
* avec overlap pour préserver le contexte aux frontières.
*
* Chaque fragment reçoit un fragmentId stable (sha256) pour le dedup :
* si le contenu d'un fragment ne change pas entre deux sauvegardes,
* il n'est pas re-embeddé.
*/
import { createHash } from 'crypto'
const CHUNK_TARGET_CHARS = 1000
const CHUNK_OVERLAP_CHARS = 200
const MIN_FRAGMENT_CHARS = 10
const MAX_PARAGRAPH_BEFORE_SPLIT = 1500
export interface NoteChunk {
fragmentId: string
content: string
chunkIndex: number
charCount: number
}
/**
* Découpe le plain text d'une note en fragments sémantiques.
*
* @param noteId ID de la note (inclus dans le hash pour isolation)
* @param plainText Texte brut (titre + corps), déjà nettoyé via prepareNoteTextForEmbedding
* @returns fragments triés par chunkIndex
*/
export function chunkNoteContent(noteId: string, plainText: string): NoteChunk[] {
const normalized = plainText.trim()
if (normalized.length < MIN_FRAGMENT_CHARS) return []
const paragraphs = normalized
.split(/\n\s*\n/)
.map((p) => p.trim())
.filter((p) => p.length >= MIN_FRAGMENT_CHARS)
if (paragraphs.length === 0) return []
const atomicParagraphs: string[] = []
for (const para of paragraphs) {
if (para.length > MAX_PARAGRAPH_BEFORE_SPLIT) {
atomicParagraphs.push(...splitLongParagraph(para, CHUNK_TARGET_CHARS))
} else {
atomicParagraphs.push(para)
}
}
const groups = groupParagraphsByMaxContentLen(
atomicParagraphs,
CHUNK_TARGET_CHARS,
CHUNK_OVERLAP_CHARS,
)
const chunks: NoteChunk[] = []
const seen = new Set<string>()
for (let i = 0; i < groups.length; i++) {
const content = groups[i]
if (content.length < MIN_FRAGMENT_CHARS) continue
const fragmentId = hashFragment(noteId, content)
if (seen.has(fragmentId)) continue
seen.add(fragmentId)
chunks.push({
fragmentId,
content,
chunkIndex: i,
charCount: content.length,
})
}
return chunks
}
function hashFragment(noteId: string, content: string): string {
return createHash('sha256')
.update(`${noteId}::${content}`)
.digest('hex')
.slice(0, 32)
}
function splitLongParagraph(para: string, maxLen: number): string[] {
const sentences = para.split(/(?<=[.!?؟!。])\s+/)
const chunks: string[] = []
let current = ''
for (const sentence of sentences) {
if ((current + ' ' + sentence).length > maxLen && current) {
chunks.push(current.trim())
current = sentence
} else {
current = current ? `${current} ${sentence}` : sentence
}
}
if (current.trim()) chunks.push(current.trim())
return chunks.flatMap((chunk) =>
chunk.length > maxLen * 1.5 ? hardSplitByWords(chunk, maxLen) : [chunk],
)
}
function hardSplitByWords(text: string, maxLen: number): string[] {
const words = text.split(/\s+/)
const chunks: string[] = []
let current = ''
for (const word of words) {
if ((current + ' ' + word).length > maxLen && current) {
chunks.push(current.trim())
current = word
} else {
current = current ? `${current} ${word}` : word
}
}
if (current.trim()) chunks.push(current.trim())
return chunks
}
function groupParagraphsByMaxContentLen(
paragraphs: string[],
maxLen: number,
overlap: number,
): string[] {
if (paragraphs.length === 0) return []
if (overlap > maxLen) overlap = Math.floor(maxLen / 2)
const result: string[] = []
let current = ''
for (const para of paragraphs) {
if (current.length + para.length > maxLen && current) {
result.push(current.trim())
const tail = current.slice(-overlap)
current = `${tail}${para}`
} else {
current = current ? `${current}\n\n${para}` : para
}
}
if (current.trim()) result.push(current.trim())
return result
}