151 lines
3.9 KiB
TypeScript
151 lines
3.9 KiB
TypeScript
/**
|
|
* Chunking sémantique pour embeddings par fragments.
|
|
*
|
|
* Inspiré d'AppFlowy flowy-ai/src/embeddings/document_indexer.rs.
|
|
* Découpe le plain text d'une note en fragments cohérents (~1000 chars),
|
|
* avec overlap pour préserver le contexte aux frontières.
|
|
*
|
|
* Chaque fragment reçoit un fragmentId stable (sha256) pour le dedup :
|
|
* si le contenu d'un fragment ne change pas entre deux sauvegardes,
|
|
* il n'est pas re-embeddé.
|
|
*/
|
|
|
|
import { createHash } from 'crypto'
|
|
|
|
const CHUNK_TARGET_CHARS = 1000
|
|
const CHUNK_OVERLAP_CHARS = 200
|
|
const MIN_FRAGMENT_CHARS = 10
|
|
const MAX_PARAGRAPH_BEFORE_SPLIT = 1500
|
|
|
|
export interface NoteChunk {
|
|
fragmentId: string
|
|
content: string
|
|
chunkIndex: number
|
|
charCount: number
|
|
}
|
|
|
|
/**
|
|
* Découpe le plain text d'une note en fragments sémantiques.
|
|
*
|
|
* @param noteId ID de la note (inclus dans le hash pour isolation)
|
|
* @param plainText Texte brut (titre + corps), déjà nettoyé via prepareNoteTextForEmbedding
|
|
* @returns fragments triés par chunkIndex
|
|
*/
|
|
export function chunkNoteContent(noteId: string, plainText: string): NoteChunk[] {
|
|
const normalized = plainText.trim()
|
|
if (normalized.length < MIN_FRAGMENT_CHARS) return []
|
|
|
|
const paragraphs = normalized
|
|
.split(/\n\s*\n/)
|
|
.map((p) => p.trim())
|
|
.filter((p) => p.length >= MIN_FRAGMENT_CHARS)
|
|
|
|
if (paragraphs.length === 0) return []
|
|
|
|
const atomicParagraphs: string[] = []
|
|
for (const para of paragraphs) {
|
|
if (para.length > MAX_PARAGRAPH_BEFORE_SPLIT) {
|
|
atomicParagraphs.push(...splitLongParagraph(para, CHUNK_TARGET_CHARS))
|
|
} else {
|
|
atomicParagraphs.push(para)
|
|
}
|
|
}
|
|
|
|
const groups = groupParagraphsByMaxContentLen(
|
|
atomicParagraphs,
|
|
CHUNK_TARGET_CHARS,
|
|
CHUNK_OVERLAP_CHARS,
|
|
)
|
|
|
|
const chunks: NoteChunk[] = []
|
|
const seen = new Set<string>()
|
|
|
|
for (let i = 0; i < groups.length; i++) {
|
|
const content = groups[i]
|
|
if (content.length < MIN_FRAGMENT_CHARS) continue
|
|
|
|
const fragmentId = hashFragment(noteId, content)
|
|
if (seen.has(fragmentId)) continue
|
|
seen.add(fragmentId)
|
|
|
|
chunks.push({
|
|
fragmentId,
|
|
content,
|
|
chunkIndex: i,
|
|
charCount: content.length,
|
|
})
|
|
}
|
|
|
|
return chunks
|
|
}
|
|
|
|
function hashFragment(noteId: string, content: string): string {
|
|
return createHash('sha256')
|
|
.update(`${noteId}::${content}`)
|
|
.digest('hex')
|
|
.slice(0, 32)
|
|
}
|
|
|
|
function splitLongParagraph(para: string, maxLen: number): string[] {
|
|
const sentences = para.split(/(?<=[.!?؟!。])\s+/)
|
|
const chunks: string[] = []
|
|
let current = ''
|
|
|
|
for (const sentence of sentences) {
|
|
if ((current + ' ' + sentence).length > maxLen && current) {
|
|
chunks.push(current.trim())
|
|
current = sentence
|
|
} else {
|
|
current = current ? `${current} ${sentence}` : sentence
|
|
}
|
|
}
|
|
if (current.trim()) chunks.push(current.trim())
|
|
|
|
return chunks.flatMap((chunk) =>
|
|
chunk.length > maxLen * 1.5 ? hardSplitByWords(chunk, maxLen) : [chunk],
|
|
)
|
|
}
|
|
|
|
function hardSplitByWords(text: string, maxLen: number): string[] {
|
|
const words = text.split(/\s+/)
|
|
const chunks: string[] = []
|
|
let current = ''
|
|
|
|
for (const word of words) {
|
|
if ((current + ' ' + word).length > maxLen && current) {
|
|
chunks.push(current.trim())
|
|
current = word
|
|
} else {
|
|
current = current ? `${current} ${word}` : word
|
|
}
|
|
}
|
|
if (current.trim()) chunks.push(current.trim())
|
|
|
|
return chunks
|
|
}
|
|
|
|
function groupParagraphsByMaxContentLen(
|
|
paragraphs: string[],
|
|
maxLen: number,
|
|
overlap: number,
|
|
): string[] {
|
|
if (paragraphs.length === 0) return []
|
|
if (overlap > maxLen) overlap = Math.floor(maxLen / 2)
|
|
|
|
const result: string[] = []
|
|
let current = ''
|
|
|
|
for (const para of paragraphs) {
|
|
if (current.length + para.length > maxLen && current) {
|
|
result.push(current.trim())
|
|
const tail = current.slice(-overlap)
|
|
current = `${tail}${para}`
|
|
} else {
|
|
current = current ? `${current}\n\n${para}` : para
|
|
}
|
|
}
|
|
if (current.trim()) result.push(current.trim())
|
|
|
|
return result
|
|
}
|