Momento/memento-note/tests/unit/chunking.test.ts

import { chunkNoteContent } from '../../lib/text/note-chunking'

function test(name: string, fn: () => void) {
  try {
    fn()
    console.log(`  ✓ ${name}`)
  } catch (err: any) {
    console.error(`  ✗ ${name}: ${err.message}`)
    process.exitCode = 1
  }
}

function assert(condition: any, msg: string) {
  if (!condition) throw new Error(msg)
}

console.log('\n=== US-CHUNK-1 : Chunking sémantique ===\n')

test('note vide → aucun fragment', () => {
  const chunks = chunkNoteContent('note1', '')
  assert(chunks.length === 0, `attendu 0, reçu ${chunks.length}`)
})

test('note très courte (< 10 chars) → aucun fragment', () => {
  const chunks = chunkNoteContent('note1', 'Hello')
  assert(chunks.length === 0, `attendu 0, reçu ${chunks.length}`)
})

test('note courte (< 1000 chars) → 1 seul fragment', () => {
  const text = 'Ceci est une note courte. Elle parle de productivité et de gestion du temps.'
  const chunks = chunkNoteContent('note1', text)
  assert(chunks.length === 1, `attendu 1, reçu ${chunks.length}`)
  assert(chunks[0].chunkIndex === 0, 'chunkIndex doit être 0')
  assert(chunks[0].content.includes('productivité'), 'le contenu doit être préservé')
  assert(chunks[0].charCount === chunks[0].content.length, 'charCount doit correspondre')
})

test('note longue avec plusieurs paragraphes → plusieurs fragments', () => {
  const paragraphs: string[] = []
  for (let i = 0; i < 10; i++) {
    paragraphs.push(`Paragraphe ${i}. `.repeat(60).trim())
  }
  const text = paragraphs.join('\n\n')
  const chunks = chunkNoteContent('note2', text)
  assert(chunks.length > 1, `attendu >1, reçu ${chunks.length}`)
  assert(chunks.length <= 15, `attendu <=15 fragments, reçu ${chunks.length}`)
  for (let i = 0; i < chunks.length; i++) {
    assert(chunks[i].chunkIndex === i, `chunkIndex ${i} incorrect`)
  }
})

test('fragmentId est stable (déterministe)', () => {
  const text = 'Même contenu donne même hash.'
  const chunks1 = chunkNoteContent('noteA', text)
  const chunks2 = chunkNoteContent('noteA', text)
  assert(chunks1[0].fragmentId === chunks2[0].fragmentId, 'les hash doivent être identiques')
})

test('fragmentId diffère entre notes différentes', () => {
  const text = 'Même contenu mais note différente.'
  const chunks1 = chunkNoteContent('noteA', text)
  const chunks2 = chunkNoteContent('noteB', text)
  assert(chunks1[0].fragmentId !== chunks2[0].fragmentId, 'les hash doivent différer par noteId')
})

test('paragraphe géant (> 1500 chars) → sous-découpé aux phrases', () => {
  const giantPara =
    'Ceci est une phrase très longue. '.repeat(100) + 'Dernière phrase du paragraphe géant.'
  const chunks = chunkNoteContent('note3', giantPara)
  assert(chunks.length > 1, `attendu >1 fragment, reçu ${chunks.length}`)
  for (const chunk of chunks) {
    assert(
      chunk.content.length <= 2000,
      `fragment trop long: ${chunk.charCount} chars`,
    )
  }
})

test('persan (RTL) → chunking correct', () => {
  const persianText =
    'یادداشت درباره بهره‌وری.\n\nاین یک پاراگراف فارسی است. این متن برای تست قالب‌بندی راست‌چین نوشته شده است. یادداشت‌های فارسی باید به درستی پردازش شوند.\n\nپاراگراف سوم. محتوای بیشتری برای اطمینان از صحت پردازش.'
  const chunks = chunkNoteContent('note-fa', persianText)
  assert(chunks.length >= 1, `attendu >=1, reçu ${chunks.length}`)
  assert(chunks[0].content.includes('بهره‌وری'), 'contenu persan préservé')
})

test('contenu plain text → pas de transformation', () => {
  const plainText = 'Premier paragraphe.\n\nDeuxième paragraphe.'
  const chunks = chunkNoteContent('note4', plainText)
  assert(chunks.length >= 1, 'au moins 1 fragment')
  assert(chunks[0].content.includes('Premier'), 'contenu préservé')
  // Le strippage HTML est fait en amont par prepareNoteTextForEmbedding, pas par le chunker
})

test('paragraphe répété → dedup par fragmentId', () => {
  const repeatedPara = 'Paragraphe identique répété volontairement.'
  const text = `${repeatedPara}\n\n${repeatedPara}\n\n${repeatedPara}`
  const chunks = chunkNoteContent('note5', text)
  const uniqueIds = new Set(chunks.map((c) => c.fragmentId))
  assert(uniqueIds.size === chunks.length, 'les doublons doivent être supprimés')
})

test('modification d\'un paragraphe → fragmentId change pour ce fragment uniquement', () => {
  const paraA = 'Section A. '.repeat(80).trim()
  const paraB = 'Section B. '.repeat(80).trim()
  const paraC = 'Section C. '.repeat(80).trim()

  const original = `${paraA}\n\n${paraB}\n\n${paraC}`
  const modified = `${paraA} MODIFIE.\n\n${paraB}\n\n${paraC}`

  const chunksOriginal = chunkNoteContent('note6', original)
  const chunksModified = chunkNoteContent('note6', modified)

  assert(chunksOriginal.length >= 2, `original devrait avoir >=2 fragments, reçu ${chunksOriginal.length}`)

  const originalIds = new Set(chunksOriginal.map((c) => c.fragmentId))
  const newIds = chunksModified.map((c) => c.fragmentId)

  const unchanged = newIds.filter((id) => originalIds.has(id))
  assert(unchanged.length >= 1, `au moins 1 fragment inchangé attendu, reçu ${unchanged.length} sur ${newIds.length}`)
  assert(unchanged.length < newIds.length, `au moins 1 fragment modifié attendu`)
})

test('overlap entre fragments consécutifs', () => {
  const paragraphs: string[] = []
  for (let i = 0; i < 8; i++) {
    paragraphs.push(`Section ${i}. `.repeat(80).trim())
  }
  const text = paragraphs.join('\n\n')
  const chunks = chunkNoteContent('note7', text)
  if (chunks.length >= 2) {
    const tail = chunks[0].content.slice(-200)
    assert(
      chunks[1].content.startsWith(tail.slice(0, 50)) || chunks[1].content.includes(tail.slice(0, 30)),
      'l\'overlap devrait être présent entre fragments consécutifs',
    )
  }
})

console.log('\n=== Tests terminés ===')