Momento/memento-note/tests/unit/chunking.test.ts

import { test, expect, describe } from 'vitest'
import { chunkNoteContent } from '../../lib/text/note-chunking'

describe('US-CHUNK-1 : Chunking sémantique', () => {
  test('note vide → aucun fragment', () => {
    const chunks = chunkNoteContent('note1', '')
    expect(chunks.length).toBe(0)
  })

  test('note très courte (< 10 chars) → aucun fragment', () => {
    const chunks = chunkNoteContent('note1', 'Hello')
    expect(chunks.length).toBe(0)
  })

  test('note courte (< 1000 chars) → 1 seul fragment', () => {
    const text = 'Ceci est une note courte. Elle parle de productivité et de gestion du temps.'
    const chunks = chunkNoteContent('note1', text)
    expect(chunks.length).toBe(1)
    expect(chunks[0].chunkIndex).toBe(0)
    expect(chunks[0].content).toContain('productivité')
    expect(chunks[0].charCount).toBe(chunks[0].content.length)
  })

  test('note longue avec plusieurs paragraphes → plusieurs fragments', () => {
    const paragraphs: string[] = []
    for (let i = 0; i < 10; i++) {
      paragraphs.push(`Paragraphe ${i}. `.repeat(60).trim())
    }
    const text = paragraphs.join('\n\n')
    const chunks = chunkNoteContent('note2', text)
    expect(chunks.length).toBeGreaterThan(1)
    expect(chunks.length).toBeLessThanOrEqual(15)
    for (let i = 0; i < chunks.length; i++) {
      expect(chunks[i].chunkIndex).toBe(i)
    }
  })

  test('fragmentId est stable (déterministe)', () => {
    const text = 'Même contenu donne même hash.'
    const chunks1 = chunkNoteContent('noteA', text)
    const chunks2 = chunkNoteContent('noteA', text)
    expect(chunks1[0].fragmentId).toBe(chunks2[0].fragmentId)
  })

  test('fragmentId diffère entre notes différentes', () => {
    const text = 'Même contenu mais note différente.'
    const chunks1 = chunkNoteContent('noteA', text)
    const chunks2 = chunkNoteContent('noteB', text)
    expect(chunks1[0].fragmentId).not.toBe(chunks2[0].fragmentId)
  })

  test('paragraphe géant (> 1500 chars) → sous-découpé aux phrases', () => {
    const giantPara =
      'Ceci est une phrase très longue. '.repeat(100) + 'Dernière phrase du paragraphe géant.'
    const chunks = chunkNoteContent('note3', giantPara)
    expect(chunks.length).toBeGreaterThan(1)
    for (const chunk of chunks) {
      expect(chunk.content.length).toBeLessThanOrEqual(2000)
    }
  })

  test('persan (RTL) → chunking correct', () => {
    const persianText =
      'یادداشت درباره بهره‌وری.\n\nاین یک پاراگراف فارسی است. این متن برای تست قالب‌بندی راست‌چین نوشته شده است. یادداشت‌های فارسی باید به درستی پردازش شوند.\n\nپاراگراف سوم. محتوای بیشتری برای اطمینان از صحت پردازش.'
    const chunks = chunkNoteContent('note-fa', persianText)
    expect(chunks.length).toBeGreaterThanOrEqual(1)
    expect(chunks[0].content).toContain('بهره‌وری')
  })

  test('contenu plain text → pas de transformation', () => {
    const plainText = 'Premier paragraphe.\n\nDeuxième paragraphe.'
    const chunks = chunkNoteContent('note4', plainText)
    expect(chunks.length).toBeGreaterThanOrEqual(1)
    expect(chunks[0].content).toContain('Premier')
  })

  test('paragraphe répété → dedup par fragmentId', () => {
    const repeatedPara = 'Paragraphe identique répété volontairement.'
    const text = `${repeatedPara}\n\n${repeatedPara}\n\n${repeatedPara}`
    const chunks = chunkNoteContent('note5', text)
    const uniqueIds = new Set(chunks.map((c) => c.fragmentId))
    expect(uniqueIds.size).toBe(chunks.length)
  })

  test('modification d\'un paragraphe → fragmentId change pour ce fragment uniquement', () => {
    const paraA = 'Section A. '.repeat(80).trim()
    const paraB = 'Section B. '.repeat(80).trim()
    const paraC = 'Section C. '.repeat(80).trim()

    const original = `${paraA}\n\n${paraB}\n\n${paraC}`
    const modified = `${paraA} MODIFIE.\n\n${paraB}\n\n${paraC}`

    const chunksOriginal = chunkNoteContent('note6', original)
    const chunksModified = chunkNoteContent('note6', modified)

    expect(chunksOriginal.length).toBeGreaterThanOrEqual(2)

    const originalIds = new Set(chunksOriginal.map((c) => c.fragmentId))
    const newIds = chunksModified.map((c) => c.fragmentId)

    const unchanged = newIds.filter((id) => originalIds.has(id))
    expect(unchanged.length).toBeGreaterThanOrEqual(1)
    expect(unchanged.length).toBeLessThan(newIds.length)
  })

  test('overlap entre fragments consécutifs', () => {
    const paragraphs: string[] = []
    for (let i = 0; i < 8; i++) {
      paragraphs.push(`Section ${i}. `.repeat(80).trim())
    }
    const text = paragraphs.join('\n\n')
    const chunks = chunkNoteContent('note7', text)
    if (chunks.length >= 2) {
      const tail = chunks[0].content.slice(-200)
      const matchesOverlap = chunks[1].content.startsWith(tail.slice(0, 50)) || chunks[1].content.includes(tail.slice(0, 30))
      expect(matchesOverlap).toBe(true)
    }
  })
})