interface ChunkInput { text: string pageNumber: number } export interface DocumentChunkData { content: string chunkIndex: number pageNumber: number startChar: number endChar: number metadata?: string } export class DocumentChunkingService { private readonly CHUNK_SIZE = 800 private readonly OVERLAP = 200 chunk(pages: ChunkInput[]): DocumentChunkData[] { const chunks: DocumentChunkData[] = [] let globalIndex = 0 let previousTail = '' for (const page of pages) { const text = page.text.trim() if (!text) continue const sections = this.splitSections(text) let buffer = previousTail let bufferStart = 0 for (const section of sections) { if (buffer.length + section.length > this.CHUNK_SIZE && buffer.length > 0) { chunks.push({ content: buffer.trim(), chunkIndex: globalIndex++, pageNumber: page.pageNumber, startChar: bufferStart, endChar: bufferStart + buffer.length, }) previousTail = buffer.slice(-this.OVERLAP) buffer = previousTail + '\n' + section bufferStart += buffer.length - section.length - previousTail.length } else { buffer += (buffer ? '\n\n' : '') + section } } if (buffer.trim()) { chunks.push({ content: buffer.trim(), chunkIndex: globalIndex++, pageNumber: page.pageNumber, startChar: bufferStart, endChar: bufferStart + buffer.length, }) previousTail = buffer.slice(-this.OVERLAP) } } return chunks } private splitSections(text: string): string[] { const lines = text.split('\n') const sections: string[] = [] let current = '' for (const line of lines) { const isHeading = /^(#{1,6}\s|[A-Z][A-Z\s]{5,}$)/.test(line.trim()) if (isHeading && current.trim()) { sections.push(current.trim()) current = line } else { current += (current ? '\n' : '') + line } } if (current.trim()) sections.push(current.trim()) return sections } } export const documentChunkingService = new DocumentChunkingService()