All checks were successful
Deploy to Production / Build and Deploy (push) Successful in 7s
- Add brainstorm feature with collaborative canvas, AI idea generation, live cursors, playback, and export - Add PDF upload/extraction/ingestion pipeline with pgvector document search (RAG) - Add document Q&A overlay with streaming chat and PDF preview - Add note attachments UI with status polling, grid layout, and auto-scroll - Add task extraction AI tool and agent executor improvements - Fix NoteEmbedding missing updatedAt column, re-index 66 notes with 1536-dim embeddings - Fix brainstorm 'Create Note' button: add success toast and redirect to created note - Fix memory echo notification infinite polling - Fix chat route to always include document_search tool - Add brainstorm i18n keys across all 14 locales - Add socket server for real-time brainstorm collaboration - Add hierarchical notebook selector and organize notebook dialog improvements - Add sidebar brainstorm section with session management - Update prisma schema with brainstorm tables, attachments, and document chunks
84 lines
2.2 KiB
TypeScript
84 lines
2.2 KiB
TypeScript
interface ChunkInput {
|
|
text: string
|
|
pageNumber: number
|
|
}
|
|
|
|
export interface DocumentChunkData {
|
|
content: string
|
|
chunkIndex: number
|
|
pageNumber: number
|
|
startChar: number
|
|
endChar: number
|
|
metadata?: string
|
|
}
|
|
|
|
export class DocumentChunkingService {
|
|
private readonly CHUNK_SIZE = 800
|
|
private readonly OVERLAP = 200
|
|
|
|
chunk(pages: ChunkInput[]): DocumentChunkData[] {
|
|
const chunks: DocumentChunkData[] = []
|
|
let globalIndex = 0
|
|
let previousTail = ''
|
|
|
|
for (const page of pages) {
|
|
const text = page.text.trim()
|
|
if (!text) continue
|
|
|
|
const sections = this.splitSections(text)
|
|
let buffer = previousTail
|
|
let bufferStart = 0
|
|
|
|
for (const section of sections) {
|
|
if (buffer.length + section.length > this.CHUNK_SIZE && buffer.length > 0) {
|
|
chunks.push({
|
|
content: buffer.trim(),
|
|
chunkIndex: globalIndex++,
|
|
pageNumber: page.pageNumber,
|
|
startChar: bufferStart,
|
|
endChar: bufferStart + buffer.length,
|
|
})
|
|
previousTail = buffer.slice(-this.OVERLAP)
|
|
buffer = previousTail + '\n' + section
|
|
bufferStart += buffer.length - section.length - previousTail.length
|
|
} else {
|
|
buffer += (buffer ? '\n\n' : '') + section
|
|
}
|
|
}
|
|
|
|
if (buffer.trim()) {
|
|
chunks.push({
|
|
content: buffer.trim(),
|
|
chunkIndex: globalIndex++,
|
|
pageNumber: page.pageNumber,
|
|
startChar: bufferStart,
|
|
endChar: bufferStart + buffer.length,
|
|
})
|
|
previousTail = buffer.slice(-this.OVERLAP)
|
|
}
|
|
}
|
|
|
|
return chunks
|
|
}
|
|
|
|
private splitSections(text: string): string[] {
|
|
const lines = text.split('\n')
|
|
const sections: string[] = []
|
|
let current = ''
|
|
|
|
for (const line of lines) {
|
|
const isHeading = /^(#{1,6}\s|[A-Z][A-Z\s]{5,}$)/.test(line.trim())
|
|
if (isHeading && current.trim()) {
|
|
sections.push(current.trim())
|
|
current = line
|
|
} else {
|
|
current += (current ? '\n' : '') + line
|
|
}
|
|
}
|
|
if (current.trim()) sections.push(current.trim())
|
|
return sections
|
|
}
|
|
}
|
|
|
|
export const documentChunkingService = new DocumentChunkingService()
|