181 lines
5.3 KiB
TypeScript
181 lines
5.3 KiB
TypeScript
/**
|
|
* Embedding Service
|
|
* Generates vector embeddings for semantic search and similarity analysis.
|
|
* Stores embeddings as native pgvector in PostgreSQL.
|
|
*/
|
|
|
|
import { withAiProviderFallback } from '../fallback'
|
|
import { getSystemConfig } from '@/lib/config'
|
|
import { prisma } from '@/lib/prisma'
|
|
import {
|
|
meanPoolEmbeddingVectors,
|
|
prepareNoteTextForEmbedding,
|
|
prepareTextForEmbedding,
|
|
splitPlainTextForEmbeddingChunks,
|
|
} from '@/lib/text/plain-text'
|
|
|
|
export interface EmbeddingResult {
|
|
embedding: number[]
|
|
model: string
|
|
dimension: number
|
|
/** Nombre de caractères plain text indexés */
|
|
indexedChars?: number
|
|
/** Nombre de chunks API utilisés */
|
|
chunkCount?: number
|
|
}
|
|
|
|
export class EmbeddingService {
|
|
prepareTextForEmbedding(content: string): string {
|
|
return prepareTextForEmbedding(content)
|
|
}
|
|
|
|
private async embedPlainText(plain: string): Promise<number[]> {
|
|
const config = await getSystemConfig()
|
|
return withAiProviderFallback('embedding', config, (provider) =>
|
|
provider.getEmbeddings(plain)
|
|
)
|
|
}
|
|
|
|
/**
|
|
* Embedding d'une note complète : titre + corps, multi-chunks si l'article dépasse la fenêtre API.
|
|
* Ex. 17 679 caractères → 3 chunks → vecteur moyenné (aucune perte de contenu).
|
|
*/
|
|
async generateNoteEmbedding(
|
|
title: string | null | undefined,
|
|
content: string,
|
|
): Promise<EmbeddingResult> {
|
|
const plain = prepareNoteTextForEmbedding(title, content)
|
|
if (!plain.trim()) {
|
|
throw new Error('Cannot generate embedding for empty note')
|
|
}
|
|
|
|
const chunks = splitPlainTextForEmbeddingChunks(plain)
|
|
const vectors = await Promise.all(chunks.map((chunk) => this.embedPlainText(chunk)))
|
|
const embedding = meanPoolEmbeddingVectors(vectors)
|
|
|
|
return {
|
|
embedding,
|
|
model: 'text-embedding-3-small',
|
|
dimension: embedding.length,
|
|
indexedChars: plain.length,
|
|
chunkCount: chunks.length,
|
|
}
|
|
}
|
|
|
|
/** Embedding d'une requête courte (recherche). */
|
|
async generateEmbedding(text: string): Promise<EmbeddingResult> {
|
|
if (!text || text.trim().length === 0) {
|
|
throw new Error('Cannot generate embedding for empty text')
|
|
}
|
|
|
|
const plain = prepareTextForEmbedding(text)
|
|
const embedding = await this.embedPlainText(plain)
|
|
|
|
return {
|
|
embedding,
|
|
model: 'text-embedding-3-small',
|
|
dimension: embedding.length,
|
|
indexedChars: plain.length,
|
|
chunkCount: 1,
|
|
}
|
|
}
|
|
|
|
async generateBatchEmbeddings(texts: string[]): Promise<EmbeddingResult[]> {
|
|
if (!texts || texts.length === 0) return []
|
|
|
|
const validTexts = texts
|
|
.filter((t) => t && t.trim().length > 0)
|
|
.map((t) => prepareTextForEmbedding(t))
|
|
if (validTexts.length === 0) return []
|
|
|
|
try {
|
|
const embeddings = await Promise.all(validTexts.map((text) => this.embedPlainText(text)))
|
|
|
|
return embeddings.map((embedding, i) => ({
|
|
embedding,
|
|
model: 'text-embedding-3-small',
|
|
dimension: embedding.length,
|
|
indexedChars: validTexts[i].length,
|
|
chunkCount: 1,
|
|
}))
|
|
} catch (error) {
|
|
console.error('Error generating batch embeddings:', error)
|
|
throw error
|
|
}
|
|
}
|
|
|
|
toVectorString(embedding: number[]): string {
|
|
return `[${embedding.join(',')}]`
|
|
}
|
|
|
|
fromVectorString(vec: string): number[] {
|
|
if (Array.isArray(vec)) return vec
|
|
if (!vec || typeof vec !== 'string') return []
|
|
return vec.replace(/^\[/, '').replace(/\]$/, '').split(',').map(Number)
|
|
}
|
|
|
|
calculateCosineSimilarity(a: number[], b: number[]): number {
|
|
if (!a.length || !b.length) return 0
|
|
const minLen = Math.min(a.length, b.length)
|
|
let dot = 0
|
|
let mA = 0
|
|
let mB = 0
|
|
for (let i = 0; i < minLen; i++) {
|
|
dot += a[i] * b[i]
|
|
mA += a[i] * a[i]
|
|
mB += b[i] * b[i]
|
|
}
|
|
mA = Math.sqrt(mA)
|
|
mB = Math.sqrt(mB)
|
|
if (mA === 0 || mB === 0) return 0
|
|
return dot / (mA * mB)
|
|
}
|
|
|
|
async getDbDimension(): Promise<number | null> {
|
|
try {
|
|
const result: Array<{ dim: number | null }> = await prisma.$queryRawUnsafe(
|
|
`SELECT a.atttypmod AS dim FROM pg_attribute a JOIN pg_class c ON a.attrelid = c.oid WHERE c.relname = 'NoteEmbedding' AND a.attname = 'embedding'`
|
|
)
|
|
return result[0]?.dim ?? null
|
|
} catch {
|
|
return null
|
|
}
|
|
}
|
|
|
|
async getModelDimension(): Promise<number | null> {
|
|
try {
|
|
const { dimension } = await this.generateEmbedding('dimension test')
|
|
return dimension
|
|
} catch {
|
|
return null
|
|
}
|
|
}
|
|
|
|
async validateDimension(): Promise<{ dbDimension: number | null; modelDimension: number | null; match: boolean }> {
|
|
const [dbDimension, modelDimension] = await Promise.all([
|
|
this.getDbDimension(),
|
|
this.getModelDimension(),
|
|
])
|
|
return {
|
|
dbDimension,
|
|
modelDimension,
|
|
match: dbDimension !== null && modelDimension !== null && dbDimension === modelDimension,
|
|
}
|
|
}
|
|
|
|
shouldRegenerateEmbedding(
|
|
_noteContent: string,
|
|
_lastEmbeddingContent: string | null,
|
|
lastAnalysis: Date | null,
|
|
options?: { force?: boolean; isClip?: boolean },
|
|
): boolean {
|
|
if (options?.force) return true
|
|
if (options?.isClip) return true
|
|
if (!lastAnalysis) return true
|
|
const daysSinceAnalysis = (Date.now() - lastAnalysis.getTime()) / (1000 * 60 * 60 * 24)
|
|
return daysSinceAnalysis > 7
|
|
}
|
|
}
|
|
|
|
export const embeddingService = new EmbeddingService()
|