/** * Embedding Service * Generates vector embeddings for semantic search and similarity analysis * Uses text-embedding-3-small model via OpenAI (or Ollama alternatives) */ import { getAIProvider } from '../factory' export interface EmbeddingResult { embedding: number[] model: string dimension: number } /** * Service for generating and managing text embeddings */ export class EmbeddingService { private readonly EMBEDDING_MODEL = 'text-embedding-3-small' private readonly EMBEDDING_DIMENSION = 1536 // OpenAI's embedding dimension /** * Generate embedding for a single text */ async generateEmbedding(text: string): Promise { if (!text || text.trim().length === 0) { throw new Error('Cannot generate embedding for empty text') } try { const provider = getAIProvider() // Use the existing getEmbeddings method from AIProvider const embedding = await provider.getEmbeddings(text) // Validate embedding dimension if (embedding.length !== this.EMBEDDING_DIMENSION) { } return { embedding, model: this.EMBEDDING_MODEL, dimension: embedding.length } } catch (error) { console.error('Error generating embedding:', error) throw new Error(`Failed to generate embedding: ${error}`) } } /** * Generate embeddings for multiple texts in batch * More efficient than calling generateEmbedding multiple times */ async generateBatchEmbeddings(texts: string[]): Promise { if (!texts || texts.length === 0) { return [] } // Filter out empty texts const validTexts = texts.filter(t => t && t.trim().length > 0) if (validTexts.length === 0) { return [] } try { const provider = getAIProvider() // Batch embedding using the existing getEmbeddings method const embeddings = await Promise.all( validTexts.map(text => provider.getEmbeddings(text)) ) return embeddings.map(embedding => ({ embedding, model: this.EMBEDDING_MODEL, dimension: embedding.length })) } catch (error) { console.error('Error generating batch embeddings:', error) throw error } } /** * Calculate cosine similarity between two embeddings * Returns value between -1 and 1, where 1 is identical */ calculateCosineSimilarity(embedding1: number[], embedding2: number[]): number { if (embedding1.length !== embedding2.length) { throw new Error('Embeddings must have the same dimension') } let dotProduct = 0 let magnitude1 = 0 let magnitude2 = 0 for (let i = 0; i < embedding1.length; i++) { dotProduct += embedding1[i] * embedding2[i] magnitude1 += embedding1[i] * embedding1[i] magnitude2 += embedding2[i] * embedding2[i] } magnitude1 = Math.sqrt(magnitude1) magnitude2 = Math.sqrt(magnitude2) if (magnitude1 === 0 || magnitude2 === 0) { return 0 } return dotProduct / (magnitude1 * magnitude2) } /** * Calculate similarity between an embedding and multiple other embeddings * Returns array of similarities */ calculateSimilarities( queryEmbedding: number[], targetEmbeddings: number[][] ): number[] { return targetEmbeddings.map(embedding => this.calculateCosineSimilarity(queryEmbedding, embedding) ) } /** * Find most similar embeddings to a query * Returns top-k results with their similarities */ findMostSimilar( queryEmbedding: number[], targetEmbeddings: Array<{ id: string; embedding: number[] }>, topK: number = 10 ): Array<{ id: string; similarity: number }> { const similarities = targetEmbeddings.map(({ id, embedding }) => ({ id, similarity: this.calculateCosineSimilarity(queryEmbedding, embedding) })) // Sort by similarity descending and return top-k return similarities .sort((a, b) => b.similarity - a.similarity) .slice(0, topK) } /** * Get average embedding from multiple embeddings * Useful for clustering or centroid calculation */ averageEmbeddings(embeddings: number[][]): number[] { if (embeddings.length === 0) { throw new Error('Cannot average empty embeddings array') } const dimension = embeddings[0].length const average = new Array(dimension).fill(0) for (const embedding of embeddings) { if (embedding.length !== dimension) { throw new Error('All embeddings must have the same dimension') } for (let i = 0; i < dimension; i++) { average[i] += embedding[i] } } // Divide by number of embeddings return average.map(val => val / embeddings.length) } /** * Serialize embedding to JSON-safe format (for storage) */ serialize(embedding: number[]): string { return JSON.stringify(embedding) } /** * Deserialize embedding from JSON string */ deserialize(jsonString: string): number[] { try { const parsed = JSON.parse(jsonString) if (!Array.isArray(parsed)) { throw new Error('Invalid embedding format') } return parsed } catch (error) { console.error('Error deserializing embedding:', error) throw new Error('Failed to deserialize embedding') } } /** * Check if a note needs embedding regeneration * (e.g., if content has changed significantly) */ shouldRegenerateEmbedding( noteContent: string, lastEmbeddingContent: string | null, lastAnalysis: Date | null ): boolean { // If no previous embedding, generate one if (!lastEmbeddingContent || !lastAnalysis) { return true } // If content has changed more than 20% (simple heuristic) const contentChanged = Math.abs(noteContent.length - lastEmbeddingContent.length) / lastEmbeddingContent.length > 0.2 // If last analysis is more than 7 days old const daysSinceAnalysis = (Date.now() - lastAnalysis.getTime()) / (1000 * 60 * 60 * 24) const isStale = daysSinceAnalysis > 7 return contentChanged || isStale } } // Singleton instance export const embeddingService = new EmbeddingService()