Files
Keep/keep-notes/lib/ai/services/semantic-search.service.ts
Sepehr Ramezani 3ef5915062 feat(db): extraction des embeddings + mode WAL + config DB provider-agnostic
- Ajout de la table de relation 1-1 NoteEmbedding pour alléger Model Note
- Refactor complet des actions IA sémantique et Memory Echo pour utiliser la jointure
- Migration propre des 85 embeddings locaux existants
- Ajout PRAGMA journal_mode=WAL pour la concurrence au sein de lib/prisma
- Ajout npm run db:switch pour configuration auto SQLite / PostgreSQL
- Fix du compilateur Turbopack et Next-PWA
2026-04-17 22:05:19 +02:00

335 lines
9.3 KiB
TypeScript

/**
* Semantic Search Service
* Hybrid search combining keyword matching and semantic similarity
* Uses Reciprocal Rank Fusion (RRF) for result ranking
*/
import { embeddingService } from './embedding.service'
import { prisma } from '@/lib/prisma'
import { auth } from '@/auth'
export interface SearchResult {
noteId: string
title: string | null
content: string
score: number
matchType: 'exact' | 'related'
language?: string | null
}
export interface SearchOptions {
limit?: number
threshold?: number // Minimum similarity score (0-1)
includeExactMatches?: boolean
notebookId?: string // NEW: Filter by notebook for contextual search (IA5)
}
export class SemanticSearchService {
private readonly RRF_K = 60 // RRF constant (default recommended value)
private readonly DEFAULT_LIMIT = 20
private readonly DEFAULT_THRESHOLD = 0.6
/**
* Hybrid search: keyword + semantic with RRF fusion
*/
async search(
query: string,
options: SearchOptions = {}
): Promise<SearchResult[]> {
const {
limit = this.DEFAULT_LIMIT,
threshold = this.DEFAULT_THRESHOLD,
includeExactMatches = true,
notebookId // NEW: Contextual search within notebook (IA5)
} = options
if (!query || query.trim().length < 2) {
return []
}
const session = await auth()
const userId = session?.user?.id || null
try {
// 1. Keyword search (SQLite FTS)
const keywordResults = await this.keywordSearch(query, userId, notebookId)
// 2. Semantic search (vector similarity)
const semanticResults = await this.semanticVectorSearch(query, userId, threshold, notebookId)
// 3. Reciprocal Rank Fusion
const fusedResults = await this.reciprocalRankFusion(
keywordResults,
semanticResults
)
// 4. Sort by final score and limit
return fusedResults
.sort((a, b) => b.score - a.score)
.slice(0, limit)
.map(result => ({
...result,
matchType: result.score > 0.8 ? 'exact' : 'related'
}))
} catch (error) {
console.error('Error in hybrid search:', error)
// Fallback to keyword-only search
const keywordResults = await this.keywordSearch(query, userId)
// Fetch note details for keyword results
const noteIds = keywordResults.slice(0, limit).map(r => r.noteId)
const notes = await prisma.note.findMany({
where: { id: { in: noteIds } },
select: {
id: true,
title: true,
content: true,
language: true
}
})
return notes.map(note => ({
noteId: note.id,
title: note.title,
content: note.content,
score: 1.0, // Default score for keyword-only results
matchType: 'related' as const,
language: note.language
}))
}
}
/**
* Keyword search using SQLite LIKE/FTS
*/
private async keywordSearch(
query: string,
userId: string | null,
notebookId?: string // NEW: Filter by notebook (IA5)
): Promise<Array<{ noteId: string; rank: number }>> {
// Build query for case-insensitive search
const searchPattern = `%${query}%`
const notes = await prisma.note.findMany({
where: {
...(userId ? { userId } : {}),
...(notebookId !== undefined ? { notebookId } : {}), // NEW: Notebook filter
OR: [
{ title: { contains: query } },
{ content: { contains: query } }
]
},
select: {
id: true,
title: true,
content: true
}
})
// Simple relevance scoring based on match position and frequency
const results = notes.map(note => {
const title = note.title || ''
const content = note.content || ''
const queryLower = query.toLowerCase()
// Count occurrences
const titleMatches = (title.match(new RegExp(queryLower, 'gi')) || []).length
const contentMatches = (content.match(new RegExp(queryLower, 'gi')) || []).length
// Boost title matches significantly
const titlePosition = title.toLowerCase().indexOf(queryLower)
const contentPosition = content.toLowerCase().indexOf(queryLower)
// Calculate rank (lower is better)
let rank = 100
if (titleMatches > 0) {
rank = titlePosition === 0 ? 1 : 10
rank -= titleMatches * 2
} else if (contentMatches > 0) {
rank = contentPosition < 100 ? 20 : 30
rank -= contentMatches
}
return {
noteId: note.id,
rank
}
})
return results.sort((a, b) => a.rank - b.rank)
}
/**
* Semantic vector search using embeddings
*/
private async semanticVectorSearch(
query: string,
userId: string | null,
threshold: number,
notebookId?: string // NEW: Filter by notebook (IA5)
): Promise<Array<{ noteId: string; rank: number }>> {
try {
// Generate query embedding
const { embedding: queryEmbedding } = await embeddingService.generateEmbedding(query)
// Fetch all user's notes with embeddings
const notes = await prisma.note.findMany({
where: {
...(userId ? { userId } : {}),
...(notebookId !== undefined ? { notebookId } : {}),
noteEmbedding: { isNot: null }
},
select: {
id: true,
noteEmbedding: true
}
})
if (notes.length === 0) {
return []
}
// Calculate similarities for all notes
const similarities = notes.map(note => {
const noteEmbedding = note.noteEmbedding?.embedding ? JSON.parse(note.noteEmbedding.embedding) as number[] : []
const similarity = embeddingService.calculateCosineSimilarity(
queryEmbedding,
noteEmbedding
)
return {
noteId: note.id,
similarity
}
})
// Filter by threshold and convert to rank
return similarities
.filter(s => s.similarity >= threshold)
.sort((a, b) => b.similarity - a.similarity)
.map((s, index) => ({
noteId: s.noteId,
rank: index + 1 // 1-based rank
}))
} catch (error) {
console.error('Error in semantic vector search:', error)
return []
}
}
/**
* Reciprocal Rank Fusion algorithm
* Combines multiple ranked lists into a single ranking
* Formula: RRF(score) = 1 / (k + rank)
* k = 60 (default, prevents high rank from dominating)
*/
private async reciprocalRankFusion(
keywordResults: Array<{ noteId: string; rank: number }>,
semanticResults: Array<{ noteId: string; rank: number }>
): Promise<SearchResult[]> {
const scores = new Map<string, number>()
// Add keyword scores
for (const result of keywordResults) {
const rrfScore = 1 / (this.RRF_K + result.rank)
scores.set(result.noteId, (scores.get(result.noteId) || 0) + rrfScore)
}
// Add semantic scores
for (const result of semanticResults) {
const rrfScore = 1 / (this.RRF_K + result.rank)
scores.set(result.noteId, (scores.get(result.noteId) || 0) + rrfScore)
}
// Fetch note details
const noteIds = Array.from(scores.keys())
const notes = await prisma.note.findMany({
where: { id: { in: noteIds } },
select: {
id: true,
title: true,
content: true,
language: true
}
})
// Combine scores with note details
return notes.map(note => ({
noteId: note.id,
title: note.title,
content: note.content,
score: scores.get(note.id) || 0,
matchType: 'related' as const,
language: note.language
}))
}
/**
* Generate or update embedding for a note
* Called when note is created or significantly updated
*/
async indexNote(noteId: string): Promise<void> {
try {
const note = await prisma.note.findUnique({
where: { id: noteId },
select: { content: true, noteEmbedding: true, lastAiAnalysis: true }
})
if (!note) {
throw new Error('Note not found')
}
// Check if embedding needs regeneration
const shouldRegenerate = embeddingService.shouldRegenerateEmbedding(
note.content,
note.noteEmbedding?.embedding as any,
note.lastAiAnalysis
)
if (!shouldRegenerate) {
return
}
// Generate new embedding
const { embedding } = await embeddingService.generateEmbedding(note.content)
// Save to database
await prisma.noteEmbedding.upsert({
where: { noteId: noteId },
create: { noteId: noteId, embedding: embeddingService.serialize(embedding) as any },
update: { embedding: embeddingService.serialize(embedding) as any }
})
await prisma.note.update({
where: { id: noteId },
data: {
lastAiAnalysis: new Date()
}
})
} catch (error) {
console.error(`Error indexing note ${noteId}:`, error)
throw error
}
}
/**
* Batch index multiple notes (for initial migration or bulk updates)
*/
async indexBatchNotes(noteIds: string[]): Promise<void> {
const BATCH_SIZE = 10 // Process in batches to avoid overwhelming
for (let i = 0; i < noteIds.length; i += BATCH_SIZE) {
const batch = noteIds.slice(i, i + BATCH_SIZE)
await Promise.allSettled(
batch.map(noteId => this.indexNote(noteId))
)
}
}
}
// Singleton instance
export const semanticSearchService = new SemanticSearchService()