feat: migrate semantic search to pgvector + full-text search

Replace JSON-string embeddings with native pgvector(1536) storage and add PostgreSQL full-text search (tsvector/GIN) with Reciprocal Rank Fusion for hybrid keyword + semantic ranking. Changes: - NoteEmbedding.embedding: String → vector(1536) via pgvector - NoteEmbedding: added updatedAt for reindex tracking - Note: added tsv (tsvector) with auto-update trigger for FTS - semantic-search.service: hybrid FTS + vector search with RRF fusion - embedding.service: toVectorString() for pgvector SQL literals - Removed JS-side cosine similarity loops (now DB-side via <=>) - Added HNSW index on NoteEmbedding.embedding (cosine distance) - Added GIN index on Note.tsv for FTS queries Schema migration in: prisma/migrations/20260512120000_pgvector_and_fts_search/ Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-12 07:03:56 +00:00
parent 92c3a6f307
commit 03e6a62b80
43 changed files with 4024 additions and 786 deletions
--- a/memento-note/lib/ai/services/semantic-search.service.ts
+++ b/memento-note/lib/ai/services/semantic-search.service.ts
@@ -1,7 +1,12 @@
 /**
 * Semantic Search Service
- * Hybrid search combining keyword matching and semantic similarity
- * Uses Reciprocal Rank Fusion (RRF) for result ranking
+ *
+ * Unified hybrid search combining:
+ *   1. PostgreSQL full-text search (tsvector / tsquery) via GIN index
+ *   2. pgvector cosine-distance nearest-neighbor search via HNSW index
+ *   3. Reciprocal Rank Fusion (RRF) for final ranking
+ *
+ * All vector operations happen in the database — no JS cosine-similarity loops.
 */

 import { embeddingService } from './embedding.service'
@@ -19,19 +24,22 @@ export interface SearchResult {

 export interface SearchOptions {
  limit?: number
-  threshold?: number // Minimum similarity score (0-1)
+  threshold?: number
  includeExactMatches?: boolean
-  notebookId?: string // NEW: Filter by notebook for contextual search (IA5)
-  defaultTitle?: string // Optional default title for untitled notes (i18n)
+  notebookId?: string
+  defaultTitle?: string
 }

 export class SemanticSearchService {
-  private readonly RRF_K = 60 // RRF constant (default recommended value)
+  private readonly RRF_K = 60
  private readonly DEFAULT_LIMIT = 20
-  private readonly DEFAULT_THRESHOLD = 0.6
+  private readonly DEFAULT_THRESHOLD = 0.3
+  private readonly VECTOR_CANDIDATES = 50
+  private readonly FTS_CANDIDATES = 50

  /**
-   * Hybrid search: keyword + semantic with RRF fusion
+   * Hybrid search: FTS + pgvector with RRF fusion.
+   * Accepts an optional userId to skip auth() (used by agent tools).
   */
  async search(
    query: string,
@@ -40,292 +48,15 @@ export class SemanticSearchService {
    const {
      limit = this.DEFAULT_LIMIT,
      threshold = this.DEFAULT_THRESHOLD,
-      includeExactMatches = true,
-      notebookId, // NEW: Contextual search within notebook (IA5)
-      defaultTitle = 'Untitled' // Default title for i18n
+      notebookId,
+      defaultTitle = 'Untitled'
    } = options

-    if (!query || query.trim().length < 2) {
-      return []
-    }
+    if (!query || query.trim().length < 2) return []

    const session = await auth()
    const userId = session?.user?.id || null
-
-    try {
-      // 1. Keyword search (SQLite FTS)
-      const keywordResults = await this.keywordSearch(query, userId, notebookId)
-
-      // 2. Semantic search (vector similarity)
-      const semanticResults = await this.semanticVectorSearch(query, userId, threshold, notebookId)
-
-      // 3. Reciprocal Rank Fusion
-      const fusedResults = await this.reciprocalRankFusion(
-        keywordResults,
-        semanticResults
-      )
-
-    // 4. Sort by final score and limit
-    return fusedResults
-      .sort((a, b) => b.score - a.score)
-      .slice(0, limit)
-      .map(result => ({
-        ...result,
-        title: result.title || defaultTitle,
-        matchType: result.score > 0.8 ? 'exact' : 'related'
-      }))
-    } catch (error) {
-      console.error('Error in hybrid search:', error)
-      // Fallback to keyword-only search
-      const keywordResults = await this.keywordSearch(query, userId)
-
-      // Fetch note details for keyword results
-      const noteIds = keywordResults.slice(0, limit).map(r => r.noteId)
-      const notes = await prisma.note.findMany({
-        where: { id: { in: noteIds }, trashedAt: null },
-        select: {
-          id: true,
-          title: true,
-          content: true,
-          language: true
-        }
-      })
-
-      return notes.map(note => ({
-        noteId: note.id,
-        title: note.title || defaultTitle,
-        content: note.content,
-        score: 1.0, // Default score for keyword-only results
-        matchType: 'related' as const,
-        language: note.language
-      }))
-    }
-  }
-
-  /**
-   * Keyword search using SQLite LIKE/FTS
-   */
-  private async keywordSearch(
-    query: string,
-    userId: string | null,
-    notebookId?: string // NEW: Filter by notebook (IA5)
-  ): Promise<Array<{ noteId: string; rank: number }>> {
-    // Extract keywords (words with > 3 characters) to avoid entire sentence matching failing
-    const stopWords = new Set(['comment', 'pourquoi', 'lequel', 'laquelle', 'avec', 'pour', 'dans', 'sur', 'est-ce']);
-    const keywords = query.toLowerCase()
-      .split(/[^a-z0-9àáâäçéèêëíìîïñóòôöúùûü]/i)
-      .filter(w => w.length > 3 && !stopWords.has(w));
-      
-    // If no good keywords found, fallback to the original query but it'll likely fail
-    const searchTerms = keywords.length > 0 ? keywords : [query];
-
-    // Build Prisma OR clauses for each keyword
-    const searchConditions = searchTerms.flatMap(term => [
-      { title: { contains: term, mode: 'insensitive' as const } },
-      { content: { contains: term, mode: 'insensitive' as const } }
-    ]);
-
-    const notes = await prisma.note.findMany({
-      where: {
-        ...(userId ? { userId } : {}),
-        ...(notebookId !== undefined ? { notebookId } : {}), // NEW: Notebook filter
-        trashedAt: null,
-        OR: searchConditions
-      },
-      select: {
-        id: true,
-        title: true,
-        content: true
-      }
-    })
-
-    // Simple relevance scoring based on match position and frequency
-    const results = notes.map(note => {
-      const title = note.title || ''
-      const content = note.content || ''
-      const queryLower = query.toLowerCase()
-
-      // Count occurrences — escape regex special chars to avoid crashes
-      const escaped = queryLower.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
-      const titleMatches = (title.match(new RegExp(escaped, 'gi')) || []).length
-      const contentMatches = (content.match(new RegExp(escaped, 'gi')) || []).length
-
-      // Boost title matches significantly
-      const titlePosition = title.toLowerCase().indexOf(queryLower)
-      const contentPosition = content.toLowerCase().indexOf(queryLower)
-
-      // Calculate rank (lower is better)
-      let rank = 100
-
-      if (titleMatches > 0) {
-        rank = titlePosition === 0 ? 1 : 10
-        rank -= titleMatches * 2
-      } else if (contentMatches > 0) {
-        rank = contentPosition < 100 ? 20 : 30
-        rank -= contentMatches
-      }
-
-      return {
-        noteId: note.id,
-        rank
-      }
-    })
-
-    return results.sort((a, b) => a.rank - b.rank)
-  }
-
-  /**
-   * Semantic vector search using embeddings
-   */
-  private async semanticVectorSearch(
-    query: string,
-    userId: string | null,
-    threshold: number,
-    notebookId?: string // NEW: Filter by notebook (IA5)
-  ): Promise<Array<{ noteId: string; rank: number }>> {
-    try {
-      // Generate query embedding
-      const { embedding: queryEmbedding } = await embeddingService.generateEmbedding(query)
-
-      // Fetch all user's notes with embeddings
-      const notes = await prisma.note.findMany({
-        where: {
-          ...(userId ? { userId } : {}),
-          ...(notebookId !== undefined ? { notebookId } : {}),
-          trashedAt: null,
-          noteEmbedding: { isNot: null }
-        },
-        select: {
-          id: true,
-          noteEmbedding: true
-        }
-      })
-
-      if (notes.length === 0) {
-        return []
-      }
-
-      // Calculate similarities for all notes
-      const similarities = notes.map(note => {
-        const noteEmbedding = note.noteEmbedding?.embedding ? JSON.parse(note.noteEmbedding.embedding) as number[] : []
-        const similarity = embeddingService.calculateCosineSimilarity(
-          queryEmbedding,
-          noteEmbedding
-        )
-
-        return {
-          noteId: note.id,
-          similarity
-        }
-      })
-
-      // Filter by threshold and convert to rank
-      return similarities
-        .filter(s => s.similarity >= threshold)
-        .sort((a, b) => b.similarity - a.similarity)
-        .map((s, index) => ({
-          noteId: s.noteId,
-          rank: index + 1 // 1-based rank
-        }))
-    } catch (error) {
-      console.error('Error in semantic vector search:', error)
-      return []
-    }
-  }
-
-  /**
-   * Reciprocal Rank Fusion algorithm
-   * Combines multiple ranked lists into a single ranking
-   * Formula: RRF(score) = 1 / (k + rank)
-   * k = 60 (default, prevents high rank from dominating)
-   */
-  private async reciprocalRankFusion(
-    keywordResults: Array<{ noteId: string; rank: number }>,
-    semanticResults: Array<{ noteId: string; rank: number }>
-  ): Promise<SearchResult[]> {
-    const scores = new Map<string, number>()
-
-    // Add keyword scores
-    for (const result of keywordResults) {
-      const rrfScore = 1 / (this.RRF_K + result.rank)
-      scores.set(result.noteId, (scores.get(result.noteId) || 0) + rrfScore)
-    }
-
-    // Add semantic scores
-    for (const result of semanticResults) {
-      const rrfScore = 1 / (this.RRF_K + result.rank)
-      scores.set(result.noteId, (scores.get(result.noteId) || 0) + rrfScore)
-    }
-
-    // Fetch note details
-    const noteIds = Array.from(scores.keys())
-    const notes = await prisma.note.findMany({
-      where: { id: { in: noteIds }, trashedAt: null },
-      select: {
-        id: true,
-        title: true,
-        content: true,
-        language: true
-      }
-    })
-
-    // Combine scores with note details
-    return notes.map(note => ({
-      noteId: note.id,
-      title: note.title,
-      content: note.content,
-      score: scores.get(note.id) || 0,
-      matchType: 'related' as const,
-      language: note.language
-    }))
-  }
-
-  /**
-   * Generate or update embedding for a note
-   * Called when note is created or significantly updated
-   */
-  async indexNote(noteId: string): Promise<void> {
-    try {
-      const note = await prisma.note.findUnique({
-        where: { id: noteId },
-        select: { content: true, noteEmbedding: true, lastAiAnalysis: true }
-      })
-
-      if (!note) {
-        throw new Error('Note not found')
-      }
-
-      // Check if embedding needs regeneration
-      const shouldRegenerate = embeddingService.shouldRegenerateEmbedding(
-        note.content,
-        note.noteEmbedding?.embedding as any,
-        note.lastAiAnalysis
-      )
-
-      if (!shouldRegenerate) {
-        return
-      }
-
-      // Generate new embedding
-      const { embedding } = await embeddingService.generateEmbedding(note.content)
-
-      // Save to database
-      await prisma.noteEmbedding.upsert({
-        where: { noteId: noteId },
-        create: { noteId: noteId, embedding: embeddingService.serialize(embedding) as any },
-        update: { embedding: embeddingService.serialize(embedding) as any }
-      })
-      await prisma.note.update({
-        where: { id: noteId },
-        data: {
-          lastAiAnalysis: new Date()
-        }
-      })
-
-    } catch (error) {
-      console.error(`Error indexing note ${noteId}:`, error)
-      throw error
-    }
+    return this._doSearch(query, userId, { limit, threshold, notebookId, defaultTitle })
  }

  /**
@@ -340,50 +71,251 @@ export class SemanticSearchService {
    const {
      limit = this.DEFAULT_LIMIT,
      threshold = this.DEFAULT_THRESHOLD,
-      includeExactMatches = true,
      notebookId,
      defaultTitle = 'Untitled'
    } = options

-    if (!query || query.trim().length < 2) {
-      return []
-    }
+    if (!query || query.trim().length < 2) return []
+    return this._doSearch(query, userId, { limit, threshold, notebookId, defaultTitle })
+  }

+  private async _doSearch(
+    query: string,
+    userId: string | null,
+    opts: { limit: number; threshold: number; notebookId?: string; defaultTitle: string }
+  ): Promise<SearchResult[]> {
    try {
-      const keywordResults = await this.keywordSearch(query, userId, notebookId)
-      const semanticResults = await this.semanticVectorSearch(query, userId, threshold, notebookId)
-      const fusedResults = await this.reciprocalRankFusion(keywordResults, semanticResults)
+      const [keywordResults, semanticResults] = await Promise.all([
+        this.ftsSearch(query, userId, opts.notebookId),
+        this.vectorSearch(query, userId, opts.threshold, opts.notebookId)
+      ])
+
+      const fusedResults = this.reciprocalRankFusion(keywordResults, semanticResults)

      return fusedResults
        .sort((a, b) => b.score - a.score)
-        .slice(0, limit)
+        .slice(0, opts.limit)
        .map(result => ({
          ...result,
-          title: result.title || defaultTitle,
-          matchType: result.score > 0.8 ? 'exact' : 'related'
+          title: result.title || opts.defaultTitle,
+          matchType: result.score > 0.8 ? 'exact' as const : 'related' as const
        }))
    } catch (error) {
-      console.error('Error in searchAsUser:', error)
+      console.error('Error in hybrid search:', error)
+      return this._ftsFallback(query, userId, opts)
+    }
+  }
+
+  /**
+   * PostgreSQL full-text search using tsvector + GIN index.
+   * Returns ranked results using ts_rank.
+   */
+  private async ftsSearch(
+    query: string,
+    userId: string | null,
+    notebookId?: string
+  ): Promise<Array<{ noteId: string; rank: number }>> {
+    const safeQuery = query.replace(/'/g, "''")
+
+    const userClause = userId ? `AND "userId" = '${userId}'` : ''
+    const notebookClause = notebookId !== undefined
+      ? `AND "notebookId" ${notebookId ? `= '${notebookId.replace(/'/g, "''")}'` : 'IS NULL'}`
+      : ''
+
+    const sql = `
+      SELECT id AS "noteId", ts_rank("tsv", plainto_tsquery('simple', '${safeQuery}')) AS rank
+      FROM "Note"
+      WHERE "tsv" @@ plainto_tsquery('simple', '${safeQuery}')
+        AND "trashedAt" IS NULL
+        AND "isArchived" = false
+        ${userClause}
+        ${notebookClause}
+      ORDER BY rank DESC
+      LIMIT ${this.FTS_CANDIDATES}
+    `
+
+    const rows: Array<{ noteId: string; rank: number }> = await prisma.$queryRawUnsafe(sql)
+
+    const maxRank = rows.length > 0 ? rows[0].rank : 1
+    return rows.map((r, i) => ({
+      noteId: r.noteId,
+      rank: i + 1
+    }))
+  }
+
+  /**
+   * pgvector cosine-distance search using the HNSW index.
+   * Returns nearest neighbors above the similarity threshold.
+   */
+  private async vectorSearch(
+    query: string,
+    userId: string | null,
+    threshold: number,
+    notebookId?: string
+  ): Promise<Array<{ noteId: string; rank: number }>> {
+    let queryEmbedding: number[]
+    try {
+      const result = await embeddingService.generateEmbedding(query)
+      queryEmbedding = result.embedding
+    } catch (error) {
+      console.error('Failed to generate query embedding:', error)
+      return []
+    }
+
+    const vecStr = embeddingService.toVectorString(queryEmbedding)
+    const userClause = userId ? `AND n."userId" = '${userId}'` : ''
+    const notebookClause = notebookId !== undefined
+      ? `AND n."notebookId" ${notebookId ? `= '${notebookId.replace(/'/g, "''")}'` : 'IS NULL'}`
+      : ''
+
+    const sql = `
+      SELECT n.id AS "noteId",
+             1 - (e."embedding" <=> '${vecStr}'::vector) AS similarity
+      FROM "Note" n
+      INNER JOIN "NoteEmbedding" e ON e."noteId" = n.id
+      WHERE n."trashedAt" IS NULL
+        AND n."isArchived" = false
+        ${userClause}
+        ${notebookClause}
+        AND 1 - (e."embedding" <=> '${vecStr}'::vector) >= ${threshold}
+      ORDER BY e."embedding" <=> '${vecStr}'::vector ASC
+      LIMIT ${this.VECTOR_CANDIDATES}
+    `
+
+    const rows: Array<{ noteId: string; similarity: number }> = await prisma.$queryRawUnsafe(sql)
+
+    return rows.map((r, i) => ({
+      noteId: r.noteId,
+      rank: i + 1
+    }))
+  }
+
+  /**
+   * Reciprocal Rank Fusion algorithm.
+   * Combines keyword and semantic ranked lists into a single ranking.
+   */
+  private async reciprocalRankFusion(
+    keywordResults: Array<{ noteId: string; rank: number }>,
+    semanticResults: Array<{ noteId: string; rank: number }>
+  ): Promise<SearchResult[]> {
+    const scores = new Map<string, number>()
+
+    for (const result of keywordResults) {
+      const rrfScore = 1 / (this.RRF_K + result.rank)
+      scores.set(result.noteId, (scores.get(result.noteId) || 0) + rrfScore)
+    }
+
+    for (const result of semanticResults) {
+      const rrfScore = 1 / (this.RRF_K + result.rank)
+      scores.set(result.noteId, (scores.get(result.noteId) || 0) + rrfScore)
+    }
+
+    const noteIds = Array.from(scores.keys())
+    if (noteIds.length === 0) return []
+
+    const notes = await prisma.note.findMany({
+      where: { id: { in: noteIds }, trashedAt: null },
+      select: {
+        id: true,
+        title: true,
+        content: true,
+        language: true
+      }
+    })
+
+    return notes.map(note => ({
+      noteId: note.id,
+      title: note.title,
+      content: note.content,
+      score: scores.get(note.id) || 0,
+      matchType: 'related' as const,
+      language: note.language
+    }))
+  }
+
+  /**
+   * Fallback to FTS-only when vector search fails entirely.
+   */
+  private async _ftsFallback(
+    query: string,
+    userId: string | null,
+    opts: { limit: number; threshold: number; notebookId?: string; defaultTitle: string }
+  ): Promise<SearchResult[]> {
+    try {
+      const keywordResults = await this.ftsSearch(query, userId, opts.notebookId)
+      const noteIds = keywordResults.slice(0, opts.limit).map(r => r.noteId)
+      const notes = await prisma.note.findMany({
+        where: { id: { in: noteIds }, trashedAt: null },
+        select: { id: true, title: true, content: true, language: true }
+      })
+
+      return notes.map(note => ({
+        noteId: note.id,
+        title: note.title || opts.defaultTitle,
+        content: note.content,
+        score: 1.0,
+        matchType: 'related' as const,
+        language: note.language
+      }))
+    } catch {
      return []
    }
  }

  /**
-   * Batch index multiple notes (for initial migration or bulk updates)
+   * Generate or update embedding for a note.
+   * Stores as native pgvector via raw SQL.
+   */
+  async indexNote(noteId: string): Promise<void> {
+    try {
+      const note = await prisma.note.findUnique({
+        where: { id: noteId },
+        select: { content: true, lastAiAnalysis: true }
+      })
+
+      if (!note) throw new Error('Note not found')
+
+      const shouldRegenerate = embeddingService.shouldRegenerateEmbedding(
+        note.content,
+        null,
+        note.lastAiAnalysis
+      )
+
+      if (!shouldRegenerate) return
+
+      const { embedding } = await embeddingService.generateEmbedding(note.content)
+      const vecStr = embeddingService.toVectorString(embedding)
+
+      await prisma.$executeRawUnsafe(
+        `INSERT INTO "NoteEmbedding" ("id", "noteId", "embedding", "createdAt", "updatedAt")
+         VALUES (gen_random_uuid(), $1, $2::vector, now(), now())
+         ON CONFLICT ("noteId")
+         DO UPDATE SET "embedding" = $2::vector, "updatedAt" = now()`,
+        noteId,
+        vecStr
+      )
+
+      await prisma.note.update({
+        where: { id: noteId },
+        data: { lastAiAnalysis: new Date() }
+      })
+    } catch (error) {
+      console.error(`Error indexing note ${noteId}:`, error)
+      throw error
+    }
+  }
+
+  /**
+   * Batch index multiple notes.
   */
  async indexBatchNotes(noteIds: string[]): Promise<void> {
-    const BATCH_SIZE = 10 // Process in batches to avoid overwhelming
+    const BATCH_SIZE = 20

    for (let i = 0; i < noteIds.length; i += BATCH_SIZE) {
      const batch = noteIds.slice(i, i + BATCH_SIZE)
-
-      await Promise.allSettled(
-        batch.map(noteId => this.indexNote(noteId))
-      )
-
+      await Promise.allSettled(batch.map(noteId => this.indexNote(noteId)))
    }
  }
 }

-// Singleton instance
 export const semanticSearchService = new SemanticSearchService()