Keep/keep-notes/lib/ai/services/semantic-search.service.ts

/**
 * Semantic Search Service
 * Hybrid search combining keyword matching and semantic similarity
 * Uses Reciprocal Rank Fusion (RRF) for result ranking
 */

import { embeddingService } from './embedding.service'
import { prisma } from '@/lib/prisma'
import { auth } from '@/auth'

export interface SearchResult {
  noteId: string
  title: string | null
  content: string
  score: number
  matchType: 'exact' | 'related'
  language?: string | null
}

export interface SearchOptions {
  limit?: number
  threshold?: number // Minimum similarity score (0-1)
  includeExactMatches?: boolean
  notebookId?: string // NEW: Filter by notebook for contextual search (IA5)
}

export class SemanticSearchService {
  private readonly RRF_K = 60 // RRF constant (default recommended value)
  private readonly DEFAULT_LIMIT = 20
  private readonly DEFAULT_THRESHOLD = 0.6

  /**
   * Hybrid search: keyword + semantic with RRF fusion
   */
  async search(
    query: string,
    options: SearchOptions = {}
  ): Promise<SearchResult[]> {
    const {
      limit = this.DEFAULT_LIMIT,
      threshold = this.DEFAULT_THRESHOLD,
      includeExactMatches = true,
      notebookId // NEW: Contextual search within notebook (IA5)
    } = options

    if (!query || query.trim().length < 2) {
      return []
    }

    const session = await auth()
    const userId = session?.user?.id || null

    try {
      // 1. Keyword search (SQLite FTS)
      const keywordResults = await this.keywordSearch(query, userId, notebookId)

      // 2. Semantic search (vector similarity)
      const semanticResults = await this.semanticVectorSearch(query, userId, threshold, notebookId)

      // 3. Reciprocal Rank Fusion
      const fusedResults = await this.reciprocalRankFusion(
        keywordResults,
        semanticResults
      )

      // 4. Sort by final score and limit
      return fusedResults
        .sort((a, b) => b.score - a.score)
        .slice(0, limit)
        .map(result => ({
          ...result,
          matchType: result.score > 0.8 ? 'exact' : 'related'
        }))
    } catch (error) {
      console.error('Error in hybrid search:', error)
      // Fallback to keyword-only search
      const keywordResults = await this.keywordSearch(query, userId)

      // Fetch note details for keyword results
      const noteIds = keywordResults.slice(0, limit).map(r => r.noteId)
      const notes = await prisma.note.findMany({
        where: { id: { in: noteIds } },
        select: {
          id: true,
          title: true,
          content: true,
          language: true
        }
      })

      return notes.map(note => ({
        noteId: note.id,
        title: note.title,
        content: note.content,
        score: 1.0, // Default score for keyword-only results
        matchType: 'related' as const,
        language: note.language
      }))
    }
  }

  /**
   * Keyword search using SQLite LIKE/FTS
   */
  private async keywordSearch(
    query: string,
    userId: string | null,
    notebookId?: string // NEW: Filter by notebook (IA5)
  ): Promise<Array<{ noteId: string; rank: number }>> {
    // Build query for case-insensitive search
    const searchPattern = `%${query}%`

    const notes = await prisma.note.findMany({
      where: {
        ...(userId ? { userId } : {}),
        ...(notebookId !== undefined ? { notebookId } : {}), // NEW: Notebook filter
        OR: [
          { title: { contains: query } },
          { content: { contains: query } }
        ]
      },
      select: {
        id: true,
        title: true,
        content: true
      }
    })

    // Simple relevance scoring based on match position and frequency
    const results = notes.map(note => {
      const title = note.title || ''
      const content = note.content || ''
      const queryLower = query.toLowerCase()

      // Count occurrences
      const titleMatches = (title.match(new RegExp(queryLower, 'gi')) || []).length
      const contentMatches = (content.match(new RegExp(queryLower, 'gi')) || []).length

      // Boost title matches significantly
      const titlePosition = title.toLowerCase().indexOf(queryLower)
      const contentPosition = content.toLowerCase().indexOf(queryLower)

      // Calculate rank (lower is better)
      let rank = 100

      if (titleMatches > 0) {
        rank = titlePosition === 0 ? 1 : 10
        rank -= titleMatches * 2
      } else if (contentMatches > 0) {
        rank = contentPosition < 100 ? 20 : 30
        rank -= contentMatches
      }

      return {
        noteId: note.id,
        rank
      }
    })

    return results.sort((a, b) => a.rank - b.rank)
  }

  /**
   * Semantic vector search using embeddings
   */
  private async semanticVectorSearch(
    query: string,
    userId: string | null,
    threshold: number,
    notebookId?: string // NEW: Filter by notebook (IA5)
  ): Promise<Array<{ noteId: string; rank: number }>> {
    try {
      // Generate query embedding
      const { embedding: queryEmbedding } = await embeddingService.generateEmbedding(query)

      // Fetch all user's notes with embeddings
      const notes = await prisma.note.findMany({
        where: {
          ...(userId ? { userId } : {}),
          ...(notebookId !== undefined ? { notebookId } : {}),
          noteEmbedding: { isNot: null }
        },
        select: {
          id: true,
          noteEmbedding: true
        }
      })

      if (notes.length === 0) {
        return []
      }

      // Calculate similarities for all notes
      const similarities = notes.map(note => {
        const noteEmbedding = note.noteEmbedding?.embedding ? JSON.parse(note.noteEmbedding.embedding) as number[] : []
        const similarity = embeddingService.calculateCosineSimilarity(
          queryEmbedding,
          noteEmbedding
        )

        return {
          noteId: note.id,
          similarity
        }
      })

      // Filter by threshold and convert to rank
      return similarities
        .filter(s => s.similarity >= threshold)
        .sort((a, b) => b.similarity - a.similarity)
        .map((s, index) => ({
          noteId: s.noteId,
          rank: index + 1 // 1-based rank
        }))
    } catch (error) {
      console.error('Error in semantic vector search:', error)
      return []
    }
  }

  /**
   * Reciprocal Rank Fusion algorithm
   * Combines multiple ranked lists into a single ranking
   * Formula: RRF(score) = 1 / (k + rank)
   * k = 60 (default, prevents high rank from dominating)
   */
  private async reciprocalRankFusion(
    keywordResults: Array<{ noteId: string; rank: number }>,
    semanticResults: Array<{ noteId: string; rank: number }>
  ): Promise<SearchResult[]> {
    const scores = new Map<string, number>()

    // Add keyword scores
    for (const result of keywordResults) {
      const rrfScore = 1 / (this.RRF_K + result.rank)
      scores.set(result.noteId, (scores.get(result.noteId) || 0) + rrfScore)
    }

    // Add semantic scores
    for (const result of semanticResults) {
      const rrfScore = 1 / (this.RRF_K + result.rank)
      scores.set(result.noteId, (scores.get(result.noteId) || 0) + rrfScore)
    }

    // Fetch note details
    const noteIds = Array.from(scores.keys())
    const notes = await prisma.note.findMany({
      where: { id: { in: noteIds } },
      select: {
        id: true,
        title: true,
        content: true,
        language: true
      }
    })

    // Combine scores with note details
    return notes.map(note => ({
      noteId: note.id,
      title: note.title,
      content: note.content,
      score: scores.get(note.id) || 0,
      matchType: 'related' as const,
      language: note.language
    }))
  }

  /**
   * Generate or update embedding for a note
   * Called when note is created or significantly updated
   */
  async indexNote(noteId: string): Promise<void> {
    try {
      const note = await prisma.note.findUnique({
        where: { id: noteId },
        select: { content: true, noteEmbedding: true, lastAiAnalysis: true }
      })

      if (!note) {
        throw new Error('Note not found')
      }

      // Check if embedding needs regeneration
      const shouldRegenerate = embeddingService.shouldRegenerateEmbedding(
        note.content,
        note.noteEmbedding?.embedding as any,
        note.lastAiAnalysis
      )

      if (!shouldRegenerate) {
        return
      }

      // Generate new embedding
      const { embedding } = await embeddingService.generateEmbedding(note.content)

      // Save to database
      await prisma.noteEmbedding.upsert({
        where: { noteId: noteId },
        create: { noteId: noteId, embedding: embeddingService.serialize(embedding) as any },
        update: { embedding: embeddingService.serialize(embedding) as any }
      })
      await prisma.note.update({
        where: { id: noteId },
        data: {
          lastAiAnalysis: new Date()
        }
      })

    } catch (error) {
      console.error(`Error indexing note ${noteId}:`, error)
      throw error
    }
  }

  /**
   * Batch index multiple notes (for initial migration or bulk updates)
   */
  async indexBatchNotes(noteIds: string[]): Promise<void> {
    const BATCH_SIZE = 10 // Process in batches to avoid overwhelming

    for (let i = 0; i < noteIds.length; i += BATCH_SIZE) {
      const batch = noteIds.slice(i, i + BATCH_SIZE)

      await Promise.allSettled(
        batch.map(noteId => this.indexNote(noteId))
      )

    }
  }
}

// Singleton instance
export const semanticSearchService = new SemanticSearchService()