Momento/memento-note/lib/ai/services/clustering.service.ts

/**
 * Clustering Service
 *
 * Density-based clustering algorithm (DBSCAN variant) for note embeddings.
 * Groups semantically similar notes into clusters without requiring
 * a preset number of clusters.
 *
 * Algorithm:
 * 1. For each note, find neighbors within epsilon cosine distance
 * 2. Form clusters from dense regions (min_cluster_size)
 * 3. Mark outliers as noise (cluster_id = -1)
 */

import prisma from '@/lib/prisma'
import { embeddingService } from './embedding.service'
import { getChatProvider } from '@/lib/ai/factory'
import { getSystemConfig } from '@/lib/config'

export interface ClusterResult {
  clusterId: number
  noteIds: string[]
  centroid?: number[]
  name?: string
}

export interface ClusteredNote {
  noteId: string
  clusterId: number
  membershipScore: number
  isCentral: boolean
}

export interface ClusteringOptions {
  minClusterSize?: number
  epsilon?: number // Cosine distance threshold (lower = more strict)
  maxClusters?: number
}

export class ClusteringService {
  private readonly DEFAULT_MIN_CLUSTER_SIZE = 3
  private readonly DEFAULT_EPSILON = 0.3 // Cosine distance ~ 1 - similarity
  private readonly DEFAULT_MAX_CLUSTERS = 50
  private readonly MIN_NOTES_FOR_CLUSTERING = 10

  /**
   * Calculate cosine similarity between two embedding vectors.
   * Uses 1 - cosine_distance where cosine_distance is computed via pgvector.
   */
  private async getCosineSimilarity(
    noteIdA: string,
    noteIdB: string
  ): Promise<number> {
    const result = await prisma.$queryRawUnsafe<Array<{ similarity: number }>>(
      `SELECT 1 - (e1."embedding"::vector <=> e2."embedding"::vector) AS similarity
       FROM "NoteEmbedding" e1, "NoteEmbedding" e2
       WHERE e1."noteId" = $1 AND e2."noteId" = $2`,
      noteIdA,
      noteIdB
    )
    return result[0]?.similarity || 0
  }

  /**
   * Find all neighbors for a note within epsilon similarity threshold.
   */
  private async findNeighbors(
    noteId: string,
    allNoteIds: string[],
    epsilon: number
  ): Promise<string[]> {
    // Convert epsilon (similarity threshold) to cosine distance
    const cosineDistance = 1 - epsilon

    const result = await prisma.$queryRawUnsafe<Array<{ noteId: string }>>(
      `SELECT e2."noteId"
       FROM "NoteEmbedding" e1
       CROSS JOIN "NoteEmbedding" e2
       WHERE e1."noteId" = $1
         AND e2."noteId" != $1
         AND e2."noteId" = ANY($2::text[])
         AND (e1."embedding"::vector <=> e2."embedding"::vector) <= $3`,
      noteId,
      allNoteIds,
      cosineDistance
    )

    return result.map(r => r.noteId)
  }

  /**
   * Expand a cluster from a seed note using DBSCAN-like algorithm.
   */
  private async expandCluster(
    noteId: string,
    neighbors: string[],
    clusterId: number,
    visited: Set<string>,
    clustered: Map<string, number>,
    allNoteIds: string[],
    epsilon: number,
    minClusterSize: number
  ): Promise<string[]> {
    const clusterMembers: string[] = [noteId]
    const queue = [...neighbors]
    clustered.set(noteId, clusterId)

    while (queue.length > 0) {
      const currentNoteId = queue.shift()!

      if (!visited.has(currentNoteId)) {
        visited.add(currentNoteId)
        const currentNeighbors = await this.findNeighbors(currentNoteId, allNoteIds, epsilon)

        if (currentNeighbors.length >= minClusterSize) {
          for (const neighborId of currentNeighbors) {
            if (!clustered.has(neighborId)) {
              clustered.set(neighborId, clusterId)
              clusterMembers.push(neighborId)
              queue.push(neighborId)
            }
          }
        }
      }
    }

    return clusterMembers
  }

  /**
   * Perform density-based clustering on user's note embeddings.
   */
  async clusterNotes(
    userId: string,
    options: ClusteringOptions = {}
  ): Promise<{
    clusters: ClusterResult[]
    clusteredNotes: ClusteredNote[]
    noiseCount: number
  }> {
    const {
      minClusterSize = this.DEFAULT_MIN_CLUSTER_SIZE,
      epsilon = this.DEFAULT_EPSILON,
      maxClusters = this.DEFAULT_MAX_CLUSTERS
    } = options

    // Get all user's notes with embeddings
    const notesWithEmbeddings = await prisma.$queryRawUnsafe<Array<{ noteId: string }>>(
      `SELECT ne."noteId"
       FROM "NoteEmbedding" ne
       INNER JOIN "Note" n ON n.id = ne."noteId"
       WHERE n."userId" = $1
         AND n."trashedAt" IS NULL
         AND ne."embedding" IS NOT NULL`,
      userId
    )

    const allNoteIds = notesWithEmbeddings.map(n => n.noteId)

    if (allNoteIds.length < this.MIN_NOTES_FOR_CLUSTERING) {
      return {
        clusters: [],
        clusteredNotes: [],
        noiseCount: allNoteIds.length
      }
    }

    const visited = new Set<string>()
    const clustered = new Map<string, number>() // noteId -> clusterId
    const clusterResults: ClusterResult[] = []
    let clusterId = 0

    // DBSCAN algorithm
    for (const noteId of allNoteIds) {
      if (visited.has(noteId)) continue

      visited.add(noteId)
      const neighbors = await this.findNeighbors(noteId, allNoteIds, epsilon)

      if (neighbors.length < minClusterSize) {
        // Mark as noise (cluster_id = -1)
        clustered.set(noteId, -1)
        continue
      }

      // Expand cluster
      const clusterMembers = await this.expandCluster(
        noteId,
        neighbors,
        clusterId,
        visited,
        clustered,
        allNoteIds,
        epsilon,
        minClusterSize
      )

      if (clusterMembers.length >= minClusterSize && clusterId < maxClusters) {
        clusterResults.push({
          clusterId,
          noteIds: clusterMembers
        })
        clusterId++
      } else {
        // Too small, mark as noise
        for (const memberId of clusterMembers) {
          clustered.set(memberId, -1)
        }
      }
    }

    // Calculate membership scores and identify central notes
    const clusteredNotes: ClusteredNote[] = []
    for (const [noteId, cid] of clustered.entries()) {
      if (cid === -1) continue // Skip noise

      const cluster = clusterResults[cid]
      if (!cluster) continue

      // Calculate membership score as average similarity to other cluster members
      const score = await this.calculateMembershipScore(noteId, cluster.noteIds)
      const isCentral = await this.isCentralNote(noteId, cluster.noteIds)

      clusteredNotes.push({
        noteId,
        clusterId: cid,
        membershipScore: score,
        isCentral
      })
    }

    const noiseCount = Array.from(clustered.values()).filter(id => id === -1).length

    return {
      clusters: clusterResults,
      clusteredNotes,
      noiseCount
    }
  }

  /**
   * Calculate membership score for a note within its cluster.
   * Score = average similarity to all other cluster members.
   */
  private async calculateMembershipScore(noteId: string, clusterMemberIds: string[]): Promise<number> {
    if (clusterMemberIds.length <= 1) return 1.0

    const similarities: number[] = []
    for (const memberId of clusterMemberIds) {
      if (memberId === noteId) continue
      const sim = await this.getCosineSimilarity(noteId, memberId)
      similarities.push(sim)
    }

    return similarities.length > 0
      ? similarities.reduce((a, b) => a + b, 0) / similarities.length
      : 1.0
  }

  /**
   * Determine if a note is central to its cluster.
   * A note is central if its average similarity to other members
   * is above the cluster mean.
   */
  private async isCentralNote(noteId: string, clusterMemberIds: string[]): Promise<boolean> {
    const allScores: Array<{ memberId: string; score: number }> = []

    for (const memberId of clusterMemberIds) {
      const score = await this.calculateMembershipScore(memberId, clusterMemberIds)
      allScores.push({ memberId, score })
    }

    const meanScore = allScores.reduce((sum, s) => sum + s.score, 0) / allScores.length
    const noteScore = allScores.find(s => s.memberId === noteId)?.score || 0

    return noteScore >= meanScore
  }

  /**
   * Get the N most central notes from a cluster for naming purposes.
   */
  async getCentralNotes(clusterId: number, userId: string, n: number = 5): Promise<Array<{ noteId: string; title: string | null; content: string }>> {
    const result = await prisma.$queryRawUnsafe<Array<{ noteId: string; title: string | null; content: string }>>(
      `SELECT DISTINCT n.id AS "noteId", n.title, n.content
       FROM "ClusterMember" cm
       INNER JOIN "Note" n ON n.id = cm."noteId"
       WHERE cm."clusterId" = $1
         AND cm."userId" = $2
         AND cm."isCentral" = true
       LIMIT $3`,
      clusterId,
      userId,
      n
    )

    return result
  }

  /**
   * Save clustering results to database.
   */
  async saveClusteringResults(
    userId: string,
    results: { clusters: ClusterResult[]; clusteredNotes: ClusteredNote[] }
  ): Promise<void> {
    await prisma.$transaction(async (tx) => {
      // Clear existing clusters for this user
      await tx.$executeRawUnsafe(`DELETE FROM "ClusterMember" WHERE "userId" = $1`, userId)
      await tx.$executeRawUnsafe(`DELETE FROM "NoteCluster" WHERE "userId" = $1`, userId)

      // Insert new clusters
      for (const cluster of results.clusters) {
        await tx.noteCluster.create({
          data: {
            userId,
            clusterId: cluster.clusterId,
            name: cluster.name,
            noteCount: cluster.noteIds.length,
            lastCalculated: new Date()
          }
        })
      }

      // Insert cluster members
      for (const clusteredNote of results.clusteredNotes) {
        await tx.clusterMember.create({
          data: {
            userId,
            noteId: clusteredNote.noteId,
            clusterId: clusteredNote.clusterId,
            membershipScore: clusteredNote.membershipScore,
            isCentral: clusteredNote.isCentral
          }
        })
      }
    })
  }

  /**
   * Generate a name for a cluster using the LLM.
   * Analyzes the 5 most central notes to extract a common theme.
   */
  async generateClusterName(clusterId: number, userId: string): Promise<string> {
    const centralNotes = await this.getCentralNotes(clusterId, userId, 5)

    if (centralNotes.length === 0) {
      return `Cluster ${clusterId}`
    }

    const notesText = centralNotes
      .map((note, i) => `${i + 1}. "${note.title || 'Untitled'}" - ${note.content.slice(0, 100)}...`)
      .join('\n')

    const systemPrompt = 'You are a clustering assistant. Provide ONLY a concise name (2-4 words) in English. No punctuation, no explanation.'

    const userPrompt = `Analyze these 5 notes that belong to the same cluster. What is the common theme?\n\n${notesText}\n\nTheme:`

    try {
      const config = await getSystemConfig()
      const provider = getChatProvider(config)
      const response = await provider.chat(
        [{ role: 'user', content: userPrompt }],
        systemPrompt
      )
      return response.text.trim().slice(0, 50)
    } catch {
      return `Cluster ${clusterId}`
    }
  }

  /**
   * Check if recalculation is needed based on data change percentage.
   */
  async shouldRecalculate(userId: string): Promise<boolean> {
    const lastCluster = await prisma.noteCluster.findFirst({
      where: { userId },
      orderBy: { lastCalculated: 'desc' }
    })

    if (!lastCluster) return true

    // Count notes modified since last calculation
    const modifiedCount = await prisma.note.count({
      where: {
        userId,
        OR: [
          { updatedAt: { gt: lastCluster.lastCalculated } },
          { contentUpdatedAt: { gt: lastCluster.lastCalculated } }
        ]
      }
    })

    const totalNotes = await prisma.note.count({
      where: { userId, trashedAt: null }
    })

    if (totalNotes === 0) return false

    const changePercentage = modifiedCount / totalNotes
    return changePercentage > 0.05 // More than 5% changed
  }

  /**
   * Get cached clustering results if available and fresh.
   */
  async getCachedClusters(userId: string): Promise<ClusterResult[] | null> {
    const clusters = await prisma.noteCluster.findMany({
      where: { userId },
      orderBy: { clusterId: 'asc' }
    })

    if (clusters.length === 0) return null

    // Check if data is still fresh
    const needsUpdate = await this.shouldRecalculate(userId)
    if (needsUpdate) return null

    // Get cluster members
    const result: ClusterResult[] = []
    for (const cluster of clusters) {
      const members = await prisma.clusterMember.findMany({
        where: { clusterId: cluster.clusterId, userId },
        select: { noteId: true }
      })

      result.push({
        clusterId: cluster.clusterId,
        noteIds: members.map(m => m.noteId),
        name: cluster.name || undefined
      })
    }

    return result
  }
}

export const clusteringService = new ClusteringService()