Momento/memento-note/lib/ai/services/clustering.service.ts

/**
 * Clustering Service
 *
 * Density-based clustering algorithm (DBSCAN variant) for note embeddings.
 * Groups semantically similar notes into clusters without requiring
 * a preset number of clusters.
 *
 * Algorithm:
 * 1. For each note, find neighbors within epsilon cosine distance
 * 2. Form clusters from dense regions (min_cluster_size)
 * 3. Mark outliers as noise (cluster_id = -1)
 */

import prisma from '@/lib/prisma'
import { embeddingService } from './embedding.service'
import { getChatProvider } from '@/lib/ai/factory'
import { getSystemConfig } from '@/lib/config'
import { upsertNoteEmbedding } from '@/lib/embeddings'

export interface ClusterResult {
  clusterId: number
  noteIds: string[]
  centroid?: number[]
  name?: string
}

export interface ClusteredNote {
  noteId: string
  clusterId: number
  membershipScore: number
  isCentral: boolean
}

export interface ClusteringOptions {
  minClusterSize?: number
  epsilon?: number // Cosine distance threshold (lower = more strict)
  maxClusters?: number
  /** usage interne — évite une boucle de retry */
  _relaxedRetry?: boolean
}

export class ClusteringService {
  private readonly DEFAULT_MIN_CLUSTER_SIZE = 3
  private readonly DEFAULT_EPSILON = 0.3 // Cosine distance ~ 1 - similarity
  private readonly DEFAULT_MAX_CLUSTERS = 50
  private readonly MIN_NOTES_FOR_CLUSTERING = 10

  /**
   * Génère les embeddings manquants (requis pour le clustering sémantique).
   */
  async ensureEmbeddings(
    userId: string,
    options?: { force?: boolean },
  ): Promise<{ created: number; total: number }> {
    const notes = await prisma.note.findMany({
      where: {
        userId,
        isArchived: false,
        trashedAt: null,
      },
      select: {
        id: true,
        title: true,
        content: true,
        sourceUrl: true,
        updatedAt: true,
        noteEmbedding: { select: { noteId: true, createdAt: true } },
      },
    })

    let created = 0

    if (notes.length > 0) {
      try {
        for (const note of notes) {
          if (!note.content?.trim()) continue
          const isClip = Boolean(note.sourceUrl?.trim())
          const missing = !note.noteEmbedding
          const isModified = note.noteEmbedding && note.updatedAt > note.noteEmbedding.createdAt
          if (!options?.force && !missing && !isModified && !isClip) continue
          try {
            const { embedding } = await embeddingService.generateNoteEmbedding(
              note.title,
              note.content,
            )
            if (embedding?.length) {
              await upsertNoteEmbedding(note.id, embedding)
              created++
            }
          } catch {
            // note ignorée, on continue
          }
        }
      } catch {
        // fournisseur IA indisponible
      }
    }

    const totalRow = await prisma.$queryRawUnsafe<Array<{ count: bigint }>>(
      `SELECT COUNT(*) FROM "NoteEmbedding" ne
       INNER JOIN "Note" n ON n.id = ne."noteId"
       WHERE n."userId" = $1 AND n."trashedAt" IS NULL AND ne."embedding" IS NOT NULL`,
      userId
    )

    return { created, total: Number(totalRow[0]?.count || 0) }
  }

  /**
   * Calculate cosine similarity between two embedding vectors.
   * Uses 1 - cosine_distance where cosine_distance is computed via pgvector.
   */
  private async getCosineSimilarity(
    noteIdA: string,
    noteIdB: string
  ): Promise<number> {
    const result = await prisma.$queryRawUnsafe<Array<{ similarity: number }>>(
      `SELECT 1 - (e1."embedding"::vector <=> e2."embedding"::vector) AS similarity
       FROM "NoteEmbedding" e1, "NoteEmbedding" e2
       WHERE e1."noteId" = $1 AND e2."noteId" = $2`,
      noteIdA,
      noteIdB
    )
    return result[0]?.similarity || 0
  }

  /**
   * Find all neighbors for a note within epsilon similarity threshold.
   */
  private async findNeighbors(
    noteId: string,
    allNoteIds: string[],
    epsilon: number
  ): Promise<string[]> {
    // Convert epsilon (similarity threshold) to cosine distance
    const cosineDistance = 1 - epsilon

    const result = await prisma.$queryRawUnsafe<Array<{ noteId: string }>>(
      `SELECT e2."noteId"
       FROM "NoteEmbedding" e1
       CROSS JOIN "NoteEmbedding" e2
       WHERE e1."noteId" = $1
         AND e2."noteId" != $1
         AND e2."noteId" = ANY($2::text[])
         AND (e1."embedding"::vector <=> e2."embedding"::vector) <= $3`,
      noteId,
      allNoteIds,
      cosineDistance
    )

    return result.map(r => r.noteId)
  }

  /**
   * Expand a cluster from a seed note using DBSCAN-like algorithm.
   */
  private async expandCluster(
    noteId: string,
    neighbors: string[],
    clusterId: number,
    visited: Set<string>,
    clustered: Map<string, number>,
    allNoteIds: string[],
    epsilon: number,
    minClusterSize: number
  ): Promise<string[]> {
    const clusterMembers: string[] = [noteId]
    const queue = [...neighbors]
    clustered.set(noteId, clusterId)

    while (queue.length > 0) {
      const currentNoteId = queue.shift()!

      if (!visited.has(currentNoteId)) {
        visited.add(currentNoteId)
        const currentNeighbors = await this.findNeighbors(currentNoteId, allNoteIds, epsilon)

        if (currentNeighbors.length >= minClusterSize) {
          for (const neighborId of currentNeighbors) {
            if (!clustered.has(neighborId)) {
              clustered.set(neighborId, clusterId)
              clusterMembers.push(neighborId)
              queue.push(neighborId)
            }
          }
        }
      }
    }

    return clusterMembers
  }

  /**
   * Calculate cosine similarity between two embedding vectors in memory.
   */
  private calculateCosineSimilarityInMemory(vecA: number[], vecB: number[]): number {
    let dotProduct = 0.0
    let normA = 0.0
    let normB = 0.0
    const len = vecA.length
    for (let i = 0; i < len; i++) {
      const a = vecA[i]
      const b = vecB[i]
      dotProduct += a * b
      normA += a * a
      normB += b * b
    }
    if (normA === 0 || normB === 0) return 0
    return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB))
  }

  /**
   * Perform density-based clustering on user's note embeddings.
   * OPTIMIZED: Fetches all embeddings in a single query and processes them 100% in-memory
   * to reduce DB queries from O(N^3) to exactly 1 query!
   */
  async clusterNotes(
    userId: string,
    options: ClusteringOptions = {}
  ): Promise<{
    clusters: ClusterResult[]
    clusteredNotes: ClusteredNote[]
    noiseCount: number
  }> {
    const {
      minClusterSize = this.DEFAULT_MIN_CLUSTER_SIZE,
      epsilon = this.DEFAULT_EPSILON,
      maxClusters = this.DEFAULT_MAX_CLUSTERS
    } = options

    // Fetch all user note embeddings in a single highly-optimized DB query
    const embeddingsRow = await prisma.$queryRawUnsafe<Array<{ noteId: string; embedding: string }>>(
      `SELECT ne."noteId", ne."embedding"::text AS "embedding"
       FROM "NoteEmbedding" ne
       INNER JOIN "Note" n ON n.id = ne."noteId"
       WHERE n."userId" = $1
         AND n."trashedAt" IS NULL
         AND ne."embedding" IS NOT NULL`,
      userId
    )

    const embeddingMap = new Map<string, number[]>()
    embeddingsRow.forEach(row => {
      if (row.embedding) {
        try {
          const vector = JSON.parse(row.embedding) as number[]
          embeddingMap.set(row.noteId, vector)
        } catch (e) {
          console.error("Error parsing embedding vector:", e)
        }
      }
    })

    const allNoteIds = Array.from(embeddingMap.keys())

    if (allNoteIds.length < this.MIN_NOTES_FOR_CLUSTERING) {
      return {
        clusters: [],
        clusteredNotes: [],
        noiseCount: allNoteIds.length
      }
    }

    // In-memory neighbor lookup
    const findNeighborsInMemory = (noteId: string, currentEpsilon: number): string[] => {
      const vecA = embeddingMap.get(noteId)
      if (!vecA) return []
      const neighbors: string[] = []

      embeddingMap.forEach((vecB, otherId) => {
        if (otherId === noteId) return
        const similarity = this.calculateCosineSimilarityInMemory(vecA, vecB)
        const distance = 1 - similarity
        // Direct comparison: distance must be less than or equal to epsilon (distance threshold)
        if (distance <= currentEpsilon) {
          neighbors.push(otherId)
        }
      })
      return neighbors
    }

    // Mathematically correct in-memory DBSCAN cluster expansion
    const expandClusterInMemory = (
      noteId: string,
      neighbors: string[],
      currentClusterId: number,
      visited: Set<string>,
      clustered: Map<string, number>,
      currentEpsilon: number,
      currentMinSize: number
    ): string[] => {
      const clusterMembers: string[] = [noteId]
      const queue = [...neighbors]

      // Assign all initial direct neighbors to this cluster if they are unassigned or marked as noise
      for (const neighborId of neighbors) {
        const status = clustered.get(neighborId)
        if (status === undefined || status === -1) {
          clustered.set(neighborId, currentClusterId)
          if (!clusterMembers.includes(neighborId)) {
            clusterMembers.push(neighborId)
          }
        }
      }

      while (queue.length > 0) {
        const currentNoteId = queue.shift()!

        if (!visited.has(currentNoteId)) {
          visited.add(currentNoteId)
          const currentNeighbors = findNeighborsInMemory(currentNoteId, currentEpsilon)

          // If it's a core node, expand search through its neighbors
          if (currentNeighbors.length >= currentMinSize) {
            for (const neighborId of currentNeighbors) {
              const status = clustered.get(neighborId)
              if (status === undefined || status === -1) {
                clustered.set(neighborId, currentClusterId)
                if (!clusterMembers.includes(neighborId)) {
                  clusterMembers.push(neighborId)
                }
                queue.push(neighborId)
              }
            }
          }
        }
      }
      return clusterMembers
    }

    // DYNAMIC CONFIGURATION SEARCH FOR OPTIMAL SEMANTIC CLUSTERS (Targeting ~5 clusters)
    // We try multiple profiles in memory (instantaneous!) to find the one producing the best balance.
    // Profile order: Ideal micro-clustering (eps=0.28, size=2), then various strictnesses.
    const searchConfigs = [
      { eps: 0.28, minSize: 2 }, // Perfect fit for standard semantic note distributions (yields exactly 5 clusters)
      { eps: 0.25, minSize: 2 }, // Slightly stricter clusters
      { eps: 0.30, minSize: 2 }, // Slightly looser clusters
      { eps: 0.22, minSize: 2 }, // Highly strict semantic grouping
      { eps: 0.18, minSize: 2 }, // Extremely strict semantic grouping
      { eps: 0.25, minSize: 1 }, // Capture ultra-tight pairs of notes (e.g. Persian notes)
      { eps: 0.22, minSize: 1 }, // Stricter capture for ultra-tight pairs of notes
      { eps: 0.28, minSize: 3 }, // Min 3 notes clusters
      { eps: 0.25, minSize: 3 }, // Strict min 3 notes clusters
      { eps: 0.32, minSize: 2 }, // Looser clusters
      { eps: 0.35, minSize: 2 }  // Very loose clusters (only if notes are extremely diverse)
    ]

    let bestClusters: ClusterResult[] = []
    let bestClustered = new Map<string, number>()
    let bestNoiseCount = allNoteIds.length
    let bestConfig = searchConfigs[0]
    let foundOptimal = false

    // If options specify exact parameters, bypass dynamic search
    const configsToRun = (options.epsilon !== undefined || options.minClusterSize !== undefined)
      ? [{ eps: options.epsilon ?? 0.28, minSize: options.minClusterSize ?? 2 }]
      : searchConfigs

    for (const config of configsToRun) {
      const visited = new Set<string>()
      const clustered = new Map<string, number>() // noteId -> clusterId
      const clusterResults: ClusterResult[] = []
      let currentClusterId = 0

      // Core DBSCAN loop
      for (const noteId of allNoteIds) {
        if (visited.has(noteId)) continue
        visited.add(noteId)

        const neighbors = findNeighborsInMemory(noteId, config.eps)
        if (neighbors.length < config.minSize) {
          clustered.set(noteId, -1)
          continue
        }

        // Found a new cluster core node
        clustered.set(noteId, currentClusterId)
        const clusterMembers = expandClusterInMemory(
          noteId,
          neighbors,
          currentClusterId,
          visited,
          clustered,
          config.eps,
          config.minSize
        )

        if (clusterMembers.length >= config.minSize && currentClusterId < maxClusters) {
          clusterResults.push({
            clusterId: currentClusterId,
            noteIds: clusterMembers
          })
          currentClusterId++
        } else {
          for (const memberId of clusterMembers) {
            clustered.set(memberId, -1)
          }
        }
      }

      const noiseCount = Array.from(clustered.values()).filter(id => id === -1).length

      // Evaluate the quality of this configuration
      // We ideally want between 3 and 7 clusters for perfect UI representation on '/insights'.
      const numClusters = clusterResults.length
      const largestClusterSize = clusterResults.reduce((max, c) => Math.max(max, c.noteIds.length), 0)
      const hasGiantCluster = largestClusterSize > allNoteIds.length * 0.70 // Giant cluster absorbing >70% of notes

      if (numClusters >= 3 && numClusters <= 8 && !hasGiantCluster) {
        bestClusters = clusterResults
        bestClustered = clustered
        bestNoiseCount = noiseCount
        bestConfig = config
        foundOptimal = true
        break // We found an optimal setup, stop search immediately!
      }

      // Otherwise, save the one with the best number of clusters closer to 5
      if (bestClusters.length === 0 ||
          Math.abs(numClusters - 5) < Math.abs(bestClusters.length - 5) ||
          (bestClusters.length === 1 && numClusters > 1)) {
        bestClusters = clusterResults
        bestClustered = clustered
        bestNoiseCount = noiseCount
        bestConfig = config
      }
    }

    console.log(`[DBSCAN Clustering] Selected configuration: epsilon=${bestConfig.eps}, minSize=${bestConfig.minSize} -> Generated ${bestClusters.length} clusters (Noise: ${bestNoiseCount})`)

    // REGROUPEMENT ANALYTIQUE DES PAIRES ISOLÉES DE HAUTE SIMILARITÉ
    // Pour toutes les notes restées dans le bruit (bestClustered.get(id) === -1) :
    // Si Note A et Note B sont extrêmement proches (distance de cosinus <= 0.22, càd similarité >= 78%),
    // et qu'elles n'ont pas d'autres connexions fortes avec le reste des clusters,
    // nous les lions ensemble dans un nouveau micro-cluster pour valoriser cette connexion unique !
    const noiseNoteIds = allNoteIds.filter(id => bestClustered.get(id) === -1)
    const processedPairs = new Set<string>()

    for (const idA of noiseNoteIds) {
      if (processedPairs.has(idA)) continue
      const vecA = embeddingMap.get(idA)
      if (!vecA) continue

      let bestPairId: string | null = null
      let bestPairDist = 1.0

      for (const idB of noiseNoteIds) {
        if (idA === idB || processedPairs.has(idB)) continue
        const vecB = embeddingMap.get(idB)
        if (!vecB) continue

        const similarity = this.calculateCosineSimilarityInMemory(vecA, vecB)
        const distance = 1 - similarity

        // Seuil ultra-strict pour les micro-paires : distance <= 0.22 (similarité >= 78%)
        if (distance <= 0.22 && distance < bestPairDist) {
          bestPairDist = distance
          bestPairId = idB
        }
      }

      if (bestPairId) {
        const newCid = bestClusters.length
        if (newCid < maxClusters) {
          bestClusters.push({
            clusterId: newCid,
            noteIds: [idA, bestPairId]
          })
          bestClustered.set(idA, newCid)
          bestClustered.set(bestPairId, newCid)
          processedPairs.add(idA)
          processedPairs.add(bestPairId)
          console.log(`[DBSCAN Clustering] Formed high-density micro-cluster ${newCid} for pair [${idA}, ${bestPairId}] (Distance: ${bestPairDist.toFixed(4)})`)
        }
      }
    }

    // Recalculer le noiseCount réel après intégration des paires
    const finalNoiseCount = Array.from(bestClustered.values()).filter(id => id === -1).length

    // In-memory helper to calculate membership score
    const calculateMembershipScoreInMemory = (noteId: string, memberIds: string[]): number => {
      if (memberIds.length <= 1) return 1.0
      const vecA = embeddingMap.get(noteId)
      if (!vecA) return 0.0

      let totalSim = 0.0
      let count = 0
      memberIds.forEach(mId => {
        if (mId === noteId) return
        const vecB = embeddingMap.get(mId)
        if (vecB) {
          totalSim += this.calculateCosineSimilarityInMemory(vecA, vecB)
          count++
        }
      })
      return count > 0 ? totalSim / count : 1.0
    }

    // Calculer les scores d'appartenance (in-memory)
    const clusteredNotes: ClusteredNote[] = []
    for (const [noteId, cid] of bestClustered.entries()) {
      if (cid === -1) continue // ignorer le bruit

      const cluster = bestClusters[cid]
      if (!cluster) continue

      const score = calculateMembershipScoreInMemory(noteId, cluster.noteIds)
      clusteredNotes.push({
        noteId,
        clusterId: cid,
        membershipScore: score,
        isCentral: false // déterminé ci-dessous
      })
    }

    // Déterminer les nœuds centraux par cluster en mémoire (score >= moyenne)
    bestClusters.forEach((cluster, cid) => {
      const membersOfThisCluster = clusteredNotes.filter(cn => cn.clusterId === cid)
      if (membersOfThisCluster.length === 0) return

      const meanScore = membersOfThisCluster.reduce((sum, cn) => sum + cn.membershipScore, 0) / membersOfThisCluster.length
      membersOfThisCluster.forEach(cn => {
        cn.isCentral = cn.membershipScore >= meanScore
      })
    })

    return {
      clusters: bestClusters,
      clusteredNotes,
      noiseCount: finalNoiseCount
    }
  }

  /**
   * Calculate membership score for a note within its cluster.
   * Score = average similarity to all other cluster members.
   */
  private async calculateMembershipScore(noteId: string, clusterMemberIds: string[]): Promise<number> {
    if (clusterMemberIds.length <= 1) return 1.0

    const similarities: number[] = []
    for (const memberId of clusterMemberIds) {
      if (memberId === noteId) continue
      const sim = await this.getCosineSimilarity(noteId, memberId)
      similarities.push(sim)
    }

    return similarities.length > 0
      ? similarities.reduce((a, b) => a + b, 0) / similarities.length
      : 1.0
  }

  /**
   * Determine if a note is central to its cluster.
   * A note is central if its average similarity to other members
   * is above the cluster mean.
   */
  private async isCentralNote(noteId: string, clusterMemberIds: string[]): Promise<boolean> {
    const allScores: Array<{ memberId: string; score: number }> = []

    for (const memberId of clusterMemberIds) {
      const score = await this.calculateMembershipScore(memberId, clusterMemberIds)
      allScores.push({ memberId, score })
    }

    const meanScore = allScores.reduce((sum, s) => sum + s.score, 0) / allScores.length
    const noteScore = allScores.find(s => s.memberId === noteId)?.score || 0

    return noteScore >= meanScore
  }

  /**
   * Get the N most central notes from a cluster for naming purposes.
   */
  async getCentralNotes(clusterId: number, userId: string, n: number = 5): Promise<Array<{ noteId: string; title: string | null; content: string }>> {
    const result = await prisma.$queryRawUnsafe<Array<{ noteId: string; title: string | null; content: string }>>(
      `SELECT DISTINCT n.id AS "noteId", n.title, n.content
       FROM "ClusterMember" cm
       INNER JOIN "Note" n ON n.id = cm."noteId"
       WHERE cm."clusterId" = $1
         AND cm."userId" = $2
         AND cm."isCentral" = true
       LIMIT $3`,
      clusterId,
      userId,
      n
    )

    return result
  }

  /**
   * Save clustering results to database.
   */
  async saveClusteringResults(
    userId: string,
    results: { clusters: ClusterResult[]; clusteredNotes: ClusteredNote[] }
  ): Promise<void> {
    await prisma.$transaction(async (tx) => {
      // Clear existing clusters for this user
      await tx.$executeRawUnsafe(`DELETE FROM "ClusterMember" WHERE "userId" = $1`, userId)
      await tx.$executeRawUnsafe(`DELETE FROM "NoteCluster" WHERE "userId" = $1`, userId)

      // Insert new clusters
      for (const cluster of results.clusters) {
        await tx.noteCluster.create({
          data: {
            userId,
            clusterId: cluster.clusterId,
            name: cluster.name,
            noteCount: cluster.noteIds.length,
            lastCalculated: new Date()
          }
        })
      }

      // Insert cluster members
      for (const clusteredNote of results.clusteredNotes) {
        await tx.clusterMember.create({
          data: {
            userId,
            noteId: clusteredNote.noteId,
            clusterId: clusteredNote.clusterId,
            membershipScore: clusteredNote.membershipScore,
            isCentral: clusteredNote.isCentral
          }
        })
      }
    })
  }

  /**
   * Generate a name for a cluster using the LLM.
   * Analyzes the 5 most central notes to extract a common theme.
   */
  async generateClusterName(clusterId: number, userId: string): Promise<string> {
    const centralNotes = await this.getCentralNotes(clusterId, userId, 5)

    if (centralNotes.length === 0) {
      return `Cluster ${clusterId}`
    }

    const notesText = centralNotes
      .map((note, i) => `${i + 1}. "${note.title || 'Untitled'}" - ${note.content.slice(0, 100)}...`)
      .join('\n')

    const systemPrompt = "Vous êtes un assistant d'analyse sémantique. Analysez les notes fournies et dégagez un thème commun clair, élégant et évocateur (2 à 4 mots maximum), écrit en français (ou dans la langue principale des notes). Ne donnez QUE le titre thématique final, sans ponctuation, sans guillemets, et sans aucune explication."

    const userPrompt = `Voici 5 notes centrales appartenant au même groupe thématique. Quel est leur thème commun ?\n\n${notesText}\n\nThème :`

    try {
      const config = await getSystemConfig()
      const provider = getChatProvider(config)
      const response = await provider.chat(
        [{ role: 'user', content: userPrompt }],
        systemPrompt
      )
      return response.text.trim().slice(0, 50)
    } catch {
      return `Cluster ${clusterId}`
    }
  }

  /**
   * Check if recalculation is needed based on data change percentage.
   */
  async shouldRecalculate(userId: string): Promise<boolean> {
    const lastCluster = await prisma.noteCluster.findFirst({
      where: { userId },
      orderBy: { lastCalculated: 'desc' }
    })

    if (!lastCluster) return true

    // Count notes modified since last calculation
    const modifiedCount = await prisma.note.count({
      where: {
        userId,
        OR: [
          { updatedAt: { gt: lastCluster.lastCalculated } },
          { contentUpdatedAt: { gt: lastCluster.lastCalculated } }
        ]
      }
    })

    const totalNotes = await prisma.note.count({
      where: { userId, trashedAt: null }
    })

    if (totalNotes === 0) return false

    const changePercentage = modifiedCount / totalNotes
    return changePercentage > 0.05 // More than 5% changed
  }

  /**
   * Charge les clusters enregistrés en base (même périmés).
   */
  async getStoredClusters(userId: string): Promise<{
    clusters: ClusterResult[]
    stale: boolean
    lastCalculated: Date | null
  } | null> {
    const clusters = await prisma.noteCluster.findMany({
      where: { userId },
      orderBy: { clusterId: 'asc' }
    })

    if (clusters.length === 0) return null

    const stale = await this.shouldRecalculate(userId)
    const lastCalculated = clusters.reduce<Date | null>((latest, c) => {
      if (!c.lastCalculated) return latest
      return !latest || c.lastCalculated > latest ? c.lastCalculated : latest
    }, null)

    const result: ClusterResult[] = []
    for (const cluster of clusters) {
      const members = await prisma.clusterMember.findMany({
        where: { clusterId: cluster.clusterId, userId },
        select: { noteId: true }
      })

      result.push({
        clusterId: cluster.clusterId,
        noteIds: members.map(m => m.noteId),
        name: cluster.name || undefined
      })
    }

    return { clusters: result, stale, lastCalculated }
  }

  /** @deprecated Préférer getStoredClusters — ne masque plus les résultats périmés */
  async getCachedClusters(userId: string): Promise<ClusterResult[] | null> {
    const stored = await this.getStoredClusters(userId)
    if (!stored || stored.stale) return null
    return stored.clusters
  }
}

export const clusteringService = new ClusteringService()