Momento/memento-note/scripts/compare-dbscan.ts

import { PrismaClient } from '@prisma/client'
import * as d3 from 'd3'

const prisma = new PrismaClient()

interface D3Node {
  id: string
  clusterId: string | number
}

async function getCosineSimilarityDB(noteIdA: string, noteIdB: string): Promise<number> {
  const result = await prisma.$queryRawUnsafe<Array<{ similarity: number }>>(
    `SELECT 1 - (e1."embedding"::vector <=> e2."embedding"::vector) AS similarity
     FROM "NoteEmbedding" e1, "NoteEmbedding" e2
     WHERE e1."noteId" = $1 AND e2."noteId" = $2`,
    noteIdA,
    noteIdB
  )
  return result[0]?.similarity || 0
}

function calculateCosineSimilarityInMemory(vecA: number[], vecB: number[]): number {
  let dotProduct = 0.0
  let normA = 0.0
  let normB = 0.0
  const len = vecA.length
  for (let i = 0; i < len; i++) {
    const a = vecA[i]
    const b = vecB[i]
    dotProduct += a * b
    normA += a * a
    normB += b * b
  }
  if (normA === 0 || normB === 0) return 0
  return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB))
}

async function main() {
  const user = await prisma.user.findFirst()
  if (!user) return
  const userId = user.id

  // Fetch all user's notes with embeddings
  const notesWithEmbeddings = await prisma.$queryRawUnsafe<Array<{ noteId: string }>>(
    `SELECT ne."noteId"
     FROM "NoteEmbedding" ne
     INNER JOIN "Note" n ON n.id = ne."noteId"
     WHERE n."userId" = $1
       AND n."trashedAt" IS NULL
       AND ne."embedding" IS NOT NULL`,
    userId
  )
  const allNoteIds = notesWithEmbeddings.map(n => n.noteId)

  // Fetch in-memory embeddings
  const embeddingsRow = await prisma.$queryRawUnsafe<Array<{ noteId: string; embedding: string }>>(
    `SELECT ne."noteId", ne."embedding"::text AS "embedding"
     FROM "NoteEmbedding" ne
     INNER JOIN "Note" n ON n.id = ne."noteId"
     WHERE n."userId" = $1
       AND n."trashedAt" IS NULL
       AND ne."embedding" IS NOT NULL`,
    userId
  )

  const embeddingMap = new Map<string, number[]>()
  embeddingsRow.forEach(row => {
    if (row.embedding) {
      embeddingMap.set(row.noteId, JSON.parse(row.embedding))
    }
  })

  console.log(`Total notes with embeddings: ${allNoteIds.length}`)

  // Compare single similarities
  if (allNoteIds.length >= 2) {
    const idA = allNoteIds[0]
    const idB = allNoteIds[1]
    const simDB = await getCosineSimilarityDB(idA, idB)
    const simMem = calculateCosineSimilarityInMemory(embeddingMap.get(idA)!, embeddingMap.get(idB)!)
    console.log(`Note A: ${idA}, Note B: ${idB}`)
    console.log(`Similarity DB: ${simDB}`)
    console.log(`Similarity Mem: ${simMem}`)
    console.log(`Difference: ${Math.abs(simDB - simMem)}`)
  }

  // Compare neighbors
  const epsilon = 0.3
  const cosineDistance = 1 - epsilon
  const seedId = allNoteIds[0]

  // Neighbors DB
  const neighborsDB = await prisma.$queryRawUnsafe<Array<{ noteId: string }>>(
    `SELECT e2."noteId"
     FROM "NoteEmbedding" e1
     CROSS JOIN "NoteEmbedding" e2
     WHERE e1."noteId" = $1
       AND e2."noteId" != $1
       AND e2."noteId" = ANY($2::text[])
       AND (e1."embedding"::vector <=> e2."embedding"::vector) <= $3`,
    seedId,
    allNoteIds,
    cosineDistance
  )
  const neighborsDBIds = neighborsDB.map(r => r.noteId)

  // Neighbors Mem
  const vecA = embeddingMap.get(seedId)!
  const neighborsMemIds: string[] = []
  embeddingMap.forEach((vecB, otherId) => {
    if (otherId === seedId) return
    const similarity = calculateCosineSimilarityInMemory(vecA, vecB)
    const distance = 1 - similarity
    if (distance <= cosineDistance) {
      neighborsMemIds.push(otherId)
    }
  })

  console.log(`Seed Note: ${seedId}`)
  console.log(`Neighbors DB count: ${neighborsDBIds.length}`)
  console.log(`Neighbors Mem count: ${neighborsMemIds.length}`)
  console.log(`Common neighbors: ${neighborsDBIds.filter(x => neighborsMemIds.includes(x)).length}`)

  // Run DB-based clustering expandCluster
  // We can see if there is any difference in cluster expandCluster output
  console.log("\n=== DBSCAN Simulation ===");
  const testEpsilons = [0.1, 0.15, 0.18, 0.2, 0.22, 0.25, 0.28, 0.3];
  const minClusterSize = 2;

  for (const eps of testEpsilons) {
    const visited = new Set<string>();
    const clustered = new Map<string, number>(); // noteId -> clusterId
    const clusters: Array<{ clusterId: number; noteIds: string[] }> = [];
    let clusterId = 0;

    const findNeighbors = (noteId: string, currentEps: number): string[] => {
      const vecA = embeddingMap.get(noteId);
      if (!vecA) return [];
      const neighbors: string[] = [];

      // Let's check how epsilon is used.
      // If epsilon is a cosine distance threshold, then distance <= eps.
      // E.g., similarity >= 1 - eps.
      // If epsilon is similarity threshold, then distance <= 1 - eps.
      // Let's test both! We will test using eps as the actual cosine distance threshold.
      embeddingMap.forEach((vecB, otherId) => {
        if (otherId === noteId) return;
        const similarity = calculateCosineSimilarityInMemory(vecA, vecB);
        const distance = 1 - similarity;
        if (distance <= currentEps) {
          neighbors.push(otherId);
        }
      });
      return neighbors;
    };

    const expandCluster = (
      noteId: string,
      neighbors: string[],
      cid: number,
      currentEps: number
    ): string[] => {
      const members: string[] = [noteId];
      const queue = [...neighbors];
      clustered.set(noteId, cid);

      for (const neighborId of neighbors) {
        if (clustered.get(neighborId) === undefined || clustered.get(neighborId) === -1) {
          clustered.set(neighborId, cid);
          if (!members.includes(neighborId)) members.push(neighborId);
        }
      }

      while (queue.length > 0) {
        const currentNoteId = queue.shift()!;

        if (!visited.has(currentNoteId)) {
          visited.add(currentNoteId);
          const currentNeighbors = findNeighbors(currentNoteId, currentEps);

          if (currentNeighbors.length >= minClusterSize) {
            for (const neighborId of currentNeighbors) {
              const neighborCid = clustered.get(neighborId);
              if (neighborCid === undefined || neighborCid === -1) {
                clustered.set(neighborId, cid);
                if (!members.includes(neighborId)) members.push(neighborId);
                queue.push(neighborId);
              }
            }
          }
        }
      }
      return members;
    };

    for (const noteId of allNoteIds) {
      if (visited.has(noteId)) continue;
      visited.add(noteId);

      const neighbors = findNeighbors(noteId, eps);
      if (neighbors.length < minClusterSize) {
        clustered.set(noteId, -1);
        continue;
      }

      const members = expandCluster(noteId, neighbors, clusterId, eps);
      clusters.push({ clusterId, noteIds: members });
      clusterId++;
    }

    const noiseCount = Array.from(clustered.values()).filter(id => id === -1).length;
    console.log(`Using epsilon (distance threshold) = ${eps}:`);
    console.log(`  -> Clusters generated: ${clusters.length}`);
    clusters.forEach(c => {
      console.log(`     Cluster ${c.clusterId}: ${c.noteIds.length} notes`);
    });
    console.log(`  -> Noise count: ${noiseCount}`);
  }

  console.log("\n=== Calling Real Service in-memory ===");
  const { clusteringService } = await import('../lib/ai/services/clustering.service');
  const serviceResult = await clusteringService.clusterNotes(userId);
  console.log(`Service generated ${serviceResult.clusters.length} clusters!`);
  serviceResult.clusters.forEach(c => {
    console.log(`  -> Cluster ${c.clusterId} (${c.name || 'unnamed'}): ${c.noteIds.length} notes (Central notes: ${serviceResult.clusteredNotes.filter(cn => cn.clusterId === c.clusterId && cn.isCentral).length})`);
  });
  console.log(`  -> Noise count: ${serviceResult.noiseCount}`);
}

main().catch(console.error).finally(() => prisma.$disconnect())