import { PrismaClient } from '@prisma/client' import * as d3 from 'd3' const prisma = new PrismaClient() interface D3Node { id: string clusterId: string | number } async function getCosineSimilarityDB(noteIdA: string, noteIdB: string): Promise { const result = await prisma.$queryRawUnsafe>( `SELECT 1 - (e1."embedding"::vector <=> e2."embedding"::vector) AS similarity FROM "NoteEmbedding" e1, "NoteEmbedding" e2 WHERE e1."noteId" = $1 AND e2."noteId" = $2`, noteIdA, noteIdB ) return result[0]?.similarity || 0 } function calculateCosineSimilarityInMemory(vecA: number[], vecB: number[]): number { let dotProduct = 0.0 let normA = 0.0 let normB = 0.0 const len = vecA.length for (let i = 0; i < len; i++) { const a = vecA[i] const b = vecB[i] dotProduct += a * b normA += a * a normB += b * b } if (normA === 0 || normB === 0) return 0 return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)) } async function main() { const user = await prisma.user.findFirst() if (!user) return const userId = user.id // Fetch all user's notes with embeddings const notesWithEmbeddings = await prisma.$queryRawUnsafe>( `SELECT ne."noteId" FROM "NoteEmbedding" ne INNER JOIN "Note" n ON n.id = ne."noteId" WHERE n."userId" = $1 AND n."trashedAt" IS NULL AND ne."embedding" IS NOT NULL`, userId ) const allNoteIds = notesWithEmbeddings.map(n => n.noteId) // Fetch in-memory embeddings const embeddingsRow = await prisma.$queryRawUnsafe>( `SELECT ne."noteId", ne."embedding"::text AS "embedding" FROM "NoteEmbedding" ne INNER JOIN "Note" n ON n.id = ne."noteId" WHERE n."userId" = $1 AND n."trashedAt" IS NULL AND ne."embedding" IS NOT NULL`, userId ) const embeddingMap = new Map() embeddingsRow.forEach(row => { if (row.embedding) { embeddingMap.set(row.noteId, JSON.parse(row.embedding)) } }) console.log(`Total notes with embeddings: ${allNoteIds.length}`) // Compare single similarities if (allNoteIds.length >= 2) { const idA = allNoteIds[0] const idB = allNoteIds[1] const simDB = await getCosineSimilarityDB(idA, idB) const simMem = calculateCosineSimilarityInMemory(embeddingMap.get(idA)!, embeddingMap.get(idB)!) console.log(`Note A: ${idA}, Note B: ${idB}`) console.log(`Similarity DB: ${simDB}`) console.log(`Similarity Mem: ${simMem}`) console.log(`Difference: ${Math.abs(simDB - simMem)}`) } // Compare neighbors const epsilon = 0.3 const cosineDistance = 1 - epsilon const seedId = allNoteIds[0] // Neighbors DB const neighborsDB = await prisma.$queryRawUnsafe>( `SELECT e2."noteId" FROM "NoteEmbedding" e1 CROSS JOIN "NoteEmbedding" e2 WHERE e1."noteId" = $1 AND e2."noteId" != $1 AND e2."noteId" = ANY($2::text[]) AND (e1."embedding"::vector <=> e2."embedding"::vector) <= $3`, seedId, allNoteIds, cosineDistance ) const neighborsDBIds = neighborsDB.map(r => r.noteId) // Neighbors Mem const vecA = embeddingMap.get(seedId)! const neighborsMemIds: string[] = [] embeddingMap.forEach((vecB, otherId) => { if (otherId === seedId) return const similarity = calculateCosineSimilarityInMemory(vecA, vecB) const distance = 1 - similarity if (distance <= cosineDistance) { neighborsMemIds.push(otherId) } }) console.log(`Seed Note: ${seedId}`) console.log(`Neighbors DB count: ${neighborsDBIds.length}`) console.log(`Neighbors Mem count: ${neighborsMemIds.length}`) console.log(`Common neighbors: ${neighborsDBIds.filter(x => neighborsMemIds.includes(x)).length}`) // Run DB-based clustering expandCluster // We can see if there is any difference in cluster expandCluster output console.log("\n=== DBSCAN Simulation ==="); const testEpsilons = [0.1, 0.15, 0.18, 0.2, 0.22, 0.25, 0.28, 0.3]; const minClusterSize = 2; for (const eps of testEpsilons) { const visited = new Set(); const clustered = new Map(); // noteId -> clusterId const clusters: Array<{ clusterId: number; noteIds: string[] }> = []; let clusterId = 0; const findNeighbors = (noteId: string, currentEps: number): string[] => { const vecA = embeddingMap.get(noteId); if (!vecA) return []; const neighbors: string[] = []; // Let's check how epsilon is used. // If epsilon is a cosine distance threshold, then distance <= eps. // E.g., similarity >= 1 - eps. // If epsilon is similarity threshold, then distance <= 1 - eps. // Let's test both! We will test using eps as the actual cosine distance threshold. embeddingMap.forEach((vecB, otherId) => { if (otherId === noteId) return; const similarity = calculateCosineSimilarityInMemory(vecA, vecB); const distance = 1 - similarity; if (distance <= currentEps) { neighbors.push(otherId); } }); return neighbors; }; const expandCluster = ( noteId: string, neighbors: string[], cid: number, currentEps: number ): string[] => { const members: string[] = [noteId]; const queue = [...neighbors]; clustered.set(noteId, cid); for (const neighborId of neighbors) { if (clustered.get(neighborId) === undefined || clustered.get(neighborId) === -1) { clustered.set(neighborId, cid); if (!members.includes(neighborId)) members.push(neighborId); } } while (queue.length > 0) { const currentNoteId = queue.shift()!; if (!visited.has(currentNoteId)) { visited.add(currentNoteId); const currentNeighbors = findNeighbors(currentNoteId, currentEps); if (currentNeighbors.length >= minClusterSize) { for (const neighborId of currentNeighbors) { const neighborCid = clustered.get(neighborId); if (neighborCid === undefined || neighborCid === -1) { clustered.set(neighborId, cid); if (!members.includes(neighborId)) members.push(neighborId); queue.push(neighborId); } } } } } return members; }; for (const noteId of allNoteIds) { if (visited.has(noteId)) continue; visited.add(noteId); const neighbors = findNeighbors(noteId, eps); if (neighbors.length < minClusterSize) { clustered.set(noteId, -1); continue; } const members = expandCluster(noteId, neighbors, clusterId, eps); clusters.push({ clusterId, noteIds: members }); clusterId++; } const noiseCount = Array.from(clustered.values()).filter(id => id === -1).length; console.log(`Using epsilon (distance threshold) = ${eps}:`); console.log(` -> Clusters generated: ${clusters.length}`); clusters.forEach(c => { console.log(` Cluster ${c.clusterId}: ${c.noteIds.length} notes`); }); console.log(` -> Noise count: ${noiseCount}`); } console.log("\n=== Calling Real Service in-memory ==="); const { clusteringService } = await import('../lib/ai/services/clustering.service'); const serviceResult = await clusteringService.clusterNotes(userId); console.log(`Service generated ${serviceResult.clusters.length} clusters!`); serviceResult.clusters.forEach(c => { console.log(` -> Cluster ${c.clusterId} (${c.name || 'unnamed'}): ${c.noteIds.length} notes (Central notes: ${serviceResult.clusteredNotes.filter(cn => cn.clusterId === c.clusterId && cn.isCentral).length})`); }); console.log(` -> Noise count: ${serviceResult.noiseCount}`); } main().catch(console.error).finally(() => prisma.$disconnect())