Files
Momento/memento-note/scripts/compare-dbscan.ts
Antigravity e881004c77
Some checks failed
CI / Lint, Test & Build (push) Failing after 1m7s
CI / Deploy production (on server) (push) Has been skipped
feat(insights): fix DBSCAN, Persian embeddings crash, D3 physics layouts, and D3 node not found runtime error
2026-05-24 18:57:33 +00:00

233 lines
7.8 KiB
TypeScript

import { PrismaClient } from '@prisma/client'
import * as d3 from 'd3'
const prisma = new PrismaClient()
interface D3Node {
id: string
clusterId: string | number
}
async function getCosineSimilarityDB(noteIdA: string, noteIdB: string): Promise<number> {
const result = await prisma.$queryRawUnsafe<Array<{ similarity: number }>>(
`SELECT 1 - (e1."embedding"::vector <=> e2."embedding"::vector) AS similarity
FROM "NoteEmbedding" e1, "NoteEmbedding" e2
WHERE e1."noteId" = $1 AND e2."noteId" = $2`,
noteIdA,
noteIdB
)
return result[0]?.similarity || 0
}
function calculateCosineSimilarityInMemory(vecA: number[], vecB: number[]): number {
let dotProduct = 0.0
let normA = 0.0
let normB = 0.0
const len = vecA.length
for (let i = 0; i < len; i++) {
const a = vecA[i]
const b = vecB[i]
dotProduct += a * b
normA += a * a
normB += b * b
}
if (normA === 0 || normB === 0) return 0
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB))
}
async function main() {
const user = await prisma.user.findFirst()
if (!user) return
const userId = user.id
// Fetch all user's notes with embeddings
const notesWithEmbeddings = await prisma.$queryRawUnsafe<Array<{ noteId: string }>>(
`SELECT ne."noteId"
FROM "NoteEmbedding" ne
INNER JOIN "Note" n ON n.id = ne."noteId"
WHERE n."userId" = $1
AND n."trashedAt" IS NULL
AND ne."embedding" IS NOT NULL`,
userId
)
const allNoteIds = notesWithEmbeddings.map(n => n.noteId)
// Fetch in-memory embeddings
const embeddingsRow = await prisma.$queryRawUnsafe<Array<{ noteId: string; embedding: string }>>(
`SELECT ne."noteId", ne."embedding"::text AS "embedding"
FROM "NoteEmbedding" ne
INNER JOIN "Note" n ON n.id = ne."noteId"
WHERE n."userId" = $1
AND n."trashedAt" IS NULL
AND ne."embedding" IS NOT NULL`,
userId
)
const embeddingMap = new Map<string, number[]>()
embeddingsRow.forEach(row => {
if (row.embedding) {
embeddingMap.set(row.noteId, JSON.parse(row.embedding))
}
})
console.log(`Total notes with embeddings: ${allNoteIds.length}`)
// Compare single similarities
if (allNoteIds.length >= 2) {
const idA = allNoteIds[0]
const idB = allNoteIds[1]
const simDB = await getCosineSimilarityDB(idA, idB)
const simMem = calculateCosineSimilarityInMemory(embeddingMap.get(idA)!, embeddingMap.get(idB)!)
console.log(`Note A: ${idA}, Note B: ${idB}`)
console.log(`Similarity DB: ${simDB}`)
console.log(`Similarity Mem: ${simMem}`)
console.log(`Difference: ${Math.abs(simDB - simMem)}`)
}
// Compare neighbors
const epsilon = 0.3
const cosineDistance = 1 - epsilon
const seedId = allNoteIds[0]
// Neighbors DB
const neighborsDB = await prisma.$queryRawUnsafe<Array<{ noteId: string }>>(
`SELECT e2."noteId"
FROM "NoteEmbedding" e1
CROSS JOIN "NoteEmbedding" e2
WHERE e1."noteId" = $1
AND e2."noteId" != $1
AND e2."noteId" = ANY($2::text[])
AND (e1."embedding"::vector <=> e2."embedding"::vector) <= $3`,
seedId,
allNoteIds,
cosineDistance
)
const neighborsDBIds = neighborsDB.map(r => r.noteId)
// Neighbors Mem
const vecA = embeddingMap.get(seedId)!
const neighborsMemIds: string[] = []
embeddingMap.forEach((vecB, otherId) => {
if (otherId === seedId) return
const similarity = calculateCosineSimilarityInMemory(vecA, vecB)
const distance = 1 - similarity
if (distance <= cosineDistance) {
neighborsMemIds.push(otherId)
}
})
console.log(`Seed Note: ${seedId}`)
console.log(`Neighbors DB count: ${neighborsDBIds.length}`)
console.log(`Neighbors Mem count: ${neighborsMemIds.length}`)
console.log(`Common neighbors: ${neighborsDBIds.filter(x => neighborsMemIds.includes(x)).length}`)
// Run DB-based clustering expandCluster
// We can see if there is any difference in cluster expandCluster output
console.log("\n=== DBSCAN Simulation ===");
const testEpsilons = [0.1, 0.15, 0.18, 0.2, 0.22, 0.25, 0.28, 0.3];
const minClusterSize = 2;
for (const eps of testEpsilons) {
const visited = new Set<string>();
const clustered = new Map<string, number>(); // noteId -> clusterId
const clusters: Array<{ clusterId: number; noteIds: string[] }> = [];
let clusterId = 0;
const findNeighbors = (noteId: string, currentEps: number): string[] => {
const vecA = embeddingMap.get(noteId);
if (!vecA) return [];
const neighbors: string[] = [];
// Let's check how epsilon is used.
// If epsilon is a cosine distance threshold, then distance <= eps.
// E.g., similarity >= 1 - eps.
// If epsilon is similarity threshold, then distance <= 1 - eps.
// Let's test both! We will test using eps as the actual cosine distance threshold.
embeddingMap.forEach((vecB, otherId) => {
if (otherId === noteId) return;
const similarity = calculateCosineSimilarityInMemory(vecA, vecB);
const distance = 1 - similarity;
if (distance <= currentEps) {
neighbors.push(otherId);
}
});
return neighbors;
};
const expandCluster = (
noteId: string,
neighbors: string[],
cid: number,
currentEps: number
): string[] => {
const members: string[] = [noteId];
const queue = [...neighbors];
clustered.set(noteId, cid);
for (const neighborId of neighbors) {
if (clustered.get(neighborId) === undefined || clustered.get(neighborId) === -1) {
clustered.set(neighborId, cid);
if (!members.includes(neighborId)) members.push(neighborId);
}
}
while (queue.length > 0) {
const currentNoteId = queue.shift()!;
if (!visited.has(currentNoteId)) {
visited.add(currentNoteId);
const currentNeighbors = findNeighbors(currentNoteId, currentEps);
if (currentNeighbors.length >= minClusterSize) {
for (const neighborId of currentNeighbors) {
const neighborCid = clustered.get(neighborId);
if (neighborCid === undefined || neighborCid === -1) {
clustered.set(neighborId, cid);
if (!members.includes(neighborId)) members.push(neighborId);
queue.push(neighborId);
}
}
}
}
}
return members;
};
for (const noteId of allNoteIds) {
if (visited.has(noteId)) continue;
visited.add(noteId);
const neighbors = findNeighbors(noteId, eps);
if (neighbors.length < minClusterSize) {
clustered.set(noteId, -1);
continue;
}
const members = expandCluster(noteId, neighbors, clusterId, eps);
clusters.push({ clusterId, noteIds: members });
clusterId++;
}
const noiseCount = Array.from(clustered.values()).filter(id => id === -1).length;
console.log(`Using epsilon (distance threshold) = ${eps}:`);
console.log(` -> Clusters generated: ${clusters.length}`);
clusters.forEach(c => {
console.log(` Cluster ${c.clusterId}: ${c.noteIds.length} notes`);
});
console.log(` -> Noise count: ${noiseCount}`);
}
console.log("\n=== Calling Real Service in-memory ===");
const { clusteringService } = await import('../lib/ai/services/clustering.service');
const serviceResult = await clusteringService.clusterNotes(userId);
console.log(`Service generated ${serviceResult.clusters.length} clusters!`);
serviceResult.clusters.forEach(c => {
console.log(` -> Cluster ${c.clusterId} (${c.name || 'unnamed'}): ${c.noteIds.length} notes (Central notes: ${serviceResult.clusteredNotes.filter(cn => cn.clusterId === c.clusterId && cn.isCentral).length})`);
});
console.log(` -> Noise count: ${serviceResult.noiseCount}`);
}
main().catch(console.error).finally(() => prisma.$disconnect())