233 lines
7.8 KiB
TypeScript
233 lines
7.8 KiB
TypeScript
import { PrismaClient } from '@prisma/client'
|
|
import * as d3 from 'd3'
|
|
|
|
const prisma = new PrismaClient()
|
|
|
|
interface D3Node {
|
|
id: string
|
|
clusterId: string | number
|
|
}
|
|
|
|
async function getCosineSimilarityDB(noteIdA: string, noteIdB: string): Promise<number> {
|
|
const result = await prisma.$queryRawUnsafe<Array<{ similarity: number }>>(
|
|
`SELECT 1 - (e1."embedding"::vector <=> e2."embedding"::vector) AS similarity
|
|
FROM "NoteEmbedding" e1, "NoteEmbedding" e2
|
|
WHERE e1."noteId" = $1 AND e2."noteId" = $2`,
|
|
noteIdA,
|
|
noteIdB
|
|
)
|
|
return result[0]?.similarity || 0
|
|
}
|
|
|
|
function calculateCosineSimilarityInMemory(vecA: number[], vecB: number[]): number {
|
|
let dotProduct = 0.0
|
|
let normA = 0.0
|
|
let normB = 0.0
|
|
const len = vecA.length
|
|
for (let i = 0; i < len; i++) {
|
|
const a = vecA[i]
|
|
const b = vecB[i]
|
|
dotProduct += a * b
|
|
normA += a * a
|
|
normB += b * b
|
|
}
|
|
if (normA === 0 || normB === 0) return 0
|
|
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB))
|
|
}
|
|
|
|
async function main() {
|
|
const user = await prisma.user.findFirst()
|
|
if (!user) return
|
|
const userId = user.id
|
|
|
|
// Fetch all user's notes with embeddings
|
|
const notesWithEmbeddings = await prisma.$queryRawUnsafe<Array<{ noteId: string }>>(
|
|
`SELECT ne."noteId"
|
|
FROM "NoteEmbedding" ne
|
|
INNER JOIN "Note" n ON n.id = ne."noteId"
|
|
WHERE n."userId" = $1
|
|
AND n."trashedAt" IS NULL
|
|
AND ne."embedding" IS NOT NULL`,
|
|
userId
|
|
)
|
|
const allNoteIds = notesWithEmbeddings.map(n => n.noteId)
|
|
|
|
// Fetch in-memory embeddings
|
|
const embeddingsRow = await prisma.$queryRawUnsafe<Array<{ noteId: string; embedding: string }>>(
|
|
`SELECT ne."noteId", ne."embedding"::text AS "embedding"
|
|
FROM "NoteEmbedding" ne
|
|
INNER JOIN "Note" n ON n.id = ne."noteId"
|
|
WHERE n."userId" = $1
|
|
AND n."trashedAt" IS NULL
|
|
AND ne."embedding" IS NOT NULL`,
|
|
userId
|
|
)
|
|
|
|
const embeddingMap = new Map<string, number[]>()
|
|
embeddingsRow.forEach(row => {
|
|
if (row.embedding) {
|
|
embeddingMap.set(row.noteId, JSON.parse(row.embedding))
|
|
}
|
|
})
|
|
|
|
console.log(`Total notes with embeddings: ${allNoteIds.length}`)
|
|
|
|
// Compare single similarities
|
|
if (allNoteIds.length >= 2) {
|
|
const idA = allNoteIds[0]
|
|
const idB = allNoteIds[1]
|
|
const simDB = await getCosineSimilarityDB(idA, idB)
|
|
const simMem = calculateCosineSimilarityInMemory(embeddingMap.get(idA)!, embeddingMap.get(idB)!)
|
|
console.log(`Note A: ${idA}, Note B: ${idB}`)
|
|
console.log(`Similarity DB: ${simDB}`)
|
|
console.log(`Similarity Mem: ${simMem}`)
|
|
console.log(`Difference: ${Math.abs(simDB - simMem)}`)
|
|
}
|
|
|
|
// Compare neighbors
|
|
const epsilon = 0.3
|
|
const cosineDistance = 1 - epsilon
|
|
const seedId = allNoteIds[0]
|
|
|
|
// Neighbors DB
|
|
const neighborsDB = await prisma.$queryRawUnsafe<Array<{ noteId: string }>>(
|
|
`SELECT e2."noteId"
|
|
FROM "NoteEmbedding" e1
|
|
CROSS JOIN "NoteEmbedding" e2
|
|
WHERE e1."noteId" = $1
|
|
AND e2."noteId" != $1
|
|
AND e2."noteId" = ANY($2::text[])
|
|
AND (e1."embedding"::vector <=> e2."embedding"::vector) <= $3`,
|
|
seedId,
|
|
allNoteIds,
|
|
cosineDistance
|
|
)
|
|
const neighborsDBIds = neighborsDB.map(r => r.noteId)
|
|
|
|
// Neighbors Mem
|
|
const vecA = embeddingMap.get(seedId)!
|
|
const neighborsMemIds: string[] = []
|
|
embeddingMap.forEach((vecB, otherId) => {
|
|
if (otherId === seedId) return
|
|
const similarity = calculateCosineSimilarityInMemory(vecA, vecB)
|
|
const distance = 1 - similarity
|
|
if (distance <= cosineDistance) {
|
|
neighborsMemIds.push(otherId)
|
|
}
|
|
})
|
|
|
|
console.log(`Seed Note: ${seedId}`)
|
|
console.log(`Neighbors DB count: ${neighborsDBIds.length}`)
|
|
console.log(`Neighbors Mem count: ${neighborsMemIds.length}`)
|
|
console.log(`Common neighbors: ${neighborsDBIds.filter(x => neighborsMemIds.includes(x)).length}`)
|
|
|
|
// Run DB-based clustering expandCluster
|
|
// We can see if there is any difference in cluster expandCluster output
|
|
console.log("\n=== DBSCAN Simulation ===");
|
|
const testEpsilons = [0.1, 0.15, 0.18, 0.2, 0.22, 0.25, 0.28, 0.3];
|
|
const minClusterSize = 2;
|
|
|
|
for (const eps of testEpsilons) {
|
|
const visited = new Set<string>();
|
|
const clustered = new Map<string, number>(); // noteId -> clusterId
|
|
const clusters: Array<{ clusterId: number; noteIds: string[] }> = [];
|
|
let clusterId = 0;
|
|
|
|
const findNeighbors = (noteId: string, currentEps: number): string[] => {
|
|
const vecA = embeddingMap.get(noteId);
|
|
if (!vecA) return [];
|
|
const neighbors: string[] = [];
|
|
|
|
// Let's check how epsilon is used.
|
|
// If epsilon is a cosine distance threshold, then distance <= eps.
|
|
// E.g., similarity >= 1 - eps.
|
|
// If epsilon is similarity threshold, then distance <= 1 - eps.
|
|
// Let's test both! We will test using eps as the actual cosine distance threshold.
|
|
embeddingMap.forEach((vecB, otherId) => {
|
|
if (otherId === noteId) return;
|
|
const similarity = calculateCosineSimilarityInMemory(vecA, vecB);
|
|
const distance = 1 - similarity;
|
|
if (distance <= currentEps) {
|
|
neighbors.push(otherId);
|
|
}
|
|
});
|
|
return neighbors;
|
|
};
|
|
|
|
const expandCluster = (
|
|
noteId: string,
|
|
neighbors: string[],
|
|
cid: number,
|
|
currentEps: number
|
|
): string[] => {
|
|
const members: string[] = [noteId];
|
|
const queue = [...neighbors];
|
|
clustered.set(noteId, cid);
|
|
|
|
for (const neighborId of neighbors) {
|
|
if (clustered.get(neighborId) === undefined || clustered.get(neighborId) === -1) {
|
|
clustered.set(neighborId, cid);
|
|
if (!members.includes(neighborId)) members.push(neighborId);
|
|
}
|
|
}
|
|
|
|
while (queue.length > 0) {
|
|
const currentNoteId = queue.shift()!;
|
|
|
|
if (!visited.has(currentNoteId)) {
|
|
visited.add(currentNoteId);
|
|
const currentNeighbors = findNeighbors(currentNoteId, currentEps);
|
|
|
|
if (currentNeighbors.length >= minClusterSize) {
|
|
for (const neighborId of currentNeighbors) {
|
|
const neighborCid = clustered.get(neighborId);
|
|
if (neighborCid === undefined || neighborCid === -1) {
|
|
clustered.set(neighborId, cid);
|
|
if (!members.includes(neighborId)) members.push(neighborId);
|
|
queue.push(neighborId);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return members;
|
|
};
|
|
|
|
for (const noteId of allNoteIds) {
|
|
if (visited.has(noteId)) continue;
|
|
visited.add(noteId);
|
|
|
|
const neighbors = findNeighbors(noteId, eps);
|
|
if (neighbors.length < minClusterSize) {
|
|
clustered.set(noteId, -1);
|
|
continue;
|
|
}
|
|
|
|
const members = expandCluster(noteId, neighbors, clusterId, eps);
|
|
clusters.push({ clusterId, noteIds: members });
|
|
clusterId++;
|
|
}
|
|
|
|
const noiseCount = Array.from(clustered.values()).filter(id => id === -1).length;
|
|
console.log(`Using epsilon (distance threshold) = ${eps}:`);
|
|
console.log(` -> Clusters generated: ${clusters.length}`);
|
|
clusters.forEach(c => {
|
|
console.log(` Cluster ${c.clusterId}: ${c.noteIds.length} notes`);
|
|
});
|
|
console.log(` -> Noise count: ${noiseCount}`);
|
|
}
|
|
|
|
console.log("\n=== Calling Real Service in-memory ===");
|
|
const { clusteringService } = await import('../lib/ai/services/clustering.service');
|
|
const serviceResult = await clusteringService.clusterNotes(userId);
|
|
console.log(`Service generated ${serviceResult.clusters.length} clusters!`);
|
|
serviceResult.clusters.forEach(c => {
|
|
console.log(` -> Cluster ${c.clusterId} (${c.name || 'unnamed'}): ${c.noteIds.length} notes (Central notes: ${serviceResult.clusteredNotes.filter(cn => cn.clusterId === c.clusterId && cn.isCentral).length})`);
|
|
});
|
|
console.log(` -> Noise count: ${serviceResult.noiseCount}`);
|
|
}
|
|
|
|
main().catch(console.error).finally(() => prisma.$disconnect())
|
|
|
|
|