Add automatic note clustering using density-based algorithm (DBSCAN variant) and bridge notes detection for connecting different thematic clusters. Features: - NoteCluster, ClusterMember, BridgeNote, BridgeSuggestion models - Clustering service with pgvector cosine similarity - Bridge notes detection (notes connecting >=2 clusters) - AI-powered suggestions for missing cluster connections - /insights page with React Flow visualization - Cron endpoint for automatic recalculation Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
437 lines
13 KiB
TypeScript
437 lines
13 KiB
TypeScript
/**
|
|
* Clustering Service
|
|
*
|
|
* Density-based clustering algorithm (DBSCAN variant) for note embeddings.
|
|
* Groups semantically similar notes into clusters without requiring
|
|
* a preset number of clusters.
|
|
*
|
|
* Algorithm:
|
|
* 1. For each note, find neighbors within epsilon cosine distance
|
|
* 2. Form clusters from dense regions (min_cluster_size)
|
|
* 3. Mark outliers as noise (cluster_id = -1)
|
|
*/
|
|
|
|
import prisma from '@/lib/prisma'
|
|
import { embeddingService } from './embedding.service'
|
|
import { getChatProvider } from '@/lib/ai/factory'
|
|
import { getSystemConfig } from '@/lib/config'
|
|
|
|
export interface ClusterResult {
|
|
clusterId: number
|
|
noteIds: string[]
|
|
centroid?: number[]
|
|
name?: string
|
|
}
|
|
|
|
export interface ClusteredNote {
|
|
noteId: string
|
|
clusterId: number
|
|
membershipScore: number
|
|
isCentral: boolean
|
|
}
|
|
|
|
export interface ClusteringOptions {
|
|
minClusterSize?: number
|
|
epsilon?: number // Cosine distance threshold (lower = more strict)
|
|
maxClusters?: number
|
|
}
|
|
|
|
export class ClusteringService {
|
|
private readonly DEFAULT_MIN_CLUSTER_SIZE = 3
|
|
private readonly DEFAULT_EPSILON = 0.3 // Cosine distance ~ 1 - similarity
|
|
private readonly DEFAULT_MAX_CLUSTERS = 50
|
|
private readonly MIN_NOTES_FOR_CLUSTERING = 10
|
|
|
|
/**
|
|
* Calculate cosine similarity between two embedding vectors.
|
|
* Uses 1 - cosine_distance where cosine_distance is computed via pgvector.
|
|
*/
|
|
private async getCosineSimilarity(
|
|
noteIdA: string,
|
|
noteIdB: string
|
|
): Promise<number> {
|
|
const result = await prisma.$queryRawUnsafe<Array<{ similarity: number }>>(
|
|
`SELECT 1 - (e1."embedding"::vector <=> e2."embedding"::vector) AS similarity
|
|
FROM "NoteEmbedding" e1, "NoteEmbedding" e2
|
|
WHERE e1."noteId" = $1 AND e2."noteId" = $2`,
|
|
noteIdA,
|
|
noteIdB
|
|
)
|
|
return result[0]?.similarity || 0
|
|
}
|
|
|
|
/**
|
|
* Find all neighbors for a note within epsilon similarity threshold.
|
|
*/
|
|
private async findNeighbors(
|
|
noteId: string,
|
|
allNoteIds: string[],
|
|
epsilon: number
|
|
): Promise<string[]> {
|
|
// Convert epsilon (similarity threshold) to cosine distance
|
|
const cosineDistance = 1 - epsilon
|
|
|
|
const result = await prisma.$queryRawUnsafe<Array<{ noteId: string }>>(
|
|
`SELECT e2."noteId"
|
|
FROM "NoteEmbedding" e1
|
|
CROSS JOIN "NoteEmbedding" e2
|
|
WHERE e1."noteId" = $1
|
|
AND e2."noteId" != $1
|
|
AND e2."noteId" = ANY($2::text[])
|
|
AND (e1."embedding"::vector <=> e2."embedding"::vector) <= $3`,
|
|
noteId,
|
|
allNoteIds,
|
|
cosineDistance
|
|
)
|
|
|
|
return result.map(r => r.noteId)
|
|
}
|
|
|
|
/**
|
|
* Expand a cluster from a seed note using DBSCAN-like algorithm.
|
|
*/
|
|
private async expandCluster(
|
|
noteId: string,
|
|
neighbors: string[],
|
|
clusterId: number,
|
|
visited: Set<string>,
|
|
clustered: Map<string, number>,
|
|
allNoteIds: string[],
|
|
epsilon: number,
|
|
minClusterSize: number
|
|
): Promise<string[]> {
|
|
const clusterMembers: string[] = [noteId]
|
|
const queue = [...neighbors]
|
|
clustered.set(noteId, clusterId)
|
|
|
|
while (queue.length > 0) {
|
|
const currentNoteId = queue.shift()!
|
|
|
|
if (!visited.has(currentNoteId)) {
|
|
visited.add(currentNoteId)
|
|
const currentNeighbors = await this.findNeighbors(currentNoteId, allNoteIds, epsilon)
|
|
|
|
if (currentNeighbors.length >= minClusterSize) {
|
|
for (const neighborId of currentNeighbors) {
|
|
if (!clustered.has(neighborId)) {
|
|
clustered.set(neighborId, clusterId)
|
|
clusterMembers.push(neighborId)
|
|
queue.push(neighborId)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return clusterMembers
|
|
}
|
|
|
|
/**
|
|
* Perform density-based clustering on user's note embeddings.
|
|
*/
|
|
async clusterNotes(
|
|
userId: string,
|
|
options: ClusteringOptions = {}
|
|
): Promise<{
|
|
clusters: ClusterResult[]
|
|
clusteredNotes: ClusteredNote[]
|
|
noiseCount: number
|
|
}> {
|
|
const {
|
|
minClusterSize = this.DEFAULT_MIN_CLUSTER_SIZE,
|
|
epsilon = this.DEFAULT_EPSILON,
|
|
maxClusters = this.DEFAULT_MAX_CLUSTERS
|
|
} = options
|
|
|
|
// Get all user's notes with embeddings
|
|
const notesWithEmbeddings = await prisma.$queryRawUnsafe<Array<{ noteId: string }>>(
|
|
`SELECT ne."noteId"
|
|
FROM "NoteEmbedding" ne
|
|
INNER JOIN "Note" n ON n.id = ne."noteId"
|
|
WHERE n."userId" = $1
|
|
AND n."trashedAt" IS NULL
|
|
AND ne."embedding" IS NOT NULL`,
|
|
userId
|
|
)
|
|
|
|
const allNoteIds = notesWithEmbeddings.map(n => n.noteId)
|
|
|
|
if (allNoteIds.length < this.MIN_NOTES_FOR_CLUSTERING) {
|
|
return {
|
|
clusters: [],
|
|
clusteredNotes: [],
|
|
noiseCount: allNoteIds.length
|
|
}
|
|
}
|
|
|
|
const visited = new Set<string>()
|
|
const clustered = new Map<string, number>() // noteId -> clusterId
|
|
const clusterResults: ClusterResult[] = []
|
|
let clusterId = 0
|
|
|
|
// DBSCAN algorithm
|
|
for (const noteId of allNoteIds) {
|
|
if (visited.has(noteId)) continue
|
|
|
|
visited.add(noteId)
|
|
const neighbors = await this.findNeighbors(noteId, allNoteIds, epsilon)
|
|
|
|
if (neighbors.length < minClusterSize) {
|
|
// Mark as noise (cluster_id = -1)
|
|
clustered.set(noteId, -1)
|
|
continue
|
|
}
|
|
|
|
// Expand cluster
|
|
const clusterMembers = await this.expandCluster(
|
|
noteId,
|
|
neighbors,
|
|
clusterId,
|
|
visited,
|
|
clustered,
|
|
allNoteIds,
|
|
epsilon,
|
|
minClusterSize
|
|
)
|
|
|
|
if (clusterMembers.length >= minClusterSize && clusterId < maxClusters) {
|
|
clusterResults.push({
|
|
clusterId,
|
|
noteIds: clusterMembers
|
|
})
|
|
clusterId++
|
|
} else {
|
|
// Too small, mark as noise
|
|
for (const memberId of clusterMembers) {
|
|
clustered.set(memberId, -1)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Calculate membership scores and identify central notes
|
|
const clusteredNotes: ClusteredNote[] = []
|
|
for (const [noteId, cid] of clustered.entries()) {
|
|
if (cid === -1) continue // Skip noise
|
|
|
|
const cluster = clusterResults[cid]
|
|
if (!cluster) continue
|
|
|
|
// Calculate membership score as average similarity to other cluster members
|
|
const score = await this.calculateMembershipScore(noteId, cluster.noteIds)
|
|
const isCentral = await this.isCentralNote(noteId, cluster.noteIds)
|
|
|
|
clusteredNotes.push({
|
|
noteId,
|
|
clusterId: cid,
|
|
membershipScore: score,
|
|
isCentral
|
|
})
|
|
}
|
|
|
|
const noiseCount = Array.from(clustered.values()).filter(id => id === -1).length
|
|
|
|
return {
|
|
clusters: clusterResults,
|
|
clusteredNotes,
|
|
noiseCount
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Calculate membership score for a note within its cluster.
|
|
* Score = average similarity to all other cluster members.
|
|
*/
|
|
private async calculateMembershipScore(noteId: string, clusterMemberIds: string[]): Promise<number> {
|
|
if (clusterMemberIds.length <= 1) return 1.0
|
|
|
|
const similarities: number[] = []
|
|
for (const memberId of clusterMemberIds) {
|
|
if (memberId === noteId) continue
|
|
const sim = await this.getCosineSimilarity(noteId, memberId)
|
|
similarities.push(sim)
|
|
}
|
|
|
|
return similarities.length > 0
|
|
? similarities.reduce((a, b) => a + b, 0) / similarities.length
|
|
: 1.0
|
|
}
|
|
|
|
/**
|
|
* Determine if a note is central to its cluster.
|
|
* A note is central if its average similarity to other members
|
|
* is above the cluster mean.
|
|
*/
|
|
private async isCentralNote(noteId: string, clusterMemberIds: string[]): Promise<boolean> {
|
|
const allScores: Array<{ memberId: string; score: number }> = []
|
|
|
|
for (const memberId of clusterMemberIds) {
|
|
const score = await this.calculateMembershipScore(memberId, clusterMemberIds)
|
|
allScores.push({ memberId, score })
|
|
}
|
|
|
|
const meanScore = allScores.reduce((sum, s) => sum + s.score, 0) / allScores.length
|
|
const noteScore = allScores.find(s => s.memberId === noteId)?.score || 0
|
|
|
|
return noteScore >= meanScore
|
|
}
|
|
|
|
/**
|
|
* Get the N most central notes from a cluster for naming purposes.
|
|
*/
|
|
async getCentralNotes(clusterId: number, userId: string, n: number = 5): Promise<Array<{ noteId: string; title: string | null; content: string }>> {
|
|
const result = await prisma.$queryRawUnsafe<Array<{ noteId: string; title: string | null; content: string }>>(
|
|
`SELECT DISTINCT n.id AS "noteId", n.title, n.content
|
|
FROM "ClusterMember" cm
|
|
INNER JOIN "Note" n ON n.id = cm."noteId"
|
|
WHERE cm."clusterId" = $1
|
|
AND cm."userId" = $2
|
|
AND cm."isCentral" = true
|
|
LIMIT $3`,
|
|
clusterId,
|
|
userId,
|
|
n
|
|
)
|
|
|
|
return result
|
|
}
|
|
|
|
/**
|
|
* Save clustering results to database.
|
|
*/
|
|
async saveClusteringResults(
|
|
userId: string,
|
|
results: { clusters: ClusterResult[]; clusteredNotes: ClusteredNote[] }
|
|
): Promise<void> {
|
|
await prisma.$transaction(async (tx) => {
|
|
// Clear existing clusters for this user
|
|
await tx.$executeRawUnsafe(`DELETE FROM "ClusterMember" WHERE "userId" = $1`, userId)
|
|
await tx.$executeRawUnsafe(`DELETE FROM "NoteCluster" WHERE "userId" = $1`, userId)
|
|
|
|
// Insert new clusters
|
|
for (const cluster of results.clusters) {
|
|
await tx.noteCluster.create({
|
|
data: {
|
|
userId,
|
|
clusterId: cluster.clusterId,
|
|
name: cluster.name,
|
|
noteCount: cluster.noteIds.length,
|
|
lastCalculated: new Date()
|
|
}
|
|
})
|
|
}
|
|
|
|
// Insert cluster members
|
|
for (const clusteredNote of results.clusteredNotes) {
|
|
await tx.clusterMember.create({
|
|
data: {
|
|
userId,
|
|
noteId: clusteredNote.noteId,
|
|
clusterId: clusteredNote.clusterId,
|
|
membershipScore: clusteredNote.membershipScore,
|
|
isCentral: clusteredNote.isCentral
|
|
}
|
|
})
|
|
}
|
|
})
|
|
}
|
|
|
|
/**
|
|
* Generate a name for a cluster using the LLM.
|
|
* Analyzes the 5 most central notes to extract a common theme.
|
|
*/
|
|
async generateClusterName(clusterId: number, userId: string): Promise<string> {
|
|
const centralNotes = await this.getCentralNotes(clusterId, userId, 5)
|
|
|
|
if (centralNotes.length === 0) {
|
|
return `Cluster ${clusterId}`
|
|
}
|
|
|
|
const notesText = centralNotes
|
|
.map((note, i) => `${i + 1}. "${note.title || 'Untitled'}" - ${note.content.slice(0, 100)}...`)
|
|
.join('\n')
|
|
|
|
const systemPrompt = 'You are a clustering assistant. Provide ONLY a concise name (2-4 words) in English. No punctuation, no explanation.'
|
|
|
|
const userPrompt = `Analyze these 5 notes that belong to the same cluster. What is the common theme?\n\n${notesText}\n\nTheme:`
|
|
|
|
try {
|
|
const config = await getSystemConfig()
|
|
const provider = getChatProvider(config)
|
|
const response = await provider.chat(
|
|
[{ role: 'user', content: userPrompt }],
|
|
systemPrompt
|
|
)
|
|
return response.text.trim().slice(0, 50)
|
|
} catch {
|
|
return `Cluster ${clusterId}`
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Check if recalculation is needed based on data change percentage.
|
|
*/
|
|
async shouldRecalculate(userId: string): Promise<boolean> {
|
|
const lastCluster = await prisma.noteCluster.findFirst({
|
|
where: { userId },
|
|
orderBy: { lastCalculated: 'desc' }
|
|
})
|
|
|
|
if (!lastCluster) return true
|
|
|
|
// Count notes modified since last calculation
|
|
const modifiedCount = await prisma.note.count({
|
|
where: {
|
|
userId,
|
|
OR: [
|
|
{ updatedAt: { gt: lastCluster.lastCalculated } },
|
|
{ contentUpdatedAt: { gt: lastCluster.lastCalculated } }
|
|
]
|
|
}
|
|
})
|
|
|
|
const totalNotes = await prisma.note.count({
|
|
where: { userId, trashedAt: null }
|
|
})
|
|
|
|
if (totalNotes === 0) return false
|
|
|
|
const changePercentage = modifiedCount / totalNotes
|
|
return changePercentage > 0.05 // More than 5% changed
|
|
}
|
|
|
|
/**
|
|
* Get cached clustering results if available and fresh.
|
|
*/
|
|
async getCachedClusters(userId: string): Promise<ClusterResult[] | null> {
|
|
const clusters = await prisma.noteCluster.findMany({
|
|
where: { userId },
|
|
orderBy: { clusterId: 'asc' }
|
|
})
|
|
|
|
if (clusters.length === 0) return null
|
|
|
|
// Check if data is still fresh
|
|
const needsUpdate = await this.shouldRecalculate(userId)
|
|
if (needsUpdate) return null
|
|
|
|
// Get cluster members
|
|
const result: ClusterResult[] = []
|
|
for (const cluster of clusters) {
|
|
const members = await prisma.clusterMember.findMany({
|
|
where: { clusterId: cluster.clusterId, userId },
|
|
select: { noteId: true }
|
|
})
|
|
|
|
result.push({
|
|
clusterId: cluster.clusterId,
|
|
noteIds: members.map(m => m.noteId),
|
|
name: cluster.name || undefined
|
|
})
|
|
}
|
|
|
|
return result
|
|
}
|
|
}
|
|
|
|
export const clusteringService = new ClusteringService()
|