Files
Momento/memento-note/lib/ai/services/clustering.service.ts
Antigravity 077e665dfc feat(cluster): implement cluster detection and bridge notes discovery
Add automatic note clustering using density-based algorithm (DBSCAN variant)
and bridge notes detection for connecting different thematic clusters.

Features:
- NoteCluster, ClusterMember, BridgeNote, BridgeSuggestion models
- Clustering service with pgvector cosine similarity
- Bridge notes detection (notes connecting >=2 clusters)
- AI-powered suggestions for missing cluster connections
- /insights page with React Flow visualization
- Cron endpoint for automatic recalculation

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 20:26:25 +00:00

437 lines
13 KiB
TypeScript

/**
* Clustering Service
*
* Density-based clustering algorithm (DBSCAN variant) for note embeddings.
* Groups semantically similar notes into clusters without requiring
* a preset number of clusters.
*
* Algorithm:
* 1. For each note, find neighbors within epsilon cosine distance
* 2. Form clusters from dense regions (min_cluster_size)
* 3. Mark outliers as noise (cluster_id = -1)
*/
import prisma from '@/lib/prisma'
import { embeddingService } from './embedding.service'
import { getChatProvider } from '@/lib/ai/factory'
import { getSystemConfig } from '@/lib/config'
export interface ClusterResult {
clusterId: number
noteIds: string[]
centroid?: number[]
name?: string
}
export interface ClusteredNote {
noteId: string
clusterId: number
membershipScore: number
isCentral: boolean
}
export interface ClusteringOptions {
minClusterSize?: number
epsilon?: number // Cosine distance threshold (lower = more strict)
maxClusters?: number
}
export class ClusteringService {
private readonly DEFAULT_MIN_CLUSTER_SIZE = 3
private readonly DEFAULT_EPSILON = 0.3 // Cosine distance ~ 1 - similarity
private readonly DEFAULT_MAX_CLUSTERS = 50
private readonly MIN_NOTES_FOR_CLUSTERING = 10
/**
* Calculate cosine similarity between two embedding vectors.
* Uses 1 - cosine_distance where cosine_distance is computed via pgvector.
*/
private async getCosineSimilarity(
noteIdA: string,
noteIdB: string
): Promise<number> {
const result = await prisma.$queryRawUnsafe<Array<{ similarity: number }>>(
`SELECT 1 - (e1."embedding"::vector <=> e2."embedding"::vector) AS similarity
FROM "NoteEmbedding" e1, "NoteEmbedding" e2
WHERE e1."noteId" = $1 AND e2."noteId" = $2`,
noteIdA,
noteIdB
)
return result[0]?.similarity || 0
}
/**
* Find all neighbors for a note within epsilon similarity threshold.
*/
private async findNeighbors(
noteId: string,
allNoteIds: string[],
epsilon: number
): Promise<string[]> {
// Convert epsilon (similarity threshold) to cosine distance
const cosineDistance = 1 - epsilon
const result = await prisma.$queryRawUnsafe<Array<{ noteId: string }>>(
`SELECT e2."noteId"
FROM "NoteEmbedding" e1
CROSS JOIN "NoteEmbedding" e2
WHERE e1."noteId" = $1
AND e2."noteId" != $1
AND e2."noteId" = ANY($2::text[])
AND (e1."embedding"::vector <=> e2."embedding"::vector) <= $3`,
noteId,
allNoteIds,
cosineDistance
)
return result.map(r => r.noteId)
}
/**
* Expand a cluster from a seed note using DBSCAN-like algorithm.
*/
private async expandCluster(
noteId: string,
neighbors: string[],
clusterId: number,
visited: Set<string>,
clustered: Map<string, number>,
allNoteIds: string[],
epsilon: number,
minClusterSize: number
): Promise<string[]> {
const clusterMembers: string[] = [noteId]
const queue = [...neighbors]
clustered.set(noteId, clusterId)
while (queue.length > 0) {
const currentNoteId = queue.shift()!
if (!visited.has(currentNoteId)) {
visited.add(currentNoteId)
const currentNeighbors = await this.findNeighbors(currentNoteId, allNoteIds, epsilon)
if (currentNeighbors.length >= minClusterSize) {
for (const neighborId of currentNeighbors) {
if (!clustered.has(neighborId)) {
clustered.set(neighborId, clusterId)
clusterMembers.push(neighborId)
queue.push(neighborId)
}
}
}
}
}
return clusterMembers
}
/**
* Perform density-based clustering on user's note embeddings.
*/
async clusterNotes(
userId: string,
options: ClusteringOptions = {}
): Promise<{
clusters: ClusterResult[]
clusteredNotes: ClusteredNote[]
noiseCount: number
}> {
const {
minClusterSize = this.DEFAULT_MIN_CLUSTER_SIZE,
epsilon = this.DEFAULT_EPSILON,
maxClusters = this.DEFAULT_MAX_CLUSTERS
} = options
// Get all user's notes with embeddings
const notesWithEmbeddings = await prisma.$queryRawUnsafe<Array<{ noteId: string }>>(
`SELECT ne."noteId"
FROM "NoteEmbedding" ne
INNER JOIN "Note" n ON n.id = ne."noteId"
WHERE n."userId" = $1
AND n."trashedAt" IS NULL
AND ne."embedding" IS NOT NULL`,
userId
)
const allNoteIds = notesWithEmbeddings.map(n => n.noteId)
if (allNoteIds.length < this.MIN_NOTES_FOR_CLUSTERING) {
return {
clusters: [],
clusteredNotes: [],
noiseCount: allNoteIds.length
}
}
const visited = new Set<string>()
const clustered = new Map<string, number>() // noteId -> clusterId
const clusterResults: ClusterResult[] = []
let clusterId = 0
// DBSCAN algorithm
for (const noteId of allNoteIds) {
if (visited.has(noteId)) continue
visited.add(noteId)
const neighbors = await this.findNeighbors(noteId, allNoteIds, epsilon)
if (neighbors.length < minClusterSize) {
// Mark as noise (cluster_id = -1)
clustered.set(noteId, -1)
continue
}
// Expand cluster
const clusterMembers = await this.expandCluster(
noteId,
neighbors,
clusterId,
visited,
clustered,
allNoteIds,
epsilon,
minClusterSize
)
if (clusterMembers.length >= minClusterSize && clusterId < maxClusters) {
clusterResults.push({
clusterId,
noteIds: clusterMembers
})
clusterId++
} else {
// Too small, mark as noise
for (const memberId of clusterMembers) {
clustered.set(memberId, -1)
}
}
}
// Calculate membership scores and identify central notes
const clusteredNotes: ClusteredNote[] = []
for (const [noteId, cid] of clustered.entries()) {
if (cid === -1) continue // Skip noise
const cluster = clusterResults[cid]
if (!cluster) continue
// Calculate membership score as average similarity to other cluster members
const score = await this.calculateMembershipScore(noteId, cluster.noteIds)
const isCentral = await this.isCentralNote(noteId, cluster.noteIds)
clusteredNotes.push({
noteId,
clusterId: cid,
membershipScore: score,
isCentral
})
}
const noiseCount = Array.from(clustered.values()).filter(id => id === -1).length
return {
clusters: clusterResults,
clusteredNotes,
noiseCount
}
}
/**
* Calculate membership score for a note within its cluster.
* Score = average similarity to all other cluster members.
*/
private async calculateMembershipScore(noteId: string, clusterMemberIds: string[]): Promise<number> {
if (clusterMemberIds.length <= 1) return 1.0
const similarities: number[] = []
for (const memberId of clusterMemberIds) {
if (memberId === noteId) continue
const sim = await this.getCosineSimilarity(noteId, memberId)
similarities.push(sim)
}
return similarities.length > 0
? similarities.reduce((a, b) => a + b, 0) / similarities.length
: 1.0
}
/**
* Determine if a note is central to its cluster.
* A note is central if its average similarity to other members
* is above the cluster mean.
*/
private async isCentralNote(noteId: string, clusterMemberIds: string[]): Promise<boolean> {
const allScores: Array<{ memberId: string; score: number }> = []
for (const memberId of clusterMemberIds) {
const score = await this.calculateMembershipScore(memberId, clusterMemberIds)
allScores.push({ memberId, score })
}
const meanScore = allScores.reduce((sum, s) => sum + s.score, 0) / allScores.length
const noteScore = allScores.find(s => s.memberId === noteId)?.score || 0
return noteScore >= meanScore
}
/**
* Get the N most central notes from a cluster for naming purposes.
*/
async getCentralNotes(clusterId: number, userId: string, n: number = 5): Promise<Array<{ noteId: string; title: string | null; content: string }>> {
const result = await prisma.$queryRawUnsafe<Array<{ noteId: string; title: string | null; content: string }>>(
`SELECT DISTINCT n.id AS "noteId", n.title, n.content
FROM "ClusterMember" cm
INNER JOIN "Note" n ON n.id = cm."noteId"
WHERE cm."clusterId" = $1
AND cm."userId" = $2
AND cm."isCentral" = true
LIMIT $3`,
clusterId,
userId,
n
)
return result
}
/**
* Save clustering results to database.
*/
async saveClusteringResults(
userId: string,
results: { clusters: ClusterResult[]; clusteredNotes: ClusteredNote[] }
): Promise<void> {
await prisma.$transaction(async (tx) => {
// Clear existing clusters for this user
await tx.$executeRawUnsafe(`DELETE FROM "ClusterMember" WHERE "userId" = $1`, userId)
await tx.$executeRawUnsafe(`DELETE FROM "NoteCluster" WHERE "userId" = $1`, userId)
// Insert new clusters
for (const cluster of results.clusters) {
await tx.noteCluster.create({
data: {
userId,
clusterId: cluster.clusterId,
name: cluster.name,
noteCount: cluster.noteIds.length,
lastCalculated: new Date()
}
})
}
// Insert cluster members
for (const clusteredNote of results.clusteredNotes) {
await tx.clusterMember.create({
data: {
userId,
noteId: clusteredNote.noteId,
clusterId: clusteredNote.clusterId,
membershipScore: clusteredNote.membershipScore,
isCentral: clusteredNote.isCentral
}
})
}
})
}
/**
* Generate a name for a cluster using the LLM.
* Analyzes the 5 most central notes to extract a common theme.
*/
async generateClusterName(clusterId: number, userId: string): Promise<string> {
const centralNotes = await this.getCentralNotes(clusterId, userId, 5)
if (centralNotes.length === 0) {
return `Cluster ${clusterId}`
}
const notesText = centralNotes
.map((note, i) => `${i + 1}. "${note.title || 'Untitled'}" - ${note.content.slice(0, 100)}...`)
.join('\n')
const systemPrompt = 'You are a clustering assistant. Provide ONLY a concise name (2-4 words) in English. No punctuation, no explanation.'
const userPrompt = `Analyze these 5 notes that belong to the same cluster. What is the common theme?\n\n${notesText}\n\nTheme:`
try {
const config = await getSystemConfig()
const provider = getChatProvider(config)
const response = await provider.chat(
[{ role: 'user', content: userPrompt }],
systemPrompt
)
return response.text.trim().slice(0, 50)
} catch {
return `Cluster ${clusterId}`
}
}
/**
* Check if recalculation is needed based on data change percentage.
*/
async shouldRecalculate(userId: string): Promise<boolean> {
const lastCluster = await prisma.noteCluster.findFirst({
where: { userId },
orderBy: { lastCalculated: 'desc' }
})
if (!lastCluster) return true
// Count notes modified since last calculation
const modifiedCount = await prisma.note.count({
where: {
userId,
OR: [
{ updatedAt: { gt: lastCluster.lastCalculated } },
{ contentUpdatedAt: { gt: lastCluster.lastCalculated } }
]
}
})
const totalNotes = await prisma.note.count({
where: { userId, trashedAt: null }
})
if (totalNotes === 0) return false
const changePercentage = modifiedCount / totalNotes
return changePercentage > 0.05 // More than 5% changed
}
/**
* Get cached clustering results if available and fresh.
*/
async getCachedClusters(userId: string): Promise<ClusterResult[] | null> {
const clusters = await prisma.noteCluster.findMany({
where: { userId },
orderBy: { clusterId: 'asc' }
})
if (clusters.length === 0) return null
// Check if data is still fresh
const needsUpdate = await this.shouldRecalculate(userId)
if (needsUpdate) return null
// Get cluster members
const result: ClusterResult[] = []
for (const cluster of clusters) {
const members = await prisma.clusterMember.findMany({
where: { clusterId: cluster.clusterId, userId },
select: { noteId: true }
})
result.push({
clusterId: cluster.clusterId,
noteIds: members.map(m => m.noteId),
name: cluster.name || undefined
})
}
return result
}
}
export const clusteringService = new ClusteringService()