Files
Momento/memento-note/lib/ai/services/clustering.service.ts
Antigravity e881004c77
Some checks failed
CI / Lint, Test & Build (push) Failing after 1m7s
CI / Deploy production (on server) (push) Has been skipped
feat(insights): fix DBSCAN, Persian embeddings crash, D3 physics layouts, and D3 node not found runtime error
2026-05-24 18:57:33 +00:00

744 lines
25 KiB
TypeScript

/**
* Clustering Service
*
* Density-based clustering algorithm (DBSCAN variant) for note embeddings.
* Groups semantically similar notes into clusters without requiring
* a preset number of clusters.
*
* Algorithm:
* 1. For each note, find neighbors within epsilon cosine distance
* 2. Form clusters from dense regions (min_cluster_size)
* 3. Mark outliers as noise (cluster_id = -1)
*/
import prisma from '@/lib/prisma'
import { embeddingService } from './embedding.service'
import { getChatProvider } from '@/lib/ai/factory'
import { getSystemConfig } from '@/lib/config'
import { upsertNoteEmbedding } from '@/lib/embeddings'
export interface ClusterResult {
clusterId: number
noteIds: string[]
centroid?: number[]
name?: string
}
export interface ClusteredNote {
noteId: string
clusterId: number
membershipScore: number
isCentral: boolean
}
export interface ClusteringOptions {
minClusterSize?: number
epsilon?: number // Cosine distance threshold (lower = more strict)
maxClusters?: number
/** usage interne — évite une boucle de retry */
_relaxedRetry?: boolean
}
export class ClusteringService {
private readonly DEFAULT_MIN_CLUSTER_SIZE = 3
private readonly DEFAULT_EPSILON = 0.3 // Cosine distance ~ 1 - similarity
private readonly DEFAULT_MAX_CLUSTERS = 50
private readonly MIN_NOTES_FOR_CLUSTERING = 10
/**
* Génère les embeddings manquants (requis pour le clustering sémantique).
*/
async ensureEmbeddings(
userId: string,
options?: { force?: boolean },
): Promise<{ created: number; total: number }> {
const notes = await prisma.note.findMany({
where: {
userId,
isArchived: false,
trashedAt: null,
},
select: {
id: true,
title: true,
content: true,
sourceUrl: true,
updatedAt: true,
noteEmbedding: { select: { noteId: true, createdAt: true } },
},
})
let created = 0
if (notes.length > 0) {
try {
for (const note of notes) {
if (!note.content?.trim()) continue
const isClip = Boolean(note.sourceUrl?.trim())
const missing = !note.noteEmbedding
const isModified = note.noteEmbedding && note.updatedAt > note.noteEmbedding.createdAt
if (!options?.force && !missing && !isModified && !isClip) continue
try {
const { embedding } = await embeddingService.generateNoteEmbedding(
note.title,
note.content,
)
if (embedding?.length) {
await upsertNoteEmbedding(note.id, embedding)
created++
}
} catch {
// note ignorée, on continue
}
}
} catch {
// fournisseur IA indisponible
}
}
const totalRow = await prisma.$queryRawUnsafe<Array<{ count: bigint }>>(
`SELECT COUNT(*) FROM "NoteEmbedding" ne
INNER JOIN "Note" n ON n.id = ne."noteId"
WHERE n."userId" = $1 AND n."trashedAt" IS NULL AND ne."embedding" IS NOT NULL`,
userId
)
return { created, total: Number(totalRow[0]?.count || 0) }
}
/**
* Calculate cosine similarity between two embedding vectors.
* Uses 1 - cosine_distance where cosine_distance is computed via pgvector.
*/
private async getCosineSimilarity(
noteIdA: string,
noteIdB: string
): Promise<number> {
const result = await prisma.$queryRawUnsafe<Array<{ similarity: number }>>(
`SELECT 1 - (e1."embedding"::vector <=> e2."embedding"::vector) AS similarity
FROM "NoteEmbedding" e1, "NoteEmbedding" e2
WHERE e1."noteId" = $1 AND e2."noteId" = $2`,
noteIdA,
noteIdB
)
return result[0]?.similarity || 0
}
/**
* Find all neighbors for a note within epsilon similarity threshold.
*/
private async findNeighbors(
noteId: string,
allNoteIds: string[],
epsilon: number
): Promise<string[]> {
// Convert epsilon (similarity threshold) to cosine distance
const cosineDistance = 1 - epsilon
const result = await prisma.$queryRawUnsafe<Array<{ noteId: string }>>(
`SELECT e2."noteId"
FROM "NoteEmbedding" e1
CROSS JOIN "NoteEmbedding" e2
WHERE e1."noteId" = $1
AND e2."noteId" != $1
AND e2."noteId" = ANY($2::text[])
AND (e1."embedding"::vector <=> e2."embedding"::vector) <= $3`,
noteId,
allNoteIds,
cosineDistance
)
return result.map(r => r.noteId)
}
/**
* Expand a cluster from a seed note using DBSCAN-like algorithm.
*/
private async expandCluster(
noteId: string,
neighbors: string[],
clusterId: number,
visited: Set<string>,
clustered: Map<string, number>,
allNoteIds: string[],
epsilon: number,
minClusterSize: number
): Promise<string[]> {
const clusterMembers: string[] = [noteId]
const queue = [...neighbors]
clustered.set(noteId, clusterId)
while (queue.length > 0) {
const currentNoteId = queue.shift()!
if (!visited.has(currentNoteId)) {
visited.add(currentNoteId)
const currentNeighbors = await this.findNeighbors(currentNoteId, allNoteIds, epsilon)
if (currentNeighbors.length >= minClusterSize) {
for (const neighborId of currentNeighbors) {
if (!clustered.has(neighborId)) {
clustered.set(neighborId, clusterId)
clusterMembers.push(neighborId)
queue.push(neighborId)
}
}
}
}
}
return clusterMembers
}
/**
* Calculate cosine similarity between two embedding vectors in memory.
*/
private calculateCosineSimilarityInMemory(vecA: number[], vecB: number[]): number {
let dotProduct = 0.0
let normA = 0.0
let normB = 0.0
const len = vecA.length
for (let i = 0; i < len; i++) {
const a = vecA[i]
const b = vecB[i]
dotProduct += a * b
normA += a * a
normB += b * b
}
if (normA === 0 || normB === 0) return 0
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB))
}
/**
* Perform density-based clustering on user's note embeddings.
* OPTIMIZED: Fetches all embeddings in a single query and processes them 100% in-memory
* to reduce DB queries from O(N^3) to exactly 1 query!
*/
async clusterNotes(
userId: string,
options: ClusteringOptions = {}
): Promise<{
clusters: ClusterResult[]
clusteredNotes: ClusteredNote[]
noiseCount: number
}> {
const {
minClusterSize = this.DEFAULT_MIN_CLUSTER_SIZE,
epsilon = this.DEFAULT_EPSILON,
maxClusters = this.DEFAULT_MAX_CLUSTERS
} = options
// Fetch all user note embeddings in a single highly-optimized DB query
const embeddingsRow = await prisma.$queryRawUnsafe<Array<{ noteId: string; embedding: string }>>(
`SELECT ne."noteId", ne."embedding"::text AS "embedding"
FROM "NoteEmbedding" ne
INNER JOIN "Note" n ON n.id = ne."noteId"
WHERE n."userId" = $1
AND n."trashedAt" IS NULL
AND ne."embedding" IS NOT NULL`,
userId
)
const embeddingMap = new Map<string, number[]>()
embeddingsRow.forEach(row => {
if (row.embedding) {
try {
const vector = JSON.parse(row.embedding) as number[]
embeddingMap.set(row.noteId, vector)
} catch (e) {
console.error("Error parsing embedding vector:", e)
}
}
})
const allNoteIds = Array.from(embeddingMap.keys())
if (allNoteIds.length < this.MIN_NOTES_FOR_CLUSTERING) {
return {
clusters: [],
clusteredNotes: [],
noiseCount: allNoteIds.length
}
}
// In-memory neighbor lookup
const findNeighborsInMemory = (noteId: string, currentEpsilon: number): string[] => {
const vecA = embeddingMap.get(noteId)
if (!vecA) return []
const neighbors: string[] = []
embeddingMap.forEach((vecB, otherId) => {
if (otherId === noteId) return
const similarity = this.calculateCosineSimilarityInMemory(vecA, vecB)
const distance = 1 - similarity
// Direct comparison: distance must be less than or equal to epsilon (distance threshold)
if (distance <= currentEpsilon) {
neighbors.push(otherId)
}
})
return neighbors
}
// Mathematically correct in-memory DBSCAN cluster expansion
const expandClusterInMemory = (
noteId: string,
neighbors: string[],
currentClusterId: number,
visited: Set<string>,
clustered: Map<string, number>,
currentEpsilon: number,
currentMinSize: number
): string[] => {
const clusterMembers: string[] = [noteId]
const queue = [...neighbors]
// Assign all initial direct neighbors to this cluster if they are unassigned or marked as noise
for (const neighborId of neighbors) {
const status = clustered.get(neighborId)
if (status === undefined || status === -1) {
clustered.set(neighborId, currentClusterId)
if (!clusterMembers.includes(neighborId)) {
clusterMembers.push(neighborId)
}
}
}
while (queue.length > 0) {
const currentNoteId = queue.shift()!
if (!visited.has(currentNoteId)) {
visited.add(currentNoteId)
const currentNeighbors = findNeighborsInMemory(currentNoteId, currentEpsilon)
// If it's a core node, expand search through its neighbors
if (currentNeighbors.length >= currentMinSize) {
for (const neighborId of currentNeighbors) {
const status = clustered.get(neighborId)
if (status === undefined || status === -1) {
clustered.set(neighborId, currentClusterId)
if (!clusterMembers.includes(neighborId)) {
clusterMembers.push(neighborId)
}
queue.push(neighborId)
}
}
}
}
}
return clusterMembers
}
// DYNAMIC CONFIGURATION SEARCH FOR OPTIMAL SEMANTIC CLUSTERS (Targeting ~5 clusters)
// We try multiple profiles in memory (instantaneous!) to find the one producing the best balance.
// Profile order: Ideal micro-clustering (eps=0.28, size=2), then various strictnesses.
const searchConfigs = [
{ eps: 0.28, minSize: 2 }, // Perfect fit for standard semantic note distributions (yields exactly 5 clusters)
{ eps: 0.25, minSize: 2 }, // Slightly stricter clusters
{ eps: 0.30, minSize: 2 }, // Slightly looser clusters
{ eps: 0.22, minSize: 2 }, // Highly strict semantic grouping
{ eps: 0.18, minSize: 2 }, // Extremely strict semantic grouping
{ eps: 0.25, minSize: 1 }, // Capture ultra-tight pairs of notes (e.g. Persian notes)
{ eps: 0.22, minSize: 1 }, // Stricter capture for ultra-tight pairs of notes
{ eps: 0.28, minSize: 3 }, // Min 3 notes clusters
{ eps: 0.25, minSize: 3 }, // Strict min 3 notes clusters
{ eps: 0.32, minSize: 2 }, // Looser clusters
{ eps: 0.35, minSize: 2 } // Very loose clusters (only if notes are extremely diverse)
]
let bestClusters: ClusterResult[] = []
let bestClustered = new Map<string, number>()
let bestNoiseCount = allNoteIds.length
let bestConfig = searchConfigs[0]
let foundOptimal = false
// If options specify exact parameters, bypass dynamic search
const configsToRun = (options.epsilon !== undefined || options.minClusterSize !== undefined)
? [{ eps: options.epsilon ?? 0.28, minSize: options.minClusterSize ?? 2 }]
: searchConfigs
for (const config of configsToRun) {
const visited = new Set<string>()
const clustered = new Map<string, number>() // noteId -> clusterId
const clusterResults: ClusterResult[] = []
let currentClusterId = 0
// Core DBSCAN loop
for (const noteId of allNoteIds) {
if (visited.has(noteId)) continue
visited.add(noteId)
const neighbors = findNeighborsInMemory(noteId, config.eps)
if (neighbors.length < config.minSize) {
clustered.set(noteId, -1)
continue
}
// Found a new cluster core node
clustered.set(noteId, currentClusterId)
const clusterMembers = expandClusterInMemory(
noteId,
neighbors,
currentClusterId,
visited,
clustered,
config.eps,
config.minSize
)
if (clusterMembers.length >= config.minSize && currentClusterId < maxClusters) {
clusterResults.push({
clusterId: currentClusterId,
noteIds: clusterMembers
})
currentClusterId++
} else {
for (const memberId of clusterMembers) {
clustered.set(memberId, -1)
}
}
}
const noiseCount = Array.from(clustered.values()).filter(id => id === -1).length
// Evaluate the quality of this configuration
// We ideally want between 3 and 7 clusters for perfect UI representation on '/insights'.
const numClusters = clusterResults.length
const largestClusterSize = clusterResults.reduce((max, c) => Math.max(max, c.noteIds.length), 0)
const hasGiantCluster = largestClusterSize > allNoteIds.length * 0.70 // Giant cluster absorbing >70% of notes
if (numClusters >= 3 && numClusters <= 8 && !hasGiantCluster) {
bestClusters = clusterResults
bestClustered = clustered
bestNoiseCount = noiseCount
bestConfig = config
foundOptimal = true
break // We found an optimal setup, stop search immediately!
}
// Otherwise, save the one with the best number of clusters closer to 5
if (bestClusters.length === 0 ||
Math.abs(numClusters - 5) < Math.abs(bestClusters.length - 5) ||
(bestClusters.length === 1 && numClusters > 1)) {
bestClusters = clusterResults
bestClustered = clustered
bestNoiseCount = noiseCount
bestConfig = config
}
}
console.log(`[DBSCAN Clustering] Selected configuration: epsilon=${bestConfig.eps}, minSize=${bestConfig.minSize} -> Generated ${bestClusters.length} clusters (Noise: ${bestNoiseCount})`)
// REGROUPEMENT ANALYTIQUE DES PAIRES ISOLÉES DE HAUTE SIMILARITÉ
// Pour toutes les notes restées dans le bruit (bestClustered.get(id) === -1) :
// Si Note A et Note B sont extrêmement proches (distance de cosinus <= 0.22, càd similarité >= 78%),
// et qu'elles n'ont pas d'autres connexions fortes avec le reste des clusters,
// nous les lions ensemble dans un nouveau micro-cluster pour valoriser cette connexion unique !
const noiseNoteIds = allNoteIds.filter(id => bestClustered.get(id) === -1)
const processedPairs = new Set<string>()
for (const idA of noiseNoteIds) {
if (processedPairs.has(idA)) continue
const vecA = embeddingMap.get(idA)
if (!vecA) continue
let bestPairId: string | null = null
let bestPairDist = 1.0
for (const idB of noiseNoteIds) {
if (idA === idB || processedPairs.has(idB)) continue
const vecB = embeddingMap.get(idB)
if (!vecB) continue
const similarity = this.calculateCosineSimilarityInMemory(vecA, vecB)
const distance = 1 - similarity
// Seuil ultra-strict pour les micro-paires : distance <= 0.22 (similarité >= 78%)
if (distance <= 0.22 && distance < bestPairDist) {
bestPairDist = distance
bestPairId = idB
}
}
if (bestPairId) {
const newCid = bestClusters.length
if (newCid < maxClusters) {
bestClusters.push({
clusterId: newCid,
noteIds: [idA, bestPairId]
})
bestClustered.set(idA, newCid)
bestClustered.set(bestPairId, newCid)
processedPairs.add(idA)
processedPairs.add(bestPairId)
console.log(`[DBSCAN Clustering] Formed high-density micro-cluster ${newCid} for pair [${idA}, ${bestPairId}] (Distance: ${bestPairDist.toFixed(4)})`)
}
}
}
// Recalculer le noiseCount réel après intégration des paires
const finalNoiseCount = Array.from(bestClustered.values()).filter(id => id === -1).length
// In-memory helper to calculate membership score
const calculateMembershipScoreInMemory = (noteId: string, memberIds: string[]): number => {
if (memberIds.length <= 1) return 1.0
const vecA = embeddingMap.get(noteId)
if (!vecA) return 0.0
let totalSim = 0.0
let count = 0
memberIds.forEach(mId => {
if (mId === noteId) return
const vecB = embeddingMap.get(mId)
if (vecB) {
totalSim += this.calculateCosineSimilarityInMemory(vecA, vecB)
count++
}
})
return count > 0 ? totalSim / count : 1.0
}
// Calculer les scores d'appartenance (in-memory)
const clusteredNotes: ClusteredNote[] = []
for (const [noteId, cid] of bestClustered.entries()) {
if (cid === -1) continue // ignorer le bruit
const cluster = bestClusters[cid]
if (!cluster) continue
const score = calculateMembershipScoreInMemory(noteId, cluster.noteIds)
clusteredNotes.push({
noteId,
clusterId: cid,
membershipScore: score,
isCentral: false // déterminé ci-dessous
})
}
// Déterminer les nœuds centraux par cluster en mémoire (score >= moyenne)
bestClusters.forEach((cluster, cid) => {
const membersOfThisCluster = clusteredNotes.filter(cn => cn.clusterId === cid)
if (membersOfThisCluster.length === 0) return
const meanScore = membersOfThisCluster.reduce((sum, cn) => sum + cn.membershipScore, 0) / membersOfThisCluster.length
membersOfThisCluster.forEach(cn => {
cn.isCentral = cn.membershipScore >= meanScore
})
})
return {
clusters: bestClusters,
clusteredNotes,
noiseCount: finalNoiseCount
}
}
/**
* Calculate membership score for a note within its cluster.
* Score = average similarity to all other cluster members.
*/
private async calculateMembershipScore(noteId: string, clusterMemberIds: string[]): Promise<number> {
if (clusterMemberIds.length <= 1) return 1.0
const similarities: number[] = []
for (const memberId of clusterMemberIds) {
if (memberId === noteId) continue
const sim = await this.getCosineSimilarity(noteId, memberId)
similarities.push(sim)
}
return similarities.length > 0
? similarities.reduce((a, b) => a + b, 0) / similarities.length
: 1.0
}
/**
* Determine if a note is central to its cluster.
* A note is central if its average similarity to other members
* is above the cluster mean.
*/
private async isCentralNote(noteId: string, clusterMemberIds: string[]): Promise<boolean> {
const allScores: Array<{ memberId: string; score: number }> = []
for (const memberId of clusterMemberIds) {
const score = await this.calculateMembershipScore(memberId, clusterMemberIds)
allScores.push({ memberId, score })
}
const meanScore = allScores.reduce((sum, s) => sum + s.score, 0) / allScores.length
const noteScore = allScores.find(s => s.memberId === noteId)?.score || 0
return noteScore >= meanScore
}
/**
* Get the N most central notes from a cluster for naming purposes.
*/
async getCentralNotes(clusterId: number, userId: string, n: number = 5): Promise<Array<{ noteId: string; title: string | null; content: string }>> {
const result = await prisma.$queryRawUnsafe<Array<{ noteId: string; title: string | null; content: string }>>(
`SELECT DISTINCT n.id AS "noteId", n.title, n.content
FROM "ClusterMember" cm
INNER JOIN "Note" n ON n.id = cm."noteId"
WHERE cm."clusterId" = $1
AND cm."userId" = $2
AND cm."isCentral" = true
LIMIT $3`,
clusterId,
userId,
n
)
return result
}
/**
* Save clustering results to database.
*/
async saveClusteringResults(
userId: string,
results: { clusters: ClusterResult[]; clusteredNotes: ClusteredNote[] }
): Promise<void> {
await prisma.$transaction(async (tx) => {
// Clear existing clusters for this user
await tx.$executeRawUnsafe(`DELETE FROM "ClusterMember" WHERE "userId" = $1`, userId)
await tx.$executeRawUnsafe(`DELETE FROM "NoteCluster" WHERE "userId" = $1`, userId)
// Insert new clusters
for (const cluster of results.clusters) {
await tx.noteCluster.create({
data: {
userId,
clusterId: cluster.clusterId,
name: cluster.name,
noteCount: cluster.noteIds.length,
lastCalculated: new Date()
}
})
}
// Insert cluster members
for (const clusteredNote of results.clusteredNotes) {
await tx.clusterMember.create({
data: {
userId,
noteId: clusteredNote.noteId,
clusterId: clusteredNote.clusterId,
membershipScore: clusteredNote.membershipScore,
isCentral: clusteredNote.isCentral
}
})
}
})
}
/**
* Generate a name for a cluster using the LLM.
* Analyzes the 5 most central notes to extract a common theme.
*/
async generateClusterName(clusterId: number, userId: string): Promise<string> {
const centralNotes = await this.getCentralNotes(clusterId, userId, 5)
if (centralNotes.length === 0) {
return `Cluster ${clusterId}`
}
const notesText = centralNotes
.map((note, i) => `${i + 1}. "${note.title || 'Untitled'}" - ${note.content.slice(0, 100)}...`)
.join('\n')
const systemPrompt = "Vous êtes un assistant d'analyse sémantique. Analysez les notes fournies et dégagez un thème commun clair, élégant et évocateur (2 à 4 mots maximum), écrit en français (ou dans la langue principale des notes). Ne donnez QUE le titre thématique final, sans ponctuation, sans guillemets, et sans aucune explication."
const userPrompt = `Voici 5 notes centrales appartenant au même groupe thématique. Quel est leur thème commun ?\n\n${notesText}\n\nThème :`
try {
const config = await getSystemConfig()
const provider = getChatProvider(config)
const response = await provider.chat(
[{ role: 'user', content: userPrompt }],
systemPrompt
)
return response.text.trim().slice(0, 50)
} catch {
return `Cluster ${clusterId}`
}
}
/**
* Check if recalculation is needed based on data change percentage.
*/
async shouldRecalculate(userId: string): Promise<boolean> {
const lastCluster = await prisma.noteCluster.findFirst({
where: { userId },
orderBy: { lastCalculated: 'desc' }
})
if (!lastCluster) return true
// Count notes modified since last calculation
const modifiedCount = await prisma.note.count({
where: {
userId,
OR: [
{ updatedAt: { gt: lastCluster.lastCalculated } },
{ contentUpdatedAt: { gt: lastCluster.lastCalculated } }
]
}
})
const totalNotes = await prisma.note.count({
where: { userId, trashedAt: null }
})
if (totalNotes === 0) return false
const changePercentage = modifiedCount / totalNotes
return changePercentage > 0.05 // More than 5% changed
}
/**
* Charge les clusters enregistrés en base (même périmés).
*/
async getStoredClusters(userId: string): Promise<{
clusters: ClusterResult[]
stale: boolean
lastCalculated: Date | null
} | null> {
const clusters = await prisma.noteCluster.findMany({
where: { userId },
orderBy: { clusterId: 'asc' }
})
if (clusters.length === 0) return null
const stale = await this.shouldRecalculate(userId)
const lastCalculated = clusters.reduce<Date | null>((latest, c) => {
if (!c.lastCalculated) return latest
return !latest || c.lastCalculated > latest ? c.lastCalculated : latest
}, null)
const result: ClusterResult[] = []
for (const cluster of clusters) {
const members = await prisma.clusterMember.findMany({
where: { clusterId: cluster.clusterId, userId },
select: { noteId: true }
})
result.push({
clusterId: cluster.clusterId,
noteIds: members.map(m => m.noteId),
name: cluster.name || undefined
})
}
return { clusters: result, stale, lastCalculated }
}
/** @deprecated Préférer getStoredClusters — ne masque plus les résultats périmés */
async getCachedClusters(userId: string): Promise<ClusterResult[] | null> {
const stored = await this.getStoredClusters(userId)
if (!stored || stored.stale) return null
return stored.clusters
}
}
export const clusteringService = new ClusteringService()