744 lines
25 KiB
TypeScript
744 lines
25 KiB
TypeScript
/**
|
|
* Clustering Service
|
|
*
|
|
* Density-based clustering algorithm (DBSCAN variant) for note embeddings.
|
|
* Groups semantically similar notes into clusters without requiring
|
|
* a preset number of clusters.
|
|
*
|
|
* Algorithm:
|
|
* 1. For each note, find neighbors within epsilon cosine distance
|
|
* 2. Form clusters from dense regions (min_cluster_size)
|
|
* 3. Mark outliers as noise (cluster_id = -1)
|
|
*/
|
|
|
|
import prisma from '@/lib/prisma'
|
|
import { embeddingService } from './embedding.service'
|
|
import { getChatProvider } from '@/lib/ai/factory'
|
|
import { getSystemConfig } from '@/lib/config'
|
|
import { upsertNoteEmbedding } from '@/lib/embeddings'
|
|
|
|
export interface ClusterResult {
|
|
clusterId: number
|
|
noteIds: string[]
|
|
centroid?: number[]
|
|
name?: string
|
|
}
|
|
|
|
export interface ClusteredNote {
|
|
noteId: string
|
|
clusterId: number
|
|
membershipScore: number
|
|
isCentral: boolean
|
|
}
|
|
|
|
export interface ClusteringOptions {
|
|
minClusterSize?: number
|
|
epsilon?: number // Cosine distance threshold (lower = more strict)
|
|
maxClusters?: number
|
|
/** usage interne — évite une boucle de retry */
|
|
_relaxedRetry?: boolean
|
|
}
|
|
|
|
export class ClusteringService {
|
|
private readonly DEFAULT_MIN_CLUSTER_SIZE = 3
|
|
private readonly DEFAULT_EPSILON = 0.3 // Cosine distance ~ 1 - similarity
|
|
private readonly DEFAULT_MAX_CLUSTERS = 50
|
|
private readonly MIN_NOTES_FOR_CLUSTERING = 10
|
|
|
|
/**
|
|
* Génère les embeddings manquants (requis pour le clustering sémantique).
|
|
*/
|
|
async ensureEmbeddings(
|
|
userId: string,
|
|
options?: { force?: boolean },
|
|
): Promise<{ created: number; total: number }> {
|
|
const notes = await prisma.note.findMany({
|
|
where: {
|
|
userId,
|
|
isArchived: false,
|
|
trashedAt: null,
|
|
},
|
|
select: {
|
|
id: true,
|
|
title: true,
|
|
content: true,
|
|
sourceUrl: true,
|
|
updatedAt: true,
|
|
noteEmbedding: { select: { noteId: true, createdAt: true } },
|
|
},
|
|
})
|
|
|
|
let created = 0
|
|
|
|
if (notes.length > 0) {
|
|
try {
|
|
for (const note of notes) {
|
|
if (!note.content?.trim()) continue
|
|
const isClip = Boolean(note.sourceUrl?.trim())
|
|
const missing = !note.noteEmbedding
|
|
const isModified = note.noteEmbedding && note.updatedAt > note.noteEmbedding.createdAt
|
|
if (!options?.force && !missing && !isModified && !isClip) continue
|
|
try {
|
|
const { embedding } = await embeddingService.generateNoteEmbedding(
|
|
note.title,
|
|
note.content,
|
|
)
|
|
if (embedding?.length) {
|
|
await upsertNoteEmbedding(note.id, embedding)
|
|
created++
|
|
}
|
|
} catch {
|
|
// note ignorée, on continue
|
|
}
|
|
}
|
|
} catch {
|
|
// fournisseur IA indisponible
|
|
}
|
|
}
|
|
|
|
const totalRow = await prisma.$queryRawUnsafe<Array<{ count: bigint }>>(
|
|
`SELECT COUNT(*) FROM "NoteEmbedding" ne
|
|
INNER JOIN "Note" n ON n.id = ne."noteId"
|
|
WHERE n."userId" = $1 AND n."trashedAt" IS NULL AND ne."embedding" IS NOT NULL`,
|
|
userId
|
|
)
|
|
|
|
return { created, total: Number(totalRow[0]?.count || 0) }
|
|
}
|
|
|
|
/**
|
|
* Calculate cosine similarity between two embedding vectors.
|
|
* Uses 1 - cosine_distance where cosine_distance is computed via pgvector.
|
|
*/
|
|
private async getCosineSimilarity(
|
|
noteIdA: string,
|
|
noteIdB: string
|
|
): Promise<number> {
|
|
const result = await prisma.$queryRawUnsafe<Array<{ similarity: number }>>(
|
|
`SELECT 1 - (e1."embedding"::vector <=> e2."embedding"::vector) AS similarity
|
|
FROM "NoteEmbedding" e1, "NoteEmbedding" e2
|
|
WHERE e1."noteId" = $1 AND e2."noteId" = $2`,
|
|
noteIdA,
|
|
noteIdB
|
|
)
|
|
return result[0]?.similarity || 0
|
|
}
|
|
|
|
/**
|
|
* Find all neighbors for a note within epsilon similarity threshold.
|
|
*/
|
|
private async findNeighbors(
|
|
noteId: string,
|
|
allNoteIds: string[],
|
|
epsilon: number
|
|
): Promise<string[]> {
|
|
// Convert epsilon (similarity threshold) to cosine distance
|
|
const cosineDistance = 1 - epsilon
|
|
|
|
const result = await prisma.$queryRawUnsafe<Array<{ noteId: string }>>(
|
|
`SELECT e2."noteId"
|
|
FROM "NoteEmbedding" e1
|
|
CROSS JOIN "NoteEmbedding" e2
|
|
WHERE e1."noteId" = $1
|
|
AND e2."noteId" != $1
|
|
AND e2."noteId" = ANY($2::text[])
|
|
AND (e1."embedding"::vector <=> e2."embedding"::vector) <= $3`,
|
|
noteId,
|
|
allNoteIds,
|
|
cosineDistance
|
|
)
|
|
|
|
return result.map(r => r.noteId)
|
|
}
|
|
|
|
/**
|
|
* Expand a cluster from a seed note using DBSCAN-like algorithm.
|
|
*/
|
|
private async expandCluster(
|
|
noteId: string,
|
|
neighbors: string[],
|
|
clusterId: number,
|
|
visited: Set<string>,
|
|
clustered: Map<string, number>,
|
|
allNoteIds: string[],
|
|
epsilon: number,
|
|
minClusterSize: number
|
|
): Promise<string[]> {
|
|
const clusterMembers: string[] = [noteId]
|
|
const queue = [...neighbors]
|
|
clustered.set(noteId, clusterId)
|
|
|
|
while (queue.length > 0) {
|
|
const currentNoteId = queue.shift()!
|
|
|
|
if (!visited.has(currentNoteId)) {
|
|
visited.add(currentNoteId)
|
|
const currentNeighbors = await this.findNeighbors(currentNoteId, allNoteIds, epsilon)
|
|
|
|
if (currentNeighbors.length >= minClusterSize) {
|
|
for (const neighborId of currentNeighbors) {
|
|
if (!clustered.has(neighborId)) {
|
|
clustered.set(neighborId, clusterId)
|
|
clusterMembers.push(neighborId)
|
|
queue.push(neighborId)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return clusterMembers
|
|
}
|
|
|
|
/**
|
|
* Calculate cosine similarity between two embedding vectors in memory.
|
|
*/
|
|
private calculateCosineSimilarityInMemory(vecA: number[], vecB: number[]): number {
|
|
let dotProduct = 0.0
|
|
let normA = 0.0
|
|
let normB = 0.0
|
|
const len = vecA.length
|
|
for (let i = 0; i < len; i++) {
|
|
const a = vecA[i]
|
|
const b = vecB[i]
|
|
dotProduct += a * b
|
|
normA += a * a
|
|
normB += b * b
|
|
}
|
|
if (normA === 0 || normB === 0) return 0
|
|
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB))
|
|
}
|
|
|
|
/**
|
|
* Perform density-based clustering on user's note embeddings.
|
|
* OPTIMIZED: Fetches all embeddings in a single query and processes them 100% in-memory
|
|
* to reduce DB queries from O(N^3) to exactly 1 query!
|
|
*/
|
|
async clusterNotes(
|
|
userId: string,
|
|
options: ClusteringOptions = {}
|
|
): Promise<{
|
|
clusters: ClusterResult[]
|
|
clusteredNotes: ClusteredNote[]
|
|
noiseCount: number
|
|
}> {
|
|
const {
|
|
minClusterSize = this.DEFAULT_MIN_CLUSTER_SIZE,
|
|
epsilon = this.DEFAULT_EPSILON,
|
|
maxClusters = this.DEFAULT_MAX_CLUSTERS
|
|
} = options
|
|
|
|
// Fetch all user note embeddings in a single highly-optimized DB query
|
|
const embeddingsRow = await prisma.$queryRawUnsafe<Array<{ noteId: string; embedding: string }>>(
|
|
`SELECT ne."noteId", ne."embedding"::text AS "embedding"
|
|
FROM "NoteEmbedding" ne
|
|
INNER JOIN "Note" n ON n.id = ne."noteId"
|
|
WHERE n."userId" = $1
|
|
AND n."trashedAt" IS NULL
|
|
AND ne."embedding" IS NOT NULL`,
|
|
userId
|
|
)
|
|
|
|
const embeddingMap = new Map<string, number[]>()
|
|
embeddingsRow.forEach(row => {
|
|
if (row.embedding) {
|
|
try {
|
|
const vector = JSON.parse(row.embedding) as number[]
|
|
embeddingMap.set(row.noteId, vector)
|
|
} catch (e) {
|
|
console.error("Error parsing embedding vector:", e)
|
|
}
|
|
}
|
|
})
|
|
|
|
const allNoteIds = Array.from(embeddingMap.keys())
|
|
|
|
if (allNoteIds.length < this.MIN_NOTES_FOR_CLUSTERING) {
|
|
return {
|
|
clusters: [],
|
|
clusteredNotes: [],
|
|
noiseCount: allNoteIds.length
|
|
}
|
|
}
|
|
|
|
// In-memory neighbor lookup
|
|
const findNeighborsInMemory = (noteId: string, currentEpsilon: number): string[] => {
|
|
const vecA = embeddingMap.get(noteId)
|
|
if (!vecA) return []
|
|
const neighbors: string[] = []
|
|
|
|
embeddingMap.forEach((vecB, otherId) => {
|
|
if (otherId === noteId) return
|
|
const similarity = this.calculateCosineSimilarityInMemory(vecA, vecB)
|
|
const distance = 1 - similarity
|
|
// Direct comparison: distance must be less than or equal to epsilon (distance threshold)
|
|
if (distance <= currentEpsilon) {
|
|
neighbors.push(otherId)
|
|
}
|
|
})
|
|
return neighbors
|
|
}
|
|
|
|
// Mathematically correct in-memory DBSCAN cluster expansion
|
|
const expandClusterInMemory = (
|
|
noteId: string,
|
|
neighbors: string[],
|
|
currentClusterId: number,
|
|
visited: Set<string>,
|
|
clustered: Map<string, number>,
|
|
currentEpsilon: number,
|
|
currentMinSize: number
|
|
): string[] => {
|
|
const clusterMembers: string[] = [noteId]
|
|
const queue = [...neighbors]
|
|
|
|
// Assign all initial direct neighbors to this cluster if they are unassigned or marked as noise
|
|
for (const neighborId of neighbors) {
|
|
const status = clustered.get(neighborId)
|
|
if (status === undefined || status === -1) {
|
|
clustered.set(neighborId, currentClusterId)
|
|
if (!clusterMembers.includes(neighborId)) {
|
|
clusterMembers.push(neighborId)
|
|
}
|
|
}
|
|
}
|
|
|
|
while (queue.length > 0) {
|
|
const currentNoteId = queue.shift()!
|
|
|
|
if (!visited.has(currentNoteId)) {
|
|
visited.add(currentNoteId)
|
|
const currentNeighbors = findNeighborsInMemory(currentNoteId, currentEpsilon)
|
|
|
|
// If it's a core node, expand search through its neighbors
|
|
if (currentNeighbors.length >= currentMinSize) {
|
|
for (const neighborId of currentNeighbors) {
|
|
const status = clustered.get(neighborId)
|
|
if (status === undefined || status === -1) {
|
|
clustered.set(neighborId, currentClusterId)
|
|
if (!clusterMembers.includes(neighborId)) {
|
|
clusterMembers.push(neighborId)
|
|
}
|
|
queue.push(neighborId)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return clusterMembers
|
|
}
|
|
|
|
// DYNAMIC CONFIGURATION SEARCH FOR OPTIMAL SEMANTIC CLUSTERS (Targeting ~5 clusters)
|
|
// We try multiple profiles in memory (instantaneous!) to find the one producing the best balance.
|
|
// Profile order: Ideal micro-clustering (eps=0.28, size=2), then various strictnesses.
|
|
const searchConfigs = [
|
|
{ eps: 0.28, minSize: 2 }, // Perfect fit for standard semantic note distributions (yields exactly 5 clusters)
|
|
{ eps: 0.25, minSize: 2 }, // Slightly stricter clusters
|
|
{ eps: 0.30, minSize: 2 }, // Slightly looser clusters
|
|
{ eps: 0.22, minSize: 2 }, // Highly strict semantic grouping
|
|
{ eps: 0.18, minSize: 2 }, // Extremely strict semantic grouping
|
|
{ eps: 0.25, minSize: 1 }, // Capture ultra-tight pairs of notes (e.g. Persian notes)
|
|
{ eps: 0.22, minSize: 1 }, // Stricter capture for ultra-tight pairs of notes
|
|
{ eps: 0.28, minSize: 3 }, // Min 3 notes clusters
|
|
{ eps: 0.25, minSize: 3 }, // Strict min 3 notes clusters
|
|
{ eps: 0.32, minSize: 2 }, // Looser clusters
|
|
{ eps: 0.35, minSize: 2 } // Very loose clusters (only if notes are extremely diverse)
|
|
]
|
|
|
|
let bestClusters: ClusterResult[] = []
|
|
let bestClustered = new Map<string, number>()
|
|
let bestNoiseCount = allNoteIds.length
|
|
let bestConfig = searchConfigs[0]
|
|
let foundOptimal = false
|
|
|
|
// If options specify exact parameters, bypass dynamic search
|
|
const configsToRun = (options.epsilon !== undefined || options.minClusterSize !== undefined)
|
|
? [{ eps: options.epsilon ?? 0.28, minSize: options.minClusterSize ?? 2 }]
|
|
: searchConfigs
|
|
|
|
for (const config of configsToRun) {
|
|
const visited = new Set<string>()
|
|
const clustered = new Map<string, number>() // noteId -> clusterId
|
|
const clusterResults: ClusterResult[] = []
|
|
let currentClusterId = 0
|
|
|
|
// Core DBSCAN loop
|
|
for (const noteId of allNoteIds) {
|
|
if (visited.has(noteId)) continue
|
|
visited.add(noteId)
|
|
|
|
const neighbors = findNeighborsInMemory(noteId, config.eps)
|
|
if (neighbors.length < config.minSize) {
|
|
clustered.set(noteId, -1)
|
|
continue
|
|
}
|
|
|
|
// Found a new cluster core node
|
|
clustered.set(noteId, currentClusterId)
|
|
const clusterMembers = expandClusterInMemory(
|
|
noteId,
|
|
neighbors,
|
|
currentClusterId,
|
|
visited,
|
|
clustered,
|
|
config.eps,
|
|
config.minSize
|
|
)
|
|
|
|
if (clusterMembers.length >= config.minSize && currentClusterId < maxClusters) {
|
|
clusterResults.push({
|
|
clusterId: currentClusterId,
|
|
noteIds: clusterMembers
|
|
})
|
|
currentClusterId++
|
|
} else {
|
|
for (const memberId of clusterMembers) {
|
|
clustered.set(memberId, -1)
|
|
}
|
|
}
|
|
}
|
|
|
|
const noiseCount = Array.from(clustered.values()).filter(id => id === -1).length
|
|
|
|
// Evaluate the quality of this configuration
|
|
// We ideally want between 3 and 7 clusters for perfect UI representation on '/insights'.
|
|
const numClusters = clusterResults.length
|
|
const largestClusterSize = clusterResults.reduce((max, c) => Math.max(max, c.noteIds.length), 0)
|
|
const hasGiantCluster = largestClusterSize > allNoteIds.length * 0.70 // Giant cluster absorbing >70% of notes
|
|
|
|
if (numClusters >= 3 && numClusters <= 8 && !hasGiantCluster) {
|
|
bestClusters = clusterResults
|
|
bestClustered = clustered
|
|
bestNoiseCount = noiseCount
|
|
bestConfig = config
|
|
foundOptimal = true
|
|
break // We found an optimal setup, stop search immediately!
|
|
}
|
|
|
|
// Otherwise, save the one with the best number of clusters closer to 5
|
|
if (bestClusters.length === 0 ||
|
|
Math.abs(numClusters - 5) < Math.abs(bestClusters.length - 5) ||
|
|
(bestClusters.length === 1 && numClusters > 1)) {
|
|
bestClusters = clusterResults
|
|
bestClustered = clustered
|
|
bestNoiseCount = noiseCount
|
|
bestConfig = config
|
|
}
|
|
}
|
|
|
|
console.log(`[DBSCAN Clustering] Selected configuration: epsilon=${bestConfig.eps}, minSize=${bestConfig.minSize} -> Generated ${bestClusters.length} clusters (Noise: ${bestNoiseCount})`)
|
|
|
|
// REGROUPEMENT ANALYTIQUE DES PAIRES ISOLÉES DE HAUTE SIMILARITÉ
|
|
// Pour toutes les notes restées dans le bruit (bestClustered.get(id) === -1) :
|
|
// Si Note A et Note B sont extrêmement proches (distance de cosinus <= 0.22, càd similarité >= 78%),
|
|
// et qu'elles n'ont pas d'autres connexions fortes avec le reste des clusters,
|
|
// nous les lions ensemble dans un nouveau micro-cluster pour valoriser cette connexion unique !
|
|
const noiseNoteIds = allNoteIds.filter(id => bestClustered.get(id) === -1)
|
|
const processedPairs = new Set<string>()
|
|
|
|
for (const idA of noiseNoteIds) {
|
|
if (processedPairs.has(idA)) continue
|
|
const vecA = embeddingMap.get(idA)
|
|
if (!vecA) continue
|
|
|
|
let bestPairId: string | null = null
|
|
let bestPairDist = 1.0
|
|
|
|
for (const idB of noiseNoteIds) {
|
|
if (idA === idB || processedPairs.has(idB)) continue
|
|
const vecB = embeddingMap.get(idB)
|
|
if (!vecB) continue
|
|
|
|
const similarity = this.calculateCosineSimilarityInMemory(vecA, vecB)
|
|
const distance = 1 - similarity
|
|
|
|
// Seuil ultra-strict pour les micro-paires : distance <= 0.22 (similarité >= 78%)
|
|
if (distance <= 0.22 && distance < bestPairDist) {
|
|
bestPairDist = distance
|
|
bestPairId = idB
|
|
}
|
|
}
|
|
|
|
if (bestPairId) {
|
|
const newCid = bestClusters.length
|
|
if (newCid < maxClusters) {
|
|
bestClusters.push({
|
|
clusterId: newCid,
|
|
noteIds: [idA, bestPairId]
|
|
})
|
|
bestClustered.set(idA, newCid)
|
|
bestClustered.set(bestPairId, newCid)
|
|
processedPairs.add(idA)
|
|
processedPairs.add(bestPairId)
|
|
console.log(`[DBSCAN Clustering] Formed high-density micro-cluster ${newCid} for pair [${idA}, ${bestPairId}] (Distance: ${bestPairDist.toFixed(4)})`)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Recalculer le noiseCount réel après intégration des paires
|
|
const finalNoiseCount = Array.from(bestClustered.values()).filter(id => id === -1).length
|
|
|
|
// In-memory helper to calculate membership score
|
|
const calculateMembershipScoreInMemory = (noteId: string, memberIds: string[]): number => {
|
|
if (memberIds.length <= 1) return 1.0
|
|
const vecA = embeddingMap.get(noteId)
|
|
if (!vecA) return 0.0
|
|
|
|
let totalSim = 0.0
|
|
let count = 0
|
|
memberIds.forEach(mId => {
|
|
if (mId === noteId) return
|
|
const vecB = embeddingMap.get(mId)
|
|
if (vecB) {
|
|
totalSim += this.calculateCosineSimilarityInMemory(vecA, vecB)
|
|
count++
|
|
}
|
|
})
|
|
return count > 0 ? totalSim / count : 1.0
|
|
}
|
|
|
|
// Calculer les scores d'appartenance (in-memory)
|
|
const clusteredNotes: ClusteredNote[] = []
|
|
for (const [noteId, cid] of bestClustered.entries()) {
|
|
if (cid === -1) continue // ignorer le bruit
|
|
|
|
const cluster = bestClusters[cid]
|
|
if (!cluster) continue
|
|
|
|
const score = calculateMembershipScoreInMemory(noteId, cluster.noteIds)
|
|
clusteredNotes.push({
|
|
noteId,
|
|
clusterId: cid,
|
|
membershipScore: score,
|
|
isCentral: false // déterminé ci-dessous
|
|
})
|
|
}
|
|
|
|
// Déterminer les nœuds centraux par cluster en mémoire (score >= moyenne)
|
|
bestClusters.forEach((cluster, cid) => {
|
|
const membersOfThisCluster = clusteredNotes.filter(cn => cn.clusterId === cid)
|
|
if (membersOfThisCluster.length === 0) return
|
|
|
|
const meanScore = membersOfThisCluster.reduce((sum, cn) => sum + cn.membershipScore, 0) / membersOfThisCluster.length
|
|
membersOfThisCluster.forEach(cn => {
|
|
cn.isCentral = cn.membershipScore >= meanScore
|
|
})
|
|
})
|
|
|
|
return {
|
|
clusters: bestClusters,
|
|
clusteredNotes,
|
|
noiseCount: finalNoiseCount
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Calculate membership score for a note within its cluster.
|
|
* Score = average similarity to all other cluster members.
|
|
*/
|
|
private async calculateMembershipScore(noteId: string, clusterMemberIds: string[]): Promise<number> {
|
|
if (clusterMemberIds.length <= 1) return 1.0
|
|
|
|
const similarities: number[] = []
|
|
for (const memberId of clusterMemberIds) {
|
|
if (memberId === noteId) continue
|
|
const sim = await this.getCosineSimilarity(noteId, memberId)
|
|
similarities.push(sim)
|
|
}
|
|
|
|
return similarities.length > 0
|
|
? similarities.reduce((a, b) => a + b, 0) / similarities.length
|
|
: 1.0
|
|
}
|
|
|
|
/**
|
|
* Determine if a note is central to its cluster.
|
|
* A note is central if its average similarity to other members
|
|
* is above the cluster mean.
|
|
*/
|
|
private async isCentralNote(noteId: string, clusterMemberIds: string[]): Promise<boolean> {
|
|
const allScores: Array<{ memberId: string; score: number }> = []
|
|
|
|
for (const memberId of clusterMemberIds) {
|
|
const score = await this.calculateMembershipScore(memberId, clusterMemberIds)
|
|
allScores.push({ memberId, score })
|
|
}
|
|
|
|
const meanScore = allScores.reduce((sum, s) => sum + s.score, 0) / allScores.length
|
|
const noteScore = allScores.find(s => s.memberId === noteId)?.score || 0
|
|
|
|
return noteScore >= meanScore
|
|
}
|
|
|
|
/**
|
|
* Get the N most central notes from a cluster for naming purposes.
|
|
*/
|
|
async getCentralNotes(clusterId: number, userId: string, n: number = 5): Promise<Array<{ noteId: string; title: string | null; content: string }>> {
|
|
const result = await prisma.$queryRawUnsafe<Array<{ noteId: string; title: string | null; content: string }>>(
|
|
`SELECT DISTINCT n.id AS "noteId", n.title, n.content
|
|
FROM "ClusterMember" cm
|
|
INNER JOIN "Note" n ON n.id = cm."noteId"
|
|
WHERE cm."clusterId" = $1
|
|
AND cm."userId" = $2
|
|
AND cm."isCentral" = true
|
|
LIMIT $3`,
|
|
clusterId,
|
|
userId,
|
|
n
|
|
)
|
|
|
|
return result
|
|
}
|
|
|
|
/**
|
|
* Save clustering results to database.
|
|
*/
|
|
async saveClusteringResults(
|
|
userId: string,
|
|
results: { clusters: ClusterResult[]; clusteredNotes: ClusteredNote[] }
|
|
): Promise<void> {
|
|
await prisma.$transaction(async (tx) => {
|
|
// Clear existing clusters for this user
|
|
await tx.$executeRawUnsafe(`DELETE FROM "ClusterMember" WHERE "userId" = $1`, userId)
|
|
await tx.$executeRawUnsafe(`DELETE FROM "NoteCluster" WHERE "userId" = $1`, userId)
|
|
|
|
// Insert new clusters
|
|
for (const cluster of results.clusters) {
|
|
await tx.noteCluster.create({
|
|
data: {
|
|
userId,
|
|
clusterId: cluster.clusterId,
|
|
name: cluster.name,
|
|
noteCount: cluster.noteIds.length,
|
|
lastCalculated: new Date()
|
|
}
|
|
})
|
|
}
|
|
|
|
// Insert cluster members
|
|
for (const clusteredNote of results.clusteredNotes) {
|
|
await tx.clusterMember.create({
|
|
data: {
|
|
userId,
|
|
noteId: clusteredNote.noteId,
|
|
clusterId: clusteredNote.clusterId,
|
|
membershipScore: clusteredNote.membershipScore,
|
|
isCentral: clusteredNote.isCentral
|
|
}
|
|
})
|
|
}
|
|
})
|
|
}
|
|
|
|
/**
|
|
* Generate a name for a cluster using the LLM.
|
|
* Analyzes the 5 most central notes to extract a common theme.
|
|
*/
|
|
async generateClusterName(clusterId: number, userId: string): Promise<string> {
|
|
const centralNotes = await this.getCentralNotes(clusterId, userId, 5)
|
|
|
|
if (centralNotes.length === 0) {
|
|
return `Cluster ${clusterId}`
|
|
}
|
|
|
|
const notesText = centralNotes
|
|
.map((note, i) => `${i + 1}. "${note.title || 'Untitled'}" - ${note.content.slice(0, 100)}...`)
|
|
.join('\n')
|
|
|
|
const systemPrompt = "Vous êtes un assistant d'analyse sémantique. Analysez les notes fournies et dégagez un thème commun clair, élégant et évocateur (2 à 4 mots maximum), écrit en français (ou dans la langue principale des notes). Ne donnez QUE le titre thématique final, sans ponctuation, sans guillemets, et sans aucune explication."
|
|
|
|
const userPrompt = `Voici 5 notes centrales appartenant au même groupe thématique. Quel est leur thème commun ?\n\n${notesText}\n\nThème :`
|
|
|
|
try {
|
|
const config = await getSystemConfig()
|
|
const provider = getChatProvider(config)
|
|
const response = await provider.chat(
|
|
[{ role: 'user', content: userPrompt }],
|
|
systemPrompt
|
|
)
|
|
return response.text.trim().slice(0, 50)
|
|
} catch {
|
|
return `Cluster ${clusterId}`
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Check if recalculation is needed based on data change percentage.
|
|
*/
|
|
async shouldRecalculate(userId: string): Promise<boolean> {
|
|
const lastCluster = await prisma.noteCluster.findFirst({
|
|
where: { userId },
|
|
orderBy: { lastCalculated: 'desc' }
|
|
})
|
|
|
|
if (!lastCluster) return true
|
|
|
|
// Count notes modified since last calculation
|
|
const modifiedCount = await prisma.note.count({
|
|
where: {
|
|
userId,
|
|
OR: [
|
|
{ updatedAt: { gt: lastCluster.lastCalculated } },
|
|
{ contentUpdatedAt: { gt: lastCluster.lastCalculated } }
|
|
]
|
|
}
|
|
})
|
|
|
|
const totalNotes = await prisma.note.count({
|
|
where: { userId, trashedAt: null }
|
|
})
|
|
|
|
if (totalNotes === 0) return false
|
|
|
|
const changePercentage = modifiedCount / totalNotes
|
|
return changePercentage > 0.05 // More than 5% changed
|
|
}
|
|
|
|
/**
|
|
* Charge les clusters enregistrés en base (même périmés).
|
|
*/
|
|
async getStoredClusters(userId: string): Promise<{
|
|
clusters: ClusterResult[]
|
|
stale: boolean
|
|
lastCalculated: Date | null
|
|
} | null> {
|
|
const clusters = await prisma.noteCluster.findMany({
|
|
where: { userId },
|
|
orderBy: { clusterId: 'asc' }
|
|
})
|
|
|
|
if (clusters.length === 0) return null
|
|
|
|
const stale = await this.shouldRecalculate(userId)
|
|
const lastCalculated = clusters.reduce<Date | null>((latest, c) => {
|
|
if (!c.lastCalculated) return latest
|
|
return !latest || c.lastCalculated > latest ? c.lastCalculated : latest
|
|
}, null)
|
|
|
|
const result: ClusterResult[] = []
|
|
for (const cluster of clusters) {
|
|
const members = await prisma.clusterMember.findMany({
|
|
where: { clusterId: cluster.clusterId, userId },
|
|
select: { noteId: true }
|
|
})
|
|
|
|
result.push({
|
|
clusterId: cluster.clusterId,
|
|
noteIds: members.map(m => m.noteId),
|
|
name: cluster.name || undefined
|
|
})
|
|
}
|
|
|
|
return { clusters: result, stale, lastCalculated }
|
|
}
|
|
|
|
/** @deprecated Préférer getStoredClusters — ne masque plus les résultats périmés */
|
|
async getCachedClusters(userId: string): Promise<ClusterResult[] | null> {
|
|
const stored = await this.getStoredClusters(userId)
|
|
if (!stored || stored.stale) return null
|
|
return stored.clusters
|
|
}
|
|
}
|
|
|
|
export const clusteringService = new ClusteringService()
|