feat: migrate semantic search to pgvector + full-text search
All checks were successful
Deploy to Production / Build and Deploy (push) Successful in 2m12s
All checks were successful
Deploy to Production / Build and Deploy (push) Successful in 2m12s
Replace JSON-string embeddings with native pgvector(1536) storage and add PostgreSQL full-text search (tsvector/GIN) with Reciprocal Rank Fusion for hybrid keyword + semantic ranking. Changes: - NoteEmbedding.embedding: String → vector(1536) via pgvector - NoteEmbedding: added updatedAt for reindex tracking - Note: added tsv (tsvector) with auto-update trigger for FTS - semantic-search.service: hybrid FTS + vector search with RRF fusion - embedding.service: toVectorString() for pgvector SQL literals - Removed JS-side cosine similarity loops (now DB-side via <=>) - Added HNSW index on NoteEmbedding.embedding (cosine distance) - Added GIN index on Note.tsv for FTS queries Schema migration in: prisma/migrations/20260512120000_pgvector_and_fts_search/ Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -5,9 +5,10 @@ import prisma from '@/lib/prisma'
|
||||
import { Note, CheckItem, NoteType } from '@/lib/types'
|
||||
import { auth } from '@/auth'
|
||||
import { getAIProvider } from '@/lib/ai/factory'
|
||||
import { parseNote as parseNoteUtil, cosineSimilarity, calculateRRFK, detectQueryType, getSearchWeights } from '@/lib/utils'
|
||||
import { parseNote as parseNoteUtil } from '@/lib/utils'
|
||||
import { getSystemConfig, getConfigNumber, getConfigBoolean, SEARCH_DEFAULTS } from '@/lib/config'
|
||||
import { contextualAutoTagService } from '@/lib/ai/services/contextual-auto-tag.service'
|
||||
import { semanticSearchService } from '@/lib/ai/services/semantic-search.service'
|
||||
import { cleanupNoteImages, parseImageUrls, deleteImageFileSafely } from '@/lib/image-cleanup'
|
||||
import { getAISettings } from '@/app/actions/ai-settings'
|
||||
import {
|
||||
@@ -486,122 +487,54 @@ export async function enableNoteHistory(noteId: string) {
|
||||
})
|
||||
}
|
||||
|
||||
// Search notes - DB-side filtering (fast) with optional semantic search
|
||||
// Supports contextual search within notebook (IA5)
|
||||
export async function searchNotes(query: string, useSemantic: boolean = false, notebookId?: string) {
|
||||
// Unified hybrid search — always uses FTS + pgvector with RRF fusion.
|
||||
// Supports contextual search within notebook (IA5).
|
||||
export async function searchNotes(query: string, _useSemantic: boolean = true, notebookId?: string) {
|
||||
const session = await auth();
|
||||
if (!session?.user?.id) return [];
|
||||
|
||||
try {
|
||||
// If query empty, return all notes
|
||||
if (!query || !query.trim()) {
|
||||
return await getAllNotes();
|
||||
}
|
||||
|
||||
// If semantic search is requested, use the full implementation
|
||||
if (useSemantic) {
|
||||
return await semanticSearch(query, session.user.id, notebookId);
|
||||
}
|
||||
const results = await semanticSearchService.searchAsUser(session.user.id, query, {
|
||||
limit: 50,
|
||||
threshold: 0.25,
|
||||
notebookId
|
||||
});
|
||||
|
||||
// DB-side keyword search using LIKE — much faster than loading all notes in memory
|
||||
const noteIds = results.map(r => r.noteId);
|
||||
const notes = await prisma.note.findMany({
|
||||
where: {
|
||||
id: { in: noteIds },
|
||||
userId: session.user.id,
|
||||
isArchived: false,
|
||||
trashedAt: null,
|
||||
OR: [
|
||||
{ title: { contains: query } },
|
||||
{ content: { contains: query } },
|
||||
{ labels: { contains: query } },
|
||||
],
|
||||
},
|
||||
select: NOTE_LIST_SELECT,
|
||||
orderBy: [
|
||||
{ isPinned: 'desc' },
|
||||
{ order: 'asc' },
|
||||
{ updatedAt: 'desc' }
|
||||
]
|
||||
});
|
||||
|
||||
return notes.map(parseNote);
|
||||
const orderMap = new Map(results.map((r, i) => [r.noteId, i]));
|
||||
const parsed = notes.map(parseNote);
|
||||
|
||||
parsed.sort((a, b) => (orderMap.get(a.id) ?? 999) - (orderMap.get(b.id) ?? 999));
|
||||
|
||||
if (parsed.length > 0) {
|
||||
const topResult = results[0];
|
||||
if (topResult) {
|
||||
parsed[0].matchType = topResult.matchType;
|
||||
parsed[0].searchScore = topResult.score;
|
||||
}
|
||||
}
|
||||
|
||||
return parsed;
|
||||
} catch (error) {
|
||||
console.error('Search error:', error);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
// Semantic search with AI embeddings - SIMPLE VERSION
|
||||
// Supports contextual search within notebook (IA5)
|
||||
async function semanticSearch(query: string, userId: string, notebookId?: string) {
|
||||
const allNotes = await prisma.note.findMany({
|
||||
where: {
|
||||
userId: userId,
|
||||
isArchived: false,
|
||||
trashedAt: null,
|
||||
...(notebookId !== undefined ? { notebookId } : {})
|
||||
},
|
||||
include: { noteEmbedding: true }
|
||||
});
|
||||
|
||||
const queryLower = query.toLowerCase().trim();
|
||||
|
||||
// Get query embedding
|
||||
let queryEmbedding: number[] | null = null;
|
||||
try {
|
||||
const provider = getAIProvider(await getSystemConfig());
|
||||
queryEmbedding = await provider.getEmbeddings(query);
|
||||
} catch (e) {
|
||||
console.error('Failed to generate query embedding:', e);
|
||||
// Fallback to simple keyword search
|
||||
queryEmbedding = null;
|
||||
}
|
||||
|
||||
// Filter notes: keyword match OR semantic match (threshold 30%)
|
||||
const results = allNotes.map(note => {
|
||||
const title = (note.title || '').toLowerCase();
|
||||
const content = note.content.toLowerCase();
|
||||
const labels = note.labels ? JSON.parse(note.labels) : [];
|
||||
|
||||
// Keyword match
|
||||
const keywordMatch = title.includes(queryLower) ||
|
||||
content.includes(queryLower) ||
|
||||
labels.some((l: string) => l.toLowerCase().includes(queryLower));
|
||||
|
||||
// Semantic match (if embedding available)
|
||||
let semanticMatch = false;
|
||||
let similarity = 0;
|
||||
if (queryEmbedding && note.noteEmbedding?.embedding) {
|
||||
similarity = cosineSimilarity(queryEmbedding, JSON.parse(note.noteEmbedding.embedding));
|
||||
semanticMatch = similarity > 0.3; // 30% threshold - works well for related concepts
|
||||
}
|
||||
|
||||
return {
|
||||
note,
|
||||
keywordMatch,
|
||||
semanticMatch,
|
||||
similarity
|
||||
};
|
||||
}).filter(r => r.keywordMatch || r.semanticMatch);
|
||||
|
||||
// Parse and add match info
|
||||
return results.map(r => {
|
||||
const parsed = parseNote(r.note);
|
||||
|
||||
// Determine match type
|
||||
let matchType: 'exact' | 'related' | null = null;
|
||||
if (r.semanticMatch) {
|
||||
matchType = 'related';
|
||||
} else if (r.keywordMatch) {
|
||||
matchType = 'exact';
|
||||
}
|
||||
|
||||
return {
|
||||
...parsed,
|
||||
matchType
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
// Create a new note
|
||||
export async function createNote(data: {
|
||||
title?: string
|
||||
@@ -683,16 +616,19 @@ export async function createNote(data: {
|
||||
// Use setImmediate-like pattern to not block the response
|
||||
; (async () => {
|
||||
try {
|
||||
// Background task 1: Generate embedding
|
||||
const bgConfig = await getSystemConfig()
|
||||
const provider = getAIProvider(bgConfig)
|
||||
const embedding = await provider.getEmbeddings(content)
|
||||
if (embedding) {
|
||||
await prisma.noteEmbedding.upsert({
|
||||
where: { noteId: noteId },
|
||||
create: { noteId: noteId, embedding: JSON.stringify(embedding) },
|
||||
update: { embedding: JSON.stringify(embedding) }
|
||||
})
|
||||
const vecStr = `[${embedding.join(',')}]`
|
||||
await prisma.$executeRawUnsafe(
|
||||
`INSERT INTO "NoteEmbedding" ("id", "noteId", "embedding", "createdAt", "updatedAt")
|
||||
VALUES (gen_random_uuid(), $1, $2::vector, now(), now())
|
||||
ON CONFLICT ("noteId")
|
||||
DO UPDATE SET "embedding" = $2::vector, "updatedAt" = now()`,
|
||||
noteId,
|
||||
vecStr
|
||||
)
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('[BG] Embedding generation failed:', e)
|
||||
@@ -815,7 +751,6 @@ export async function updateNote(id: string, data: {
|
||||
}
|
||||
}
|
||||
|
||||
// Generate embedding in background — don't block the update
|
||||
if (data.content !== undefined) {
|
||||
const noteId = id
|
||||
const content = data.content
|
||||
@@ -824,11 +759,15 @@ export async function updateNote(id: string, data: {
|
||||
const provider = getAIProvider(await getSystemConfig());
|
||||
const embedding = await provider.getEmbeddings(content);
|
||||
if (embedding) {
|
||||
await prisma.noteEmbedding.upsert({
|
||||
where: { noteId: noteId },
|
||||
create: { noteId: noteId, embedding: JSON.stringify(embedding) },
|
||||
update: { embedding: JSON.stringify(embedding) }
|
||||
})
|
||||
const vecStr = `[${embedding.join(',')}]`
|
||||
await prisma.$executeRawUnsafe(
|
||||
`INSERT INTO "NoteEmbedding" ("id", "noteId", "embedding", "createdAt", "updatedAt")
|
||||
VALUES (gen_random_uuid(), $1, $2::vector, now(), now())
|
||||
ON CONFLICT ("noteId")
|
||||
DO UPDATE SET "embedding" = $2::vector, "updatedAt" = now()`,
|
||||
noteId,
|
||||
vecStr
|
||||
)
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('[BG] Embedding regeneration failed:', e);
|
||||
@@ -1409,11 +1348,15 @@ export async function syncAllEmbeddings() {
|
||||
try {
|
||||
const embedding = await provider.getEmbeddings(note.content);
|
||||
if (embedding) {
|
||||
await prisma.noteEmbedding.upsert({
|
||||
where: { noteId: note.id },
|
||||
create: { noteId: note.id, embedding: JSON.stringify(embedding) },
|
||||
update: { embedding: JSON.stringify(embedding) }
|
||||
})
|
||||
const vecStr = `[${embedding.join(',')}]`
|
||||
await prisma.$executeRawUnsafe(
|
||||
`INSERT INTO "NoteEmbedding" ("id", "noteId", "embedding", "createdAt", "updatedAt")
|
||||
VALUES (gen_random_uuid(), $1, $2::vector, now(), now())
|
||||
ON CONFLICT ("noteId")
|
||||
DO UPDATE SET "embedding" = $2::vector, "updatedAt" = now()`,
|
||||
note.id,
|
||||
vecStr
|
||||
)
|
||||
updatedCount++;
|
||||
}
|
||||
} catch (e) { }
|
||||
|
||||
@@ -23,7 +23,7 @@ export async function semanticSearch(
|
||||
try {
|
||||
const results = await semanticSearchService.search(query, {
|
||||
limit: options?.limit || 20,
|
||||
threshold: options?.threshold || 0.6,
|
||||
threshold: options?.threshold || 0.3,
|
||||
notebookId: options?.notebookId // NEW: Pass notebook filter
|
||||
})
|
||||
|
||||
|
||||
@@ -1,11 +1,10 @@
|
||||
import { NextResponse } from 'next/server'
|
||||
import prisma from '@/lib/prisma'
|
||||
import { prisma } from '@/lib/prisma'
|
||||
import { auth } from '@/auth'
|
||||
import { validateEmbedding } from '@/lib/utils'
|
||||
|
||||
/**
|
||||
* Admin endpoint to validate all embeddings in the database
|
||||
* Returns a list of notes with invalid embeddings
|
||||
* Admin endpoint to validate all pgvector embeddings in the database.
|
||||
* Uses native SQL to check for valid vector format.
|
||||
*/
|
||||
export async function GET() {
|
||||
try {
|
||||
@@ -14,7 +13,6 @@ export async function GET() {
|
||||
return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
|
||||
}
|
||||
|
||||
// Check if user is admin
|
||||
const user = await prisma.user.findUnique({
|
||||
where: { id: session.user.id },
|
||||
select: { role: true }
|
||||
@@ -24,72 +22,34 @@ export async function GET() {
|
||||
return NextResponse.json({ error: 'Forbidden - Admin only' }, { status: 403 })
|
||||
}
|
||||
|
||||
// Fetch all notes with embeddings
|
||||
const allNotes = await prisma.note.findMany({
|
||||
select: {
|
||||
id: true,
|
||||
title: true,
|
||||
noteEmbedding: true
|
||||
}
|
||||
})
|
||||
const totalResult: Array<{ total: bigint }> = await prisma.$queryRawUnsafe(
|
||||
`SELECT COUNT(*)::bigint as total FROM "Note" WHERE "trashedAt" IS NULL`
|
||||
)
|
||||
const total = Number(totalResult[0]?.total ?? 0)
|
||||
|
||||
const invalidNotes: Array<{
|
||||
id: string
|
||||
title: string
|
||||
issues: string[]
|
||||
}> = []
|
||||
const withEmbedding: Array<{ count: bigint }> = await prisma.$queryRawUnsafe(
|
||||
`SELECT COUNT(*)::bigint as count FROM "NoteEmbedding"`
|
||||
)
|
||||
const validCount = Number(withEmbedding[0]?.count ?? 0)
|
||||
|
||||
let validCount = 0
|
||||
let missingCount = 0
|
||||
let invalidCount = 0
|
||||
const invalidResult: Array<{ count: bigint }> = await prisma.$queryRawUnsafe(
|
||||
`SELECT COUNT(*)::bigint as count FROM "NoteEmbedding" e
|
||||
WHERE e."embedding" IS NULL
|
||||
OR array_length(string_to_array(replace(replace(e."embedding"::text, '[', ''), ']', ''), ','), 1) != 1536`
|
||||
)
|
||||
const invalidCount = Number(invalidResult[0]?.count ?? 0)
|
||||
|
||||
for (const note of allNotes) {
|
||||
// Check if embedding is missing
|
||||
if (!note.noteEmbedding?.embedding) {
|
||||
missingCount++
|
||||
invalidNotes.push({
|
||||
id: note.id,
|
||||
title: note.title || 'Untitled',
|
||||
issues: ['Missing embedding']
|
||||
})
|
||||
continue
|
||||
}
|
||||
|
||||
// Validate embedding
|
||||
try {
|
||||
if (!note.noteEmbedding?.embedding) continue
|
||||
const embedding = JSON.parse(note.noteEmbedding.embedding) as number[]
|
||||
const validation = validateEmbedding(embedding)
|
||||
|
||||
if (!validation.valid) {
|
||||
invalidCount++
|
||||
invalidNotes.push({
|
||||
id: note.id,
|
||||
title: note.title || 'Untitled',
|
||||
issues: validation.issues
|
||||
})
|
||||
} else {
|
||||
validCount++
|
||||
}
|
||||
} catch (error) {
|
||||
invalidCount++
|
||||
invalidNotes.push({
|
||||
id: note.id,
|
||||
title: note.title || 'Untitled',
|
||||
issues: [`Failed to parse embedding: ${error}`]
|
||||
})
|
||||
}
|
||||
}
|
||||
const missingCount = total - validCount
|
||||
|
||||
return NextResponse.json({
|
||||
success: true,
|
||||
summary: {
|
||||
total: allNotes.length,
|
||||
valid: validCount,
|
||||
missing: missingCount,
|
||||
total,
|
||||
valid: validCount - invalidCount,
|
||||
missing: missingCount > 0 ? missingCount : 0,
|
||||
invalid: invalidCount
|
||||
},
|
||||
invalidNotes
|
||||
invalidNotes: []
|
||||
})
|
||||
} catch (error) {
|
||||
console.error('[EMBEDDING_VALIDATION] Error:', error)
|
||||
|
||||
@@ -27,14 +27,18 @@ export async function POST(req: NextRequest) {
|
||||
}
|
||||
})
|
||||
|
||||
// 2. Clean up NoteEmbeddings that don't have a corresponding Note (shouldn't happen with Cascade, but good for cleanup)
|
||||
const orphanedEmbeddings = await prisma.noteEmbedding.findMany({
|
||||
where: {
|
||||
note: { userId: { not: userId } } // Or just those where note is null if not using cascade
|
||||
}
|
||||
})
|
||||
|
||||
// Actually, let's just focus on user-specific cleanup
|
||||
// 2. Clean up NoteEmbeddings that don't have a corresponding Note
|
||||
const orphanedEmbeddings: Array<{ id: string }> = await prisma.$queryRawUnsafe(
|
||||
`SELECT e.id FROM "NoteEmbedding" e
|
||||
LEFT JOIN "Note" n ON n.id = e."noteId"
|
||||
WHERE n.id IS NULL`
|
||||
)
|
||||
|
||||
if (orphanedEmbeddings.length > 0) {
|
||||
await prisma.$executeRawUnsafe(
|
||||
`DELETE FROM "NoteEmbedding" WHERE id = ANY(${`ARRAY['${orphanedEmbeddings.map(e => e.id).join("','")}']`}::text[])`
|
||||
)
|
||||
}
|
||||
|
||||
// 3. Remove note history entries for notes that were deleted (cascade should handle this, but let's be safe)
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import { NextRequest, NextResponse } from 'next/server'
|
||||
import { auth } from '@/auth'
|
||||
import { prisma } from '@/lib/prisma'
|
||||
import { EmbeddingService } from '@/lib/ai/services/embedding.service'
|
||||
import { semanticSearchService } from '@/lib/ai/services/semantic-search.service'
|
||||
|
||||
export async function POST(req: NextRequest) {
|
||||
try {
|
||||
@@ -12,41 +12,31 @@ export async function POST(req: NextRequest) {
|
||||
|
||||
const userId = session.user.id
|
||||
|
||||
// Fetch all notes for the user
|
||||
const notes = await prisma.note.findMany({
|
||||
where: { userId, trashedAt: null },
|
||||
select: { id: true, title: true, content: true }
|
||||
select: { id: true }
|
||||
})
|
||||
|
||||
const embeddingService = new EmbeddingService()
|
||||
let processedCount = 0
|
||||
let failedCount = 0
|
||||
const BATCH_SIZE = 20
|
||||
|
||||
// Process in small batches to avoid timeouts if possible
|
||||
// Note: In a real production app, this should be a background job
|
||||
for (const note of notes) {
|
||||
try {
|
||||
const textToEmbed = `${note.title || ''}\n${note.content}`
|
||||
if (textToEmbed.trim()) {
|
||||
const embedding = await embeddingService.generateEmbedding(textToEmbed)
|
||||
|
||||
await prisma.noteEmbedding.upsert({
|
||||
where: { noteId: note.id },
|
||||
update: { embedding: JSON.stringify(embedding) },
|
||||
create: {
|
||||
noteId: note.id,
|
||||
embedding: JSON.stringify(embedding)
|
||||
}
|
||||
})
|
||||
processedCount++
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(`Failed to reindex note ${note.id}:`, err)
|
||||
for (let i = 0; i < notes.length; i += BATCH_SIZE) {
|
||||
const batch = notes.slice(i, i + BATCH_SIZE)
|
||||
const results = await Promise.allSettled(
|
||||
batch.map(note => semanticSearchService.indexNote(note.id))
|
||||
)
|
||||
|
||||
for (const r of results) {
|
||||
if (r.status === 'fulfilled') processedCount++
|
||||
else failedCount++
|
||||
}
|
||||
}
|
||||
|
||||
return NextResponse.json({
|
||||
success: true,
|
||||
count: processedCount,
|
||||
failed: failedCount,
|
||||
total: notes.length
|
||||
})
|
||||
} catch (error) {
|
||||
|
||||
@@ -259,7 +259,6 @@ export function HomeClient({ initialNotes, initialSettings }: HomeClientProps) {
|
||||
const labelFilter = searchParams.get('labels')?.split(',').filter(Boolean) || []
|
||||
const colorFilter = searchParams.get('color')
|
||||
const notebook = searchParams.get('notebook')
|
||||
const semanticMode = searchParams.get('semantic') === 'true'
|
||||
|
||||
const isBackgroundRefresh = refreshKey > prevRefreshKey.current
|
||||
prevRefreshKey.current = refreshKey
|
||||
@@ -271,7 +270,7 @@ export function HomeClient({ initialNotes, initialSettings }: HomeClientProps) {
|
||||
setIsLoading(true)
|
||||
}
|
||||
let allNotes = search
|
||||
? await searchNotes(search, semanticMode, notebook || undefined)
|
||||
? await searchNotes(search, true, notebook || undefined)
|
||||
: await getAllNotes(false, notebook || undefined)
|
||||
|
||||
const sharedOnly = searchParams.get('shared') === '1'
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
/**
|
||||
* Embedding Service
|
||||
* Generates vector embeddings for semantic search and similarity analysis
|
||||
* Uses text-embedding-3-small model via OpenAI (or Ollama alternatives)
|
||||
* Generates vector embeddings for semantic search and similarity analysis.
|
||||
* Stores embeddings as native pgvector(1536) in PostgreSQL.
|
||||
*/
|
||||
|
||||
import { getAIProvider } from '../factory'
|
||||
@@ -13,16 +13,9 @@ export interface EmbeddingResult {
|
||||
dimension: number
|
||||
}
|
||||
|
||||
/**
|
||||
* Service for generating and managing text embeddings
|
||||
*/
|
||||
export class EmbeddingService {
|
||||
private readonly EMBEDDING_MODEL = 'text-embedding-3-small'
|
||||
private readonly EMBEDDING_DIMENSION = 1536 // OpenAI's embedding dimension
|
||||
private readonly EMBEDDING_DIMENSION = 1536
|
||||
|
||||
/**
|
||||
* Generate embedding for a single text
|
||||
*/
|
||||
async generateEmbedding(text: string): Promise<EmbeddingResult> {
|
||||
if (!text || text.trim().length === 0) {
|
||||
throw new Error('Cannot generate embedding for empty text')
|
||||
@@ -31,17 +24,11 @@ export class EmbeddingService {
|
||||
try {
|
||||
const config = await getSystemConfig()
|
||||
const provider = getAIProvider(config)
|
||||
|
||||
// Use the existing getEmbeddings method from AIProvider
|
||||
const embedding = await provider.getEmbeddings(text)
|
||||
|
||||
// Validate embedding dimension
|
||||
if (embedding.length !== this.EMBEDDING_DIMENSION) {
|
||||
}
|
||||
|
||||
return {
|
||||
embedding,
|
||||
model: this.EMBEDDING_MODEL,
|
||||
model: 'text-embedding-3-small',
|
||||
dimension: embedding.length
|
||||
}
|
||||
} catch (error) {
|
||||
@@ -50,34 +37,22 @@ export class EmbeddingService {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate embeddings for multiple texts in batch
|
||||
* More efficient than calling generateEmbedding multiple times
|
||||
*/
|
||||
async generateBatchEmbeddings(texts: string[]): Promise<EmbeddingResult[]> {
|
||||
if (!texts || texts.length === 0) {
|
||||
return []
|
||||
}
|
||||
if (!texts || texts.length === 0) return []
|
||||
|
||||
// Filter out empty texts
|
||||
const validTexts = texts.filter(t => t && t.trim().length > 0)
|
||||
|
||||
if (validTexts.length === 0) {
|
||||
return []
|
||||
}
|
||||
if (validTexts.length === 0) return []
|
||||
|
||||
try {
|
||||
const config = await getSystemConfig()
|
||||
const provider = getAIProvider(config)
|
||||
|
||||
// Batch embedding using the existing getEmbeddings method
|
||||
const embeddings = await Promise.all(
|
||||
validTexts.map(text => provider.getEmbeddings(text))
|
||||
)
|
||||
|
||||
return embeddings.map(embedding => ({
|
||||
embedding,
|
||||
model: this.EMBEDDING_MODEL,
|
||||
model: 'text-embedding-3-small',
|
||||
dimension: embedding.length
|
||||
}))
|
||||
} catch (error) {
|
||||
@@ -87,132 +62,54 @@ export class EmbeddingService {
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate cosine similarity between two embeddings
|
||||
* Returns value between -1 and 1, where 1 is identical
|
||||
* Format a number[] embedding as a pgvector-compatible string literal.
|
||||
* e.g. [0.1, 0.2, 0.3] → '[0.1,0.2,0.3]'
|
||||
*/
|
||||
calculateCosineSimilarity(embedding1: number[], embedding2: number[]): number {
|
||||
if (embedding1.length !== embedding2.length) {
|
||||
throw new Error('Embeddings must have the same dimension')
|
||||
toVectorString(embedding: number[]): string {
|
||||
return `[${embedding.join(',')}]`
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a pgvector string from the DB back into number[].
|
||||
* e.g. '[0.1,0.2,0.3]' → [0.1, 0.2, 0.3]
|
||||
*/
|
||||
fromVectorString(vec: string): number[] {
|
||||
if (Array.isArray(vec)) return vec
|
||||
if (!vec || typeof vec !== 'string') return []
|
||||
return vec.replace(/^\[/, '').replace(/\]$/, '').split(',').map(Number)
|
||||
}
|
||||
|
||||
/**
|
||||
* JS cosine similarity — still used by memory-echo pairwise comparisons.
|
||||
*/
|
||||
calculateCosineSimilarity(a: number[], b: number[]): number {
|
||||
if (!a.length || !b.length) return 0
|
||||
const minLen = Math.min(a.length, b.length)
|
||||
let dot = 0, mA = 0, mB = 0
|
||||
for (let i = 0; i < minLen; i++) {
|
||||
dot += a[i] * b[i]
|
||||
mA += a[i] * a[i]
|
||||
mB += b[i] * b[i]
|
||||
}
|
||||
|
||||
let dotProduct = 0
|
||||
let magnitude1 = 0
|
||||
let magnitude2 = 0
|
||||
|
||||
for (let i = 0; i < embedding1.length; i++) {
|
||||
dotProduct += embedding1[i] * embedding2[i]
|
||||
magnitude1 += embedding1[i] * embedding1[i]
|
||||
magnitude2 += embedding2[i] * embedding2[i]
|
||||
}
|
||||
|
||||
magnitude1 = Math.sqrt(magnitude1)
|
||||
magnitude2 = Math.sqrt(magnitude2)
|
||||
|
||||
if (magnitude1 === 0 || magnitude2 === 0) {
|
||||
return 0
|
||||
}
|
||||
|
||||
return dotProduct / (magnitude1 * magnitude2)
|
||||
mA = Math.sqrt(mA)
|
||||
mB = Math.sqrt(mB)
|
||||
if (mA === 0 || mB === 0) return 0
|
||||
return dot / (mA * mB)
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate similarity between an embedding and multiple other embeddings
|
||||
* Returns array of similarities
|
||||
*/
|
||||
calculateSimilarities(
|
||||
queryEmbedding: number[],
|
||||
targetEmbeddings: number[][]
|
||||
): number[] {
|
||||
return targetEmbeddings.map(embedding =>
|
||||
this.calculateCosineSimilarity(queryEmbedding, embedding)
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Find most similar embeddings to a query
|
||||
* Returns top-k results with their similarities
|
||||
*/
|
||||
findMostSimilar(
|
||||
queryEmbedding: number[],
|
||||
targetEmbeddings: Array<{ id: string; embedding: number[] }>,
|
||||
topK: number = 10
|
||||
): Array<{ id: string; similarity: number }> {
|
||||
const similarities = targetEmbeddings.map(({ id, embedding }) => ({
|
||||
id,
|
||||
similarity: this.calculateCosineSimilarity(queryEmbedding, embedding)
|
||||
}))
|
||||
|
||||
// Sort by similarity descending and return top-k
|
||||
return similarities
|
||||
.sort((a, b) => b.similarity - a.similarity)
|
||||
.slice(0, topK)
|
||||
}
|
||||
|
||||
/**
|
||||
* Get average embedding from multiple embeddings
|
||||
* Useful for clustering or centroid calculation
|
||||
*/
|
||||
averageEmbeddings(embeddings: number[][]): number[] {
|
||||
if (embeddings.length === 0) {
|
||||
throw new Error('Cannot average empty embeddings array')
|
||||
}
|
||||
|
||||
const dimension = embeddings[0].length
|
||||
const average = new Array(dimension).fill(0)
|
||||
|
||||
for (const embedding of embeddings) {
|
||||
if (embedding.length !== dimension) {
|
||||
throw new Error('All embeddings must have the same dimension')
|
||||
}
|
||||
|
||||
for (let i = 0; i < dimension; i++) {
|
||||
average[i] += embedding[i]
|
||||
}
|
||||
}
|
||||
|
||||
// Divide by number of embeddings
|
||||
return average.map(val => val / embeddings.length)
|
||||
}
|
||||
|
||||
/**
|
||||
* Pass-through — embeddings are stored as native JSONB in PostgreSQL
|
||||
*/
|
||||
serialize(embedding: number[]): number[] {
|
||||
return embedding
|
||||
}
|
||||
|
||||
/**
|
||||
* Pass-through — embeddings come back already parsed from PostgreSQL
|
||||
*/
|
||||
deserialize(embedding: number[]): number[] {
|
||||
return embedding
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a note needs embedding regeneration
|
||||
* (e.g., if content has changed significantly)
|
||||
* Check if a note needs embedding regeneration.
|
||||
* Uses a content-content comparison (not embedding-content).
|
||||
*/
|
||||
shouldRegenerateEmbedding(
|
||||
noteContent: string,
|
||||
lastEmbeddingContent: string | null,
|
||||
_lastEmbeddingContent: string | null,
|
||||
lastAnalysis: Date | null
|
||||
): boolean {
|
||||
// If no previous embedding, generate one
|
||||
if (!lastEmbeddingContent || !lastAnalysis) {
|
||||
return true
|
||||
}
|
||||
|
||||
// If content has changed more than 20% (simple heuristic)
|
||||
const contentChanged =
|
||||
Math.abs(noteContent.length - lastEmbeddingContent.length) / lastEmbeddingContent.length > 0.2
|
||||
|
||||
// If last analysis is more than 7 days old
|
||||
if (!lastAnalysis) return true
|
||||
const daysSinceAnalysis = (Date.now() - lastAnalysis.getTime()) / (1000 * 60 * 60 * 24)
|
||||
const isStale = daysSinceAnalysis > 7
|
||||
|
||||
return contentChanged || isStale
|
||||
return daysSinceAnalysis > 7
|
||||
}
|
||||
}
|
||||
|
||||
// Singleton instance
|
||||
export const embeddingService = new EmbeddingService()
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import { getAIProvider, getChatProvider } from '../factory'
|
||||
import { cosineSimilarity } from '@/lib/utils'
|
||||
import { embeddingService } from './embedding.service'
|
||||
import { getSystemConfig } from '@/lib/config'
|
||||
import prisma from '@/lib/prisma'
|
||||
|
||||
@@ -78,11 +79,15 @@ export class MemoryEchoService {
|
||||
try {
|
||||
const embedding = await provider.getEmbeddings(note.content)
|
||||
if (embedding && embedding.length > 0) {
|
||||
await prisma.noteEmbedding.upsert({
|
||||
where: { noteId: note.id },
|
||||
create: { noteId: note.id, embedding: JSON.stringify(embedding) },
|
||||
update: { embedding: JSON.stringify(embedding) }
|
||||
})
|
||||
const vecStr = `[${embedding.join(',')}]`
|
||||
await prisma.$executeRawUnsafe(
|
||||
`INSERT INTO "NoteEmbedding" ("id", "noteId", "embedding", "createdAt", "updatedAt")
|
||||
VALUES (gen_random_uuid(), $1, $2::vector, now(), now())
|
||||
ON CONFLICT ("noteId")
|
||||
DO UPDATE SET "embedding" = $2::vector, "updatedAt" = now()`,
|
||||
note.id,
|
||||
vecStr
|
||||
)
|
||||
}
|
||||
} catch {
|
||||
// Skip this note, continue with others
|
||||
@@ -122,11 +127,12 @@ export class MemoryEchoService {
|
||||
return [] // Need at least 2 notes to find connections
|
||||
}
|
||||
|
||||
// Parse embeddings (already native Json from PostgreSQL)
|
||||
const notesWithEmbeddings = notes
|
||||
.map(note => ({
|
||||
...note,
|
||||
embedding: note.noteEmbedding?.embedding ? JSON.parse(note.noteEmbedding.embedding) as number[] : null
|
||||
embedding: note.noteEmbedding?.embedding
|
||||
? embeddingService.fromVectorString(note.noteEmbedding.embedding as unknown as string)
|
||||
: null
|
||||
}))
|
||||
.filter(note => note.embedding && Array.isArray(note.embedding))
|
||||
|
||||
@@ -500,8 +506,9 @@ Explain in one brief sentence (max 15 words) why these notes are connected. Focu
|
||||
return []
|
||||
}
|
||||
|
||||
// Target note embedding (already native Json from PostgreSQL)
|
||||
const targetEmbedding = targetNote.noteEmbedding?.embedding ? JSON.parse(targetNote.noteEmbedding.embedding) as number[] : null
|
||||
const targetEmbedding = targetNote.noteEmbedding?.embedding
|
||||
? embeddingService.fromVectorString(targetNote.noteEmbedding.embedding as unknown as string)
|
||||
: null
|
||||
if (!targetEmbedding) return []
|
||||
|
||||
// Check if user has demo mode enabled
|
||||
@@ -535,7 +542,9 @@ Explain in one brief sentence (max 15 words) why these notes are connected. Focu
|
||||
for (const otherNote of otherNotes) {
|
||||
if (!otherNote.noteEmbedding) continue
|
||||
|
||||
const otherEmbedding = otherNote.noteEmbedding?.embedding ? JSON.parse(otherNote.noteEmbedding.embedding) as number[] : null
|
||||
const otherEmbedding = otherNote.noteEmbedding?.embedding
|
||||
? embeddingService.fromVectorString(otherNote.noteEmbedding.embedding as unknown as string)
|
||||
: null
|
||||
if (!otherEmbedding) continue
|
||||
|
||||
// Check if this connection was dismissed
|
||||
|
||||
@@ -1,7 +1,12 @@
|
||||
/**
|
||||
* Semantic Search Service
|
||||
* Hybrid search combining keyword matching and semantic similarity
|
||||
* Uses Reciprocal Rank Fusion (RRF) for result ranking
|
||||
*
|
||||
* Unified hybrid search combining:
|
||||
* 1. PostgreSQL full-text search (tsvector / tsquery) via GIN index
|
||||
* 2. pgvector cosine-distance nearest-neighbor search via HNSW index
|
||||
* 3. Reciprocal Rank Fusion (RRF) for final ranking
|
||||
*
|
||||
* All vector operations happen in the database — no JS cosine-similarity loops.
|
||||
*/
|
||||
|
||||
import { embeddingService } from './embedding.service'
|
||||
@@ -19,19 +24,22 @@ export interface SearchResult {
|
||||
|
||||
export interface SearchOptions {
|
||||
limit?: number
|
||||
threshold?: number // Minimum similarity score (0-1)
|
||||
threshold?: number
|
||||
includeExactMatches?: boolean
|
||||
notebookId?: string // NEW: Filter by notebook for contextual search (IA5)
|
||||
defaultTitle?: string // Optional default title for untitled notes (i18n)
|
||||
notebookId?: string
|
||||
defaultTitle?: string
|
||||
}
|
||||
|
||||
export class SemanticSearchService {
|
||||
private readonly RRF_K = 60 // RRF constant (default recommended value)
|
||||
private readonly RRF_K = 60
|
||||
private readonly DEFAULT_LIMIT = 20
|
||||
private readonly DEFAULT_THRESHOLD = 0.6
|
||||
private readonly DEFAULT_THRESHOLD = 0.3
|
||||
private readonly VECTOR_CANDIDATES = 50
|
||||
private readonly FTS_CANDIDATES = 50
|
||||
|
||||
/**
|
||||
* Hybrid search: keyword + semantic with RRF fusion
|
||||
* Hybrid search: FTS + pgvector with RRF fusion.
|
||||
* Accepts an optional userId to skip auth() (used by agent tools).
|
||||
*/
|
||||
async search(
|
||||
query: string,
|
||||
@@ -40,292 +48,15 @@ export class SemanticSearchService {
|
||||
const {
|
||||
limit = this.DEFAULT_LIMIT,
|
||||
threshold = this.DEFAULT_THRESHOLD,
|
||||
includeExactMatches = true,
|
||||
notebookId, // NEW: Contextual search within notebook (IA5)
|
||||
defaultTitle = 'Untitled' // Default title for i18n
|
||||
notebookId,
|
||||
defaultTitle = 'Untitled'
|
||||
} = options
|
||||
|
||||
if (!query || query.trim().length < 2) {
|
||||
return []
|
||||
}
|
||||
if (!query || query.trim().length < 2) return []
|
||||
|
||||
const session = await auth()
|
||||
const userId = session?.user?.id || null
|
||||
|
||||
try {
|
||||
// 1. Keyword search (SQLite FTS)
|
||||
const keywordResults = await this.keywordSearch(query, userId, notebookId)
|
||||
|
||||
// 2. Semantic search (vector similarity)
|
||||
const semanticResults = await this.semanticVectorSearch(query, userId, threshold, notebookId)
|
||||
|
||||
// 3. Reciprocal Rank Fusion
|
||||
const fusedResults = await this.reciprocalRankFusion(
|
||||
keywordResults,
|
||||
semanticResults
|
||||
)
|
||||
|
||||
// 4. Sort by final score and limit
|
||||
return fusedResults
|
||||
.sort((a, b) => b.score - a.score)
|
||||
.slice(0, limit)
|
||||
.map(result => ({
|
||||
...result,
|
||||
title: result.title || defaultTitle,
|
||||
matchType: result.score > 0.8 ? 'exact' : 'related'
|
||||
}))
|
||||
} catch (error) {
|
||||
console.error('Error in hybrid search:', error)
|
||||
// Fallback to keyword-only search
|
||||
const keywordResults = await this.keywordSearch(query, userId)
|
||||
|
||||
// Fetch note details for keyword results
|
||||
const noteIds = keywordResults.slice(0, limit).map(r => r.noteId)
|
||||
const notes = await prisma.note.findMany({
|
||||
where: { id: { in: noteIds }, trashedAt: null },
|
||||
select: {
|
||||
id: true,
|
||||
title: true,
|
||||
content: true,
|
||||
language: true
|
||||
}
|
||||
})
|
||||
|
||||
return notes.map(note => ({
|
||||
noteId: note.id,
|
||||
title: note.title || defaultTitle,
|
||||
content: note.content,
|
||||
score: 1.0, // Default score for keyword-only results
|
||||
matchType: 'related' as const,
|
||||
language: note.language
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Keyword search using SQLite LIKE/FTS
|
||||
*/
|
||||
private async keywordSearch(
|
||||
query: string,
|
||||
userId: string | null,
|
||||
notebookId?: string // NEW: Filter by notebook (IA5)
|
||||
): Promise<Array<{ noteId: string; rank: number }>> {
|
||||
// Extract keywords (words with > 3 characters) to avoid entire sentence matching failing
|
||||
const stopWords = new Set(['comment', 'pourquoi', 'lequel', 'laquelle', 'avec', 'pour', 'dans', 'sur', 'est-ce']);
|
||||
const keywords = query.toLowerCase()
|
||||
.split(/[^a-z0-9àáâäçéèêëíìîïñóòôöúùûü]/i)
|
||||
.filter(w => w.length > 3 && !stopWords.has(w));
|
||||
|
||||
// If no good keywords found, fallback to the original query but it'll likely fail
|
||||
const searchTerms = keywords.length > 0 ? keywords : [query];
|
||||
|
||||
// Build Prisma OR clauses for each keyword
|
||||
const searchConditions = searchTerms.flatMap(term => [
|
||||
{ title: { contains: term, mode: 'insensitive' as const } },
|
||||
{ content: { contains: term, mode: 'insensitive' as const } }
|
||||
]);
|
||||
|
||||
const notes = await prisma.note.findMany({
|
||||
where: {
|
||||
...(userId ? { userId } : {}),
|
||||
...(notebookId !== undefined ? { notebookId } : {}), // NEW: Notebook filter
|
||||
trashedAt: null,
|
||||
OR: searchConditions
|
||||
},
|
||||
select: {
|
||||
id: true,
|
||||
title: true,
|
||||
content: true
|
||||
}
|
||||
})
|
||||
|
||||
// Simple relevance scoring based on match position and frequency
|
||||
const results = notes.map(note => {
|
||||
const title = note.title || ''
|
||||
const content = note.content || ''
|
||||
const queryLower = query.toLowerCase()
|
||||
|
||||
// Count occurrences — escape regex special chars to avoid crashes
|
||||
const escaped = queryLower.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
|
||||
const titleMatches = (title.match(new RegExp(escaped, 'gi')) || []).length
|
||||
const contentMatches = (content.match(new RegExp(escaped, 'gi')) || []).length
|
||||
|
||||
// Boost title matches significantly
|
||||
const titlePosition = title.toLowerCase().indexOf(queryLower)
|
||||
const contentPosition = content.toLowerCase().indexOf(queryLower)
|
||||
|
||||
// Calculate rank (lower is better)
|
||||
let rank = 100
|
||||
|
||||
if (titleMatches > 0) {
|
||||
rank = titlePosition === 0 ? 1 : 10
|
||||
rank -= titleMatches * 2
|
||||
} else if (contentMatches > 0) {
|
||||
rank = contentPosition < 100 ? 20 : 30
|
||||
rank -= contentMatches
|
||||
}
|
||||
|
||||
return {
|
||||
noteId: note.id,
|
||||
rank
|
||||
}
|
||||
})
|
||||
|
||||
return results.sort((a, b) => a.rank - b.rank)
|
||||
}
|
||||
|
||||
/**
|
||||
* Semantic vector search using embeddings
|
||||
*/
|
||||
private async semanticVectorSearch(
|
||||
query: string,
|
||||
userId: string | null,
|
||||
threshold: number,
|
||||
notebookId?: string // NEW: Filter by notebook (IA5)
|
||||
): Promise<Array<{ noteId: string; rank: number }>> {
|
||||
try {
|
||||
// Generate query embedding
|
||||
const { embedding: queryEmbedding } = await embeddingService.generateEmbedding(query)
|
||||
|
||||
// Fetch all user's notes with embeddings
|
||||
const notes = await prisma.note.findMany({
|
||||
where: {
|
||||
...(userId ? { userId } : {}),
|
||||
...(notebookId !== undefined ? { notebookId } : {}),
|
||||
trashedAt: null,
|
||||
noteEmbedding: { isNot: null }
|
||||
},
|
||||
select: {
|
||||
id: true,
|
||||
noteEmbedding: true
|
||||
}
|
||||
})
|
||||
|
||||
if (notes.length === 0) {
|
||||
return []
|
||||
}
|
||||
|
||||
// Calculate similarities for all notes
|
||||
const similarities = notes.map(note => {
|
||||
const noteEmbedding = note.noteEmbedding?.embedding ? JSON.parse(note.noteEmbedding.embedding) as number[] : []
|
||||
const similarity = embeddingService.calculateCosineSimilarity(
|
||||
queryEmbedding,
|
||||
noteEmbedding
|
||||
)
|
||||
|
||||
return {
|
||||
noteId: note.id,
|
||||
similarity
|
||||
}
|
||||
})
|
||||
|
||||
// Filter by threshold and convert to rank
|
||||
return similarities
|
||||
.filter(s => s.similarity >= threshold)
|
||||
.sort((a, b) => b.similarity - a.similarity)
|
||||
.map((s, index) => ({
|
||||
noteId: s.noteId,
|
||||
rank: index + 1 // 1-based rank
|
||||
}))
|
||||
} catch (error) {
|
||||
console.error('Error in semantic vector search:', error)
|
||||
return []
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reciprocal Rank Fusion algorithm
|
||||
* Combines multiple ranked lists into a single ranking
|
||||
* Formula: RRF(score) = 1 / (k + rank)
|
||||
* k = 60 (default, prevents high rank from dominating)
|
||||
*/
|
||||
private async reciprocalRankFusion(
|
||||
keywordResults: Array<{ noteId: string; rank: number }>,
|
||||
semanticResults: Array<{ noteId: string; rank: number }>
|
||||
): Promise<SearchResult[]> {
|
||||
const scores = new Map<string, number>()
|
||||
|
||||
// Add keyword scores
|
||||
for (const result of keywordResults) {
|
||||
const rrfScore = 1 / (this.RRF_K + result.rank)
|
||||
scores.set(result.noteId, (scores.get(result.noteId) || 0) + rrfScore)
|
||||
}
|
||||
|
||||
// Add semantic scores
|
||||
for (const result of semanticResults) {
|
||||
const rrfScore = 1 / (this.RRF_K + result.rank)
|
||||
scores.set(result.noteId, (scores.get(result.noteId) || 0) + rrfScore)
|
||||
}
|
||||
|
||||
// Fetch note details
|
||||
const noteIds = Array.from(scores.keys())
|
||||
const notes = await prisma.note.findMany({
|
||||
where: { id: { in: noteIds }, trashedAt: null },
|
||||
select: {
|
||||
id: true,
|
||||
title: true,
|
||||
content: true,
|
||||
language: true
|
||||
}
|
||||
})
|
||||
|
||||
// Combine scores with note details
|
||||
return notes.map(note => ({
|
||||
noteId: note.id,
|
||||
title: note.title,
|
||||
content: note.content,
|
||||
score: scores.get(note.id) || 0,
|
||||
matchType: 'related' as const,
|
||||
language: note.language
|
||||
}))
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate or update embedding for a note
|
||||
* Called when note is created or significantly updated
|
||||
*/
|
||||
async indexNote(noteId: string): Promise<void> {
|
||||
try {
|
||||
const note = await prisma.note.findUnique({
|
||||
where: { id: noteId },
|
||||
select: { content: true, noteEmbedding: true, lastAiAnalysis: true }
|
||||
})
|
||||
|
||||
if (!note) {
|
||||
throw new Error('Note not found')
|
||||
}
|
||||
|
||||
// Check if embedding needs regeneration
|
||||
const shouldRegenerate = embeddingService.shouldRegenerateEmbedding(
|
||||
note.content,
|
||||
note.noteEmbedding?.embedding as any,
|
||||
note.lastAiAnalysis
|
||||
)
|
||||
|
||||
if (!shouldRegenerate) {
|
||||
return
|
||||
}
|
||||
|
||||
// Generate new embedding
|
||||
const { embedding } = await embeddingService.generateEmbedding(note.content)
|
||||
|
||||
// Save to database
|
||||
await prisma.noteEmbedding.upsert({
|
||||
where: { noteId: noteId },
|
||||
create: { noteId: noteId, embedding: embeddingService.serialize(embedding) as any },
|
||||
update: { embedding: embeddingService.serialize(embedding) as any }
|
||||
})
|
||||
await prisma.note.update({
|
||||
where: { id: noteId },
|
||||
data: {
|
||||
lastAiAnalysis: new Date()
|
||||
}
|
||||
})
|
||||
|
||||
} catch (error) {
|
||||
console.error(`Error indexing note ${noteId}:`, error)
|
||||
throw error
|
||||
}
|
||||
return this._doSearch(query, userId, { limit, threshold, notebookId, defaultTitle })
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -340,50 +71,251 @@ export class SemanticSearchService {
|
||||
const {
|
||||
limit = this.DEFAULT_LIMIT,
|
||||
threshold = this.DEFAULT_THRESHOLD,
|
||||
includeExactMatches = true,
|
||||
notebookId,
|
||||
defaultTitle = 'Untitled'
|
||||
} = options
|
||||
|
||||
if (!query || query.trim().length < 2) {
|
||||
return []
|
||||
}
|
||||
if (!query || query.trim().length < 2) return []
|
||||
return this._doSearch(query, userId, { limit, threshold, notebookId, defaultTitle })
|
||||
}
|
||||
|
||||
private async _doSearch(
|
||||
query: string,
|
||||
userId: string | null,
|
||||
opts: { limit: number; threshold: number; notebookId?: string; defaultTitle: string }
|
||||
): Promise<SearchResult[]> {
|
||||
try {
|
||||
const keywordResults = await this.keywordSearch(query, userId, notebookId)
|
||||
const semanticResults = await this.semanticVectorSearch(query, userId, threshold, notebookId)
|
||||
const fusedResults = await this.reciprocalRankFusion(keywordResults, semanticResults)
|
||||
const [keywordResults, semanticResults] = await Promise.all([
|
||||
this.ftsSearch(query, userId, opts.notebookId),
|
||||
this.vectorSearch(query, userId, opts.threshold, opts.notebookId)
|
||||
])
|
||||
|
||||
const fusedResults = this.reciprocalRankFusion(keywordResults, semanticResults)
|
||||
|
||||
return fusedResults
|
||||
.sort((a, b) => b.score - a.score)
|
||||
.slice(0, limit)
|
||||
.slice(0, opts.limit)
|
||||
.map(result => ({
|
||||
...result,
|
||||
title: result.title || defaultTitle,
|
||||
matchType: result.score > 0.8 ? 'exact' : 'related'
|
||||
title: result.title || opts.defaultTitle,
|
||||
matchType: result.score > 0.8 ? 'exact' as const : 'related' as const
|
||||
}))
|
||||
} catch (error) {
|
||||
console.error('Error in searchAsUser:', error)
|
||||
console.error('Error in hybrid search:', error)
|
||||
return this._ftsFallback(query, userId, opts)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* PostgreSQL full-text search using tsvector + GIN index.
|
||||
* Returns ranked results using ts_rank.
|
||||
*/
|
||||
private async ftsSearch(
|
||||
query: string,
|
||||
userId: string | null,
|
||||
notebookId?: string
|
||||
): Promise<Array<{ noteId: string; rank: number }>> {
|
||||
const safeQuery = query.replace(/'/g, "''")
|
||||
|
||||
const userClause = userId ? `AND "userId" = '${userId}'` : ''
|
||||
const notebookClause = notebookId !== undefined
|
||||
? `AND "notebookId" ${notebookId ? `= '${notebookId.replace(/'/g, "''")}'` : 'IS NULL'}`
|
||||
: ''
|
||||
|
||||
const sql = `
|
||||
SELECT id AS "noteId", ts_rank("tsv", plainto_tsquery('simple', '${safeQuery}')) AS rank
|
||||
FROM "Note"
|
||||
WHERE "tsv" @@ plainto_tsquery('simple', '${safeQuery}')
|
||||
AND "trashedAt" IS NULL
|
||||
AND "isArchived" = false
|
||||
${userClause}
|
||||
${notebookClause}
|
||||
ORDER BY rank DESC
|
||||
LIMIT ${this.FTS_CANDIDATES}
|
||||
`
|
||||
|
||||
const rows: Array<{ noteId: string; rank: number }> = await prisma.$queryRawUnsafe(sql)
|
||||
|
||||
const maxRank = rows.length > 0 ? rows[0].rank : 1
|
||||
return rows.map((r, i) => ({
|
||||
noteId: r.noteId,
|
||||
rank: i + 1
|
||||
}))
|
||||
}
|
||||
|
||||
/**
|
||||
* pgvector cosine-distance search using the HNSW index.
|
||||
* Returns nearest neighbors above the similarity threshold.
|
||||
*/
|
||||
private async vectorSearch(
|
||||
query: string,
|
||||
userId: string | null,
|
||||
threshold: number,
|
||||
notebookId?: string
|
||||
): Promise<Array<{ noteId: string; rank: number }>> {
|
||||
let queryEmbedding: number[]
|
||||
try {
|
||||
const result = await embeddingService.generateEmbedding(query)
|
||||
queryEmbedding = result.embedding
|
||||
} catch (error) {
|
||||
console.error('Failed to generate query embedding:', error)
|
||||
return []
|
||||
}
|
||||
|
||||
const vecStr = embeddingService.toVectorString(queryEmbedding)
|
||||
const userClause = userId ? `AND n."userId" = '${userId}'` : ''
|
||||
const notebookClause = notebookId !== undefined
|
||||
? `AND n."notebookId" ${notebookId ? `= '${notebookId.replace(/'/g, "''")}'` : 'IS NULL'}`
|
||||
: ''
|
||||
|
||||
const sql = `
|
||||
SELECT n.id AS "noteId",
|
||||
1 - (e."embedding" <=> '${vecStr}'::vector) AS similarity
|
||||
FROM "Note" n
|
||||
INNER JOIN "NoteEmbedding" e ON e."noteId" = n.id
|
||||
WHERE n."trashedAt" IS NULL
|
||||
AND n."isArchived" = false
|
||||
${userClause}
|
||||
${notebookClause}
|
||||
AND 1 - (e."embedding" <=> '${vecStr}'::vector) >= ${threshold}
|
||||
ORDER BY e."embedding" <=> '${vecStr}'::vector ASC
|
||||
LIMIT ${this.VECTOR_CANDIDATES}
|
||||
`
|
||||
|
||||
const rows: Array<{ noteId: string; similarity: number }> = await prisma.$queryRawUnsafe(sql)
|
||||
|
||||
return rows.map((r, i) => ({
|
||||
noteId: r.noteId,
|
||||
rank: i + 1
|
||||
}))
|
||||
}
|
||||
|
||||
/**
|
||||
* Reciprocal Rank Fusion algorithm.
|
||||
* Combines keyword and semantic ranked lists into a single ranking.
|
||||
*/
|
||||
private async reciprocalRankFusion(
|
||||
keywordResults: Array<{ noteId: string; rank: number }>,
|
||||
semanticResults: Array<{ noteId: string; rank: number }>
|
||||
): Promise<SearchResult[]> {
|
||||
const scores = new Map<string, number>()
|
||||
|
||||
for (const result of keywordResults) {
|
||||
const rrfScore = 1 / (this.RRF_K + result.rank)
|
||||
scores.set(result.noteId, (scores.get(result.noteId) || 0) + rrfScore)
|
||||
}
|
||||
|
||||
for (const result of semanticResults) {
|
||||
const rrfScore = 1 / (this.RRF_K + result.rank)
|
||||
scores.set(result.noteId, (scores.get(result.noteId) || 0) + rrfScore)
|
||||
}
|
||||
|
||||
const noteIds = Array.from(scores.keys())
|
||||
if (noteIds.length === 0) return []
|
||||
|
||||
const notes = await prisma.note.findMany({
|
||||
where: { id: { in: noteIds }, trashedAt: null },
|
||||
select: {
|
||||
id: true,
|
||||
title: true,
|
||||
content: true,
|
||||
language: true
|
||||
}
|
||||
})
|
||||
|
||||
return notes.map(note => ({
|
||||
noteId: note.id,
|
||||
title: note.title,
|
||||
content: note.content,
|
||||
score: scores.get(note.id) || 0,
|
||||
matchType: 'related' as const,
|
||||
language: note.language
|
||||
}))
|
||||
}
|
||||
|
||||
/**
|
||||
* Fallback to FTS-only when vector search fails entirely.
|
||||
*/
|
||||
private async _ftsFallback(
|
||||
query: string,
|
||||
userId: string | null,
|
||||
opts: { limit: number; threshold: number; notebookId?: string; defaultTitle: string }
|
||||
): Promise<SearchResult[]> {
|
||||
try {
|
||||
const keywordResults = await this.ftsSearch(query, userId, opts.notebookId)
|
||||
const noteIds = keywordResults.slice(0, opts.limit).map(r => r.noteId)
|
||||
const notes = await prisma.note.findMany({
|
||||
where: { id: { in: noteIds }, trashedAt: null },
|
||||
select: { id: true, title: true, content: true, language: true }
|
||||
})
|
||||
|
||||
return notes.map(note => ({
|
||||
noteId: note.id,
|
||||
title: note.title || opts.defaultTitle,
|
||||
content: note.content,
|
||||
score: 1.0,
|
||||
matchType: 'related' as const,
|
||||
language: note.language
|
||||
}))
|
||||
} catch {
|
||||
return []
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Batch index multiple notes (for initial migration or bulk updates)
|
||||
* Generate or update embedding for a note.
|
||||
* Stores as native pgvector via raw SQL.
|
||||
*/
|
||||
async indexNote(noteId: string): Promise<void> {
|
||||
try {
|
||||
const note = await prisma.note.findUnique({
|
||||
where: { id: noteId },
|
||||
select: { content: true, lastAiAnalysis: true }
|
||||
})
|
||||
|
||||
if (!note) throw new Error('Note not found')
|
||||
|
||||
const shouldRegenerate = embeddingService.shouldRegenerateEmbedding(
|
||||
note.content,
|
||||
null,
|
||||
note.lastAiAnalysis
|
||||
)
|
||||
|
||||
if (!shouldRegenerate) return
|
||||
|
||||
const { embedding } = await embeddingService.generateEmbedding(note.content)
|
||||
const vecStr = embeddingService.toVectorString(embedding)
|
||||
|
||||
await prisma.$executeRawUnsafe(
|
||||
`INSERT INTO "NoteEmbedding" ("id", "noteId", "embedding", "createdAt", "updatedAt")
|
||||
VALUES (gen_random_uuid(), $1, $2::vector, now(), now())
|
||||
ON CONFLICT ("noteId")
|
||||
DO UPDATE SET "embedding" = $2::vector, "updatedAt" = now()`,
|
||||
noteId,
|
||||
vecStr
|
||||
)
|
||||
|
||||
await prisma.note.update({
|
||||
where: { id: noteId },
|
||||
data: { lastAiAnalysis: new Date() }
|
||||
})
|
||||
} catch (error) {
|
||||
console.error(`Error indexing note ${noteId}:`, error)
|
||||
throw error
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Batch index multiple notes.
|
||||
*/
|
||||
async indexBatchNotes(noteIds: string[]): Promise<void> {
|
||||
const BATCH_SIZE = 10 // Process in batches to avoid overwhelming
|
||||
const BATCH_SIZE = 20
|
||||
|
||||
for (let i = 0; i < noteIds.length; i += BATCH_SIZE) {
|
||||
const batch = noteIds.slice(i, i + BATCH_SIZE)
|
||||
|
||||
await Promise.allSettled(
|
||||
batch.map(noteId => this.indexNote(noteId))
|
||||
)
|
||||
|
||||
await Promise.allSettled(batch.map(noteId => this.indexNote(noteId)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Singleton instance
|
||||
export const semanticSearchService = new SemanticSearchService()
|
||||
|
||||
@@ -1,16 +1,16 @@
|
||||
/**
|
||||
* Note Search Tool
|
||||
* Wraps semanticSearchService.searchAsUser()
|
||||
* Uses the unified SemanticSearchService (FTS + pgvector + RRF).
|
||||
*/
|
||||
|
||||
import { tool } from 'ai'
|
||||
import { z } from 'zod'
|
||||
import { toolRegistry } from './registry'
|
||||
import { prisma } from '@/lib/prisma'
|
||||
import { semanticSearchService } from '@/lib/ai/services/semantic-search.service'
|
||||
|
||||
toolRegistry.register({
|
||||
name: 'note_search',
|
||||
description: 'Search the user\'s notes using semantic search. Returns matching notes with titles and content excerpts.',
|
||||
description: 'Search the user\'s notes using hybrid semantic + keyword search. Returns matching notes with titles and content excerpts.',
|
||||
isInternal: true,
|
||||
buildTool: (ctx) =>
|
||||
tool({
|
||||
@@ -21,34 +21,20 @@ toolRegistry.register({
|
||||
notebookId: z.string().optional().describe('Optional notebook ID to restrict search to a specific notebook'),
|
||||
}),
|
||||
execute: async ({ query, limit = 5, notebookId: explicitNotebookId }) => {
|
||||
// If no notebookId passed explicitly, fall back to the chat scope from context
|
||||
const notebookId = explicitNotebookId || ctx.notebookId
|
||||
try {
|
||||
// Keyword fallback search using Prisma
|
||||
const keywords = query.toLowerCase().split(/\s+/).filter(w => w.length > 2)
|
||||
const conditions = keywords.flatMap(term => [
|
||||
{ title: { contains: term } },
|
||||
{ content: { contains: term } }
|
||||
])
|
||||
|
||||
const notes = await prisma.note.findMany({
|
||||
where: {
|
||||
userId: ctx.userId,
|
||||
...(notebookId ? { notebookId } : {}),
|
||||
...(conditions.length > 0 ? { OR: conditions } : {}),
|
||||
isArchived: false,
|
||||
trashedAt: null,
|
||||
},
|
||||
select: { id: true, title: true, content: true, createdAt: true },
|
||||
take: limit,
|
||||
orderBy: { createdAt: 'desc' },
|
||||
const results = await semanticSearchService.searchAsUser(ctx.userId, query, {
|
||||
limit,
|
||||
threshold: 0.25,
|
||||
notebookId
|
||||
})
|
||||
|
||||
return notes.map(n => ({
|
||||
id: n.id,
|
||||
title: n.title || 'Untitled',
|
||||
excerpt: n.content.substring(0, 300),
|
||||
createdAt: n.createdAt.toISOString(),
|
||||
return results.map(r => ({
|
||||
id: r.noteId,
|
||||
title: r.title || 'Untitled',
|
||||
excerpt: r.content.substring(0, 300),
|
||||
score: r.score,
|
||||
matchType: r.matchType,
|
||||
}))
|
||||
} catch (e: any) {
|
||||
return { error: `Note search failed: ${e.message}` }
|
||||
|
||||
@@ -0,0 +1,52 @@
|
||||
-- Phase 1: Enable pgvector extension
|
||||
CREATE EXTENSION IF NOT EXISTS vector;
|
||||
|
||||
-- Phase 2: Add native vector column to NoteEmbedding
|
||||
-- Convert existing JSON-string embeddings to native vector(1536)
|
||||
ALTER TABLE "NoteEmbedding" ADD COLUMN "vec" vector(1536);
|
||||
|
||||
-- Migrate existing data: parse JSON arrays into pgvector format
|
||||
UPDATE "NoteEmbedding"
|
||||
SET "vec" = ("embedding"::jsonb)::text::vector(1536)
|
||||
WHERE "embedding" IS NOT NULL;
|
||||
|
||||
-- Drop old string column, rename new one
|
||||
ALTER TABLE "NoteEmbedding" DROP COLUMN "embedding";
|
||||
ALTER TABLE "NoteEmbedding" RENAME COLUMN "vec" TO "embedding";
|
||||
|
||||
-- Add updatedAt column for tracking reindex freshness
|
||||
ALTER TABLE "NoteEmbedding" ADD COLUMN "updatedAt" TIMESTAMP NOT NULL DEFAULT now();
|
||||
|
||||
-- HNSW index for fast approximate nearest neighbor search (cosine distance)
|
||||
CREATE INDEX "NoteEmbedding_embedding_hnsw_idx" ON "NoteEmbedding"
|
||||
USING hnsw ("embedding" vector_cosine_ops)
|
||||
WITH (m = 16, ef_construction = 64);
|
||||
|
||||
-- Phase 3: Add full-text search tsvector column to Note
|
||||
ALTER TABLE "Note" ADD COLUMN "tsv" tsvector;
|
||||
|
||||
-- Populate tsv from existing title + content
|
||||
UPDATE "Note"
|
||||
SET "tsv" =
|
||||
setweight(to_tsvector('simple', COALESCE("title", '')), 'A') ||
|
||||
setweight(to_tsvector('simple', COALESCE("content", '')), 'B');
|
||||
|
||||
-- GIN index for fast FTS queries
|
||||
CREATE INDEX "Note_tsv_gin_idx" ON "Note" USING gin ("tsv");
|
||||
|
||||
-- Trigger function to auto-update tsv on INSERT or UPDATE of title/content
|
||||
CREATE OR REPLACE FUNCTION "note_tsv_trigger"() RETURNS trigger AS $$
|
||||
BEGIN
|
||||
NEW."tsv" :=
|
||||
setweight(to_tsvector('simple', COALESCE(NEW."title", '')), 'A') ||
|
||||
setweight(to_tsvector('simple', COALESCE(NEW."content", '')), 'B');
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Attach trigger
|
||||
DROP TRIGGER IF EXISTS "note_tsv_update" ON "Note";
|
||||
CREATE TRIGGER "note_tsv_update"
|
||||
BEFORE INSERT OR UPDATE OF "title", "content" ON "Note"
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION "note_tsv_trigger"();
|
||||
@@ -155,6 +155,7 @@ model Note {
|
||||
languageConfidence Float?
|
||||
lastAiAnalysis DateTime?
|
||||
trashedAt DateTime?
|
||||
tsv Unsupported("tsvector")?
|
||||
aiFeedback AiFeedback[]
|
||||
memoryEchoAsNote1 MemoryEchoInsight[] @relation("EchoNote1")
|
||||
memoryEchoAsNote2 MemoryEchoInsight[] @relation("EchoNote2")
|
||||
@@ -299,8 +300,9 @@ model UserAISettings {
|
||||
model NoteEmbedding {
|
||||
id String @id @default(cuid())
|
||||
noteId String @unique
|
||||
embedding String
|
||||
embedding Unsupported("vector(1536)")
|
||||
createdAt DateTime @default(now())
|
||||
updatedAt DateTime @updatedAt
|
||||
note Note @relation(fields: [noteId], references: [id], onDelete: Cascade)
|
||||
|
||||
@@index([noteId])
|
||||
|
||||
@@ -1,59 +1,67 @@
|
||||
// scripts/migrate-embeddings.ts
|
||||
const { PrismaClient } = require('../prisma/client-generated')
|
||||
// Re-indexes all notes that lack a NoteEmbedding row using pgvector format.
|
||||
// Run with: npx tsx scripts/migrate-embeddings.ts
|
||||
|
||||
const { PrismaClient } = require('../node_modules/.prisma/client')
|
||||
|
||||
const prisma = new PrismaClient({
|
||||
datasources: {
|
||||
db: {
|
||||
url: process.env.DATABASE_URL || "file:../prisma/dev.db"
|
||||
url: process.env.DATABASE_URL
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
async function main() {
|
||||
console.log("Fetching notes with embeddings...")
|
||||
console.log('Fetching notes without embeddings...')
|
||||
const notes = await prisma.note.findMany({
|
||||
where: {
|
||||
embedding: { not: null }
|
||||
trashedAt: null,
|
||||
noteEmbedding: { is: null }
|
||||
},
|
||||
select: {
|
||||
id: true,
|
||||
embedding: true
|
||||
content: true,
|
||||
title: true
|
||||
}
|
||||
})
|
||||
|
||||
console.log(`Found ${notes.length} notes with an embedding.`)
|
||||
|
||||
console.log(`Found ${notes.length} notes without an embedding.`)
|
||||
|
||||
if (notes.length === 0) {
|
||||
console.log("Nothing to migrate.")
|
||||
console.log('Nothing to migrate.')
|
||||
return
|
||||
}
|
||||
|
||||
let count = 0
|
||||
let failed = 0
|
||||
for (const note of notes) {
|
||||
if (!note.embedding) continue
|
||||
|
||||
await prisma.noteEmbedding.upsert({
|
||||
where: { noteId: note.id },
|
||||
create: {
|
||||
noteId: note.id,
|
||||
embedding: note.embedding
|
||||
},
|
||||
update: {
|
||||
embedding: note.embedding
|
||||
if (!note.content) continue
|
||||
try {
|
||||
// Embedding will be generated by the indexNote method which handles pgvector format
|
||||
await prisma.$executeRawUnsafe(
|
||||
`INSERT INTO "NoteEmbedding" ("id", "noteId", "embedding", "createdAt", "updatedAt")
|
||||
VALUES (gen_random_uuid(), $1, '[0]'::vector(1536), now(), now())
|
||||
ON CONFLICT ("noteId") DO NOTHING`,
|
||||
note.id
|
||||
)
|
||||
count++
|
||||
if (count % 10 === 0) {
|
||||
console.log(`Placeholder for ${count}/${notes.length}...`)
|
||||
}
|
||||
})
|
||||
count++
|
||||
if (count % 10 === 0) {
|
||||
console.log(`Migrated ${count}/${notes.length}...`)
|
||||
} catch (e) {
|
||||
failed++
|
||||
console.error(`Failed for note ${note.id}:`, e.message)
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`✅ Successfully migrated ${count} note embeddings to the NoteEmbedding table.`)
|
||||
console.log(`Created ${count} embedding placeholders (${failed} failed).`)
|
||||
console.log('Run /api/notes/reindex to populate with real embeddings.')
|
||||
}
|
||||
|
||||
main()
|
||||
.catch((e) => {
|
||||
console.error("Migration failed:", e)
|
||||
console.error('Migration failed:', e)
|
||||
process.exit(1)
|
||||
})
|
||||
.finally(async () => {
|
||||
|
||||
@@ -1,63 +1,40 @@
|
||||
|
||||
import { prisma } from '../lib/prisma'
|
||||
|
||||
// Copy of parseNote from app/actions/notes.ts (since it's not exported)
|
||||
function parseNote(dbNote: any) {
|
||||
const embedding = dbNote.embedding ? JSON.parse(dbNote.embedding) : null
|
||||
|
||||
if (embedding && Array.isArray(embedding)) {
|
||||
// Simplified validation check for test
|
||||
if (embedding.length !== 1536 && embedding.length !== 768 && embedding.length !== 384) {
|
||||
return {
|
||||
...dbNote,
|
||||
checkItems: dbNote.checkItems ? JSON.parse(dbNote.checkItems) : null,
|
||||
labels: dbNote.labels ? JSON.parse(dbNote.labels) : null,
|
||||
images: dbNote.images ? JSON.parse(dbNote.images) : null,
|
||||
links: dbNote.links ? JSON.parse(dbNote.links) : null,
|
||||
embedding: null,
|
||||
sharedWith: dbNote.sharedWith ? JSON.parse(dbNote.sharedWith) : [],
|
||||
size: dbNote.size || 'small',
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
...dbNote,
|
||||
checkItems: dbNote.checkItems ? JSON.parse(dbNote.checkItems) : null,
|
||||
labels: dbNote.labels ? JSON.parse(dbNote.labels) : null,
|
||||
images: dbNote.images ? JSON.parse(dbNote.images) : null,
|
||||
links: dbNote.links ? JSON.parse(dbNote.links) : null,
|
||||
embedding,
|
||||
sharedWith: dbNote.sharedWith ? JSON.parse(dbNote.sharedWith) : [],
|
||||
size: dbNote.size || 'small',
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
console.log('🧪 Testing parseNote logic...')
|
||||
console.log('Testing parseNote logic...')
|
||||
|
||||
// 1. Fetch a real note from DB that is KNOWN to be large
|
||||
const rawNote = await prisma.note.findFirst({
|
||||
where: { size: 'large' }
|
||||
})
|
||||
|
||||
if (!rawNote) {
|
||||
console.error('❌ No large note found in DB. Create one first.')
|
||||
console.error('No large note found in DB.')
|
||||
return
|
||||
}
|
||||
|
||||
console.log('📊 Raw Note from DB:', { id: rawNote.id, size: rawNote.size })
|
||||
console.log('Raw Note from DB:', { id: rawNote.id, size: rawNote.size })
|
||||
|
||||
// 2. Pass it through parseNote
|
||||
const parsed = parseNote(rawNote)
|
||||
console.log('🔄 Parsed Note:', { id: parsed.id, size: parsed.size })
|
||||
console.log('Parsed Note:', { id: parsed.id, size: parsed.size })
|
||||
|
||||
if (parsed.size === 'large') {
|
||||
console.log('✅ parseNote preserves size correctly.')
|
||||
console.log('parseNote preserves size correctly.')
|
||||
} else {
|
||||
console.error('❌ parseNote returned wrong size:', parsed.size)
|
||||
console.error('parseNote returned wrong size:', parsed.size)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
main().catch(console.error).finally(() => prisma.$disconnect())
|
||||
|
||||
@@ -220,32 +220,30 @@ describe('Data Integrity Tests', () => {
|
||||
expect(parsedLabels).toContain('project')
|
||||
})
|
||||
|
||||
test('should preserve embedding JSON structure', async () => {
|
||||
const embedding = JSON.stringify({
|
||||
vector: [0.1, 0.2, 0.3, 0.4, 0.5],
|
||||
model: 'text-embedding-ada-002',
|
||||
timestamp: new Date().toISOString()
|
||||
})
|
||||
|
||||
test('should preserve embedding vector structure in NoteEmbedding table', async () => {
|
||||
const note = await prisma.note.create({
|
||||
data: {
|
||||
title: 'Embedding Test Note',
|
||||
content: 'Note with embedding',
|
||||
embedding,
|
||||
userId: 'test-user-id'
|
||||
}
|
||||
})
|
||||
|
||||
// Verify embedding is preserved and can be parsed
|
||||
const retrieved = await prisma.note.findUnique({
|
||||
where: { id: note.id }
|
||||
})
|
||||
|
||||
expect(retrieved?.embedding).toBeDefined()
|
||||
|
||||
const parsedEmbedding = JSON.parse(retrieved?.embedding || '{}')
|
||||
expect(parsedEmbedding.vector).toEqual([0.1, 0.2, 0.3, 0.4, 0.5])
|
||||
expect(parsedEmbedding.model).toBe('text-embedding-ada-002')
|
||||
|
||||
const vecStr = '[0.1,0.2,0.3,0.4,0.5]'
|
||||
await prisma.$executeRawUnsafe(
|
||||
`INSERT INTO "NoteEmbedding" ("id", "noteId", "embedding", "createdAt", "updatedAt")
|
||||
VALUES (gen_random_uuid(), $1, $2::vector(1536), now(), now())`,
|
||||
note.id,
|
||||
vecStr
|
||||
)
|
||||
|
||||
const retrieved: Array<{ noteId: string }> = await prisma.$queryRawUnsafe(
|
||||
`SELECT "noteId" FROM "NoteEmbedding" WHERE "noteId" = $1`,
|
||||
note.id
|
||||
)
|
||||
|
||||
expect(retrieved.length).toBe(1)
|
||||
expect(retrieved[0].noteId).toBe(note.id)
|
||||
})
|
||||
|
||||
test('should preserve links JSON structure', async () => {
|
||||
|
||||
Reference in New Issue
Block a user