Files
Momento/memento-note/lib/ai/services/image-description.service.ts
Antigravity 2fd435df6f
Some checks failed
Deploy to Production / Build and Deploy (push) Failing after 53s
feat: redesign agents page (architectural-grid style), add image description, fix AI limits, remove dead code
- Redesign agents page with architectural-grid (8) design system:
  rounded-2xl cards, serif headings, motion tabs, dashed templates section
- Replace agent form popup with full-page detail view (SettingsView style)
  with dark planning card, section tooltips, and help button
- Hide advanced mode for slide/excalidraw generators
- Add 'describe images' action to contextual AI assistant
- Add copy button to action/resource preview with HTTP fallback
- Add delete history button to agent run log panel
- Increase AI word limit from 2000 to 5000 (reformulate + transform-markdown)
- Increase max steps slider from 25 to 50
- Fix image description error with clear model compatibility message
- Fix doubled execution count display in agent detail view
- Remove dead files: notes-list-view.tsx, notes-view-toggle.tsx
- Remove 'list' view mode from NotesViewMode type
- Add missing i18n keys (back, configuration, options, copy, cleared)
2026-05-09 17:18:47 +00:00

208 lines
8.1 KiB
TypeScript

import { generateText } from 'ai'
import { readFile } from 'fs/promises'
import path from 'path'
import { getChatProvider } from '../factory'
import { getSystemConfig } from '@/lib/config'
export interface ImageDescriptionResult {
descriptions: Array<{
index: number
description: string
}>
suggestions?: Array<{
title: string
confidence: number
reasoning?: string
}>
combinedSummary?: string
}
const UPLOAD_DIR = path.join(process.cwd(), 'data', 'uploads')
async function resolveImageAsBase64(imageUrl: string): Promise<string | null> {
const localMatch = imageUrl.match(/\/uploads\/(.+)/)
if (localMatch) {
// Try reading from filesystem first
try {
const filePath = path.join(UPLOAD_DIR, localMatch[1])
const buffer = await readFile(filePath)
const ext = path.extname(imageUrl).toLowerCase()
const mime = ext === '.png' ? 'image/png' : ext === '.gif' ? 'image/gif' : ext === '.webp' ? 'image/webp' : 'image/jpeg'
return `data:${mime};base64,${buffer.toString('base64')}`
} catch {
// File not on disk — fallback to internal HTTP API (same path the browser uses)
try {
const baseUrl = process.env.NEXTAUTH_URL || process.env.NEXT_PUBLIC_APP_URL || 'http://localhost:3000'
const res = await fetch(`${baseUrl}${imageUrl}`)
if (!res.ok) return null
const contentType = res.headers.get('content-type') || 'image/jpeg'
const arrayBuffer = await res.arrayBuffer()
const base64 = Buffer.from(arrayBuffer).toString('base64')
return `data:${contentType};base64,${base64}`
} catch {
return null
}
}
}
// Remote URL — fetch and convert
try {
const res = await fetch(imageUrl)
if (!res.ok) return null
const contentType = res.headers.get('content-type') || 'image/jpeg'
const arrayBuffer = await res.arrayBuffer()
const base64 = Buffer.from(arrayBuffer).toString('base64')
return `data:${contentType};base64,${base64}`
} catch {
return null
}
}
export async function describeImages(
imageUrls: string[],
mode: 'description' | 'title',
language: string = 'fr'
): Promise<ImageDescriptionResult> {
const config = await getSystemConfig()
const model = getChatProvider(config).getModel()
const isTitleMode = mode === 'title'
const langMap: Record<string, string> = {
fr: 'French', en: 'English', fa: 'Persian', ar: 'Arabic',
es: 'Spanish', de: 'German', it: 'Italian', pt: 'Portuguese',
ru: 'Russian', zh: 'Chinese', ja: 'Japanese', ko: 'Korean',
hi: 'Hindi', nl: 'Dutch', pl: 'Polish',
}
const langName = langMap[language] || 'English'
const resolved = await Promise.all(imageUrls.map(url => resolveImageAsBase64(url)))
const imageDataUrls = resolved.filter((d): d is string => d !== null)
if (imageDataUrls.length === 0) {
throw new Error('Could not load any of the provided images. Please check the image URLs.')
}
const buildImageContent = (dataUrl: string) => ({
type: 'image' as const,
image: dataUrl,
})
if (isTitleMode) {
const prompt = imageUrls.length === 1
? `Look carefully at this image and identify every concrete detail you can see: objects, people, animals, text, logos, colors, location/setting, actions, weather, time of day, style (photo/illustration/diagram), and any notable elements.
Then generate 3 specific, descriptive titles (3-7 words each) in ${langName}. Each title must mention concrete elements actually visible in the image — do NOT use generic or abstract words like "beautiful scene", "interesting image", "visual content". Be precise and factual.
Good example: "Red bicycle parked near a brick café wall"
Bad example: "Beautiful urban scenery"
Respond ONLY with a JSON array: [{"title": "title1", "confidence": 0.95}, {"title": "title2", "confidence": 0.85}, {"title": "title3", "confidence": 0.75}]`
: `Look carefully at these images and identify every concrete detail visible: objects, people, animals, text, logos, colors, locations, actions, weather, styles, and any notable elements across all images.
Then generate 3 specific, descriptive titles (3-7 words each) in ${langName} that capture what these images collectively show. Each title must mention concrete elements actually visible — do NOT use generic or abstract words like "beautiful scenes", "collection of images". Be precise and factual.
Good example: "Red bicycle and brick café on a sunny street"
Bad example: "Beautiful urban scenery collection"
Respond ONLY with a JSON array: [{"title": "title1", "confidence": 0.95}, {"title": "title2", "confidence": 0.85}, {"title": "title3", "confidence": 0.75}]`
const content: any[] = [{ type: 'text', text: prompt }]
for (const dataUrl of imageDataUrls) {
content.push(buildImageContent(dataUrl))
}
let text: string
try {
const result = await generateText({
model,
messages: [{ role: 'user', content }],
})
text = result.text
} catch (e: any) {
if (e.message?.includes('image_url') || e.message?.includes('image') || e.message?.includes('vision') || e.message?.includes('multimodal')) {
throw new Error('Your AI model does not support image analysis. Please switch to a vision-capable model (e.g., gpt-4o, claude-3.5-sonnet, gemini-2.0-flash).')
}
throw e
}
// Parse JSON response
const jsonMatch = text.match(/\[[\s\S]*\]/)
const parsed = jsonMatch ? JSON.parse(jsonMatch[0]) : []
const suggestions = parsed.map((t: any) => ({
title: t.title?.trim().replace(/^["']|["']$/g, '') || '',
confidence: Math.round((t.confidence || 0.5) * 100),
reasoning: undefined,
})).filter((s: any) => s.title)
return {
descriptions: [],
suggestions,
}
}
// Single image description
if (imageUrls.length === 1) {
const content: any[] = [
{ type: 'text', text: `Describe this image in detail in ${langName}. Be specific about what you see: objects, people, colors, setting, mood, text visible. Keep it under 100 words.` },
buildImageContent(imageDataUrls[0]),
]
let text: string
try {
const result = await generateText({
model,
messages: [{ role: 'user', content }],
})
text = result.text
} catch (e: any) {
if (e.message?.includes('image_url') || e.message?.includes('image') || e.message?.includes('vision') || e.message?.includes('multimodal')) {
throw new Error('Your AI model does not support image analysis. Please switch to a vision-capable model (e.g., gpt-4o, claude-3.5-sonnet, gemini-2.0-flash).')
}
throw e
}
return {
descriptions: [{ index: 0, description: text.trim() }],
}
}
// Multiple images: describe each individually
const descriptions: Array<{ index: number; description: string }> = []
for (let i = 0; i < imageDataUrls.length; i++) {
const content: any[] = [
{ type: 'text', text: `Describe this image (image ${i + 1} of ${imageDataUrls.length}) in ${langName}. Be specific: objects, people, colors, setting, text visible. Under 80 words.` },
buildImageContent(imageDataUrls[i]),
]
let text: string
try {
const result = await generateText({
model,
messages: [{ role: 'user', content }],
})
text = result.text
} catch (e: any) {
if (e.message?.includes('image_url') || e.message?.includes('image') || e.message?.includes('vision') || e.message?.includes('multimodal')) {
throw new Error('Your AI model does not support image analysis. Please switch to a vision-capable model (e.g., gpt-4o, claude-3.5-sonnet, gemini-2.0-flash).')
}
throw e
}
descriptions.push({ index: i, description: text.trim() })
}
// Combined summary
const allDescriptions = descriptions.map(d => d.description).join('\n')
const { text: summary } = await generateText({
model,
prompt: `Based on these individual image descriptions, write a brief (1-2 sentence) overall summary in ${langName} of what these images collectively show:\n\n${allDescriptions}`,
})
return {
descriptions,
combinedSummary: summary.trim(),
}
}