Some checks failed
Deploy to Production / Build and Deploy (push) Failing after 53s
- Redesign agents page with architectural-grid (8) design system: rounded-2xl cards, serif headings, motion tabs, dashed templates section - Replace agent form popup with full-page detail view (SettingsView style) with dark planning card, section tooltips, and help button - Hide advanced mode for slide/excalidraw generators - Add 'describe images' action to contextual AI assistant - Add copy button to action/resource preview with HTTP fallback - Add delete history button to agent run log panel - Increase AI word limit from 2000 to 5000 (reformulate + transform-markdown) - Increase max steps slider from 25 to 50 - Fix image description error with clear model compatibility message - Fix doubled execution count display in agent detail view - Remove dead files: notes-list-view.tsx, notes-view-toggle.tsx - Remove 'list' view mode from NotesViewMode type - Add missing i18n keys (back, configuration, options, copy, cleared)
208 lines
8.1 KiB
TypeScript
208 lines
8.1 KiB
TypeScript
import { generateText } from 'ai'
|
|
import { readFile } from 'fs/promises'
|
|
import path from 'path'
|
|
import { getChatProvider } from '../factory'
|
|
import { getSystemConfig } from '@/lib/config'
|
|
|
|
export interface ImageDescriptionResult {
|
|
descriptions: Array<{
|
|
index: number
|
|
description: string
|
|
}>
|
|
suggestions?: Array<{
|
|
title: string
|
|
confidence: number
|
|
reasoning?: string
|
|
}>
|
|
combinedSummary?: string
|
|
}
|
|
|
|
const UPLOAD_DIR = path.join(process.cwd(), 'data', 'uploads')
|
|
|
|
async function resolveImageAsBase64(imageUrl: string): Promise<string | null> {
|
|
const localMatch = imageUrl.match(/\/uploads\/(.+)/)
|
|
if (localMatch) {
|
|
// Try reading from filesystem first
|
|
try {
|
|
const filePath = path.join(UPLOAD_DIR, localMatch[1])
|
|
const buffer = await readFile(filePath)
|
|
const ext = path.extname(imageUrl).toLowerCase()
|
|
const mime = ext === '.png' ? 'image/png' : ext === '.gif' ? 'image/gif' : ext === '.webp' ? 'image/webp' : 'image/jpeg'
|
|
return `data:${mime};base64,${buffer.toString('base64')}`
|
|
} catch {
|
|
// File not on disk — fallback to internal HTTP API (same path the browser uses)
|
|
try {
|
|
const baseUrl = process.env.NEXTAUTH_URL || process.env.NEXT_PUBLIC_APP_URL || 'http://localhost:3000'
|
|
const res = await fetch(`${baseUrl}${imageUrl}`)
|
|
if (!res.ok) return null
|
|
const contentType = res.headers.get('content-type') || 'image/jpeg'
|
|
const arrayBuffer = await res.arrayBuffer()
|
|
const base64 = Buffer.from(arrayBuffer).toString('base64')
|
|
return `data:${contentType};base64,${base64}`
|
|
} catch {
|
|
return null
|
|
}
|
|
}
|
|
}
|
|
|
|
// Remote URL — fetch and convert
|
|
try {
|
|
const res = await fetch(imageUrl)
|
|
if (!res.ok) return null
|
|
const contentType = res.headers.get('content-type') || 'image/jpeg'
|
|
const arrayBuffer = await res.arrayBuffer()
|
|
const base64 = Buffer.from(arrayBuffer).toString('base64')
|
|
return `data:${contentType};base64,${base64}`
|
|
} catch {
|
|
return null
|
|
}
|
|
}
|
|
|
|
export async function describeImages(
|
|
imageUrls: string[],
|
|
mode: 'description' | 'title',
|
|
language: string = 'fr'
|
|
): Promise<ImageDescriptionResult> {
|
|
const config = await getSystemConfig()
|
|
const model = getChatProvider(config).getModel()
|
|
|
|
const isTitleMode = mode === 'title'
|
|
const langMap: Record<string, string> = {
|
|
fr: 'French', en: 'English', fa: 'Persian', ar: 'Arabic',
|
|
es: 'Spanish', de: 'German', it: 'Italian', pt: 'Portuguese',
|
|
ru: 'Russian', zh: 'Chinese', ja: 'Japanese', ko: 'Korean',
|
|
hi: 'Hindi', nl: 'Dutch', pl: 'Polish',
|
|
}
|
|
const langName = langMap[language] || 'English'
|
|
|
|
const resolved = await Promise.all(imageUrls.map(url => resolveImageAsBase64(url)))
|
|
const imageDataUrls = resolved.filter((d): d is string => d !== null)
|
|
|
|
if (imageDataUrls.length === 0) {
|
|
throw new Error('Could not load any of the provided images. Please check the image URLs.')
|
|
}
|
|
|
|
const buildImageContent = (dataUrl: string) => ({
|
|
type: 'image' as const,
|
|
image: dataUrl,
|
|
})
|
|
|
|
if (isTitleMode) {
|
|
const prompt = imageUrls.length === 1
|
|
? `Look carefully at this image and identify every concrete detail you can see: objects, people, animals, text, logos, colors, location/setting, actions, weather, time of day, style (photo/illustration/diagram), and any notable elements.
|
|
|
|
Then generate 3 specific, descriptive titles (3-7 words each) in ${langName}. Each title must mention concrete elements actually visible in the image — do NOT use generic or abstract words like "beautiful scene", "interesting image", "visual content". Be precise and factual.
|
|
|
|
Good example: "Red bicycle parked near a brick café wall"
|
|
Bad example: "Beautiful urban scenery"
|
|
|
|
Respond ONLY with a JSON array: [{"title": "title1", "confidence": 0.95}, {"title": "title2", "confidence": 0.85}, {"title": "title3", "confidence": 0.75}]`
|
|
: `Look carefully at these images and identify every concrete detail visible: objects, people, animals, text, logos, colors, locations, actions, weather, styles, and any notable elements across all images.
|
|
|
|
Then generate 3 specific, descriptive titles (3-7 words each) in ${langName} that capture what these images collectively show. Each title must mention concrete elements actually visible — do NOT use generic or abstract words like "beautiful scenes", "collection of images". Be precise and factual.
|
|
|
|
Good example: "Red bicycle and brick café on a sunny street"
|
|
Bad example: "Beautiful urban scenery collection"
|
|
|
|
Respond ONLY with a JSON array: [{"title": "title1", "confidence": 0.95}, {"title": "title2", "confidence": 0.85}, {"title": "title3", "confidence": 0.75}]`
|
|
|
|
const content: any[] = [{ type: 'text', text: prompt }]
|
|
for (const dataUrl of imageDataUrls) {
|
|
content.push(buildImageContent(dataUrl))
|
|
}
|
|
|
|
let text: string
|
|
try {
|
|
const result = await generateText({
|
|
model,
|
|
messages: [{ role: 'user', content }],
|
|
})
|
|
text = result.text
|
|
} catch (e: any) {
|
|
if (e.message?.includes('image_url') || e.message?.includes('image') || e.message?.includes('vision') || e.message?.includes('multimodal')) {
|
|
throw new Error('Your AI model does not support image analysis. Please switch to a vision-capable model (e.g., gpt-4o, claude-3.5-sonnet, gemini-2.0-flash).')
|
|
}
|
|
throw e
|
|
}
|
|
|
|
// Parse JSON response
|
|
const jsonMatch = text.match(/\[[\s\S]*\]/)
|
|
const parsed = jsonMatch ? JSON.parse(jsonMatch[0]) : []
|
|
|
|
const suggestions = parsed.map((t: any) => ({
|
|
title: t.title?.trim().replace(/^["']|["']$/g, '') || '',
|
|
confidence: Math.round((t.confidence || 0.5) * 100),
|
|
reasoning: undefined,
|
|
})).filter((s: any) => s.title)
|
|
|
|
return {
|
|
descriptions: [],
|
|
suggestions,
|
|
}
|
|
}
|
|
|
|
// Single image description
|
|
if (imageUrls.length === 1) {
|
|
const content: any[] = [
|
|
{ type: 'text', text: `Describe this image in detail in ${langName}. Be specific about what you see: objects, people, colors, setting, mood, text visible. Keep it under 100 words.` },
|
|
buildImageContent(imageDataUrls[0]),
|
|
]
|
|
|
|
let text: string
|
|
try {
|
|
const result = await generateText({
|
|
model,
|
|
messages: [{ role: 'user', content }],
|
|
})
|
|
text = result.text
|
|
} catch (e: any) {
|
|
if (e.message?.includes('image_url') || e.message?.includes('image') || e.message?.includes('vision') || e.message?.includes('multimodal')) {
|
|
throw new Error('Your AI model does not support image analysis. Please switch to a vision-capable model (e.g., gpt-4o, claude-3.5-sonnet, gemini-2.0-flash).')
|
|
}
|
|
throw e
|
|
}
|
|
|
|
return {
|
|
descriptions: [{ index: 0, description: text.trim() }],
|
|
}
|
|
}
|
|
|
|
// Multiple images: describe each individually
|
|
const descriptions: Array<{ index: number; description: string }> = []
|
|
|
|
for (let i = 0; i < imageDataUrls.length; i++) {
|
|
const content: any[] = [
|
|
{ type: 'text', text: `Describe this image (image ${i + 1} of ${imageDataUrls.length}) in ${langName}. Be specific: objects, people, colors, setting, text visible. Under 80 words.` },
|
|
buildImageContent(imageDataUrls[i]),
|
|
]
|
|
|
|
let text: string
|
|
try {
|
|
const result = await generateText({
|
|
model,
|
|
messages: [{ role: 'user', content }],
|
|
})
|
|
text = result.text
|
|
} catch (e: any) {
|
|
if (e.message?.includes('image_url') || e.message?.includes('image') || e.message?.includes('vision') || e.message?.includes('multimodal')) {
|
|
throw new Error('Your AI model does not support image analysis. Please switch to a vision-capable model (e.g., gpt-4o, claude-3.5-sonnet, gemini-2.0-flash).')
|
|
}
|
|
throw e
|
|
}
|
|
|
|
descriptions.push({ index: i, description: text.trim() })
|
|
}
|
|
|
|
// Combined summary
|
|
const allDescriptions = descriptions.map(d => d.description).join('\n')
|
|
const { text: summary } = await generateText({
|
|
model,
|
|
prompt: `Based on these individual image descriptions, write a brief (1-2 sentence) overall summary in ${langName} of what these images collectively show:\n\n${allDescriptions}`,
|
|
})
|
|
|
|
return {
|
|
descriptions,
|
|
combinedSummary: summary.trim(),
|
|
}
|
|
}
|