Momento/memento-note/lib/ai/services/image-description.service.ts

import { generateText } from 'ai'
import { readFile } from 'fs/promises'
import path from 'path'
import { getChatProvider } from '../factory'
import { getSystemConfig } from '@/lib/config'

export interface ImageDescriptionResult {
  descriptions: Array<{
    index: number
    description: string
  }>
  suggestions?: Array<{
    title: string
    confidence: number
    reasoning?: string
  }>
  combinedSummary?: string
}

const UPLOAD_DIR = path.join(process.cwd(), 'data', 'uploads')

async function resolveImageAsBase64(imageUrl: string): Promise<string | null> {
  const localMatch = imageUrl.match(/\/uploads\/(.+)/)
  if (localMatch) {
    // Try reading from filesystem first
    try {
      const filePath = path.join(UPLOAD_DIR, localMatch[1])
      const buffer = await readFile(filePath)
      const ext = path.extname(imageUrl).toLowerCase()
      const mime = ext === '.png' ? 'image/png' : ext === '.gif' ? 'image/gif' : ext === '.webp' ? 'image/webp' : 'image/jpeg'
      return `data:${mime};base64,${buffer.toString('base64')}`
    } catch {
      // File not on disk — fallback to internal HTTP API (same path the browser uses)
      try {
        const baseUrl = process.env.NEXTAUTH_URL || process.env.NEXT_PUBLIC_APP_URL || 'http://localhost:3000'
        const res = await fetch(`${baseUrl}${imageUrl}`)
        if (!res.ok) return null
        const contentType = res.headers.get('content-type') || 'image/jpeg'
        const arrayBuffer = await res.arrayBuffer()
        const base64 = Buffer.from(arrayBuffer).toString('base64')
        return `data:${contentType};base64,${base64}`
      } catch {
        return null
      }
    }
  }

  // Remote URL — fetch and convert
  try {
    const res = await fetch(imageUrl)
    if (!res.ok) return null
    const contentType = res.headers.get('content-type') || 'image/jpeg'
    const arrayBuffer = await res.arrayBuffer()
    const base64 = Buffer.from(arrayBuffer).toString('base64')
    return `data:${contentType};base64,${base64}`
  } catch {
    return null
  }
}

export async function describeImages(
  imageUrls: string[],
  mode: 'description' | 'title',
  language: string = 'fr'
): Promise<ImageDescriptionResult> {
  const config = await getSystemConfig()
  const model = getChatProvider(config).getModel()

  const isTitleMode = mode === 'title'
  const langMap: Record<string, string> = {
    fr: 'French', en: 'English', fa: 'Persian', ar: 'Arabic',
    es: 'Spanish', de: 'German', it: 'Italian', pt: 'Portuguese',
    ru: 'Russian', zh: 'Chinese', ja: 'Japanese', ko: 'Korean',
    hi: 'Hindi', nl: 'Dutch', pl: 'Polish',
  }
  const langName = langMap[language] || 'English'

  const resolved = await Promise.all(imageUrls.map(url => resolveImageAsBase64(url)))
  const imageDataUrls = resolved.filter((d): d is string => d !== null)

  if (imageDataUrls.length === 0) {
    throw new Error('Could not load any of the provided images. Please check the image URLs.')
  }

  const buildImageContent = (dataUrl: string) => ({
    type: 'image' as const,
    image: dataUrl,
  })

  if (isTitleMode) {
    const prompt = imageUrls.length === 1
      ? `Look carefully at this image and identify every concrete detail you can see: objects, people, animals, text, logos, colors, location/setting, actions, weather, time of day, style (photo/illustration/diagram), and any notable elements.

Then generate 3 specific, descriptive titles (3-7 words each) in ${langName}. Each title must mention concrete elements actually visible in the image — do NOT use generic or abstract words like "beautiful scene", "interesting image", "visual content". Be precise and factual.

Good example: "Red bicycle parked near a brick café wall"
Bad example: "Beautiful urban scenery"

Respond ONLY with a JSON array: [{"title": "title1", "confidence": 0.95}, {"title": "title2", "confidence": 0.85}, {"title": "title3", "confidence": 0.75}]`
      : `Look carefully at these images and identify every concrete detail visible: objects, people, animals, text, logos, colors, locations, actions, weather, styles, and any notable elements across all images.

Then generate 3 specific, descriptive titles (3-7 words each) in ${langName} that capture what these images collectively show. Each title must mention concrete elements actually visible — do NOT use generic or abstract words like "beautiful scenes", "collection of images". Be precise and factual.

Good example: "Red bicycle and brick café on a sunny street"
Bad example: "Beautiful urban scenery collection"

Respond ONLY with a JSON array: [{"title": "title1", "confidence": 0.95}, {"title": "title2", "confidence": 0.85}, {"title": "title3", "confidence": 0.75}]`

    const content: any[] = [{ type: 'text', text: prompt }]
    for (const dataUrl of imageDataUrls) {
      content.push(buildImageContent(dataUrl))
    }

    let text: string
    try {
      const result = await generateText({
        model,
        messages: [{ role: 'user', content }],
      })
      text = result.text
    } catch (e: any) {
      if (e.message?.includes('image_url') || e.message?.includes('image') || e.message?.includes('vision') || e.message?.includes('multimodal')) {
        throw new Error('Your AI model does not support image analysis. Please switch to a vision-capable model (e.g., gpt-4o, claude-3.5-sonnet, gemini-2.0-flash).')
      }
      throw e
    }

    // Parse JSON response
    const jsonMatch = text.match(/\[[\s\S]*\]/)
    const parsed = jsonMatch ? JSON.parse(jsonMatch[0]) : []

    const suggestions = parsed.map((t: any) => ({
      title: t.title?.trim().replace(/^["']|["']$/g, '') || '',
      confidence: Math.round((t.confidence || 0.5) * 100),
      reasoning: undefined,
    })).filter((s: any) => s.title)

    return {
      descriptions: [],
      suggestions,
    }
  }

  // Single image description
  if (imageUrls.length === 1) {
    const content: any[] = [
      { type: 'text', text: `Describe this image in detail in ${langName}. Be specific about what you see: objects, people, colors, setting, mood, text visible. Keep it under 100 words.` },
      buildImageContent(imageDataUrls[0]),
    ]

    let text: string
    try {
      const result = await generateText({
        model,
        messages: [{ role: 'user', content }],
      })
      text = result.text
    } catch (e: any) {
      if (e.message?.includes('image_url') || e.message?.includes('image') || e.message?.includes('vision') || e.message?.includes('multimodal')) {
        throw new Error('Your AI model does not support image analysis. Please switch to a vision-capable model (e.g., gpt-4o, claude-3.5-sonnet, gemini-2.0-flash).')
      }
      throw e
    }

    return {
      descriptions: [{ index: 0, description: text.trim() }],
    }
  }

  // Multiple images: describe each individually
  const descriptions: Array<{ index: number; description: string }> = []

  for (let i = 0; i < imageDataUrls.length; i++) {
    const content: any[] = [
      { type: 'text', text: `Describe this image (image ${i + 1} of ${imageDataUrls.length}) in ${langName}. Be specific: objects, people, colors, setting, text visible. Under 80 words.` },
      buildImageContent(imageDataUrls[i]),
    ]

    let text: string
    try {
      const result = await generateText({
        model,
        messages: [{ role: 'user', content }],
      })
      text = result.text
    } catch (e: any) {
      if (e.message?.includes('image_url') || e.message?.includes('image') || e.message?.includes('vision') || e.message?.includes('multimodal')) {
        throw new Error('Your AI model does not support image analysis. Please switch to a vision-capable model (e.g., gpt-4o, claude-3.5-sonnet, gemini-2.0-flash).')
      }
      throw e
    }

    descriptions.push({ index: i, description: text.trim() })
  }

  // Combined summary
  const allDescriptions = descriptions.map(d => d.description).join('\n')
  const { text: summary } = await generateText({
    model,
    prompt: `Based on these individual image descriptions, write a brief (1-2 sentence) overall summary in ${langName} of what these images collectively show:\n\n${allDescriptions}`,
  })

  return {
    descriptions,
    combinedSummary: summary.trim(),
  }
}