Keep/keep-notes/lib/ai/tools/extract-images.ts

/**
 * Image Extraction Utility
 * Extracts image URLs from web pages using Cheerio.
 * Downloads and saves images locally for agent note attachment.
 */

import * as cheerio from 'cheerio'
import { promises as fs } from 'fs'
import path from 'path'
import { randomUUID } from 'crypto'
import sharp from 'sharp'

const UPLOADS_DIR = 'public/uploads/notes'
const URL_PREFIX = '/uploads/notes'
const MAX_IMAGES_PER_PAGE = 3
const MIN_IMAGE_SIZE = 200 // px -- skip icons, spacers, tracking pixels
const MAX_IMAGE_WIDTH = 600 // px -- resize for note-friendly display

export interface ExtractedImage {
  url: string
  localPath?: string
}

/**
 * Extract image URLs from an HTML page.
 * Prioritizes og:image, then article images with size filtering.
 */
export function extractImageUrlsFromHtml(html: string, pageUrl: string): string[] {
  const $ = cheerio.load(html)
  const images: string[] = []
  const seen = new Set<string>()

  // 1. Open Graph image
  const ogImage = $('meta[property="og:image"]').attr('content')
  if (ogImage) {
    const resolved = resolveUrl(ogImage, pageUrl)
    if (resolved && !seen.has(resolved)) {
      images.push(resolved)
      seen.add(resolved)
    }
  }

  // 2. Twitter card image
  const twitterImage = $('meta[name="twitter:image"]').attr('content')
  if (twitterImage) {
    const resolved = resolveUrl(twitterImage, pageUrl)
    if (resolved && !seen.has(resolved)) {
      images.push(resolved)
      seen.add(resolved)
    }
  }

  // 3. Article body images (filter by size and relevance)
  $('article img, main img, .content img, .post-content img, .entry-content img, .article-body img').each((_, el) => {
    if (images.length >= MAX_IMAGES_PER_PAGE) return false
    const src = $(el).attr('src') || $(el).attr('data-src')
    if (!src) return
    const width = parseInt($(el).attr('width') || '0', 10)
    const height = parseInt($(el).attr('height') || '0', 10)
    // Skip if explicitly sized too small
    if ((width > 0 && width < MIN_IMAGE_SIZE) || (height > 0 && height < MIN_IMAGE_SIZE)) return
    // Skip common non-content patterns
    if (src.includes('avatar') || src.includes('icon') || src.includes('logo') || src.includes('badge') || src.includes('spinner')) return
    const resolved = resolveUrl(src, pageUrl)
    if (resolved && !seen.has(resolved)) {
      images.push(resolved)
      seen.add(resolved)
    }
  })

  // 4. Fallback: any large images in the page if we still have room
  if (images.length < MAX_IMAGES_PER_PAGE) {
    $('img').each((_, el) => {
      if (images.length >= MAX_IMAGES_PER_PAGE) return false
      const src = $(el).attr('src') || $(el).attr('data-src')
      if (!src) return
      const width = parseInt($(el).attr('width') || '0', 10)
      const height = parseInt($(el).attr('height') || '0', 10)
      if ((width > 0 && width < MIN_IMAGE_SIZE) || (height > 0 && height < MIN_IMAGE_SIZE)) return
      if (src.includes('avatar') || src.includes('icon') || src.includes('logo') || src.includes('badge') || src.includes('spinner') || src.includes('pixel') || src.includes('tracking')) return
      const resolved = resolveUrl(src, pageUrl)
      if (resolved && !seen.has(resolved)) {
        images.push(resolved)
        seen.add(resolved)
      }
    })
  }

  return images.slice(0, MAX_IMAGES_PER_PAGE)
}

/**
 * Download an image and save it locally.
 */
export async function downloadImage(imageUrl: string): Promise<string | null> {
  try {
    const controller = new AbortController()
    const timeout = setTimeout(() => controller.abort(), 10000)

    const response = await fetch(imageUrl, {
      signal: controller.signal,
      headers: { 'User-Agent': 'Mozilla/5.0 (compatible; KeepBot/1.0)' },
    })
    clearTimeout(timeout)

    if (!response.ok) return null

    const contentType = response.headers.get('content-type') || ''
    if (!contentType.startsWith('image/')) return null

    const buffer = Buffer.from(await response.arrayBuffer())
    if (buffer.length < 1024) return null // Skip tiny files

    const ext = contentType.split('/')[1]?.replace('jpeg', 'jpg') || 'jpg'
    const filename = `${randomUUID()}.${ext}`

    await fs.mkdir(path.join(process.cwd(), UPLOADS_DIR), { recursive: true })

    // Resize to max width for note-friendly display
    try {
      await sharp(buffer)
        .resize(MAX_IMAGE_WIDTH, null, { withoutEnlargement: true })
        .jpeg({ quality: 80 })
        .toFile(path.join(process.cwd(), UPLOADS_DIR, filename.replace(/\.\w+$/, '.jpg')))
    } catch {
      // Sharp failed (e.g. SVG, WebP unsupported) — save raw buffer
      await fs.writeFile(path.join(process.cwd(), UPLOADS_DIR, filename), buffer)
    }

    // Always reference as .jpg since sharp converts to jpeg
    return `${URL_PREFIX}/${filename.replace(/\.\w+$/, '.jpg')}`
  } catch {
    return null
  }
}

/**
 * Extract and download images from a web page.
 * Returns local URLs for successfully downloaded images.
 */
export async function extractAndDownloadImages(html: string, pageUrl: string): Promise<string[]> {
  const imageUrls = extractImageUrlsFromHtml(html, pageUrl)
  const localUrls: string[] = []

  for (const url of imageUrls) {
    const localPath = await downloadImage(url)
    if (localPath) {
      localUrls.push(localPath)
    }
  }

  return localUrls
}

function resolveUrl(src: string, pageUrl: string): string | null {
  try {
    if (src.startsWith('//')) return `https:${src}`
    if (src.startsWith('http://') || src.startsWith('https://')) return src
    if (src.startsWith('/') || src.startsWith('./')) {
      const base = new URL(pageUrl)
      return new URL(src, base.origin).href
    }
    return new URL(src, pageUrl).href
  } catch {
    return null
  }
}