/** * Image Extraction Utility * Extracts image URLs from web pages using Cheerio. * Downloads and saves images locally for agent note attachment. */ import * as cheerio from 'cheerio' import { promises as fs } from 'fs' import path from 'path' import { randomUUID } from 'crypto' import sharp from 'sharp' const UPLOADS_DIR = 'public/uploads/notes' const URL_PREFIX = '/uploads/notes' const MAX_IMAGES_PER_PAGE = 3 const MIN_IMAGE_SIZE = 200 // px -- skip icons, spacers, tracking pixels const MAX_IMAGE_WIDTH = 600 // px -- resize for note-friendly display export interface ExtractedImage { url: string localPath?: string } /** * Extract image URLs from an HTML page. * Prioritizes og:image, then article images with size filtering. */ export function extractImageUrlsFromHtml(html: string, pageUrl: string): string[] { const $ = cheerio.load(html) const images: string[] = [] const seen = new Set() // 1. Open Graph image const ogImage = $('meta[property="og:image"]').attr('content') if (ogImage) { const resolved = resolveUrl(ogImage, pageUrl) if (resolved && !seen.has(resolved)) { images.push(resolved) seen.add(resolved) } } // 2. Twitter card image const twitterImage = $('meta[name="twitter:image"]').attr('content') if (twitterImage) { const resolved = resolveUrl(twitterImage, pageUrl) if (resolved && !seen.has(resolved)) { images.push(resolved) seen.add(resolved) } } // 3. Article body images (filter by size and relevance) $('article img, main img, .content img, .post-content img, .entry-content img, .article-body img').each((_, el) => { if (images.length >= MAX_IMAGES_PER_PAGE) return false const src = $(el).attr('src') || $(el).attr('data-src') if (!src) return const width = parseInt($(el).attr('width') || '0', 10) const height = parseInt($(el).attr('height') || '0', 10) // Skip if explicitly sized too small if ((width > 0 && width < MIN_IMAGE_SIZE) || (height > 0 && height < MIN_IMAGE_SIZE)) return // Skip common non-content patterns if (src.includes('avatar') || src.includes('icon') || src.includes('logo') || src.includes('badge') || src.includes('spinner')) return const resolved = resolveUrl(src, pageUrl) if (resolved && !seen.has(resolved)) { images.push(resolved) seen.add(resolved) } }) // 4. Fallback: any large images in the page if we still have room if (images.length < MAX_IMAGES_PER_PAGE) { $('img').each((_, el) => { if (images.length >= MAX_IMAGES_PER_PAGE) return false const src = $(el).attr('src') || $(el).attr('data-src') if (!src) return const width = parseInt($(el).attr('width') || '0', 10) const height = parseInt($(el).attr('height') || '0', 10) if ((width > 0 && width < MIN_IMAGE_SIZE) || (height > 0 && height < MIN_IMAGE_SIZE)) return if (src.includes('avatar') || src.includes('icon') || src.includes('logo') || src.includes('badge') || src.includes('spinner') || src.includes('pixel') || src.includes('tracking')) return const resolved = resolveUrl(src, pageUrl) if (resolved && !seen.has(resolved)) { images.push(resolved) seen.add(resolved) } }) } return images.slice(0, MAX_IMAGES_PER_PAGE) } /** * Download an image and save it locally. */ export async function downloadImage(imageUrl: string): Promise { try { const controller = new AbortController() const timeout = setTimeout(() => controller.abort(), 10000) const response = await fetch(imageUrl, { signal: controller.signal, headers: { 'User-Agent': 'Mozilla/5.0 (compatible; KeepBot/1.0)' }, }) clearTimeout(timeout) if (!response.ok) return null const contentType = response.headers.get('content-type') || '' if (!contentType.startsWith('image/')) return null const buffer = Buffer.from(await response.arrayBuffer()) if (buffer.length < 1024) return null // Skip tiny files const ext = contentType.split('/')[1]?.replace('jpeg', 'jpg') || 'jpg' const filename = `${randomUUID()}.${ext}` await fs.mkdir(path.join(process.cwd(), UPLOADS_DIR), { recursive: true }) // Resize to max width for note-friendly display try { await sharp(buffer) .resize(MAX_IMAGE_WIDTH, null, { withoutEnlargement: true }) .jpeg({ quality: 80 }) .toFile(path.join(process.cwd(), UPLOADS_DIR, filename.replace(/\.\w+$/, '.jpg'))) } catch { // Sharp failed (e.g. SVG, WebP unsupported) — save raw buffer await fs.writeFile(path.join(process.cwd(), UPLOADS_DIR, filename), buffer) } // Always reference as .jpg since sharp converts to jpeg return `${URL_PREFIX}/${filename.replace(/\.\w+$/, '.jpg')}` } catch { return null } } /** * Extract and download images from a web page. * Returns local URLs for successfully downloaded images. */ export async function extractAndDownloadImages(html: string, pageUrl: string): Promise { const imageUrls = extractImageUrlsFromHtml(html, pageUrl) const localUrls: string[] = [] for (const url of imageUrls) { const localPath = await downloadImage(url) if (localPath) { localUrls.push(localPath) } } return localUrls } function resolveUrl(src: string, pageUrl: string): string | null { try { if (src.startsWith('//')) return `https:${src}` if (src.startsWith('http://') || src.startsWith('https://')) return src if (src.startsWith('/') || src.startsWith('./')) { const base = new URL(pageUrl) return new URL(src, base.origin).href } return new URL(src, pageUrl).href } catch { return null } }