Files
Keep/keep-notes/lib/ai/tools/extract-images.ts

168 lines
5.6 KiB
TypeScript

/**
* Image Extraction Utility
* Extracts image URLs from web pages using Cheerio.
* Downloads and saves images locally for agent note attachment.
*/
import * as cheerio from 'cheerio'
import { promises as fs } from 'fs'
import path from 'path'
import { randomUUID } from 'crypto'
import sharp from 'sharp'
const UPLOADS_DIR = 'public/uploads/notes'
const URL_PREFIX = '/uploads/notes'
const MAX_IMAGES_PER_PAGE = 3
const MIN_IMAGE_SIZE = 200 // px -- skip icons, spacers, tracking pixels
const MAX_IMAGE_WIDTH = 600 // px -- resize for note-friendly display
export interface ExtractedImage {
url: string
localPath?: string
}
/**
* Extract image URLs from an HTML page.
* Prioritizes og:image, then article images with size filtering.
*/
export function extractImageUrlsFromHtml(html: string, pageUrl: string): string[] {
const $ = cheerio.load(html)
const images: string[] = []
const seen = new Set<string>()
// 1. Open Graph image
const ogImage = $('meta[property="og:image"]').attr('content')
if (ogImage) {
const resolved = resolveUrl(ogImage, pageUrl)
if (resolved && !seen.has(resolved)) {
images.push(resolved)
seen.add(resolved)
}
}
// 2. Twitter card image
const twitterImage = $('meta[name="twitter:image"]').attr('content')
if (twitterImage) {
const resolved = resolveUrl(twitterImage, pageUrl)
if (resolved && !seen.has(resolved)) {
images.push(resolved)
seen.add(resolved)
}
}
// 3. Article body images (filter by size and relevance)
$('article img, main img, .content img, .post-content img, .entry-content img, .article-body img').each((_, el) => {
if (images.length >= MAX_IMAGES_PER_PAGE) return false
const src = $(el).attr('src') || $(el).attr('data-src')
if (!src) return
const width = parseInt($(el).attr('width') || '0', 10)
const height = parseInt($(el).attr('height') || '0', 10)
// Skip if explicitly sized too small
if ((width > 0 && width < MIN_IMAGE_SIZE) || (height > 0 && height < MIN_IMAGE_SIZE)) return
// Skip common non-content patterns
if (src.includes('avatar') || src.includes('icon') || src.includes('logo') || src.includes('badge') || src.includes('spinner')) return
const resolved = resolveUrl(src, pageUrl)
if (resolved && !seen.has(resolved)) {
images.push(resolved)
seen.add(resolved)
}
})
// 4. Fallback: any large images in the page if we still have room
if (images.length < MAX_IMAGES_PER_PAGE) {
$('img').each((_, el) => {
if (images.length >= MAX_IMAGES_PER_PAGE) return false
const src = $(el).attr('src') || $(el).attr('data-src')
if (!src) return
const width = parseInt($(el).attr('width') || '0', 10)
const height = parseInt($(el).attr('height') || '0', 10)
if ((width > 0 && width < MIN_IMAGE_SIZE) || (height > 0 && height < MIN_IMAGE_SIZE)) return
if (src.includes('avatar') || src.includes('icon') || src.includes('logo') || src.includes('badge') || src.includes('spinner') || src.includes('pixel') || src.includes('tracking')) return
const resolved = resolveUrl(src, pageUrl)
if (resolved && !seen.has(resolved)) {
images.push(resolved)
seen.add(resolved)
}
})
}
return images.slice(0, MAX_IMAGES_PER_PAGE)
}
/**
* Download an image and save it locally.
*/
export async function downloadImage(imageUrl: string): Promise<string | null> {
try {
const controller = new AbortController()
const timeout = setTimeout(() => controller.abort(), 10000)
const response = await fetch(imageUrl, {
signal: controller.signal,
headers: { 'User-Agent': 'Mozilla/5.0 (compatible; KeepBot/1.0)' },
})
clearTimeout(timeout)
if (!response.ok) return null
const contentType = response.headers.get('content-type') || ''
if (!contentType.startsWith('image/')) return null
const buffer = Buffer.from(await response.arrayBuffer())
if (buffer.length < 1024) return null // Skip tiny files
const ext = contentType.split('/')[1]?.replace('jpeg', 'jpg') || 'jpg'
const filename = `${randomUUID()}.${ext}`
await fs.mkdir(path.join(process.cwd(), UPLOADS_DIR), { recursive: true })
// Resize to max width for note-friendly display
try {
await sharp(buffer)
.resize(MAX_IMAGE_WIDTH, null, { withoutEnlargement: true })
.jpeg({ quality: 80 })
.toFile(path.join(process.cwd(), UPLOADS_DIR, filename.replace(/\.\w+$/, '.jpg')))
} catch {
// Sharp failed (e.g. SVG, WebP unsupported) — save raw buffer
await fs.writeFile(path.join(process.cwd(), UPLOADS_DIR, filename), buffer)
}
// Always reference as .jpg since sharp converts to jpeg
return `${URL_PREFIX}/${filename.replace(/\.\w+$/, '.jpg')}`
} catch {
return null
}
}
/**
* Extract and download images from a web page.
* Returns local URLs for successfully downloaded images.
*/
export async function extractAndDownloadImages(html: string, pageUrl: string): Promise<string[]> {
const imageUrls = extractImageUrlsFromHtml(html, pageUrl)
const localUrls: string[] = []
for (const url of imageUrls) {
const localPath = await downloadImage(url)
if (localPath) {
localUrls.push(localPath)
}
}
return localUrls
}
function resolveUrl(src: string, pageUrl: string): string | null {
try {
if (src.startsWith('//')) return `https:${src}`
if (src.startsWith('http://') || src.startsWith('https://')) return src
if (src.startsWith('/') || src.startsWith('./')) {
const base = new URL(pageUrl)
return new URL(src, base.origin).href
}
return new URL(src, pageUrl).href
} catch {
return null
}
}