168 lines
5.6 KiB
TypeScript
168 lines
5.6 KiB
TypeScript
/**
|
|
* Image Extraction Utility
|
|
* Extracts image URLs from web pages using Cheerio.
|
|
* Downloads and saves images locally for agent note attachment.
|
|
*/
|
|
|
|
import * as cheerio from 'cheerio'
|
|
import { promises as fs } from 'fs'
|
|
import path from 'path'
|
|
import { randomUUID } from 'crypto'
|
|
import sharp from 'sharp'
|
|
|
|
const UPLOADS_DIR = 'public/uploads/notes'
|
|
const URL_PREFIX = '/uploads/notes'
|
|
const MAX_IMAGES_PER_PAGE = 3
|
|
const MIN_IMAGE_SIZE = 200 // px -- skip icons, spacers, tracking pixels
|
|
const MAX_IMAGE_WIDTH = 600 // px -- resize for note-friendly display
|
|
|
|
export interface ExtractedImage {
|
|
url: string
|
|
localPath?: string
|
|
}
|
|
|
|
/**
|
|
* Extract image URLs from an HTML page.
|
|
* Prioritizes og:image, then article images with size filtering.
|
|
*/
|
|
export function extractImageUrlsFromHtml(html: string, pageUrl: string): string[] {
|
|
const $ = cheerio.load(html)
|
|
const images: string[] = []
|
|
const seen = new Set<string>()
|
|
|
|
// 1. Open Graph image
|
|
const ogImage = $('meta[property="og:image"]').attr('content')
|
|
if (ogImage) {
|
|
const resolved = resolveUrl(ogImage, pageUrl)
|
|
if (resolved && !seen.has(resolved)) {
|
|
images.push(resolved)
|
|
seen.add(resolved)
|
|
}
|
|
}
|
|
|
|
// 2. Twitter card image
|
|
const twitterImage = $('meta[name="twitter:image"]').attr('content')
|
|
if (twitterImage) {
|
|
const resolved = resolveUrl(twitterImage, pageUrl)
|
|
if (resolved && !seen.has(resolved)) {
|
|
images.push(resolved)
|
|
seen.add(resolved)
|
|
}
|
|
}
|
|
|
|
// 3. Article body images (filter by size and relevance)
|
|
$('article img, main img, .content img, .post-content img, .entry-content img, .article-body img').each((_, el) => {
|
|
if (images.length >= MAX_IMAGES_PER_PAGE) return false
|
|
const src = $(el).attr('src') || $(el).attr('data-src')
|
|
if (!src) return
|
|
const width = parseInt($(el).attr('width') || '0', 10)
|
|
const height = parseInt($(el).attr('height') || '0', 10)
|
|
// Skip if explicitly sized too small
|
|
if ((width > 0 && width < MIN_IMAGE_SIZE) || (height > 0 && height < MIN_IMAGE_SIZE)) return
|
|
// Skip common non-content patterns
|
|
if (src.includes('avatar') || src.includes('icon') || src.includes('logo') || src.includes('badge') || src.includes('spinner')) return
|
|
const resolved = resolveUrl(src, pageUrl)
|
|
if (resolved && !seen.has(resolved)) {
|
|
images.push(resolved)
|
|
seen.add(resolved)
|
|
}
|
|
})
|
|
|
|
// 4. Fallback: any large images in the page if we still have room
|
|
if (images.length < MAX_IMAGES_PER_PAGE) {
|
|
$('img').each((_, el) => {
|
|
if (images.length >= MAX_IMAGES_PER_PAGE) return false
|
|
const src = $(el).attr('src') || $(el).attr('data-src')
|
|
if (!src) return
|
|
const width = parseInt($(el).attr('width') || '0', 10)
|
|
const height = parseInt($(el).attr('height') || '0', 10)
|
|
if ((width > 0 && width < MIN_IMAGE_SIZE) || (height > 0 && height < MIN_IMAGE_SIZE)) return
|
|
if (src.includes('avatar') || src.includes('icon') || src.includes('logo') || src.includes('badge') || src.includes('spinner') || src.includes('pixel') || src.includes('tracking')) return
|
|
const resolved = resolveUrl(src, pageUrl)
|
|
if (resolved && !seen.has(resolved)) {
|
|
images.push(resolved)
|
|
seen.add(resolved)
|
|
}
|
|
})
|
|
}
|
|
|
|
return images.slice(0, MAX_IMAGES_PER_PAGE)
|
|
}
|
|
|
|
/**
|
|
* Download an image and save it locally.
|
|
*/
|
|
export async function downloadImage(imageUrl: string): Promise<string | null> {
|
|
try {
|
|
const controller = new AbortController()
|
|
const timeout = setTimeout(() => controller.abort(), 10000)
|
|
|
|
const response = await fetch(imageUrl, {
|
|
signal: controller.signal,
|
|
headers: { 'User-Agent': 'Mozilla/5.0 (compatible; KeepBot/1.0)' },
|
|
})
|
|
clearTimeout(timeout)
|
|
|
|
if (!response.ok) return null
|
|
|
|
const contentType = response.headers.get('content-type') || ''
|
|
if (!contentType.startsWith('image/')) return null
|
|
|
|
const buffer = Buffer.from(await response.arrayBuffer())
|
|
if (buffer.length < 1024) return null // Skip tiny files
|
|
|
|
const ext = contentType.split('/')[1]?.replace('jpeg', 'jpg') || 'jpg'
|
|
const filename = `${randomUUID()}.${ext}`
|
|
|
|
await fs.mkdir(path.join(process.cwd(), UPLOADS_DIR), { recursive: true })
|
|
|
|
// Resize to max width for note-friendly display
|
|
try {
|
|
await sharp(buffer)
|
|
.resize(MAX_IMAGE_WIDTH, null, { withoutEnlargement: true })
|
|
.jpeg({ quality: 80 })
|
|
.toFile(path.join(process.cwd(), UPLOADS_DIR, filename.replace(/\.\w+$/, '.jpg')))
|
|
} catch {
|
|
// Sharp failed (e.g. SVG, WebP unsupported) — save raw buffer
|
|
await fs.writeFile(path.join(process.cwd(), UPLOADS_DIR, filename), buffer)
|
|
}
|
|
|
|
// Always reference as .jpg since sharp converts to jpeg
|
|
return `${URL_PREFIX}/${filename.replace(/\.\w+$/, '.jpg')}`
|
|
} catch {
|
|
return null
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Extract and download images from a web page.
|
|
* Returns local URLs for successfully downloaded images.
|
|
*/
|
|
export async function extractAndDownloadImages(html: string, pageUrl: string): Promise<string[]> {
|
|
const imageUrls = extractImageUrlsFromHtml(html, pageUrl)
|
|
const localUrls: string[] = []
|
|
|
|
for (const url of imageUrls) {
|
|
const localPath = await downloadImage(url)
|
|
if (localPath) {
|
|
localUrls.push(localPath)
|
|
}
|
|
}
|
|
|
|
return localUrls
|
|
}
|
|
|
|
function resolveUrl(src: string, pageUrl: string): string | null {
|
|
try {
|
|
if (src.startsWith('//')) return `https:${src}`
|
|
if (src.startsWith('http://') || src.startsWith('https://')) return src
|
|
if (src.startsWith('/') || src.startsWith('./')) {
|
|
const base = new URL(pageUrl)
|
|
return new URL(src, base.origin).href
|
|
}
|
|
return new URL(src, pageUrl).href
|
|
} catch {
|
|
return null
|
|
}
|
|
}
|