refactor(ux): consolidate BMAD skills, update design system, and clean up Prisma generated client
This commit is contained in:
167
keep-notes/lib/ai/tools/extract-images.ts
Normal file
167
keep-notes/lib/ai/tools/extract-images.ts
Normal file
@@ -0,0 +1,167 @@
|
||||
/**
|
||||
* Image Extraction Utility
|
||||
* Extracts image URLs from web pages using Cheerio.
|
||||
* Downloads and saves images locally for agent note attachment.
|
||||
*/
|
||||
|
||||
import * as cheerio from 'cheerio'
|
||||
import { promises as fs } from 'fs'
|
||||
import path from 'path'
|
||||
import { randomUUID } from 'crypto'
|
||||
import sharp from 'sharp'
|
||||
|
||||
const UPLOADS_DIR = 'public/uploads/notes'
|
||||
const URL_PREFIX = '/uploads/notes'
|
||||
const MAX_IMAGES_PER_PAGE = 3
|
||||
const MIN_IMAGE_SIZE = 200 // px -- skip icons, spacers, tracking pixels
|
||||
const MAX_IMAGE_WIDTH = 600 // px -- resize for note-friendly display
|
||||
|
||||
export interface ExtractedImage {
|
||||
url: string
|
||||
localPath?: string
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract image URLs from an HTML page.
|
||||
* Prioritizes og:image, then article images with size filtering.
|
||||
*/
|
||||
export function extractImageUrlsFromHtml(html: string, pageUrl: string): string[] {
|
||||
const $ = cheerio.load(html)
|
||||
const images: string[] = []
|
||||
const seen = new Set<string>()
|
||||
|
||||
// 1. Open Graph image
|
||||
const ogImage = $('meta[property="og:image"]').attr('content')
|
||||
if (ogImage) {
|
||||
const resolved = resolveUrl(ogImage, pageUrl)
|
||||
if (resolved && !seen.has(resolved)) {
|
||||
images.push(resolved)
|
||||
seen.add(resolved)
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Twitter card image
|
||||
const twitterImage = $('meta[name="twitter:image"]').attr('content')
|
||||
if (twitterImage) {
|
||||
const resolved = resolveUrl(twitterImage, pageUrl)
|
||||
if (resolved && !seen.has(resolved)) {
|
||||
images.push(resolved)
|
||||
seen.add(resolved)
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Article body images (filter by size and relevance)
|
||||
$('article img, main img, .content img, .post-content img, .entry-content img, .article-body img').each((_, el) => {
|
||||
if (images.length >= MAX_IMAGES_PER_PAGE) return false
|
||||
const src = $(el).attr('src') || $(el).attr('data-src')
|
||||
if (!src) return
|
||||
const width = parseInt($(el).attr('width') || '0', 10)
|
||||
const height = parseInt($(el).attr('height') || '0', 10)
|
||||
// Skip if explicitly sized too small
|
||||
if ((width > 0 && width < MIN_IMAGE_SIZE) || (height > 0 && height < MIN_IMAGE_SIZE)) return
|
||||
// Skip common non-content patterns
|
||||
if (src.includes('avatar') || src.includes('icon') || src.includes('logo') || src.includes('badge') || src.includes('spinner')) return
|
||||
const resolved = resolveUrl(src, pageUrl)
|
||||
if (resolved && !seen.has(resolved)) {
|
||||
images.push(resolved)
|
||||
seen.add(resolved)
|
||||
}
|
||||
})
|
||||
|
||||
// 4. Fallback: any large images in the page if we still have room
|
||||
if (images.length < MAX_IMAGES_PER_PAGE) {
|
||||
$('img').each((_, el) => {
|
||||
if (images.length >= MAX_IMAGES_PER_PAGE) return false
|
||||
const src = $(el).attr('src') || $(el).attr('data-src')
|
||||
if (!src) return
|
||||
const width = parseInt($(el).attr('width') || '0', 10)
|
||||
const height = parseInt($(el).attr('height') || '0', 10)
|
||||
if ((width > 0 && width < MIN_IMAGE_SIZE) || (height > 0 && height < MIN_IMAGE_SIZE)) return
|
||||
if (src.includes('avatar') || src.includes('icon') || src.includes('logo') || src.includes('badge') || src.includes('spinner') || src.includes('pixel') || src.includes('tracking')) return
|
||||
const resolved = resolveUrl(src, pageUrl)
|
||||
if (resolved && !seen.has(resolved)) {
|
||||
images.push(resolved)
|
||||
seen.add(resolved)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
return images.slice(0, MAX_IMAGES_PER_PAGE)
|
||||
}
|
||||
|
||||
/**
|
||||
* Download an image and save it locally.
|
||||
*/
|
||||
export async function downloadImage(imageUrl: string): Promise<string | null> {
|
||||
try {
|
||||
const controller = new AbortController()
|
||||
const timeout = setTimeout(() => controller.abort(), 10000)
|
||||
|
||||
const response = await fetch(imageUrl, {
|
||||
signal: controller.signal,
|
||||
headers: { 'User-Agent': 'Mozilla/5.0 (compatible; KeepBot/1.0)' },
|
||||
})
|
||||
clearTimeout(timeout)
|
||||
|
||||
if (!response.ok) return null
|
||||
|
||||
const contentType = response.headers.get('content-type') || ''
|
||||
if (!contentType.startsWith('image/')) return null
|
||||
|
||||
const buffer = Buffer.from(await response.arrayBuffer())
|
||||
if (buffer.length < 1024) return null // Skip tiny files
|
||||
|
||||
const ext = contentType.split('/')[1]?.replace('jpeg', 'jpg') || 'jpg'
|
||||
const filename = `${randomUUID()}.${ext}`
|
||||
|
||||
await fs.mkdir(path.join(process.cwd(), UPLOADS_DIR), { recursive: true })
|
||||
|
||||
// Resize to max width for note-friendly display
|
||||
try {
|
||||
await sharp(buffer)
|
||||
.resize(MAX_IMAGE_WIDTH, null, { withoutEnlargement: true })
|
||||
.jpeg({ quality: 80 })
|
||||
.toFile(path.join(process.cwd(), UPLOADS_DIR, filename.replace(/\.\w+$/, '.jpg')))
|
||||
} catch {
|
||||
// Sharp failed (e.g. SVG, WebP unsupported) — save raw buffer
|
||||
await fs.writeFile(path.join(process.cwd(), UPLOADS_DIR, filename), buffer)
|
||||
}
|
||||
|
||||
// Always reference as .jpg since sharp converts to jpeg
|
||||
return `${URL_PREFIX}/${filename.replace(/\.\w+$/, '.jpg')}`
|
||||
} catch {
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract and download images from a web page.
|
||||
* Returns local URLs for successfully downloaded images.
|
||||
*/
|
||||
export async function extractAndDownloadImages(html: string, pageUrl: string): Promise<string[]> {
|
||||
const imageUrls = extractImageUrlsFromHtml(html, pageUrl)
|
||||
const localUrls: string[] = []
|
||||
|
||||
for (const url of imageUrls) {
|
||||
const localPath = await downloadImage(url)
|
||||
if (localPath) {
|
||||
localUrls.push(localPath)
|
||||
}
|
||||
}
|
||||
|
||||
return localUrls
|
||||
}
|
||||
|
||||
function resolveUrl(src: string, pageUrl: string): string | null {
|
||||
try {
|
||||
if (src.startsWith('//')) return `https:${src}`
|
||||
if (src.startsWith('http://') || src.startsWith('https://')) return src
|
||||
if (src.startsWith('/') || src.startsWith('./')) {
|
||||
const base = new URL(pageUrl)
|
||||
return new URL(src, base.origin).href
|
||||
}
|
||||
return new URL(src, pageUrl).href
|
||||
} catch {
|
||||
return null
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user