/** * Scrape Service * Advanced content extraction using Readability and jsdom */ import { JSDOM } from 'jsdom' import { Readability } from '@mozilla/readability' export interface ScrapedContent { title: string content: string // Markdown or clean text textContent: string excerpt: string byline: string siteName: string url: string } export class ScrapeService { async scrapeUrl(url: string): Promise { try { // Add protocol if missing let targetUrl = url if (!url.startsWith('http://') && !url.startsWith('https://')) { targetUrl = 'https://' + url } console.log(`[ScrapeService] Fetching ${targetUrl}...`) const response = await fetch(targetUrl, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', }, next: { revalidate: 3600 } }) if (!response.ok) { throw new Error(`HTTP error! status: ${response.status}`) } const html = await response.text() const dom = new JSDOM(html, { url: targetUrl }) const reader = new Readability(dom.window.document) const article = reader.parse() if (!article) { return null } return { title: article.title, content: article.content, // HTML fragment from readability textContent: article.textContent, // Clean text excerpt: article.excerpt, byline: article.byline, siteName: article.siteName, url: targetUrl } } catch (error) { console.error(`[ScrapeService] Error scraping ${url}:`, error) return null } } } export const scrapeService = new ScrapeService()