69 lines
1.8 KiB
TypeScript
69 lines
1.8 KiB
TypeScript
/**
|
|
* Scrape Service
|
|
* Advanced content extraction using Readability and jsdom
|
|
*/
|
|
|
|
import { JSDOM } from 'jsdom'
|
|
import { Readability } from '@mozilla/readability'
|
|
|
|
export interface ScrapedContent {
|
|
title: string
|
|
content: string // Markdown or clean text
|
|
textContent: string
|
|
excerpt: string
|
|
byline: string
|
|
siteName: string
|
|
url: string
|
|
}
|
|
|
|
export class ScrapeService {
|
|
async scrapeUrl(url: string): Promise<ScrapedContent | null> {
|
|
try {
|
|
// Add protocol if missing
|
|
let targetUrl = url
|
|
if (!url.startsWith('http://') && !url.startsWith('https://')) {
|
|
targetUrl = 'https://' + url
|
|
}
|
|
|
|
console.log(`[ScrapeService] Fetching ${targetUrl}...`)
|
|
|
|
const response = await fetch(targetUrl, {
|
|
headers: {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
},
|
|
next: { revalidate: 3600 }
|
|
})
|
|
|
|
if (!response.ok) {
|
|
throw new Error(`HTTP error! status: ${response.status}`)
|
|
}
|
|
|
|
const html = await response.text()
|
|
const dom = new JSDOM(html, { url: targetUrl })
|
|
|
|
const reader = new Readability(dom.window.document)
|
|
const article = reader.parse()
|
|
|
|
if (!article) {
|
|
return null
|
|
}
|
|
|
|
return {
|
|
title: article.title,
|
|
content: article.content, // HTML fragment from readability
|
|
textContent: article.textContent, // Clean text
|
|
excerpt: article.excerpt,
|
|
byline: article.byline,
|
|
siteName: article.siteName,
|
|
url: targetUrl
|
|
}
|
|
} catch (error) {
|
|
console.error(`[ScrapeService] Error scraping ${url}:`, error)
|
|
return null
|
|
}
|
|
}
|
|
}
|
|
|
|
export const scrapeService = new ScrapeService()
|