Files
Keep/keep-notes/lib/ai/services/scrape.service.ts

69 lines
1.8 KiB
TypeScript

/**
* Scrape Service
* Advanced content extraction using Readability and jsdom
*/
import { JSDOM } from 'jsdom'
import { Readability } from '@mozilla/readability'
export interface ScrapedContent {
title: string
content: string // Markdown or clean text
textContent: string
excerpt: string
byline: string
siteName: string
url: string
}
export class ScrapeService {
async scrapeUrl(url: string): Promise<ScrapedContent | null> {
try {
// Add protocol if missing
let targetUrl = url
if (!url.startsWith('http://') && !url.startsWith('https://')) {
targetUrl = 'https://' + url
}
console.log(`[ScrapeService] Fetching ${targetUrl}...`)
const response = await fetch(targetUrl, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
},
next: { revalidate: 3600 }
})
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`)
}
const html = await response.text()
const dom = new JSDOM(html, { url: targetUrl })
const reader = new Readability(dom.window.document)
const article = reader.parse()
if (!article) {
return null
}
return {
title: article.title,
content: article.content, // HTML fragment from readability
textContent: article.textContent, // Clean text
excerpt: article.excerpt,
byline: article.byline,
siteName: article.siteName,
url: targetUrl
}
} catch (error) {
console.error(`[ScrapeService] Error scraping ${url}:`, error)
return null
}
}
}
export const scrapeService = new ScrapeService()