Keep/keep-notes/lib/ai/services/scrape.service.ts

/**
 * Scrape Service
 * Advanced content extraction using Readability and jsdom
 */

import { JSDOM } from 'jsdom'
import { Readability } from '@mozilla/readability'

export interface ScrapedContent {
  title: string
  content: string // Markdown or clean text
  textContent: string
  excerpt: string
  byline: string
  siteName: string
  url: string
}

export class ScrapeService {
  async scrapeUrl(url: string): Promise<ScrapedContent | null> {
    try {
      // Add protocol if missing
      let targetUrl = url
      if (!url.startsWith('http://') && !url.startsWith('https://')) {
        targetUrl = 'https://' + url
      }

      console.log(`[ScrapeService] Fetching ${targetUrl}...`)

      const response = await fetch(targetUrl, {
        headers: {
          'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
          'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        },
        next: { revalidate: 3600 }
      })

      if (!response.ok) {
        throw new Error(`HTTP error! status: ${response.status}`)
      }

      const html = await response.text()
      const dom = new JSDOM(html, { url: targetUrl })

      const reader = new Readability(dom.window.document)
      const article = reader.parse()

      if (!article) {
        return null
      }

      return {
        title: article.title,
        content: article.content, // HTML fragment from readability
        textContent: article.textContent, // Clean text
        excerpt: article.excerpt,
        byline: article.byline,
        siteName: article.siteName,
        url: targetUrl
      }
    } catch (error) {
      console.error(`[ScrapeService] Error scraping ${url}:`, error)
      return null
    }
  }
}

export const scrapeService = new ScrapeService()