refactor(ux): consolidate BMAD skills, update design system, and clean up Prisma generated client
This commit is contained in:
68
keep-notes/lib/ai/services/scrape.service.ts
Normal file
68
keep-notes/lib/ai/services/scrape.service.ts
Normal file
@@ -0,0 +1,68 @@
|
||||
/**
|
||||
* Scrape Service
|
||||
* Advanced content extraction using Readability and jsdom
|
||||
*/
|
||||
|
||||
import { JSDOM } from 'jsdom'
|
||||
import { Readability } from '@mozilla/readability'
|
||||
|
||||
export interface ScrapedContent {
|
||||
title: string
|
||||
content: string // Markdown or clean text
|
||||
textContent: string
|
||||
excerpt: string
|
||||
byline: string
|
||||
siteName: string
|
||||
url: string
|
||||
}
|
||||
|
||||
export class ScrapeService {
|
||||
async scrapeUrl(url: string): Promise<ScrapedContent | null> {
|
||||
try {
|
||||
// Add protocol if missing
|
||||
let targetUrl = url
|
||||
if (!url.startsWith('http://') && !url.startsWith('https://')) {
|
||||
targetUrl = 'https://' + url
|
||||
}
|
||||
|
||||
console.log(`[ScrapeService] Fetching ${targetUrl}...`)
|
||||
|
||||
const response = await fetch(targetUrl, {
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
},
|
||||
next: { revalidate: 3600 }
|
||||
})
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP error! status: ${response.status}`)
|
||||
}
|
||||
|
||||
const html = await response.text()
|
||||
const dom = new JSDOM(html, { url: targetUrl })
|
||||
|
||||
const reader = new Readability(dom.window.document)
|
||||
const article = reader.parse()
|
||||
|
||||
if (!article) {
|
||||
return null
|
||||
}
|
||||
|
||||
return {
|
||||
title: article.title,
|
||||
content: article.content, // HTML fragment from readability
|
||||
textContent: article.textContent, // Clean text
|
||||
excerpt: article.excerpt,
|
||||
byline: article.byline,
|
||||
siteName: article.siteName,
|
||||
url: targetUrl
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`[ScrapeService] Error scraping ${url}:`, error)
|
||||
return null
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export const scrapeService = new ScrapeService()
|
||||
Reference in New Issue
Block a user