Files
Momento/memento-note/lib/ai/services/scrape.service.ts
Sepehr Ramezani cff36d9619 fix: MCP server Docker deployment, healthchecks, and minor fixes
MCP server:
- Fix Prisma imports from stale client-generated path to @prisma/client
- Switch schema from SQLite to PostgreSQL for Docker compatibility
- Add prisma generate step to Dockerfile with proper binaryTargets
- Include index-sse.js in Docker build (was excluded by .dockerignore)
- Install openssl and libc6-compat in Alpine image for Prisma runtime

Docker:
- Fix memento-note healthcheck (wget unavailable in bullseye-slim)

Minor fixes:
- scrape.service SSRF protection, middleware route coverage
- canvas-board and note-input type fixes
- next.config turbopack and devIndicators adjustments

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-04-21 22:22:02 +02:00

69 lines
1.9 KiB
TypeScript

/**
* Scrape Service
* Advanced content extraction using Readability and jsdom
*/
import { JSDOM } from 'jsdom'
import { Readability } from '@mozilla/readability'
export interface ScrapedContent {
title: string
content: string // Markdown or clean text
textContent: string
excerpt: string
byline: string
siteName: string
url: string
}
export class ScrapeService {
async scrapeUrl(url: string): Promise<ScrapedContent | null> {
try {
// Add protocol if missing
let targetUrl = url
if (!url.startsWith('http://') && !url.startsWith('https://')) {
targetUrl = 'https://' + url
}
console.log(`[ScrapeService] Fetching ${targetUrl}...`)
const response = await fetch(targetUrl, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
},
next: { revalidate: 3600 }
})
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`)
}
const html = await response.text()
const dom = new JSDOM(html, { url: targetUrl })
const reader = new Readability(dom.window.document)
const article = reader.parse()
if (!article) {
return null
}
return {
title: article.title ?? '',
content: article.content ?? '', // HTML fragment from readability
textContent: article.textContent ?? '', // Clean text
excerpt: article.excerpt ?? '',
byline: article.byline ?? '',
siteName: article.siteName ?? '',
url: targetUrl
}
} catch (error) {
console.error(`[ScrapeService] Error scraping ${url}:`, error)
return null
}
}
}
export const scrapeService = new ScrapeService()