Files
Keep/keep-notes/lib/ai/services/rss.service.ts

93 lines
2.5 KiB
TypeScript

/**
* RSS/Atom Feed Service
* Parses RSS and Atom feeds and returns structured article entries.
* Used by the scraper pipeline to get individual article URLs from feeds.
*/
import Parser from 'rss-parser'
export interface FeedArticle {
title: string
link: string
pubDate?: string
contentSnippet?: string
content?: string
creator?: string
}
export interface ParsedFeed {
title: string
description?: string
link?: string
articles: FeedArticle[]
}
const parser = new Parser({
timeout: 15000,
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'application/rss+xml, application/xml, text/xml, application/atom+xml, text/html;q=0.9',
},
})
const MAX_ARTICLES_PER_FEED = 8
export class RssService {
/**
* Detect if a URL looks like an RSS/Atom feed
*/
isFeedUrl(url: string): boolean {
const feedPatterns = [
'/feed', '/rss', '/atom', '/feed/', '/rss/',
'.xml', '.rss', '.atom',
'/feed/json',
]
const lower = url.toLowerCase()
return feedPatterns.some(p => lower.includes(p))
}
/**
* Try to parse a URL as an RSS/Atom feed.
* Returns null if the URL is not a valid feed.
*/
async parseFeed(feedUrl: string): Promise<ParsedFeed | null> {
try {
const result = await parser.parseURL(feedUrl)
return {
title: result.title || feedUrl,
description: result.description,
link: result.link,
articles: (result.items || [])
.slice(0, MAX_ARTICLES_PER_FEED)
.map(item => ({
title: item.title || 'Sans titre',
link: item.link || '',
pubDate: item.pubDate || item.isoDate,
contentSnippet: (item.contentSnippet || '').substring(0, 500),
content: item['content:encoded'] || item.content || '',
creator: item.creator || item.dc?.creator,
}))
.filter(a => a.link), // Only keep entries with a link
}
} catch {
// Not a valid feed or fetch failed
return null
}
}
/**
* Fetch an RSS feed and return only the article URLs for scraping.
* Useful when you want to scrape articles individually.
*/
async getArticleUrls(feedUrl: string): Promise<{ feedTitle: string; urls: string[] }> {
const feed = await this.parseFeed(feedUrl)
if (!feed) return { feedTitle: '', urls: [] }
return {
feedTitle: feed.title,
urls: feed.articles.map(a => a.link),
}
}
}
export const rssService = new RssService()