/** * RSS/Atom Feed Service * Parses RSS and Atom feeds and returns structured article entries. * Used by the scraper pipeline to get individual article URLs from feeds. */ import Parser from 'rss-parser' export interface FeedArticle { title: string link: string pubDate?: string contentSnippet?: string content?: string creator?: string } export interface ParsedFeed { title: string description?: string link?: string articles: FeedArticle[] } const parser = new Parser({ timeout: 15000, headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'application/rss+xml, application/xml, text/xml, application/atom+xml, text/html;q=0.9', }, }) const MAX_ARTICLES_PER_FEED = 8 export class RssService { /** * Detect if a URL looks like an RSS/Atom feed */ isFeedUrl(url: string): boolean { const feedPatterns = [ '/feed', '/rss', '/atom', '/feed/', '/rss/', '.xml', '.rss', '.atom', '/feed/json', ] const lower = url.toLowerCase() return feedPatterns.some(p => lower.includes(p)) } /** * Try to parse a URL as an RSS/Atom feed. * Returns null if the URL is not a valid feed. */ async parseFeed(feedUrl: string): Promise { try { const result = await parser.parseURL(feedUrl) return { title: result.title || feedUrl, description: result.description, link: result.link, articles: (result.items || []) .slice(0, MAX_ARTICLES_PER_FEED) .map(item => ({ title: item.title || 'Sans titre', link: item.link || '', pubDate: item.pubDate || item.isoDate, contentSnippet: (item.contentSnippet || '').substring(0, 500), content: item['content:encoded'] || item.content || '', creator: item.creator || item.dc?.creator, })) .filter(a => a.link), // Only keep entries with a link } } catch { // Not a valid feed or fetch failed return null } } /** * Fetch an RSS feed and return only the article URLs for scraping. * Useful when you want to scrape articles individually. */ async getArticleUrls(feedUrl: string): Promise<{ feedTitle: string; urls: string[] }> { const feed = await this.parseFeed(feedUrl) if (!feed) return { feedTitle: '', urls: [] } return { feedTitle: feed.title, urls: feed.articles.map(a => a.link), } } } export const rssService = new RssService()