93 lines
2.5 KiB
TypeScript
93 lines
2.5 KiB
TypeScript
/**
|
|
* RSS/Atom Feed Service
|
|
* Parses RSS and Atom feeds and returns structured article entries.
|
|
* Used by the scraper pipeline to get individual article URLs from feeds.
|
|
*/
|
|
|
|
import Parser from 'rss-parser'
|
|
|
|
export interface FeedArticle {
|
|
title: string
|
|
link: string
|
|
pubDate?: string
|
|
contentSnippet?: string
|
|
content?: string
|
|
creator?: string
|
|
}
|
|
|
|
export interface ParsedFeed {
|
|
title: string
|
|
description?: string
|
|
link?: string
|
|
articles: FeedArticle[]
|
|
}
|
|
|
|
const parser = new Parser({
|
|
timeout: 15000,
|
|
headers: {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Accept': 'application/rss+xml, application/xml, text/xml, application/atom+xml, text/html;q=0.9',
|
|
},
|
|
})
|
|
|
|
const MAX_ARTICLES_PER_FEED = 8
|
|
|
|
export class RssService {
|
|
/**
|
|
* Detect if a URL looks like an RSS/Atom feed
|
|
*/
|
|
isFeedUrl(url: string): boolean {
|
|
const feedPatterns = [
|
|
'/feed', '/rss', '/atom', '/feed/', '/rss/',
|
|
'.xml', '.rss', '.atom',
|
|
'/feed/json',
|
|
]
|
|
const lower = url.toLowerCase()
|
|
return feedPatterns.some(p => lower.includes(p))
|
|
}
|
|
|
|
/**
|
|
* Try to parse a URL as an RSS/Atom feed.
|
|
* Returns null if the URL is not a valid feed.
|
|
*/
|
|
async parseFeed(feedUrl: string): Promise<ParsedFeed | null> {
|
|
try {
|
|
const result = await parser.parseURL(feedUrl)
|
|
return {
|
|
title: result.title || feedUrl,
|
|
description: result.description,
|
|
link: result.link,
|
|
articles: (result.items || [])
|
|
.slice(0, MAX_ARTICLES_PER_FEED)
|
|
.map(item => ({
|
|
title: item.title || 'Sans titre',
|
|
link: item.link || '',
|
|
pubDate: item.pubDate || item.isoDate,
|
|
contentSnippet: (item.contentSnippet || '').substring(0, 500),
|
|
content: item['content:encoded'] || item.content || '',
|
|
creator: item.creator || item.dc?.creator,
|
|
}))
|
|
.filter(a => a.link), // Only keep entries with a link
|
|
}
|
|
} catch {
|
|
// Not a valid feed or fetch failed
|
|
return null
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Fetch an RSS feed and return only the article URLs for scraping.
|
|
* Useful when you want to scrape articles individually.
|
|
*/
|
|
async getArticleUrls(feedUrl: string): Promise<{ feedTitle: string; urls: string[] }> {
|
|
const feed = await this.parseFeed(feedUrl)
|
|
if (!feed) return { feedTitle: '', urls: [] }
|
|
return {
|
|
feedTitle: feed.title,
|
|
urls: feed.articles.map(a => a.link),
|
|
}
|
|
}
|
|
}
|
|
|
|
export const rssService = new RssService()
|