Keep/keep-notes/lib/ai/services/rss.service.ts

/**
 * RSS/Atom Feed Service
 * Parses RSS and Atom feeds and returns structured article entries.
 * Used by the scraper pipeline to get individual article URLs from feeds.
 */

import Parser from 'rss-parser'

export interface FeedArticle {
  title: string
  link: string
  pubDate?: string
  contentSnippet?: string
  content?: string
  creator?: string
}

export interface ParsedFeed {
  title: string
  description?: string
  link?: string
  articles: FeedArticle[]
}

const parser = new Parser({
  timeout: 15000,
  headers: {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Accept': 'application/rss+xml, application/xml, text/xml, application/atom+xml, text/html;q=0.9',
  },
})

const MAX_ARTICLES_PER_FEED = 8

export class RssService {
  /**
   * Detect if a URL looks like an RSS/Atom feed
   */
  isFeedUrl(url: string): boolean {
    const feedPatterns = [
      '/feed', '/rss', '/atom', '/feed/', '/rss/',
      '.xml', '.rss', '.atom',
      '/feed/json',
    ]
    const lower = url.toLowerCase()
    return feedPatterns.some(p => lower.includes(p))
  }

  /**
   * Try to parse a URL as an RSS/Atom feed.
   * Returns null if the URL is not a valid feed.
   */
  async parseFeed(feedUrl: string): Promise<ParsedFeed | null> {
    try {
      const result = await parser.parseURL(feedUrl)
      return {
        title: result.title || feedUrl,
        description: result.description,
        link: result.link,
        articles: (result.items || [])
          .slice(0, MAX_ARTICLES_PER_FEED)
          .map(item => ({
            title: item.title || 'Sans titre',
            link: item.link || '',
            pubDate: item.pubDate || item.isoDate,
            contentSnippet: (item.contentSnippet || '').substring(0, 500),
            content: item['content:encoded'] || item.content || '',
            creator: item.creator || item.dc?.creator,
          }))
          .filter(a => a.link), // Only keep entries with a link
      }
    } catch {
      // Not a valid feed or fetch failed
      return null
    }
  }

  /**
   * Fetch an RSS feed and return only the article URLs for scraping.
   * Useful when you want to scrape articles individually.
   */
  async getArticleUrls(feedUrl: string): Promise<{ feedTitle: string; urls: string[] }> {
    const feed = await this.parseFeed(feedUrl)
    if (!feed) return { feedTitle: '', urls: [] }
    return {
      feedTitle: feed.title,
      urls: feed.articles.map(a => a.link),
    }
  }
}

export const rssService = new RssService()