chartbastan/backend/app/scrapers/rss_scraper.py

"""
RSS scraper module with robust error handling.

This module provides functionality to scrape RSS feeds from sports sources,
with built-in error handling and logging.
"""

import logging
import feedparser
from datetime import datetime, timezone
from typing import List, Dict, Optional
from dataclasses import dataclass
from urllib.parse import urlparse

from sqlalchemy.orm import Session

logger = logging.getLogger(__name__)


@dataclass
class RSSArticleData:
    """Structured data for an RSS article."""
    article_id: str
    title: str
    content: str
    published_at: datetime
    source_url: str
    match_id: Optional[int]
    source: str = "rss"


class RSSScraper:
    """
    RSS scraper with robust error handling.

    Features:
    - Scrapes RSS feeds from configured sports sources
    - Error handling without stopping the process
    - Continues with other sources on errors
    - Structured logging
    - Timeout configuration
    - Filters relevant football articles
    """

    # Default RSS sources for sports news
    DEFAULT_RSS_SOURCES = [
        "http://www.espn.com/espn/rss/news",
        "http://feeds.bbci.co.uk/sport/football/rss.xml",
        "https://www.goal.com/rss",
        "https://www.skysports.com/rss/12040",
    ]

    # Keywords to filter relevant football articles
    FOOTBALL_KEYWORDS = [
        "football", "soccer", "match", "goal", "premier league",
        "la liga", "serie a", "bundesliga", "ligue 1", "champions league",
        "euro", "world cup", "cup", "league", "team", "club", "player",
        "coach", "manager", "score", "result", "transfer"
    ]

    def __init__(
        self,
        rss_sources: Optional[List[str]] = None,
        timeout: int = 30,
        max_articles_per_source: int = 100,
        keywords: Optional[List[str]] = None
    ):
        """
        Initialize RSS scraper.

        Args:
            rss_sources: List of RSS feed URLs to scrape
            timeout: Request timeout in seconds (default: 30)
            max_articles_per_source: Maximum articles to collect per source
            keywords: List of keywords to filter relevant articles
        """
        self.rss_sources = rss_sources or self.DEFAULT_RSS_SOURCES
        self.timeout = timeout
        self.max_articles_per_source = max_articles_per_source
        self.keywords = keywords or self.FOOTBALL_KEYWORDS

        logger.info(f"📰 RSS Scraper initialized with {len(self.rss_sources)} sources")
        for i, source in enumerate(self.rss_sources, 1):
            domain = urlparse(source).netloc
            logger.info(f"   {i}. {domain}")

    def _is_article_relevant(self, title: str, content: str) -> bool:
        """
        Check if an article is relevant to football based on keywords.

        Args:
            title: Article title
            content: Article content

        Returns:
            True if article is relevant, False otherwise
        """
        text_to_check = f"{title.lower()} {content.lower()}"

        # Check if any keyword is present
        for keyword in self.keywords:
            if keyword.lower() in text_to_check:
                return True

        return False

    def _parse_published_date(self, published: str) -> datetime:
        """
        Parse the published date from RSS feed.

        Args:
            published: Published date string from RSS feed

        Returns:
            Datetime object in UTC timezone
        """
        try:
            # feedparser automatically parses dates
            parsed = feedparser.parse(published)
            if hasattr(parsed, 'published_parsed') and parsed.published_parsed:
                return datetime(*parsed.published_parsed[:6], tzinfo=timezone.utc)

            # Fallback: try to parse as string
            from email.utils import parsedate_to_datetime
            return parsedate_to_datetime(published).astimezone(timezone.utc)
        except Exception as e:
            logger.warning(f"⚠️ Failed to parse date '{published}': {e}")
            return datetime.now(timezone.utc)

    def _parse_feed(
        self,
        source_url: str
    ) -> List[RSSArticleData]:
        """
        Parse RSS feed and extract articles.

        Args:
            source_url: URL of the RSS feed

        Returns:
            List of RSSArticleData objects
        """
        articles = []

        try:
            logger.info(f"🔍 Parsing RSS feed: {source_url}")

            # Parse RSS feed with timeout
            feed = feedparser.parse(source_url)

            # Check for feed errors
            if feed.get('bozo', False):
                logger.warning(f"⚠️ RSS feed has malformed XML: {source_url}")
                # Continue anyway as feedparser can handle some malformed feeds

            # Extract feed info
            feed_title = feed.feed.get('title', 'Unknown')
            logger.info(f"📰 Feed: {feed_title}")
            logger.info(f"   Total entries: {len(feed.entries)}")

            # Process entries
            for entry in feed.entries[:self.max_articles_per_source]:
                try:
                    # Extract article ID
                    article_id = entry.get('id') or entry.get('link', '')
                    if not article_id:
                        logger.warning(f"⚠️ Article missing ID, skipping")
                        continue

                    # Extract title
                    title = entry.get('title', '')
                    if not title:
                        logger.warning(f"⚠️ Article missing title, skipping")
                        continue

                    # Extract content
                    content = ''
                    if 'content' in entry:
                        content = entry.content[0].value if entry.content else ''
                    elif 'summary' in entry:
                        content = entry.summary
                    elif 'description' in entry:
                        content = entry.description

                    # Parse published date
                    published_str = entry.get('published') or entry.get('updated')
                    if not published_str:
                        logger.warning(f"⚠️ Article missing published date, using current time")
                        published_at = datetime.now(timezone.utc)
                    else:
                        published_at = self._parse_published_date(published_str)

                    # Filter relevant articles
                    if not self._is_article_relevant(title, content):
                        logger.debug(f"🚫 Article not relevant: {title}")
                        continue

                    # Create article data
                    article_data = RSSArticleData(
                        article_id=article_id,
                        title=title,
                        content=content,
                        published_at=published_at,
                        source_url=source_url,
                        match_id=None,  # Will be matched later if needed
                        source=feed_title
                    )
                    articles.append(article_data)

                except Exception as e:
                    logger.error(f"❌ Error processing article: {e}")
                    continue

            logger.info(f"✅ Collected {len(articles)} relevant articles from {source_url}")

        except Exception as e:
            logger.error(f"❌ Failed to parse RSS feed {source_url}: {e}")

        return articles

    def scrape_all_sources(
        self,
        match_id: Optional[int] = None
    ) -> List[RSSArticleData]:
        """
        Scrape all configured RSS sources.

        Args:
            match_id: Optional match ID to associate with articles

        Returns:
            List of RSSArticleData objects from all sources
        """
        all_articles = []

        for source_url in self.rss_sources:
            try:
                # Parse feed
                articles = self._parse_feed(source_url)

                # Set match_id if provided
                if match_id:
                    for article in articles:
                        article.match_id = match_id

                all_articles.extend(articles)

            except Exception as e:
                logger.error(
                    f"❌ Failed to scrape source {source_url}: {e}. "
                    f"Continuing with other sources..."
                )
                continue

        logger.info(
            f"✅ Total articles collected from all sources: {len(all_articles)}"
        )

        return all_articles

    def scrape_single_source(
        self,
        source_url: str,
        match_id: Optional[int] = None
    ) -> List[RSSArticleData]:
        """
        Scrape a single RSS source.

        Args:
            source_url: URL of the RSS feed to scrape
            match_id: Optional match ID to associate with articles

        Returns:
            List of RSSArticleData objects
        """
        articles = self._parse_feed(source_url)

        # Set match_id if provided
        if match_id:
            for article in articles:
                article.match_id = match_id

        return articles

    def save_articles_to_db(self, articles: List[RSSArticleData], db: Session) -> None:
        """
        Save RSS articles to database.

        Args:
            articles: List of RSSArticleData objects
            db: SQLAlchemy database session
        """
        from app.models.rss_article import RSSArticle

        saved_count = 0
        for article_data in articles:
            # Check if article already exists
            existing = db.query(RSSArticle).filter(
                RSSArticle.article_id == article_data.article_id
            ).first()

            if existing:
                logger.debug(f"Article {article_data.article_id} already exists, skipping")
                continue

            # Create new article
            article = RSSArticle(
                article_id=article_data.article_id,
                title=article_data.title,
                content=article_data.content,
                published_at=article_data.published_at,
                source_url=article_data.source_url,
                match_id=article_data.match_id,
                source=article_data.source
            )

            db.add(article)
            saved_count += 1

        # Commit changes
        try:
            db.commit()
            logger.info(f"✅ Saved {saved_count} new RSS articles to database")
        except Exception as e:
            db.rollback()
            logger.error(f"❌ Failed to save RSS articles to database: {e}")
            raise

    def scrape_and_save(
        self,
        db: Session,
        match_id: Optional[int] = None
    ) -> List[RSSArticleData]:
        """
        Scrape all RSS sources and save to database.

        Args:
            db: SQLAlchemy database session
            match_id: Optional match ID to associate with articles

        Returns:
            List of RSSArticleData objects
        """
        try:
            # Scrape articles
            articles = self.scrape_all_sources(match_id)

            # Save to database
            if articles:
                self.save_articles_to_db(articles, db)

            return articles

        except Exception as e:
            logger.error(f"❌ Failed to scrape and save RSS articles: {e}")
            raise


def create_rss_scraper(
    rss_sources: Optional[List[str]] = None,
    keywords: Optional[List[str]] = None
) -> RSSScraper:
    """
    Factory function to create an RSS scraper instance.

    Args:
        rss_sources: Optional list of RSS feed URLs
        keywords: Optional list of keywords to filter articles

    Returns:
        Configured RSSScraper instance
    """
    scraper = RSSScraper(
        rss_sources=rss_sources,
        timeout=30,
        max_articles_per_source=100,
        keywords=keywords
    )

    return scraper