""" RSS scraper module with robust error handling. This module provides functionality to scrape RSS feeds from sports sources, with built-in error handling and logging. """ import logging import feedparser from datetime import datetime, timezone from typing import List, Dict, Optional from dataclasses import dataclass from urllib.parse import urlparse from sqlalchemy.orm import Session logger = logging.getLogger(__name__) @dataclass class RSSArticleData: """Structured data for an RSS article.""" article_id: str title: str content: str published_at: datetime source_url: str match_id: Optional[int] source: str = "rss" class RSSScraper: """ RSS scraper with robust error handling. Features: - Scrapes RSS feeds from configured sports sources - Error handling without stopping the process - Continues with other sources on errors - Structured logging - Timeout configuration - Filters relevant football articles """ # Default RSS sources for sports news DEFAULT_RSS_SOURCES = [ "http://www.espn.com/espn/rss/news", "http://feeds.bbci.co.uk/sport/football/rss.xml", "https://www.goal.com/rss", "https://www.skysports.com/rss/12040", ] # Keywords to filter relevant football articles FOOTBALL_KEYWORDS = [ "football", "soccer", "match", "goal", "premier league", "la liga", "serie a", "bundesliga", "ligue 1", "champions league", "euro", "world cup", "cup", "league", "team", "club", "player", "coach", "manager", "score", "result", "transfer" ] def __init__( self, rss_sources: Optional[List[str]] = None, timeout: int = 30, max_articles_per_source: int = 100, keywords: Optional[List[str]] = None ): """ Initialize RSS scraper. Args: rss_sources: List of RSS feed URLs to scrape timeout: Request timeout in seconds (default: 30) max_articles_per_source: Maximum articles to collect per source keywords: List of keywords to filter relevant articles """ self.rss_sources = rss_sources or self.DEFAULT_RSS_SOURCES self.timeout = timeout self.max_articles_per_source = max_articles_per_source self.keywords = keywords or self.FOOTBALL_KEYWORDS logger.info(f"📰 RSS Scraper initialized with {len(self.rss_sources)} sources") for i, source in enumerate(self.rss_sources, 1): domain = urlparse(source).netloc logger.info(f" {i}. {domain}") def _is_article_relevant(self, title: str, content: str) -> bool: """ Check if an article is relevant to football based on keywords. Args: title: Article title content: Article content Returns: True if article is relevant, False otherwise """ text_to_check = f"{title.lower()} {content.lower()}" # Check if any keyword is present for keyword in self.keywords: if keyword.lower() in text_to_check: return True return False def _parse_published_date(self, published: str) -> datetime: """ Parse the published date from RSS feed. Args: published: Published date string from RSS feed Returns: Datetime object in UTC timezone """ try: # feedparser automatically parses dates parsed = feedparser.parse(published) if hasattr(parsed, 'published_parsed') and parsed.published_parsed: return datetime(*parsed.published_parsed[:6], tzinfo=timezone.utc) # Fallback: try to parse as string from email.utils import parsedate_to_datetime return parsedate_to_datetime(published).astimezone(timezone.utc) except Exception as e: logger.warning(f"⚠️ Failed to parse date '{published}': {e}") return datetime.now(timezone.utc) def _parse_feed( self, source_url: str ) -> List[RSSArticleData]: """ Parse RSS feed and extract articles. Args: source_url: URL of the RSS feed Returns: List of RSSArticleData objects """ articles = [] try: logger.info(f"🔍 Parsing RSS feed: {source_url}") # Parse RSS feed with timeout feed = feedparser.parse(source_url) # Check for feed errors if feed.get('bozo', False): logger.warning(f"⚠️ RSS feed has malformed XML: {source_url}") # Continue anyway as feedparser can handle some malformed feeds # Extract feed info feed_title = feed.feed.get('title', 'Unknown') logger.info(f"📰 Feed: {feed_title}") logger.info(f" Total entries: {len(feed.entries)}") # Process entries for entry in feed.entries[:self.max_articles_per_source]: try: # Extract article ID article_id = entry.get('id') or entry.get('link', '') if not article_id: logger.warning(f"⚠️ Article missing ID, skipping") continue # Extract title title = entry.get('title', '') if not title: logger.warning(f"⚠️ Article missing title, skipping") continue # Extract content content = '' if 'content' in entry: content = entry.content[0].value if entry.content else '' elif 'summary' in entry: content = entry.summary elif 'description' in entry: content = entry.description # Parse published date published_str = entry.get('published') or entry.get('updated') if not published_str: logger.warning(f"⚠️ Article missing published date, using current time") published_at = datetime.now(timezone.utc) else: published_at = self._parse_published_date(published_str) # Filter relevant articles if not self._is_article_relevant(title, content): logger.debug(f"🚫 Article not relevant: {title}") continue # Create article data article_data = RSSArticleData( article_id=article_id, title=title, content=content, published_at=published_at, source_url=source_url, match_id=None, # Will be matched later if needed source=feed_title ) articles.append(article_data) except Exception as e: logger.error(f"❌ Error processing article: {e}") continue logger.info(f"✅ Collected {len(articles)} relevant articles from {source_url}") except Exception as e: logger.error(f"❌ Failed to parse RSS feed {source_url}: {e}") return articles def scrape_all_sources( self, match_id: Optional[int] = None ) -> List[RSSArticleData]: """ Scrape all configured RSS sources. Args: match_id: Optional match ID to associate with articles Returns: List of RSSArticleData objects from all sources """ all_articles = [] for source_url in self.rss_sources: try: # Parse feed articles = self._parse_feed(source_url) # Set match_id if provided if match_id: for article in articles: article.match_id = match_id all_articles.extend(articles) except Exception as e: logger.error( f"❌ Failed to scrape source {source_url}: {e}. " f"Continuing with other sources..." ) continue logger.info( f"✅ Total articles collected from all sources: {len(all_articles)}" ) return all_articles def scrape_single_source( self, source_url: str, match_id: Optional[int] = None ) -> List[RSSArticleData]: """ Scrape a single RSS source. Args: source_url: URL of the RSS feed to scrape match_id: Optional match ID to associate with articles Returns: List of RSSArticleData objects """ articles = self._parse_feed(source_url) # Set match_id if provided if match_id: for article in articles: article.match_id = match_id return articles def save_articles_to_db(self, articles: List[RSSArticleData], db: Session) -> None: """ Save RSS articles to database. Args: articles: List of RSSArticleData objects db: SQLAlchemy database session """ from app.models.rss_article import RSSArticle saved_count = 0 for article_data in articles: # Check if article already exists existing = db.query(RSSArticle).filter( RSSArticle.article_id == article_data.article_id ).first() if existing: logger.debug(f"Article {article_data.article_id} already exists, skipping") continue # Create new article article = RSSArticle( article_id=article_data.article_id, title=article_data.title, content=article_data.content, published_at=article_data.published_at, source_url=article_data.source_url, match_id=article_data.match_id, source=article_data.source ) db.add(article) saved_count += 1 # Commit changes try: db.commit() logger.info(f"✅ Saved {saved_count} new RSS articles to database") except Exception as e: db.rollback() logger.error(f"❌ Failed to save RSS articles to database: {e}") raise def scrape_and_save( self, db: Session, match_id: Optional[int] = None ) -> List[RSSArticleData]: """ Scrape all RSS sources and save to database. Args: db: SQLAlchemy database session match_id: Optional match ID to associate with articles Returns: List of RSSArticleData objects """ try: # Scrape articles articles = self.scrape_all_sources(match_id) # Save to database if articles: self.save_articles_to_db(articles, db) return articles except Exception as e: logger.error(f"❌ Failed to scrape and save RSS articles: {e}") raise def create_rss_scraper( rss_sources: Optional[List[str]] = None, keywords: Optional[List[str]] = None ) -> RSSScraper: """ Factory function to create an RSS scraper instance. Args: rss_sources: Optional list of RSS feed URLs keywords: Optional list of keywords to filter articles Returns: Configured RSSScraper instance """ scraper = RSSScraper( rss_sources=rss_sources, timeout=30, max_articles_per_source=100, keywords=keywords ) return scraper