chartbastan/backend/app/scrapers/rss_scraper.py
2026-02-01 09:31:38 +01:00

381 lines
12 KiB
Python

"""
RSS scraper module with robust error handling.
This module provides functionality to scrape RSS feeds from sports sources,
with built-in error handling and logging.
"""
import logging
import feedparser
from datetime import datetime, timezone
from typing import List, Dict, Optional
from dataclasses import dataclass
from urllib.parse import urlparse
from sqlalchemy.orm import Session
logger = logging.getLogger(__name__)
@dataclass
class RSSArticleData:
"""Structured data for an RSS article."""
article_id: str
title: str
content: str
published_at: datetime
source_url: str
match_id: Optional[int]
source: str = "rss"
class RSSScraper:
"""
RSS scraper with robust error handling.
Features:
- Scrapes RSS feeds from configured sports sources
- Error handling without stopping the process
- Continues with other sources on errors
- Structured logging
- Timeout configuration
- Filters relevant football articles
"""
# Default RSS sources for sports news
DEFAULT_RSS_SOURCES = [
"http://www.espn.com/espn/rss/news",
"http://feeds.bbci.co.uk/sport/football/rss.xml",
"https://www.goal.com/rss",
"https://www.skysports.com/rss/12040",
]
# Keywords to filter relevant football articles
FOOTBALL_KEYWORDS = [
"football", "soccer", "match", "goal", "premier league",
"la liga", "serie a", "bundesliga", "ligue 1", "champions league",
"euro", "world cup", "cup", "league", "team", "club", "player",
"coach", "manager", "score", "result", "transfer"
]
def __init__(
self,
rss_sources: Optional[List[str]] = None,
timeout: int = 30,
max_articles_per_source: int = 100,
keywords: Optional[List[str]] = None
):
"""
Initialize RSS scraper.
Args:
rss_sources: List of RSS feed URLs to scrape
timeout: Request timeout in seconds (default: 30)
max_articles_per_source: Maximum articles to collect per source
keywords: List of keywords to filter relevant articles
"""
self.rss_sources = rss_sources or self.DEFAULT_RSS_SOURCES
self.timeout = timeout
self.max_articles_per_source = max_articles_per_source
self.keywords = keywords or self.FOOTBALL_KEYWORDS
logger.info(f"📰 RSS Scraper initialized with {len(self.rss_sources)} sources")
for i, source in enumerate(self.rss_sources, 1):
domain = urlparse(source).netloc
logger.info(f" {i}. {domain}")
def _is_article_relevant(self, title: str, content: str) -> bool:
"""
Check if an article is relevant to football based on keywords.
Args:
title: Article title
content: Article content
Returns:
True if article is relevant, False otherwise
"""
text_to_check = f"{title.lower()} {content.lower()}"
# Check if any keyword is present
for keyword in self.keywords:
if keyword.lower() in text_to_check:
return True
return False
def _parse_published_date(self, published: str) -> datetime:
"""
Parse the published date from RSS feed.
Args:
published: Published date string from RSS feed
Returns:
Datetime object in UTC timezone
"""
try:
# feedparser automatically parses dates
parsed = feedparser.parse(published)
if hasattr(parsed, 'published_parsed') and parsed.published_parsed:
return datetime(*parsed.published_parsed[:6], tzinfo=timezone.utc)
# Fallback: try to parse as string
from email.utils import parsedate_to_datetime
return parsedate_to_datetime(published).astimezone(timezone.utc)
except Exception as e:
logger.warning(f"⚠️ Failed to parse date '{published}': {e}")
return datetime.now(timezone.utc)
def _parse_feed(
self,
source_url: str
) -> List[RSSArticleData]:
"""
Parse RSS feed and extract articles.
Args:
source_url: URL of the RSS feed
Returns:
List of RSSArticleData objects
"""
articles = []
try:
logger.info(f"🔍 Parsing RSS feed: {source_url}")
# Parse RSS feed with timeout
feed = feedparser.parse(source_url)
# Check for feed errors
if feed.get('bozo', False):
logger.warning(f"⚠️ RSS feed has malformed XML: {source_url}")
# Continue anyway as feedparser can handle some malformed feeds
# Extract feed info
feed_title = feed.feed.get('title', 'Unknown')
logger.info(f"📰 Feed: {feed_title}")
logger.info(f" Total entries: {len(feed.entries)}")
# Process entries
for entry in feed.entries[:self.max_articles_per_source]:
try:
# Extract article ID
article_id = entry.get('id') or entry.get('link', '')
if not article_id:
logger.warning(f"⚠️ Article missing ID, skipping")
continue
# Extract title
title = entry.get('title', '')
if not title:
logger.warning(f"⚠️ Article missing title, skipping")
continue
# Extract content
content = ''
if 'content' in entry:
content = entry.content[0].value if entry.content else ''
elif 'summary' in entry:
content = entry.summary
elif 'description' in entry:
content = entry.description
# Parse published date
published_str = entry.get('published') or entry.get('updated')
if not published_str:
logger.warning(f"⚠️ Article missing published date, using current time")
published_at = datetime.now(timezone.utc)
else:
published_at = self._parse_published_date(published_str)
# Filter relevant articles
if not self._is_article_relevant(title, content):
logger.debug(f"🚫 Article not relevant: {title}")
continue
# Create article data
article_data = RSSArticleData(
article_id=article_id,
title=title,
content=content,
published_at=published_at,
source_url=source_url,
match_id=None, # Will be matched later if needed
source=feed_title
)
articles.append(article_data)
except Exception as e:
logger.error(f"❌ Error processing article: {e}")
continue
logger.info(f"✅ Collected {len(articles)} relevant articles from {source_url}")
except Exception as e:
logger.error(f"❌ Failed to parse RSS feed {source_url}: {e}")
return articles
def scrape_all_sources(
self,
match_id: Optional[int] = None
) -> List[RSSArticleData]:
"""
Scrape all configured RSS sources.
Args:
match_id: Optional match ID to associate with articles
Returns:
List of RSSArticleData objects from all sources
"""
all_articles = []
for source_url in self.rss_sources:
try:
# Parse feed
articles = self._parse_feed(source_url)
# Set match_id if provided
if match_id:
for article in articles:
article.match_id = match_id
all_articles.extend(articles)
except Exception as e:
logger.error(
f"❌ Failed to scrape source {source_url}: {e}. "
f"Continuing with other sources..."
)
continue
logger.info(
f"✅ Total articles collected from all sources: {len(all_articles)}"
)
return all_articles
def scrape_single_source(
self,
source_url: str,
match_id: Optional[int] = None
) -> List[RSSArticleData]:
"""
Scrape a single RSS source.
Args:
source_url: URL of the RSS feed to scrape
match_id: Optional match ID to associate with articles
Returns:
List of RSSArticleData objects
"""
articles = self._parse_feed(source_url)
# Set match_id if provided
if match_id:
for article in articles:
article.match_id = match_id
return articles
def save_articles_to_db(self, articles: List[RSSArticleData], db: Session) -> None:
"""
Save RSS articles to database.
Args:
articles: List of RSSArticleData objects
db: SQLAlchemy database session
"""
from app.models.rss_article import RSSArticle
saved_count = 0
for article_data in articles:
# Check if article already exists
existing = db.query(RSSArticle).filter(
RSSArticle.article_id == article_data.article_id
).first()
if existing:
logger.debug(f"Article {article_data.article_id} already exists, skipping")
continue
# Create new article
article = RSSArticle(
article_id=article_data.article_id,
title=article_data.title,
content=article_data.content,
published_at=article_data.published_at,
source_url=article_data.source_url,
match_id=article_data.match_id,
source=article_data.source
)
db.add(article)
saved_count += 1
# Commit changes
try:
db.commit()
logger.info(f"✅ Saved {saved_count} new RSS articles to database")
except Exception as e:
db.rollback()
logger.error(f"❌ Failed to save RSS articles to database: {e}")
raise
def scrape_and_save(
self,
db: Session,
match_id: Optional[int] = None
) -> List[RSSArticleData]:
"""
Scrape all RSS sources and save to database.
Args:
db: SQLAlchemy database session
match_id: Optional match ID to associate with articles
Returns:
List of RSSArticleData objects
"""
try:
# Scrape articles
articles = self.scrape_all_sources(match_id)
# Save to database
if articles:
self.save_articles_to_db(articles, db)
return articles
except Exception as e:
logger.error(f"❌ Failed to scrape and save RSS articles: {e}")
raise
def create_rss_scraper(
rss_sources: Optional[List[str]] = None,
keywords: Optional[List[str]] = None
) -> RSSScraper:
"""
Factory function to create an RSS scraper instance.
Args:
rss_sources: Optional list of RSS feed URLs
keywords: Optional list of keywords to filter articles
Returns:
Configured RSSScraper instance
"""
scraper = RSSScraper(
rss_sources=rss_sources,
timeout=30,
max_articles_per_source=100,
keywords=keywords
)
return scraper