381 lines
12 KiB
Python
381 lines
12 KiB
Python
"""
|
|
RSS scraper module with robust error handling.
|
|
|
|
This module provides functionality to scrape RSS feeds from sports sources,
|
|
with built-in error handling and logging.
|
|
"""
|
|
|
|
import logging
|
|
import feedparser
|
|
from datetime import datetime, timezone
|
|
from typing import List, Dict, Optional
|
|
from dataclasses import dataclass
|
|
from urllib.parse import urlparse
|
|
|
|
from sqlalchemy.orm import Session
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class RSSArticleData:
|
|
"""Structured data for an RSS article."""
|
|
article_id: str
|
|
title: str
|
|
content: str
|
|
published_at: datetime
|
|
source_url: str
|
|
match_id: Optional[int]
|
|
source: str = "rss"
|
|
|
|
|
|
class RSSScraper:
|
|
"""
|
|
RSS scraper with robust error handling.
|
|
|
|
Features:
|
|
- Scrapes RSS feeds from configured sports sources
|
|
- Error handling without stopping the process
|
|
- Continues with other sources on errors
|
|
- Structured logging
|
|
- Timeout configuration
|
|
- Filters relevant football articles
|
|
"""
|
|
|
|
# Default RSS sources for sports news
|
|
DEFAULT_RSS_SOURCES = [
|
|
"http://www.espn.com/espn/rss/news",
|
|
"http://feeds.bbci.co.uk/sport/football/rss.xml",
|
|
"https://www.goal.com/rss",
|
|
"https://www.skysports.com/rss/12040",
|
|
]
|
|
|
|
# Keywords to filter relevant football articles
|
|
FOOTBALL_KEYWORDS = [
|
|
"football", "soccer", "match", "goal", "premier league",
|
|
"la liga", "serie a", "bundesliga", "ligue 1", "champions league",
|
|
"euro", "world cup", "cup", "league", "team", "club", "player",
|
|
"coach", "manager", "score", "result", "transfer"
|
|
]
|
|
|
|
def __init__(
|
|
self,
|
|
rss_sources: Optional[List[str]] = None,
|
|
timeout: int = 30,
|
|
max_articles_per_source: int = 100,
|
|
keywords: Optional[List[str]] = None
|
|
):
|
|
"""
|
|
Initialize RSS scraper.
|
|
|
|
Args:
|
|
rss_sources: List of RSS feed URLs to scrape
|
|
timeout: Request timeout in seconds (default: 30)
|
|
max_articles_per_source: Maximum articles to collect per source
|
|
keywords: List of keywords to filter relevant articles
|
|
"""
|
|
self.rss_sources = rss_sources or self.DEFAULT_RSS_SOURCES
|
|
self.timeout = timeout
|
|
self.max_articles_per_source = max_articles_per_source
|
|
self.keywords = keywords or self.FOOTBALL_KEYWORDS
|
|
|
|
logger.info(f"📰 RSS Scraper initialized with {len(self.rss_sources)} sources")
|
|
for i, source in enumerate(self.rss_sources, 1):
|
|
domain = urlparse(source).netloc
|
|
logger.info(f" {i}. {domain}")
|
|
|
|
def _is_article_relevant(self, title: str, content: str) -> bool:
|
|
"""
|
|
Check if an article is relevant to football based on keywords.
|
|
|
|
Args:
|
|
title: Article title
|
|
content: Article content
|
|
|
|
Returns:
|
|
True if article is relevant, False otherwise
|
|
"""
|
|
text_to_check = f"{title.lower()} {content.lower()}"
|
|
|
|
# Check if any keyword is present
|
|
for keyword in self.keywords:
|
|
if keyword.lower() in text_to_check:
|
|
return True
|
|
|
|
return False
|
|
|
|
def _parse_published_date(self, published: str) -> datetime:
|
|
"""
|
|
Parse the published date from RSS feed.
|
|
|
|
Args:
|
|
published: Published date string from RSS feed
|
|
|
|
Returns:
|
|
Datetime object in UTC timezone
|
|
"""
|
|
try:
|
|
# feedparser automatically parses dates
|
|
parsed = feedparser.parse(published)
|
|
if hasattr(parsed, 'published_parsed') and parsed.published_parsed:
|
|
return datetime(*parsed.published_parsed[:6], tzinfo=timezone.utc)
|
|
|
|
# Fallback: try to parse as string
|
|
from email.utils import parsedate_to_datetime
|
|
return parsedate_to_datetime(published).astimezone(timezone.utc)
|
|
except Exception as e:
|
|
logger.warning(f"⚠️ Failed to parse date '{published}': {e}")
|
|
return datetime.now(timezone.utc)
|
|
|
|
def _parse_feed(
|
|
self,
|
|
source_url: str
|
|
) -> List[RSSArticleData]:
|
|
"""
|
|
Parse RSS feed and extract articles.
|
|
|
|
Args:
|
|
source_url: URL of the RSS feed
|
|
|
|
Returns:
|
|
List of RSSArticleData objects
|
|
"""
|
|
articles = []
|
|
|
|
try:
|
|
logger.info(f"🔍 Parsing RSS feed: {source_url}")
|
|
|
|
# Parse RSS feed with timeout
|
|
feed = feedparser.parse(source_url)
|
|
|
|
# Check for feed errors
|
|
if feed.get('bozo', False):
|
|
logger.warning(f"⚠️ RSS feed has malformed XML: {source_url}")
|
|
# Continue anyway as feedparser can handle some malformed feeds
|
|
|
|
# Extract feed info
|
|
feed_title = feed.feed.get('title', 'Unknown')
|
|
logger.info(f"📰 Feed: {feed_title}")
|
|
logger.info(f" Total entries: {len(feed.entries)}")
|
|
|
|
# Process entries
|
|
for entry in feed.entries[:self.max_articles_per_source]:
|
|
try:
|
|
# Extract article ID
|
|
article_id = entry.get('id') or entry.get('link', '')
|
|
if not article_id:
|
|
logger.warning(f"⚠️ Article missing ID, skipping")
|
|
continue
|
|
|
|
# Extract title
|
|
title = entry.get('title', '')
|
|
if not title:
|
|
logger.warning(f"⚠️ Article missing title, skipping")
|
|
continue
|
|
|
|
# Extract content
|
|
content = ''
|
|
if 'content' in entry:
|
|
content = entry.content[0].value if entry.content else ''
|
|
elif 'summary' in entry:
|
|
content = entry.summary
|
|
elif 'description' in entry:
|
|
content = entry.description
|
|
|
|
# Parse published date
|
|
published_str = entry.get('published') or entry.get('updated')
|
|
if not published_str:
|
|
logger.warning(f"⚠️ Article missing published date, using current time")
|
|
published_at = datetime.now(timezone.utc)
|
|
else:
|
|
published_at = self._parse_published_date(published_str)
|
|
|
|
# Filter relevant articles
|
|
if not self._is_article_relevant(title, content):
|
|
logger.debug(f"🚫 Article not relevant: {title}")
|
|
continue
|
|
|
|
# Create article data
|
|
article_data = RSSArticleData(
|
|
article_id=article_id,
|
|
title=title,
|
|
content=content,
|
|
published_at=published_at,
|
|
source_url=source_url,
|
|
match_id=None, # Will be matched later if needed
|
|
source=feed_title
|
|
)
|
|
articles.append(article_data)
|
|
|
|
except Exception as e:
|
|
logger.error(f"❌ Error processing article: {e}")
|
|
continue
|
|
|
|
logger.info(f"✅ Collected {len(articles)} relevant articles from {source_url}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"❌ Failed to parse RSS feed {source_url}: {e}")
|
|
|
|
return articles
|
|
|
|
def scrape_all_sources(
|
|
self,
|
|
match_id: Optional[int] = None
|
|
) -> List[RSSArticleData]:
|
|
"""
|
|
Scrape all configured RSS sources.
|
|
|
|
Args:
|
|
match_id: Optional match ID to associate with articles
|
|
|
|
Returns:
|
|
List of RSSArticleData objects from all sources
|
|
"""
|
|
all_articles = []
|
|
|
|
for source_url in self.rss_sources:
|
|
try:
|
|
# Parse feed
|
|
articles = self._parse_feed(source_url)
|
|
|
|
# Set match_id if provided
|
|
if match_id:
|
|
for article in articles:
|
|
article.match_id = match_id
|
|
|
|
all_articles.extend(articles)
|
|
|
|
except Exception as e:
|
|
logger.error(
|
|
f"❌ Failed to scrape source {source_url}: {e}. "
|
|
f"Continuing with other sources..."
|
|
)
|
|
continue
|
|
|
|
logger.info(
|
|
f"✅ Total articles collected from all sources: {len(all_articles)}"
|
|
)
|
|
|
|
return all_articles
|
|
|
|
def scrape_single_source(
|
|
self,
|
|
source_url: str,
|
|
match_id: Optional[int] = None
|
|
) -> List[RSSArticleData]:
|
|
"""
|
|
Scrape a single RSS source.
|
|
|
|
Args:
|
|
source_url: URL of the RSS feed to scrape
|
|
match_id: Optional match ID to associate with articles
|
|
|
|
Returns:
|
|
List of RSSArticleData objects
|
|
"""
|
|
articles = self._parse_feed(source_url)
|
|
|
|
# Set match_id if provided
|
|
if match_id:
|
|
for article in articles:
|
|
article.match_id = match_id
|
|
|
|
return articles
|
|
|
|
def save_articles_to_db(self, articles: List[RSSArticleData], db: Session) -> None:
|
|
"""
|
|
Save RSS articles to database.
|
|
|
|
Args:
|
|
articles: List of RSSArticleData objects
|
|
db: SQLAlchemy database session
|
|
"""
|
|
from app.models.rss_article import RSSArticle
|
|
|
|
saved_count = 0
|
|
for article_data in articles:
|
|
# Check if article already exists
|
|
existing = db.query(RSSArticle).filter(
|
|
RSSArticle.article_id == article_data.article_id
|
|
).first()
|
|
|
|
if existing:
|
|
logger.debug(f"Article {article_data.article_id} already exists, skipping")
|
|
continue
|
|
|
|
# Create new article
|
|
article = RSSArticle(
|
|
article_id=article_data.article_id,
|
|
title=article_data.title,
|
|
content=article_data.content,
|
|
published_at=article_data.published_at,
|
|
source_url=article_data.source_url,
|
|
match_id=article_data.match_id,
|
|
source=article_data.source
|
|
)
|
|
|
|
db.add(article)
|
|
saved_count += 1
|
|
|
|
# Commit changes
|
|
try:
|
|
db.commit()
|
|
logger.info(f"✅ Saved {saved_count} new RSS articles to database")
|
|
except Exception as e:
|
|
db.rollback()
|
|
logger.error(f"❌ Failed to save RSS articles to database: {e}")
|
|
raise
|
|
|
|
def scrape_and_save(
|
|
self,
|
|
db: Session,
|
|
match_id: Optional[int] = None
|
|
) -> List[RSSArticleData]:
|
|
"""
|
|
Scrape all RSS sources and save to database.
|
|
|
|
Args:
|
|
db: SQLAlchemy database session
|
|
match_id: Optional match ID to associate with articles
|
|
|
|
Returns:
|
|
List of RSSArticleData objects
|
|
"""
|
|
try:
|
|
# Scrape articles
|
|
articles = self.scrape_all_sources(match_id)
|
|
|
|
# Save to database
|
|
if articles:
|
|
self.save_articles_to_db(articles, db)
|
|
|
|
return articles
|
|
|
|
except Exception as e:
|
|
logger.error(f"❌ Failed to scrape and save RSS articles: {e}")
|
|
raise
|
|
|
|
|
|
def create_rss_scraper(
|
|
rss_sources: Optional[List[str]] = None,
|
|
keywords: Optional[List[str]] = None
|
|
) -> RSSScraper:
|
|
"""
|
|
Factory function to create an RSS scraper instance.
|
|
|
|
Args:
|
|
rss_sources: Optional list of RSS feed URLs
|
|
keywords: Optional list of keywords to filter articles
|
|
|
|
Returns:
|
|
Configured RSSScraper instance
|
|
"""
|
|
scraper = RSSScraper(
|
|
rss_sources=rss_sources,
|
|
timeout=30,
|
|
max_articles_per_source=100,
|
|
keywords=keywords
|
|
)
|
|
|
|
return scraper
|