""" Tests for RSS scraper module. This test suite validates the RSS scraper functionality including parsing, filtering, error handling, and database operations. """ import pytest from datetime import datetime, timezone from unittest.mock import Mock, patch, MagicMock from app.scrapers.rss_scraper import ( RSSScraper, RSSArticleData, create_rss_scraper ) class TestRSSScraperInit: """Tests for RSS scraper initialization.""" def test_init_default_sources(self): """Test initialization with default RSS sources.""" scraper = RSSScraper() assert len(scraper.rss_sources) == 4 assert scraper.timeout == 30 assert scraper.max_articles_per_source == 100 assert len(scraper.keywords) > 0 def test_init_custom_sources(self): """Test initialization with custom RSS sources.""" custom_sources = ["http://example.com/rss"] scraper = RSSScraper(rss_sources=custom_sources) assert scraper.rss_sources == custom_sources assert len(scraper.rss_sources) == 1 def test_init_custom_keywords(self): """Test initialization with custom keywords.""" custom_keywords = ["football", "soccer"] scraper = RSSScraper(keywords=custom_keywords) assert scraper.keywords == custom_keywords def test_init_custom_timeout(self): """Test initialization with custom timeout.""" scraper = RSSScraper(timeout=60) assert scraper.timeout == 60 class TestRSSScraperIsArticleRelevant: """Tests for article relevance filtering.""" def test_relevant_article_with_keyword(self): """Test that article with keyword is relevant.""" scraper = RSSScraper() title = "Arsenal wins Premier League match" content = "Great performance by the team" assert scraper._is_article_relevant(title, content) is True def test_relevant_article_multiple_keywords(self): """Test article with multiple keywords.""" scraper = RSSScraper() title = "Champions League: Real Madrid vs Barcelona" content = "Soccer match preview" assert scraper._is_article_relevant(title, content) is True def test_irrelevant_article(self): """Test that irrelevant article is filtered out.""" scraper = RSSScraper() title = "Technology news: New iPhone released" content = "Apple announced new products" assert scraper._is_article_relevant(title, content) is False def test_case_insensitive_matching(self): """Test that keyword matching is case insensitive.""" scraper = RSSScraper() title = "FOOTBALL MATCH: TEAM A VS TEAM B" content = "SOCCER game details" assert scraper._is_article_relevant(title, content) is True class TestRSSScraperParsePublishedDate: """Tests for date parsing.""" def test_parse_valid_date(self): """Test parsing a valid date string.""" scraper = RSSScraper() date_str = "Sat, 15 Jan 2026 10:30:00 +0000" parsed = scraper._parse_published_date(date_str) assert isinstance(parsed, datetime) assert parsed.tzinfo is not None def test_parse_invalid_date(self): """Test parsing an invalid date string falls back to current time.""" scraper = RSSScraper() date_str = "invalid-date" parsed = scraper._parse_published_date(date_str) assert isinstance(parsed, datetime) assert parsed.tzinfo is not None class TestRSSScraperParseFeed: """Tests for RSS feed parsing.""" @patch('feedparser.parse') def test_parse_valid_feed(self, mock_parse): """Test parsing a valid RSS feed.""" # Mock feedparser response mock_feed = Mock() mock_feed.feed = {'title': 'ESPN'} mock_feed.bozo = False mock_feed.entries = [ Mock( id='article-1', title='Football match preview', summary='Team A vs Team B', published='Sat, 15 Jan 2026 10:30:00 +0000', link='http://example.com/article-1' ) ] mock_parse.return_value = mock_feed scraper = RSSScraper() articles = scraper._parse_feed('http://example.com/rss') assert len(articles) >= 0 mock_parse.assert_called_once() @patch('feedparser.parse') def test_parse_feed_with_bozo_error(self, mock_parse): """Test parsing a feed with XML errors.""" # Mock feedparser response with bozo error mock_feed = Mock() mock_feed.feed = {'title': 'ESPN'} mock_feed.bozo = True mock_feed.entries = [] mock_parse.return_value = mock_feed scraper = RSSScraper() articles = scraper._parse_feed('http://example.com/rss') # Should not crash, but log warning assert isinstance(articles, list) @patch('feedparser.parse') def test_parse_feed_filters_irrelevant_articles(self, mock_parse): """Test that irrelevant articles are filtered out.""" # Mock feedparser response mock_feed = Mock() mock_feed.feed = {'title': 'ESPN'} mock_feed.bozo = False mock_feed.entries = [ Mock( id='article-1', title='Football news', summary='Match result', published='Sat, 15 Jan 2026 10:30:00 +0000', link='http://example.com/article-1' ), Mock( id='article-2', title='Technology news', summary='New iPhone', published='Sat, 15 Jan 2026 11:30:00 +0000', link='http://example.com/article-2' ) ] mock_parse.return_value = mock_feed scraper = RSSScraper() articles = scraper._parse_feed('http://example.com/rss') # Only football article should be included football_articles = [a for a in articles if 'football' in a.title.lower()] assert len(football_articles) >= 0 class TestRSSScraperScrapeAllSources: """Tests for scraping all sources.""" @patch('app.scrapers.rss_scraper.RSSScraper._parse_feed') def test_scrape_all_sources(self, mock_parse_feed): """Test scraping all configured sources.""" # Mock feed parsing mock_parse_feed.return_value = [ RSSArticleData( article_id='article-1', title='Football news', content='Match result', published_at=datetime.now(timezone.utc), source_url='http://example.com/rss', match_id=None, source='ESPN' ) ] scraper = RSSScraper() articles = scraper.scrape_all_sources() assert len(articles) >= 0 assert mock_parse_feed.call_count == len(scraper.rss_sources) @patch('app.scrapers.rss_scraper.RSSScraper._parse_feed') def test_scrape_all_sources_with_match_id(self, mock_parse_feed): """Test scraping with match ID.""" mock_parse_feed.return_value = [ RSSArticleData( article_id='article-1', title='Football news', content='Match result', published_at=datetime.now(timezone.utc), source_url='http://example.com/rss', match_id=None, source='ESPN' ) ] scraper = RSSScraper() match_id = 123 articles = scraper.scrape_all_sources(match_id=match_id) # All articles should have match_id set for article in articles: assert article.match_id == match_id @patch('app.scrapers.rss_scraper.RSSScraper._parse_feed') def test_scrape_all_sources_continues_on_error(self, mock_parse_feed): """Test that scraper continues on source errors.""" # Make second source raise an error mock_parse_feed.side_effect = [ [], # First source succeeds Exception("Network error"), # Second source fails [] # Third source succeeds ] scraper = RSSScraper() articles = scraper.scrape_all_sources() # Should have collected from successful sources assert isinstance(articles, list) class TestRSSScraperSaveToDatabase: """Tests for saving articles to database.""" @patch('app.scrapers.rss_scraper.Session') def test_save_articles_to_db(self, mock_session_class): """Test saving articles to database.""" mock_db = Mock() mock_session_class.return_value = mock_db # Mock query to return None (article doesn't exist) mock_db.query.return_value.filter.return_value.first.return_value = None scraper = RSSScraper() articles = [ RSSArticleData( article_id='article-1', title='Football news', content='Match result', published_at=datetime.now(timezone.utc), source_url='http://example.com/rss', match_id=None, source='ESPN' ) ] scraper.save_articles_to_db(articles, mock_db) # Verify commit was called mock_db.commit.assert_called_once() @patch('app.scrapers.rss_scraper.Session') def test_save_articles_skips_duplicates(self, mock_session_class): """Test that duplicate articles are skipped.""" mock_db = Mock() mock_session_class.return_value = mock_db # Mock query to return existing article mock_existing = Mock() mock_db.query.return_value.filter.return_value.first.return_value = mock_existing scraper = RSSScraper() articles = [ RSSArticleData( article_id='article-1', title='Football news', content='Match result', published_at=datetime.now(timezone.utc), source_url='http://example.com/rss', match_id=None, source='ESPN' ) ] scraper.save_articles_to_db(articles, mock_db) # Should not add duplicate, but still commit assert mock_db.add.call_count == 0 mock_db.commit.assert_called_once() @patch('app.scrapers.rss_scraper.Session') def test_save_articles_handles_db_error(self, mock_session_class): """Test that database errors are handled properly.""" mock_db = Mock() mock_session_class.return_value = mock_db # Mock query and commit to raise error mock_db.query.return_value.filter.return_value.first.return_value = None mock_db.commit.side_effect = Exception("Database error") scraper = RSSScraper() articles = [ RSSArticleData( article_id='article-1', title='Football news', content='Match result', published_at=datetime.now(timezone.utc), source_url='http://example.com/rss', match_id=None, source='ESPN' ) ] # Should raise exception with pytest.raises(Exception): scraper.save_articles_to_db(articles, mock_db) # Verify rollback was called mock_db.rollback.assert_called_once() class TestCreateRSSScraper: """Tests for factory function.""" def test_create_default_scraper(self): """Test creating a scraper with default config.""" scraper = create_rss_scraper() assert isinstance(scraper, RSSScraper) assert len(scraper.rss_sources) > 0 def test_create_custom_scraper(self): """Test creating a scraper with custom config.""" custom_sources = ["http://custom.com/rss"] custom_keywords = ["football"] scraper = create_rss_scraper( rss_sources=custom_sources, keywords=custom_keywords ) assert scraper.rss_sources == custom_sources assert scraper.keywords == custom_keywords