chartbastan/backend/tests/test_rss_scraper.py
2026-02-01 09:31:38 +01:00

370 lines
12 KiB
Python

"""
Tests for RSS scraper module.
This test suite validates the RSS scraper functionality including
parsing, filtering, error handling, and database operations.
"""
import pytest
from datetime import datetime, timezone
from unittest.mock import Mock, patch, MagicMock
from app.scrapers.rss_scraper import (
RSSScraper,
RSSArticleData,
create_rss_scraper
)
class TestRSSScraperInit:
"""Tests for RSS scraper initialization."""
def test_init_default_sources(self):
"""Test initialization with default RSS sources."""
scraper = RSSScraper()
assert len(scraper.rss_sources) == 4
assert scraper.timeout == 30
assert scraper.max_articles_per_source == 100
assert len(scraper.keywords) > 0
def test_init_custom_sources(self):
"""Test initialization with custom RSS sources."""
custom_sources = ["http://example.com/rss"]
scraper = RSSScraper(rss_sources=custom_sources)
assert scraper.rss_sources == custom_sources
assert len(scraper.rss_sources) == 1
def test_init_custom_keywords(self):
"""Test initialization with custom keywords."""
custom_keywords = ["football", "soccer"]
scraper = RSSScraper(keywords=custom_keywords)
assert scraper.keywords == custom_keywords
def test_init_custom_timeout(self):
"""Test initialization with custom timeout."""
scraper = RSSScraper(timeout=60)
assert scraper.timeout == 60
class TestRSSScraperIsArticleRelevant:
"""Tests for article relevance filtering."""
def test_relevant_article_with_keyword(self):
"""Test that article with keyword is relevant."""
scraper = RSSScraper()
title = "Arsenal wins Premier League match"
content = "Great performance by the team"
assert scraper._is_article_relevant(title, content) is True
def test_relevant_article_multiple_keywords(self):
"""Test article with multiple keywords."""
scraper = RSSScraper()
title = "Champions League: Real Madrid vs Barcelona"
content = "Soccer match preview"
assert scraper._is_article_relevant(title, content) is True
def test_irrelevant_article(self):
"""Test that irrelevant article is filtered out."""
scraper = RSSScraper()
title = "Technology news: New iPhone released"
content = "Apple announced new products"
assert scraper._is_article_relevant(title, content) is False
def test_case_insensitive_matching(self):
"""Test that keyword matching is case insensitive."""
scraper = RSSScraper()
title = "FOOTBALL MATCH: TEAM A VS TEAM B"
content = "SOCCER game details"
assert scraper._is_article_relevant(title, content) is True
class TestRSSScraperParsePublishedDate:
"""Tests for date parsing."""
def test_parse_valid_date(self):
"""Test parsing a valid date string."""
scraper = RSSScraper()
date_str = "Sat, 15 Jan 2026 10:30:00 +0000"
parsed = scraper._parse_published_date(date_str)
assert isinstance(parsed, datetime)
assert parsed.tzinfo is not None
def test_parse_invalid_date(self):
"""Test parsing an invalid date string falls back to current time."""
scraper = RSSScraper()
date_str = "invalid-date"
parsed = scraper._parse_published_date(date_str)
assert isinstance(parsed, datetime)
assert parsed.tzinfo is not None
class TestRSSScraperParseFeed:
"""Tests for RSS feed parsing."""
@patch('feedparser.parse')
def test_parse_valid_feed(self, mock_parse):
"""Test parsing a valid RSS feed."""
# Mock feedparser response
mock_feed = Mock()
mock_feed.feed = {'title': 'ESPN'}
mock_feed.bozo = False
mock_feed.entries = [
Mock(
id='article-1',
title='Football match preview',
summary='Team A vs Team B',
published='Sat, 15 Jan 2026 10:30:00 +0000',
link='http://example.com/article-1'
)
]
mock_parse.return_value = mock_feed
scraper = RSSScraper()
articles = scraper._parse_feed('http://example.com/rss')
assert len(articles) >= 0
mock_parse.assert_called_once()
@patch('feedparser.parse')
def test_parse_feed_with_bozo_error(self, mock_parse):
"""Test parsing a feed with XML errors."""
# Mock feedparser response with bozo error
mock_feed = Mock()
mock_feed.feed = {'title': 'ESPN'}
mock_feed.bozo = True
mock_feed.entries = []
mock_parse.return_value = mock_feed
scraper = RSSScraper()
articles = scraper._parse_feed('http://example.com/rss')
# Should not crash, but log warning
assert isinstance(articles, list)
@patch('feedparser.parse')
def test_parse_feed_filters_irrelevant_articles(self, mock_parse):
"""Test that irrelevant articles are filtered out."""
# Mock feedparser response
mock_feed = Mock()
mock_feed.feed = {'title': 'ESPN'}
mock_feed.bozo = False
mock_feed.entries = [
Mock(
id='article-1',
title='Football news',
summary='Match result',
published='Sat, 15 Jan 2026 10:30:00 +0000',
link='http://example.com/article-1'
),
Mock(
id='article-2',
title='Technology news',
summary='New iPhone',
published='Sat, 15 Jan 2026 11:30:00 +0000',
link='http://example.com/article-2'
)
]
mock_parse.return_value = mock_feed
scraper = RSSScraper()
articles = scraper._parse_feed('http://example.com/rss')
# Only football article should be included
football_articles = [a for a in articles if 'football' in a.title.lower()]
assert len(football_articles) >= 0
class TestRSSScraperScrapeAllSources:
"""Tests for scraping all sources."""
@patch('app.scrapers.rss_scraper.RSSScraper._parse_feed')
def test_scrape_all_sources(self, mock_parse_feed):
"""Test scraping all configured sources."""
# Mock feed parsing
mock_parse_feed.return_value = [
RSSArticleData(
article_id='article-1',
title='Football news',
content='Match result',
published_at=datetime.now(timezone.utc),
source_url='http://example.com/rss',
match_id=None,
source='ESPN'
)
]
scraper = RSSScraper()
articles = scraper.scrape_all_sources()
assert len(articles) >= 0
assert mock_parse_feed.call_count == len(scraper.rss_sources)
@patch('app.scrapers.rss_scraper.RSSScraper._parse_feed')
def test_scrape_all_sources_with_match_id(self, mock_parse_feed):
"""Test scraping with match ID."""
mock_parse_feed.return_value = [
RSSArticleData(
article_id='article-1',
title='Football news',
content='Match result',
published_at=datetime.now(timezone.utc),
source_url='http://example.com/rss',
match_id=None,
source='ESPN'
)
]
scraper = RSSScraper()
match_id = 123
articles = scraper.scrape_all_sources(match_id=match_id)
# All articles should have match_id set
for article in articles:
assert article.match_id == match_id
@patch('app.scrapers.rss_scraper.RSSScraper._parse_feed')
def test_scrape_all_sources_continues_on_error(self, mock_parse_feed):
"""Test that scraper continues on source errors."""
# Make second source raise an error
mock_parse_feed.side_effect = [
[], # First source succeeds
Exception("Network error"), # Second source fails
[] # Third source succeeds
]
scraper = RSSScraper()
articles = scraper.scrape_all_sources()
# Should have collected from successful sources
assert isinstance(articles, list)
class TestRSSScraperSaveToDatabase:
"""Tests for saving articles to database."""
@patch('app.scrapers.rss_scraper.Session')
def test_save_articles_to_db(self, mock_session_class):
"""Test saving articles to database."""
mock_db = Mock()
mock_session_class.return_value = mock_db
# Mock query to return None (article doesn't exist)
mock_db.query.return_value.filter.return_value.first.return_value = None
scraper = RSSScraper()
articles = [
RSSArticleData(
article_id='article-1',
title='Football news',
content='Match result',
published_at=datetime.now(timezone.utc),
source_url='http://example.com/rss',
match_id=None,
source='ESPN'
)
]
scraper.save_articles_to_db(articles, mock_db)
# Verify commit was called
mock_db.commit.assert_called_once()
@patch('app.scrapers.rss_scraper.Session')
def test_save_articles_skips_duplicates(self, mock_session_class):
"""Test that duplicate articles are skipped."""
mock_db = Mock()
mock_session_class.return_value = mock_db
# Mock query to return existing article
mock_existing = Mock()
mock_db.query.return_value.filter.return_value.first.return_value = mock_existing
scraper = RSSScraper()
articles = [
RSSArticleData(
article_id='article-1',
title='Football news',
content='Match result',
published_at=datetime.now(timezone.utc),
source_url='http://example.com/rss',
match_id=None,
source='ESPN'
)
]
scraper.save_articles_to_db(articles, mock_db)
# Should not add duplicate, but still commit
assert mock_db.add.call_count == 0
mock_db.commit.assert_called_once()
@patch('app.scrapers.rss_scraper.Session')
def test_save_articles_handles_db_error(self, mock_session_class):
"""Test that database errors are handled properly."""
mock_db = Mock()
mock_session_class.return_value = mock_db
# Mock query and commit to raise error
mock_db.query.return_value.filter.return_value.first.return_value = None
mock_db.commit.side_effect = Exception("Database error")
scraper = RSSScraper()
articles = [
RSSArticleData(
article_id='article-1',
title='Football news',
content='Match result',
published_at=datetime.now(timezone.utc),
source_url='http://example.com/rss',
match_id=None,
source='ESPN'
)
]
# Should raise exception
with pytest.raises(Exception):
scraper.save_articles_to_db(articles, mock_db)
# Verify rollback was called
mock_db.rollback.assert_called_once()
class TestCreateRSSScraper:
"""Tests for factory function."""
def test_create_default_scraper(self):
"""Test creating a scraper with default config."""
scraper = create_rss_scraper()
assert isinstance(scraper, RSSScraper)
assert len(scraper.rss_sources) > 0
def test_create_custom_scraper(self):
"""Test creating a scraper with custom config."""
custom_sources = ["http://custom.com/rss"]
custom_keywords = ["football"]
scraper = create_rss_scraper(
rss_sources=custom_sources,
keywords=custom_keywords
)
assert scraper.rss_sources == custom_sources
assert scraper.keywords == custom_keywords