370 lines
12 KiB
Python
370 lines
12 KiB
Python
"""
|
|
Tests for RSS scraper module.
|
|
|
|
This test suite validates the RSS scraper functionality including
|
|
parsing, filtering, error handling, and database operations.
|
|
"""
|
|
|
|
import pytest
|
|
from datetime import datetime, timezone
|
|
from unittest.mock import Mock, patch, MagicMock
|
|
|
|
from app.scrapers.rss_scraper import (
|
|
RSSScraper,
|
|
RSSArticleData,
|
|
create_rss_scraper
|
|
)
|
|
|
|
|
|
class TestRSSScraperInit:
|
|
"""Tests for RSS scraper initialization."""
|
|
|
|
def test_init_default_sources(self):
|
|
"""Test initialization with default RSS sources."""
|
|
scraper = RSSScraper()
|
|
|
|
assert len(scraper.rss_sources) == 4
|
|
assert scraper.timeout == 30
|
|
assert scraper.max_articles_per_source == 100
|
|
assert len(scraper.keywords) > 0
|
|
|
|
def test_init_custom_sources(self):
|
|
"""Test initialization with custom RSS sources."""
|
|
custom_sources = ["http://example.com/rss"]
|
|
scraper = RSSScraper(rss_sources=custom_sources)
|
|
|
|
assert scraper.rss_sources == custom_sources
|
|
assert len(scraper.rss_sources) == 1
|
|
|
|
def test_init_custom_keywords(self):
|
|
"""Test initialization with custom keywords."""
|
|
custom_keywords = ["football", "soccer"]
|
|
scraper = RSSScraper(keywords=custom_keywords)
|
|
|
|
assert scraper.keywords == custom_keywords
|
|
|
|
def test_init_custom_timeout(self):
|
|
"""Test initialization with custom timeout."""
|
|
scraper = RSSScraper(timeout=60)
|
|
|
|
assert scraper.timeout == 60
|
|
|
|
|
|
class TestRSSScraperIsArticleRelevant:
|
|
"""Tests for article relevance filtering."""
|
|
|
|
def test_relevant_article_with_keyword(self):
|
|
"""Test that article with keyword is relevant."""
|
|
scraper = RSSScraper()
|
|
|
|
title = "Arsenal wins Premier League match"
|
|
content = "Great performance by the team"
|
|
|
|
assert scraper._is_article_relevant(title, content) is True
|
|
|
|
def test_relevant_article_multiple_keywords(self):
|
|
"""Test article with multiple keywords."""
|
|
scraper = RSSScraper()
|
|
|
|
title = "Champions League: Real Madrid vs Barcelona"
|
|
content = "Soccer match preview"
|
|
|
|
assert scraper._is_article_relevant(title, content) is True
|
|
|
|
def test_irrelevant_article(self):
|
|
"""Test that irrelevant article is filtered out."""
|
|
scraper = RSSScraper()
|
|
|
|
title = "Technology news: New iPhone released"
|
|
content = "Apple announced new products"
|
|
|
|
assert scraper._is_article_relevant(title, content) is False
|
|
|
|
def test_case_insensitive_matching(self):
|
|
"""Test that keyword matching is case insensitive."""
|
|
scraper = RSSScraper()
|
|
|
|
title = "FOOTBALL MATCH: TEAM A VS TEAM B"
|
|
content = "SOCCER game details"
|
|
|
|
assert scraper._is_article_relevant(title, content) is True
|
|
|
|
|
|
class TestRSSScraperParsePublishedDate:
|
|
"""Tests for date parsing."""
|
|
|
|
def test_parse_valid_date(self):
|
|
"""Test parsing a valid date string."""
|
|
scraper = RSSScraper()
|
|
|
|
date_str = "Sat, 15 Jan 2026 10:30:00 +0000"
|
|
parsed = scraper._parse_published_date(date_str)
|
|
|
|
assert isinstance(parsed, datetime)
|
|
assert parsed.tzinfo is not None
|
|
|
|
def test_parse_invalid_date(self):
|
|
"""Test parsing an invalid date string falls back to current time."""
|
|
scraper = RSSScraper()
|
|
|
|
date_str = "invalid-date"
|
|
parsed = scraper._parse_published_date(date_str)
|
|
|
|
assert isinstance(parsed, datetime)
|
|
assert parsed.tzinfo is not None
|
|
|
|
|
|
class TestRSSScraperParseFeed:
|
|
"""Tests for RSS feed parsing."""
|
|
|
|
@patch('feedparser.parse')
|
|
def test_parse_valid_feed(self, mock_parse):
|
|
"""Test parsing a valid RSS feed."""
|
|
# Mock feedparser response
|
|
mock_feed = Mock()
|
|
mock_feed.feed = {'title': 'ESPN'}
|
|
mock_feed.bozo = False
|
|
mock_feed.entries = [
|
|
Mock(
|
|
id='article-1',
|
|
title='Football match preview',
|
|
summary='Team A vs Team B',
|
|
published='Sat, 15 Jan 2026 10:30:00 +0000',
|
|
link='http://example.com/article-1'
|
|
)
|
|
]
|
|
mock_parse.return_value = mock_feed
|
|
|
|
scraper = RSSScraper()
|
|
articles = scraper._parse_feed('http://example.com/rss')
|
|
|
|
assert len(articles) >= 0
|
|
mock_parse.assert_called_once()
|
|
|
|
@patch('feedparser.parse')
|
|
def test_parse_feed_with_bozo_error(self, mock_parse):
|
|
"""Test parsing a feed with XML errors."""
|
|
# Mock feedparser response with bozo error
|
|
mock_feed = Mock()
|
|
mock_feed.feed = {'title': 'ESPN'}
|
|
mock_feed.bozo = True
|
|
mock_feed.entries = []
|
|
mock_parse.return_value = mock_feed
|
|
|
|
scraper = RSSScraper()
|
|
articles = scraper._parse_feed('http://example.com/rss')
|
|
|
|
# Should not crash, but log warning
|
|
assert isinstance(articles, list)
|
|
|
|
@patch('feedparser.parse')
|
|
def test_parse_feed_filters_irrelevant_articles(self, mock_parse):
|
|
"""Test that irrelevant articles are filtered out."""
|
|
# Mock feedparser response
|
|
mock_feed = Mock()
|
|
mock_feed.feed = {'title': 'ESPN'}
|
|
mock_feed.bozo = False
|
|
mock_feed.entries = [
|
|
Mock(
|
|
id='article-1',
|
|
title='Football news',
|
|
summary='Match result',
|
|
published='Sat, 15 Jan 2026 10:30:00 +0000',
|
|
link='http://example.com/article-1'
|
|
),
|
|
Mock(
|
|
id='article-2',
|
|
title='Technology news',
|
|
summary='New iPhone',
|
|
published='Sat, 15 Jan 2026 11:30:00 +0000',
|
|
link='http://example.com/article-2'
|
|
)
|
|
]
|
|
mock_parse.return_value = mock_feed
|
|
|
|
scraper = RSSScraper()
|
|
articles = scraper._parse_feed('http://example.com/rss')
|
|
|
|
# Only football article should be included
|
|
football_articles = [a for a in articles if 'football' in a.title.lower()]
|
|
assert len(football_articles) >= 0
|
|
|
|
|
|
class TestRSSScraperScrapeAllSources:
|
|
"""Tests for scraping all sources."""
|
|
|
|
@patch('app.scrapers.rss_scraper.RSSScraper._parse_feed')
|
|
def test_scrape_all_sources(self, mock_parse_feed):
|
|
"""Test scraping all configured sources."""
|
|
# Mock feed parsing
|
|
mock_parse_feed.return_value = [
|
|
RSSArticleData(
|
|
article_id='article-1',
|
|
title='Football news',
|
|
content='Match result',
|
|
published_at=datetime.now(timezone.utc),
|
|
source_url='http://example.com/rss',
|
|
match_id=None,
|
|
source='ESPN'
|
|
)
|
|
]
|
|
|
|
scraper = RSSScraper()
|
|
articles = scraper.scrape_all_sources()
|
|
|
|
assert len(articles) >= 0
|
|
assert mock_parse_feed.call_count == len(scraper.rss_sources)
|
|
|
|
@patch('app.scrapers.rss_scraper.RSSScraper._parse_feed')
|
|
def test_scrape_all_sources_with_match_id(self, mock_parse_feed):
|
|
"""Test scraping with match ID."""
|
|
mock_parse_feed.return_value = [
|
|
RSSArticleData(
|
|
article_id='article-1',
|
|
title='Football news',
|
|
content='Match result',
|
|
published_at=datetime.now(timezone.utc),
|
|
source_url='http://example.com/rss',
|
|
match_id=None,
|
|
source='ESPN'
|
|
)
|
|
]
|
|
|
|
scraper = RSSScraper()
|
|
match_id = 123
|
|
articles = scraper.scrape_all_sources(match_id=match_id)
|
|
|
|
# All articles should have match_id set
|
|
for article in articles:
|
|
assert article.match_id == match_id
|
|
|
|
@patch('app.scrapers.rss_scraper.RSSScraper._parse_feed')
|
|
def test_scrape_all_sources_continues_on_error(self, mock_parse_feed):
|
|
"""Test that scraper continues on source errors."""
|
|
# Make second source raise an error
|
|
mock_parse_feed.side_effect = [
|
|
[], # First source succeeds
|
|
Exception("Network error"), # Second source fails
|
|
[] # Third source succeeds
|
|
]
|
|
|
|
scraper = RSSScraper()
|
|
articles = scraper.scrape_all_sources()
|
|
|
|
# Should have collected from successful sources
|
|
assert isinstance(articles, list)
|
|
|
|
|
|
class TestRSSScraperSaveToDatabase:
|
|
"""Tests for saving articles to database."""
|
|
|
|
@patch('app.scrapers.rss_scraper.Session')
|
|
def test_save_articles_to_db(self, mock_session_class):
|
|
"""Test saving articles to database."""
|
|
mock_db = Mock()
|
|
mock_session_class.return_value = mock_db
|
|
|
|
# Mock query to return None (article doesn't exist)
|
|
mock_db.query.return_value.filter.return_value.first.return_value = None
|
|
|
|
scraper = RSSScraper()
|
|
articles = [
|
|
RSSArticleData(
|
|
article_id='article-1',
|
|
title='Football news',
|
|
content='Match result',
|
|
published_at=datetime.now(timezone.utc),
|
|
source_url='http://example.com/rss',
|
|
match_id=None,
|
|
source='ESPN'
|
|
)
|
|
]
|
|
|
|
scraper.save_articles_to_db(articles, mock_db)
|
|
|
|
# Verify commit was called
|
|
mock_db.commit.assert_called_once()
|
|
|
|
@patch('app.scrapers.rss_scraper.Session')
|
|
def test_save_articles_skips_duplicates(self, mock_session_class):
|
|
"""Test that duplicate articles are skipped."""
|
|
mock_db = Mock()
|
|
mock_session_class.return_value = mock_db
|
|
|
|
# Mock query to return existing article
|
|
mock_existing = Mock()
|
|
mock_db.query.return_value.filter.return_value.first.return_value = mock_existing
|
|
|
|
scraper = RSSScraper()
|
|
articles = [
|
|
RSSArticleData(
|
|
article_id='article-1',
|
|
title='Football news',
|
|
content='Match result',
|
|
published_at=datetime.now(timezone.utc),
|
|
source_url='http://example.com/rss',
|
|
match_id=None,
|
|
source='ESPN'
|
|
)
|
|
]
|
|
|
|
scraper.save_articles_to_db(articles, mock_db)
|
|
|
|
# Should not add duplicate, but still commit
|
|
assert mock_db.add.call_count == 0
|
|
mock_db.commit.assert_called_once()
|
|
|
|
@patch('app.scrapers.rss_scraper.Session')
|
|
def test_save_articles_handles_db_error(self, mock_session_class):
|
|
"""Test that database errors are handled properly."""
|
|
mock_db = Mock()
|
|
mock_session_class.return_value = mock_db
|
|
|
|
# Mock query and commit to raise error
|
|
mock_db.query.return_value.filter.return_value.first.return_value = None
|
|
mock_db.commit.side_effect = Exception("Database error")
|
|
|
|
scraper = RSSScraper()
|
|
articles = [
|
|
RSSArticleData(
|
|
article_id='article-1',
|
|
title='Football news',
|
|
content='Match result',
|
|
published_at=datetime.now(timezone.utc),
|
|
source_url='http://example.com/rss',
|
|
match_id=None,
|
|
source='ESPN'
|
|
)
|
|
]
|
|
|
|
# Should raise exception
|
|
with pytest.raises(Exception):
|
|
scraper.save_articles_to_db(articles, mock_db)
|
|
|
|
# Verify rollback was called
|
|
mock_db.rollback.assert_called_once()
|
|
|
|
|
|
class TestCreateRSSScraper:
|
|
"""Tests for factory function."""
|
|
|
|
def test_create_default_scraper(self):
|
|
"""Test creating a scraper with default config."""
|
|
scraper = create_rss_scraper()
|
|
|
|
assert isinstance(scraper, RSSScraper)
|
|
assert len(scraper.rss_sources) > 0
|
|
|
|
def test_create_custom_scraper(self):
|
|
"""Test creating a scraper with custom config."""
|
|
custom_sources = ["http://custom.com/rss"]
|
|
custom_keywords = ["football"]
|
|
|
|
scraper = create_rss_scraper(
|
|
rss_sources=custom_sources,
|
|
keywords=custom_keywords
|
|
)
|
|
|
|
assert scraper.rss_sources == custom_sources
|
|
assert scraper.keywords == custom_keywords
|