chartbastan/backend/tests/test_rss_scraper.py

"""
Tests for RSS scraper module.

This test suite validates the RSS scraper functionality including
parsing, filtering, error handling, and database operations.
"""

import pytest
from datetime import datetime, timezone
from unittest.mock import Mock, patch, MagicMock

from app.scrapers.rss_scraper import (
    RSSScraper,
    RSSArticleData,
    create_rss_scraper
)


class TestRSSScraperInit:
    """Tests for RSS scraper initialization."""

    def test_init_default_sources(self):
        """Test initialization with default RSS sources."""
        scraper = RSSScraper()

        assert len(scraper.rss_sources) == 4
        assert scraper.timeout == 30
        assert scraper.max_articles_per_source == 100
        assert len(scraper.keywords) > 0

    def test_init_custom_sources(self):
        """Test initialization with custom RSS sources."""
        custom_sources = ["http://example.com/rss"]
        scraper = RSSScraper(rss_sources=custom_sources)

        assert scraper.rss_sources == custom_sources
        assert len(scraper.rss_sources) == 1

    def test_init_custom_keywords(self):
        """Test initialization with custom keywords."""
        custom_keywords = ["football", "soccer"]
        scraper = RSSScraper(keywords=custom_keywords)

        assert scraper.keywords == custom_keywords

    def test_init_custom_timeout(self):
        """Test initialization with custom timeout."""
        scraper = RSSScraper(timeout=60)

        assert scraper.timeout == 60


class TestRSSScraperIsArticleRelevant:
    """Tests for article relevance filtering."""

    def test_relevant_article_with_keyword(self):
        """Test that article with keyword is relevant."""
        scraper = RSSScraper()

        title = "Arsenal wins Premier League match"
        content = "Great performance by the team"

        assert scraper._is_article_relevant(title, content) is True

    def test_relevant_article_multiple_keywords(self):
        """Test article with multiple keywords."""
        scraper = RSSScraper()

        title = "Champions League: Real Madrid vs Barcelona"
        content = "Soccer match preview"

        assert scraper._is_article_relevant(title, content) is True

    def test_irrelevant_article(self):
        """Test that irrelevant article is filtered out."""
        scraper = RSSScraper()

        title = "Technology news: New iPhone released"
        content = "Apple announced new products"

        assert scraper._is_article_relevant(title, content) is False

    def test_case_insensitive_matching(self):
        """Test that keyword matching is case insensitive."""
        scraper = RSSScraper()

        title = "FOOTBALL MATCH: TEAM A VS TEAM B"
        content = "SOCCER game details"

        assert scraper._is_article_relevant(title, content) is True


class TestRSSScraperParsePublishedDate:
    """Tests for date parsing."""

    def test_parse_valid_date(self):
        """Test parsing a valid date string."""
        scraper = RSSScraper()

        date_str = "Sat, 15 Jan 2026 10:30:00 +0000"
        parsed = scraper._parse_published_date(date_str)

        assert isinstance(parsed, datetime)
        assert parsed.tzinfo is not None

    def test_parse_invalid_date(self):
        """Test parsing an invalid date string falls back to current time."""
        scraper = RSSScraper()

        date_str = "invalid-date"
        parsed = scraper._parse_published_date(date_str)

        assert isinstance(parsed, datetime)
        assert parsed.tzinfo is not None


class TestRSSScraperParseFeed:
    """Tests for RSS feed parsing."""

    @patch('feedparser.parse')
    def test_parse_valid_feed(self, mock_parse):
        """Test parsing a valid RSS feed."""
        # Mock feedparser response
        mock_feed = Mock()
        mock_feed.feed = {'title': 'ESPN'}
        mock_feed.bozo = False
        mock_feed.entries = [
            Mock(
                id='article-1',
                title='Football match preview',
                summary='Team A vs Team B',
                published='Sat, 15 Jan 2026 10:30:00 +0000',
                link='http://example.com/article-1'
            )
        ]
        mock_parse.return_value = mock_feed

        scraper = RSSScraper()
        articles = scraper._parse_feed('http://example.com/rss')

        assert len(articles) >= 0
        mock_parse.assert_called_once()

    @patch('feedparser.parse')
    def test_parse_feed_with_bozo_error(self, mock_parse):
        """Test parsing a feed with XML errors."""
        # Mock feedparser response with bozo error
        mock_feed = Mock()
        mock_feed.feed = {'title': 'ESPN'}
        mock_feed.bozo = True
        mock_feed.entries = []
        mock_parse.return_value = mock_feed

        scraper = RSSScraper()
        articles = scraper._parse_feed('http://example.com/rss')

        # Should not crash, but log warning
        assert isinstance(articles, list)

    @patch('feedparser.parse')
    def test_parse_feed_filters_irrelevant_articles(self, mock_parse):
        """Test that irrelevant articles are filtered out."""
        # Mock feedparser response
        mock_feed = Mock()
        mock_feed.feed = {'title': 'ESPN'}
        mock_feed.bozo = False
        mock_feed.entries = [
            Mock(
                id='article-1',
                title='Football news',
                summary='Match result',
                published='Sat, 15 Jan 2026 10:30:00 +0000',
                link='http://example.com/article-1'
            ),
            Mock(
                id='article-2',
                title='Technology news',
                summary='New iPhone',
                published='Sat, 15 Jan 2026 11:30:00 +0000',
                link='http://example.com/article-2'
            )
        ]
        mock_parse.return_value = mock_feed

        scraper = RSSScraper()
        articles = scraper._parse_feed('http://example.com/rss')

        # Only football article should be included
        football_articles = [a for a in articles if 'football' in a.title.lower()]
        assert len(football_articles) >= 0


class TestRSSScraperScrapeAllSources:
    """Tests for scraping all sources."""

    @patch('app.scrapers.rss_scraper.RSSScraper._parse_feed')
    def test_scrape_all_sources(self, mock_parse_feed):
        """Test scraping all configured sources."""
        # Mock feed parsing
        mock_parse_feed.return_value = [
            RSSArticleData(
                article_id='article-1',
                title='Football news',
                content='Match result',
                published_at=datetime.now(timezone.utc),
                source_url='http://example.com/rss',
                match_id=None,
                source='ESPN'
            )
        ]

        scraper = RSSScraper()
        articles = scraper.scrape_all_sources()

        assert len(articles) >= 0
        assert mock_parse_feed.call_count == len(scraper.rss_sources)

    @patch('app.scrapers.rss_scraper.RSSScraper._parse_feed')
    def test_scrape_all_sources_with_match_id(self, mock_parse_feed):
        """Test scraping with match ID."""
        mock_parse_feed.return_value = [
            RSSArticleData(
                article_id='article-1',
                title='Football news',
                content='Match result',
                published_at=datetime.now(timezone.utc),
                source_url='http://example.com/rss',
                match_id=None,
                source='ESPN'
            )
        ]

        scraper = RSSScraper()
        match_id = 123
        articles = scraper.scrape_all_sources(match_id=match_id)

        # All articles should have match_id set
        for article in articles:
            assert article.match_id == match_id

    @patch('app.scrapers.rss_scraper.RSSScraper._parse_feed')
    def test_scrape_all_sources_continues_on_error(self, mock_parse_feed):
        """Test that scraper continues on source errors."""
        # Make second source raise an error
        mock_parse_feed.side_effect = [
            [],  # First source succeeds
            Exception("Network error"),  # Second source fails
            []  # Third source succeeds
        ]

        scraper = RSSScraper()
        articles = scraper.scrape_all_sources()

        # Should have collected from successful sources
        assert isinstance(articles, list)


class TestRSSScraperSaveToDatabase:
    """Tests for saving articles to database."""

    @patch('app.scrapers.rss_scraper.Session')
    def test_save_articles_to_db(self, mock_session_class):
        """Test saving articles to database."""
        mock_db = Mock()
        mock_session_class.return_value = mock_db

        # Mock query to return None (article doesn't exist)
        mock_db.query.return_value.filter.return_value.first.return_value = None

        scraper = RSSScraper()
        articles = [
            RSSArticleData(
                article_id='article-1',
                title='Football news',
                content='Match result',
                published_at=datetime.now(timezone.utc),
                source_url='http://example.com/rss',
                match_id=None,
                source='ESPN'
            )
        ]

        scraper.save_articles_to_db(articles, mock_db)

        # Verify commit was called
        mock_db.commit.assert_called_once()

    @patch('app.scrapers.rss_scraper.Session')
    def test_save_articles_skips_duplicates(self, mock_session_class):
        """Test that duplicate articles are skipped."""
        mock_db = Mock()
        mock_session_class.return_value = mock_db

        # Mock query to return existing article
        mock_existing = Mock()
        mock_db.query.return_value.filter.return_value.first.return_value = mock_existing

        scraper = RSSScraper()
        articles = [
            RSSArticleData(
                article_id='article-1',
                title='Football news',
                content='Match result',
                published_at=datetime.now(timezone.utc),
                source_url='http://example.com/rss',
                match_id=None,
                source='ESPN'
            )
        ]

        scraper.save_articles_to_db(articles, mock_db)

        # Should not add duplicate, but still commit
        assert mock_db.add.call_count == 0
        mock_db.commit.assert_called_once()

    @patch('app.scrapers.rss_scraper.Session')
    def test_save_articles_handles_db_error(self, mock_session_class):
        """Test that database errors are handled properly."""
        mock_db = Mock()
        mock_session_class.return_value = mock_db

        # Mock query and commit to raise error
        mock_db.query.return_value.filter.return_value.first.return_value = None
        mock_db.commit.side_effect = Exception("Database error")

        scraper = RSSScraper()
        articles = [
            RSSArticleData(
                article_id='article-1',
                title='Football news',
                content='Match result',
                published_at=datetime.now(timezone.utc),
                source_url='http://example.com/rss',
                match_id=None,
                source='ESPN'
            )
        ]

        # Should raise exception
        with pytest.raises(Exception):
            scraper.save_articles_to_db(articles, mock_db)

        # Verify rollback was called
        mock_db.rollback.assert_called_once()


class TestCreateRSSScraper:
    """Tests for factory function."""

    def test_create_default_scraper(self):
        """Test creating a scraper with default config."""
        scraper = create_rss_scraper()

        assert isinstance(scraper, RSSScraper)
        assert len(scraper.rss_sources) > 0

    def test_create_custom_scraper(self):
        """Test creating a scraper with custom config."""
        custom_sources = ["http://custom.com/rss"]
        custom_keywords = ["football"]

        scraper = create_rss_scraper(
            rss_sources=custom_sources,
            keywords=custom_keywords
        )

        assert scraper.rss_sources == custom_sources
        assert scraper.keywords == custom_keywords