Initial commit

2026-02-01 09:31:38 +01:00
commit e02db93960
4396 changed files with 1511612 additions and 0 deletions
--- a/backend/app/workers/init.py
+++ b/backend/app/workers/init.py
@@ -0,0 +1,16 @@
+"""
+Workers module.
+
+This module provides background workers for processing
+asynchronous tasks from RabbitMQ queues.
+"""
+
+from app.workers.scraping_worker import ScrapingWorker
+from app.workers.sentiment_worker import SentimentWorker
+from app.workers.energy_worker import EnergyWorker
+
+__all__ = [
+    'ScrapingWorker',
+    'SentimentWorker',
+    'EnergyWorker'
+]
--- a/backend/app/workers/energy_worker.py
+++ b/backend/app/workers/energy_worker.py
@@ -0,0 +1,191 @@
+"""
+Energy calculation worker module.
+
+This module provides a worker that consumes energy calculation tasks
+from RabbitMQ and executes energy calculation operations.
+"""
+
+import logging
+from typing import Dict, List, Optional
+from sqlalchemy.orm import Session
+
+from app.services.energy_service import (
+    calculate_and_store_energy_score,
+    get_energy_score_by_match_and_team
+)
+from app.schemas.energy_score import EnergyScoreCalculationRequest
+from app.ml.energy_calculator import (
+    calculate_energy_score,
+    get_source_weights
+)
+
+logger = logging.getLogger(__name__)
+
+
+class EnergyWorker:
+    """
+    Worker for processing energy calculation tasks.
+    
+    Features:
+    - Consumes tasks from energy_calculation_tasks queue
+    - Executes energy score calculations
+    - Publishes results to results queue
+    - Handles errors with retries
+    - Structured logging
+    """
+    
+    def __init__(self):
+        """Initialize energy calculation worker."""
+        # No initialization needed for energy worker
+        # Energy calculator is initialized in ml/energy_calculator
+        pass
+    
+    def execute_energy_calculation_task(
+        self,
+        task: Dict,
+        db: Session
+    ) -> Dict:
+        """
+        Execute an energy calculation task.
+        
+        Args:
+            task: Energy calculation task data
+            db: Database session
+            
+        Returns:
+            Dictionary with energy calculation results
+        """
+        match_id = task.get('match_id')
+        team_id = task.get('team_id')
+        twitter_sentiments = task.get('twitter_sentiments', [])
+        reddit_sentiments = task.get('reddit_sentiments', [])
+        rss_sentiments = task.get('rss_sentiments', [])
+        tweets_with_timestamps = task.get('tweets_with_timestamps', [])
+        
+        logger.info(
+            f"🔧 Executing energy calculation task: "
+            f"match_id={match_id}, team_id={team_id}"
+        )
+        
+        try:
+            # Check if energy score already exists
+            existing_score = get_energy_score_by_match_and_team(
+                db, match_id, team_id
+            )
+            
+            if existing_score:
+                logger.info(
+                    f"ℹ️ Energy score already exists for "
+                    f"match {match_id}, team {team_id}"
+                )
+                return {
+                    'energy_score': existing_score.score,
+                    'confidence': existing_score.confidence,
+                    'sources_used': existing_score.sources_used,
+                    'status': 'success',
+                    'metadata': {
+                        'match_id': match_id,
+                        'team_id': team_id,
+                        'updated_existing': True
+                    }
+                }
+            
+            # Create calculation request
+            request = EnergyScoreCalculationRequest(
+                match_id=match_id,
+                team_id=team_id,
+                twitter_sentiments=twitter_sentiments,
+                reddit_sentiments=reddit_sentiments,
+                rss_sentiments=rss_sentiments,
+                tweets_with_timestamps=tweets_with_timestamps
+            )
+            
+            # Calculate and store energy score
+            energy_score = calculate_and_store_energy_score(db, request)
+            
+            logger.info(
+                f"✅ Energy calculation completed: "
+                f"score={energy_score.score:.2f}, "
+                f"confidence={energy_score.confidence:.2f}"
+            )
+            
+            return {
+                'energy_score': energy_score.score,
+                'confidence': energy_score.confidence,
+                'sources_used': energy_score.sources_used,
+                'status': 'success',
+                'metadata': {
+                    'match_id': match_id,
+                    'team_id': team_id,
+                    'twitter_score': energy_score.twitter_score,
+                    'reddit_score': energy_score.reddit_score,
+                    'rss_score': energy_score.rss_score,
+                    'temporal_factor': energy_score.temporal_factor
+                }
+            }
+            
+        except Exception as e:
+            logger.error(f"❌ Energy calculation task failed: {e}")
+            return {
+                'energy_score': 0.0,
+                'confidence': 0.0,
+                'sources_used': [],
+                'status': 'error',
+                'error': str(e)
+            }
+    
+    def calculate_mock_energy(
+        self,
+        twitter_sentiments: List[Dict],
+        reddit_sentiments: List[Dict],
+        rss_sentiments: List[Dict] = None,
+        tweets_with_timestamps: List[Dict] = None
+    ) -> Dict:
+        """
+        Calculate energy score without storing to database (for testing).
+        
+        Args:
+            twitter_sentiments: List of Twitter sentiment scores
+            reddit_sentiments: List of Reddit sentiment scores
+            rss_sentiments: Optional list of RSS sentiment scores
+            tweets_with_timestamps: Optional list of tweets with timestamps
+            
+        Returns:
+            Dictionary with energy calculation results
+        """
+        try:
+            result = calculate_energy_score(
+                match_id=0,
+                team_id=0,
+                twitter_sentiments=twitter_sentiments,
+                reddit_sentiments=reddit_sentiments,
+                rss_sentiments=rss_sentiments or [],
+                tweets_with_timestamps=tweets_with_timestamps or []
+            )
+            
+            return {
+                'energy_score': result['score'],
+                'confidence': result['confidence'],
+                'sources_used': result['sources_used'],
+                'status': 'success'
+            }
+            
+        except Exception as e:
+            logger.error(f"❌ Mock energy calculation failed: {e}")
+            return {
+                'energy_score': 0.0,
+                'confidence': 0.0,
+                'sources_used': [],
+                'status': 'error',
+                'error': str(e)
+            }
+
+
+def create_energy_worker() -> EnergyWorker:
+    """
+    Factory function to create an energy calculation worker.
+    
+    Returns:
+        Configured EnergyWorker instance
+    """
+    return EnergyWorker()
--- a/backend/app/workers/scraping_worker.py
+++ b/backend/app/workers/scraping_worker.py
@@ -0,0 +1,243 @@
+"""
+Scraping worker module.
+
+This module provides a worker that consumes scraping tasks
+from RabbitMQ and executes scraping operations.
+"""
+
+import logging
+from typing import Dict, List
+from sqlalchemy.orm import Session
+
+from app.scrapers.twitter_scraper import TwitterScraper, create_twitter_scraper
+from app.scrapers.reddit_scraper import RedditScraper, create_reddit_scraper
+
+logger = logging.getLogger(__name__)
+
+
+class ScrapingWorker:
+    """
+    Worker for processing scraping tasks.
+    
+    Features:
+    - Consumes tasks from scraping_tasks queue
+    - Executes Twitter and Reddit scraping
+    - Publishes results to results queue
+    - Handles errors with retries
+    - Structured logging
+    """
+    
+    def __init__(
+        self,
+        twitter_bearer_token: str,
+        reddit_client_id: str,
+        reddit_client_secret: str
+    ):
+        """
+        Initialize scraping worker.
+        
+        Args:
+            twitter_bearer_token: Twitter API bearer token
+            reddit_client_id: Reddit API client ID
+            reddit_client_secret: Reddit API client secret
+        """
+        self.twitter_bearer_token = twitter_bearer_token
+        self.reddit_client_id = reddit_client_id
+        self.reddit_client_secret = reddit_client_secret
+        
+        # Initialize scrapers (lazy initialization)
+        self.twitter_scraper: TwitterScraper = None
+        self.reddit_scraper: RedditScraper = None
+    
+    def _get_twitter_scraper(self) -> TwitterScraper:
+        """Get or create Twitter scraper instance."""
+        if self.twitter_scraper is None:
+            self.twitter_scraper = create_twitter_scraper(
+                bearer_token=self.twitter_bearer_token,
+                vip_match_ids=[]
+            )
+        return self.twitter_scraper
+    
+    def _get_reddit_scraper(self) -> RedditScraper:
+        """Get or create Reddit scraper instance."""
+        if self.reddit_scraper is None:
+            self.reddit_scraper = create_reddit_scraper(
+                client_id=self.reddit_client_id,
+                client_secret=self.reddit_client_secret
+            )
+        return self.reddit_scraper
+    
+    def execute_scraping_task(
+        self,
+        task: Dict,
+        db: Session
+    ) -> Dict:
+        """
+        Execute a scraping task.
+        
+        Args:
+            task: Scraping task data
+            db: Database session
+            
+        Returns:
+            Dictionary with scraping results
+        """
+        source = task.get('source')
+        match_id = task.get('match_id')
+        keywords = task.get('keywords', [])
+        priority = task.get('priority', 'normal')
+        
+        logger.info(
+            f"🔧 Executing scraping task: match_id={match_id}, "
+            f"source={source}, priority={priority}"
+        )
+        
+        try:
+            if source == 'twitter':
+                return self._execute_twitter_scraping(match_id, keywords, db)
+            elif source == 'reddit':
+                return self._execute_reddit_scraping(match_id, keywords, db)
+            else:
+                logger.error(f"❌ Unknown scraping source: {source}")
+                return {
+                    'collected_count': 0,
+                    'status': 'error',
+                    'error': f'Unknown source: {source}'
+                }
+                
+        except Exception as e:
+            logger.error(f"❌ Scraping task failed: {e}")
+            return {
+                'collected_count': 0,
+                'status': 'error',
+                'error': str(e)
+            }
+    
+    def _execute_twitter_scraping(
+        self,
+        match_id: int,
+        keywords: List[str],
+        db: Session
+    ) -> Dict:
+        """
+        Execute Twitter scraping.
+        
+        Args:
+            match_id: Match identifier
+            keywords: Search keywords
+            db: Database session
+            
+        Returns:
+            Dictionary with scraping results
+        """
+        try:
+            scraper = self._get_twitter_scraper()
+            
+            # Scrape and save tweets
+            tweets = scraper.scrape_and_save(
+                match_id=match_id,
+                keywords=keywords,
+                db=db,
+                max_results=100
+            )
+            
+            logger.info(
+                f"✅ Twitter scraping completed: {len(tweets)} tweets collected"
+            )
+            
+            return {
+                'collected_count': len(tweets),
+                'status': 'success',
+                'metadata': {
+                    'source': 'twitter',
+                    'match_id': match_id,
+                    'keywords': keywords
+                }
+            }
+            
+        except Exception as e:
+            logger.error(f"❌ Twitter scraping failed: {e}")
+            return {
+                'collected_count': 0,
+                'status': 'error',
+                'error': str(e)
+            }
+    
+    def _execute_reddit_scraping(
+        self,
+        match_id: int,
+        keywords: List[str],
+        db: Session
+    ) -> Dict:
+        """
+        Execute Reddit scraping.
+        
+        Args:
+            match_id: Match identifier
+            keywords: Search keywords
+            db: Database session
+            
+        Returns:
+            Dictionary with scraping results
+        """
+        try:
+            scraper = self._get_reddit_scraper()
+            
+            # Scrape and save Reddit posts
+            result = scraper.scrape_and_save(
+                match_id=match_id,
+                db=db,
+                keywords=keywords,
+                scrape_comments=True
+            )
+            
+            posts = result.get('posts', [])
+            comments = result.get('comments', [])
+            
+            logger.info(
+                f"✅ Reddit scraping completed: "
+                f"{len(posts)} posts, {len(comments)} comments collected"
+            )
+            
+            return {
+                'collected_count': len(posts) + len(comments),
+                'status': 'success',
+                'metadata': {
+                    'source': 'reddit',
+                    'match_id': match_id,
+                    'keywords': keywords,
+                    'posts_count': len(posts),
+                    'comments_count': len(comments)
+                }
+            }
+            
+        except Exception as e:
+            logger.error(f"❌ Reddit scraping failed: {e}")
+            return {
+                'collected_count': 0,
+                'status': 'error',
+                'error': str(e)
+            }
+
+
+def create_scraping_worker(
+    twitter_bearer_token: str,
+    reddit_client_id: str,
+    reddit_client_secret: str
+) -> ScrapingWorker:
+    """
+    Factory function to create a scraping worker.
+    
+    Args:
+        twitter_bearer_token: Twitter API bearer token
+        reddit_client_id: Reddit API client ID
+        reddit_client_secret: Reddit API client secret
+        
+    Returns:
+        Configured ScrapingWorker instance
+    """
+    return ScrapingWorker(
+        twitter_bearer_token=twitter_bearer_token,
+        reddit_client_id=reddit_client_id,
+        reddit_client_secret=reddit_client_secret
+    )
--- a/backend/app/workers/sentiment_worker.py
+++ b/backend/app/workers/sentiment_worker.py
@@ -0,0 +1,302 @@
+"""
+Sentiment analysis worker module.
+
+This module provides a worker that consumes sentiment analysis tasks
+from RabbitMQ and executes sentiment analysis operations.
+"""
+
+import logging
+from typing import Dict, List
+from sqlalchemy.orm import Session
+
+from app.services.sentiment_service import (
+    process_tweet_batch,
+    process_reddit_post_batch,
+    get_sentiment_by_entity
+)
+from app.models.tweet import Tweet
+from app.models.reddit_post import RedditPost
+
+logger = logging.getLogger(__name__)
+
+
+class SentimentWorker:
+    """
+    Worker for processing sentiment analysis tasks.
+    
+    Features:
+    - Consumes tasks from sentiment_analysis_tasks queue
+    - Executes VADER sentiment analysis
+    - Processes batches of tweets and Reddit posts
+    - Publishes results to results queue
+    - Handles errors with retries
+    - Structured logging
+    """
+    
+    def __init__(self):
+        """Initialize sentiment analysis worker."""
+        # No initialization needed for sentiment worker
+        # VADER analyzer is initialized in sentiment_service
+        pass
+    
+    def execute_sentiment_analysis_task(
+        self,
+        task: Dict,
+        db: Session
+    ) -> Dict:
+        """
+        Execute a sentiment analysis task.
+        
+        Args:
+            task: Sentiment analysis task data
+            db: Database session
+            
+        Returns:
+            Dictionary with sentiment analysis results
+        """
+        source = task.get('source')
+        match_id = task.get('match_id')
+        entity_ids = task.get('entity_ids', [])
+        
+        logger.info(
+            f"🔧 Executing sentiment analysis task: "
+            f"match_id={match_id}, source={source}, "
+            f"entities={len(entity_ids)}"
+        )
+        
+        try:
+            if source == 'twitter':
+                return self._execute_twitter_sentiment_analysis(
+                    match_id, entity_ids, db
+                )
+            elif source == 'reddit':
+                return self._execute_reddit_sentiment_analysis(
+                    match_id, entity_ids, db
+                )
+            else:
+                logger.error(f"❌ Unknown sentiment source: {source}")
+                return {
+                    'analyzed_count': 0,
+                    'status': 'error',
+                    'error': f'Unknown source: {source}'
+                }
+                
+        except Exception as e:
+            logger.error(f"❌ Sentiment analysis task failed: {e}")
+            return {
+                'analyzed_count': 0,
+                'status': 'error',
+                'error': str(e)
+            }
+    
+    def _execute_twitter_sentiment_analysis(
+        self,
+        match_id: int,
+        entity_ids: List[str],
+        db: Session
+    ) -> Dict:
+        """
+        Execute sentiment analysis for Twitter data.
+        
+        Args:
+            match_id: Match identifier
+            entity_ids: List of tweet IDs
+            db: Database session
+            
+        Returns:
+            Dictionary with sentiment analysis results
+        """
+        try:
+            # Fetch tweets from database
+            tweets = db.query(Tweet).filter(
+                Tweet.tweet_id.in_(entity_ids)
+            ).all()
+            
+            if not tweets:
+                logger.warning(f"⚠️ No tweets found for entities: {entity_ids}")
+                return {
+                    'analyzed_count': 0,
+                    'status': 'success',
+                    'metrics': {
+                        'total_count': 0,
+                        'positive_count': 0,
+                        'negative_count': 0,
+                        'neutral_count': 0,
+                        'average_compound': 0.0
+                    }
+                }
+            
+            # Check if already analyzed
+            unanalyzed_tweets = []
+            for tweet in tweets:
+                existing_sentiment = get_sentiment_by_entity(
+                    db, tweet.tweet_id, 'tweet'
+                )
+                if not existing_sentiment:
+                    unanalyzed_tweets.append(tweet)
+            
+            if not unanalyzed_tweets:
+                logger.info(
+                    f"ℹ️ All {len(tweets)} tweets already analyzed"
+                )
+                # Get metrics from existing sentiments
+                metrics = self._calculate_metrics_from_existing(db, match_id)
+                return {
+                    'analyzed_count': 0,
+                    'status': 'success',
+                    'metrics': metrics
+                }
+            
+            # Analyze batch
+            sentiment_scores = process_tweet_batch(db, unanalyzed_tweets)
+            
+            # Calculate metrics
+            metrics = self._calculate_sentiment_metrics(db, match_id)
+            
+            logger.info(
+                f"✅ Twitter sentiment analysis completed: "
+                f"{len(sentiment_scores)} tweets analyzed"
+            )
+            
+            return {
+                'analyzed_count': len(sentiment_scores),
+                'status': 'success',
+                'metrics': metrics
+            }
+            
+        except Exception as e:
+            logger.error(f"❌ Twitter sentiment analysis failed: {e}")
+            return {
+                'analyzed_count': 0,
+                'status': 'error',
+                'error': str(e)
+            }
+    
+    def _execute_reddit_sentiment_analysis(
+        self,
+        match_id: int,
+        entity_ids: List[str],
+        db: Session
+    ) -> Dict:
+        """
+        Execute sentiment analysis for Reddit data.
+        
+        Args:
+            match_id: Match identifier
+            entity_ids: List of Reddit post IDs
+            db: Database session
+            
+        Returns:
+            Dictionary with sentiment analysis results
+        """
+        try:
+            # Fetch Reddit posts from database
+            posts = db.query(RedditPost).filter(
+                RedditPost.post_id.in_(entity_ids)
+            ).all()
+            
+            if not posts:
+                logger.warning(f"⚠️ No Reddit posts found for entities: {entity_ids}")
+                return {
+                    'analyzed_count': 0,
+                    'status': 'success',
+                    'metrics': {
+                        'total_count': 0,
+                        'positive_count': 0,
+                        'negative_count': 0,
+                        'neutral_count': 0,
+                        'average_compound': 0.0
+                    }
+                }
+            
+            # Check if already analyzed
+            unanalyzed_posts = []
+            for post in posts:
+                existing_sentiment = get_sentiment_by_entity(
+                    db, post.post_id, 'reddit_post'
+                )
+                if not existing_sentiment:
+                    unanalyzed_posts.append(post)
+            
+            if not unanalyzed_posts:
+                logger.info(
+                    f"ℹ️ All {len(posts)} Reddit posts already analyzed"
+                )
+                # Get metrics from existing sentiments
+                metrics = self._calculate_metrics_from_existing(db, match_id)
+                return {
+                    'analyzed_count': 0,
+                    'status': 'success',
+                    'metrics': metrics
+                }
+            
+            # Analyze batch
+            sentiment_scores = process_reddit_post_batch(db, unanalyzed_posts)
+            
+            # Calculate metrics
+            metrics = self._calculate_sentiment_metrics(db, match_id)
+            
+            logger.info(
+                f"✅ Reddit sentiment analysis completed: "
+                f"{len(sentiment_scores)} posts analyzed"
+            )
+            
+            return {
+                'analyzed_count': len(sentiment_scores),
+                'status': 'success',
+                'metrics': metrics
+            }
+            
+        except Exception as e:
+            logger.error(f"❌ Reddit sentiment analysis failed: {e}")
+            return {
+                'analyzed_count': 0,
+                'status': 'error',
+                'error': str(e)
+            }
+    
+    def _calculate_sentiment_metrics(
+        self,
+        db: Session,
+        match_id: int
+    ) -> Dict:
+        """
+        Calculate aggregated sentiment metrics for a match.
+        
+        Args:
+            db: Database session
+            match_id: Match identifier
+            
+        Returns:
+            Dictionary with aggregated metrics
+        """
+        from app.services.sentiment_service import calculate_match_sentiment_metrics
+        
+        return calculate_match_sentiment_metrics(db, match_id)
+    
+    def _calculate_metrics_from_existing(
+        self,
+        db: Session,
+        match_id: int
+    ) -> Dict:
+        """
+        Calculate metrics from existing sentiment scores.
+        
+        Args:
+            db: Database session
+            match_id: Match identifier
+            
+        Returns:
+            Dictionary with aggregated metrics
+        """
+        return self._calculate_sentiment_metrics(db, match_id)
+
+
+def create_sentiment_worker() -> SentimentWorker:
+    """
+    Factory function to create a sentiment analysis worker.
+    
+    Returns:
+        Configured SentimentWorker instance
+    """
+    return SentimentWorker()