Initial commit

This commit is contained in:
2026-02-01 09:31:38 +01:00
commit e02db93960
4396 changed files with 1511612 additions and 0 deletions

View File

@@ -0,0 +1,16 @@
"""
Workers module.
This module provides background workers for processing
asynchronous tasks from RabbitMQ queues.
"""
from app.workers.scraping_worker import ScrapingWorker
from app.workers.sentiment_worker import SentimentWorker
from app.workers.energy_worker import EnergyWorker
__all__ = [
'ScrapingWorker',
'SentimentWorker',
'EnergyWorker'
]

View File

@@ -0,0 +1,191 @@
"""
Energy calculation worker module.
This module provides a worker that consumes energy calculation tasks
from RabbitMQ and executes energy calculation operations.
"""
import logging
from typing import Dict, List, Optional
from sqlalchemy.orm import Session
from app.services.energy_service import (
calculate_and_store_energy_score,
get_energy_score_by_match_and_team
)
from app.schemas.energy_score import EnergyScoreCalculationRequest
from app.ml.energy_calculator import (
calculate_energy_score,
get_source_weights
)
logger = logging.getLogger(__name__)
class EnergyWorker:
"""
Worker for processing energy calculation tasks.
Features:
- Consumes tasks from energy_calculation_tasks queue
- Executes energy score calculations
- Publishes results to results queue
- Handles errors with retries
- Structured logging
"""
def __init__(self):
"""Initialize energy calculation worker."""
# No initialization needed for energy worker
# Energy calculator is initialized in ml/energy_calculator
pass
def execute_energy_calculation_task(
self,
task: Dict,
db: Session
) -> Dict:
"""
Execute an energy calculation task.
Args:
task: Energy calculation task data
db: Database session
Returns:
Dictionary with energy calculation results
"""
match_id = task.get('match_id')
team_id = task.get('team_id')
twitter_sentiments = task.get('twitter_sentiments', [])
reddit_sentiments = task.get('reddit_sentiments', [])
rss_sentiments = task.get('rss_sentiments', [])
tweets_with_timestamps = task.get('tweets_with_timestamps', [])
logger.info(
f"🔧 Executing energy calculation task: "
f"match_id={match_id}, team_id={team_id}"
)
try:
# Check if energy score already exists
existing_score = get_energy_score_by_match_and_team(
db, match_id, team_id
)
if existing_score:
logger.info(
f" Energy score already exists for "
f"match {match_id}, team {team_id}"
)
return {
'energy_score': existing_score.score,
'confidence': existing_score.confidence,
'sources_used': existing_score.sources_used,
'status': 'success',
'metadata': {
'match_id': match_id,
'team_id': team_id,
'updated_existing': True
}
}
# Create calculation request
request = EnergyScoreCalculationRequest(
match_id=match_id,
team_id=team_id,
twitter_sentiments=twitter_sentiments,
reddit_sentiments=reddit_sentiments,
rss_sentiments=rss_sentiments,
tweets_with_timestamps=tweets_with_timestamps
)
# Calculate and store energy score
energy_score = calculate_and_store_energy_score(db, request)
logger.info(
f"✅ Energy calculation completed: "
f"score={energy_score.score:.2f}, "
f"confidence={energy_score.confidence:.2f}"
)
return {
'energy_score': energy_score.score,
'confidence': energy_score.confidence,
'sources_used': energy_score.sources_used,
'status': 'success',
'metadata': {
'match_id': match_id,
'team_id': team_id,
'twitter_score': energy_score.twitter_score,
'reddit_score': energy_score.reddit_score,
'rss_score': energy_score.rss_score,
'temporal_factor': energy_score.temporal_factor
}
}
except Exception as e:
logger.error(f"❌ Energy calculation task failed: {e}")
return {
'energy_score': 0.0,
'confidence': 0.0,
'sources_used': [],
'status': 'error',
'error': str(e)
}
def calculate_mock_energy(
self,
twitter_sentiments: List[Dict],
reddit_sentiments: List[Dict],
rss_sentiments: List[Dict] = None,
tweets_with_timestamps: List[Dict] = None
) -> Dict:
"""
Calculate energy score without storing to database (for testing).
Args:
twitter_sentiments: List of Twitter sentiment scores
reddit_sentiments: List of Reddit sentiment scores
rss_sentiments: Optional list of RSS sentiment scores
tweets_with_timestamps: Optional list of tweets with timestamps
Returns:
Dictionary with energy calculation results
"""
try:
result = calculate_energy_score(
match_id=0,
team_id=0,
twitter_sentiments=twitter_sentiments,
reddit_sentiments=reddit_sentiments,
rss_sentiments=rss_sentiments or [],
tweets_with_timestamps=tweets_with_timestamps or []
)
return {
'energy_score': result['score'],
'confidence': result['confidence'],
'sources_used': result['sources_used'],
'status': 'success'
}
except Exception as e:
logger.error(f"❌ Mock energy calculation failed: {e}")
return {
'energy_score': 0.0,
'confidence': 0.0,
'sources_used': [],
'status': 'error',
'error': str(e)
}
def create_energy_worker() -> EnergyWorker:
"""
Factory function to create an energy calculation worker.
Returns:
Configured EnergyWorker instance
"""
return EnergyWorker()

View File

@@ -0,0 +1,243 @@
"""
Scraping worker module.
This module provides a worker that consumes scraping tasks
from RabbitMQ and executes scraping operations.
"""
import logging
from typing import Dict, List
from sqlalchemy.orm import Session
from app.scrapers.twitter_scraper import TwitterScraper, create_twitter_scraper
from app.scrapers.reddit_scraper import RedditScraper, create_reddit_scraper
logger = logging.getLogger(__name__)
class ScrapingWorker:
"""
Worker for processing scraping tasks.
Features:
- Consumes tasks from scraping_tasks queue
- Executes Twitter and Reddit scraping
- Publishes results to results queue
- Handles errors with retries
- Structured logging
"""
def __init__(
self,
twitter_bearer_token: str,
reddit_client_id: str,
reddit_client_secret: str
):
"""
Initialize scraping worker.
Args:
twitter_bearer_token: Twitter API bearer token
reddit_client_id: Reddit API client ID
reddit_client_secret: Reddit API client secret
"""
self.twitter_bearer_token = twitter_bearer_token
self.reddit_client_id = reddit_client_id
self.reddit_client_secret = reddit_client_secret
# Initialize scrapers (lazy initialization)
self.twitter_scraper: TwitterScraper = None
self.reddit_scraper: RedditScraper = None
def _get_twitter_scraper(self) -> TwitterScraper:
"""Get or create Twitter scraper instance."""
if self.twitter_scraper is None:
self.twitter_scraper = create_twitter_scraper(
bearer_token=self.twitter_bearer_token,
vip_match_ids=[]
)
return self.twitter_scraper
def _get_reddit_scraper(self) -> RedditScraper:
"""Get or create Reddit scraper instance."""
if self.reddit_scraper is None:
self.reddit_scraper = create_reddit_scraper(
client_id=self.reddit_client_id,
client_secret=self.reddit_client_secret
)
return self.reddit_scraper
def execute_scraping_task(
self,
task: Dict,
db: Session
) -> Dict:
"""
Execute a scraping task.
Args:
task: Scraping task data
db: Database session
Returns:
Dictionary with scraping results
"""
source = task.get('source')
match_id = task.get('match_id')
keywords = task.get('keywords', [])
priority = task.get('priority', 'normal')
logger.info(
f"🔧 Executing scraping task: match_id={match_id}, "
f"source={source}, priority={priority}"
)
try:
if source == 'twitter':
return self._execute_twitter_scraping(match_id, keywords, db)
elif source == 'reddit':
return self._execute_reddit_scraping(match_id, keywords, db)
else:
logger.error(f"❌ Unknown scraping source: {source}")
return {
'collected_count': 0,
'status': 'error',
'error': f'Unknown source: {source}'
}
except Exception as e:
logger.error(f"❌ Scraping task failed: {e}")
return {
'collected_count': 0,
'status': 'error',
'error': str(e)
}
def _execute_twitter_scraping(
self,
match_id: int,
keywords: List[str],
db: Session
) -> Dict:
"""
Execute Twitter scraping.
Args:
match_id: Match identifier
keywords: Search keywords
db: Database session
Returns:
Dictionary with scraping results
"""
try:
scraper = self._get_twitter_scraper()
# Scrape and save tweets
tweets = scraper.scrape_and_save(
match_id=match_id,
keywords=keywords,
db=db,
max_results=100
)
logger.info(
f"✅ Twitter scraping completed: {len(tweets)} tweets collected"
)
return {
'collected_count': len(tweets),
'status': 'success',
'metadata': {
'source': 'twitter',
'match_id': match_id,
'keywords': keywords
}
}
except Exception as e:
logger.error(f"❌ Twitter scraping failed: {e}")
return {
'collected_count': 0,
'status': 'error',
'error': str(e)
}
def _execute_reddit_scraping(
self,
match_id: int,
keywords: List[str],
db: Session
) -> Dict:
"""
Execute Reddit scraping.
Args:
match_id: Match identifier
keywords: Search keywords
db: Database session
Returns:
Dictionary with scraping results
"""
try:
scraper = self._get_reddit_scraper()
# Scrape and save Reddit posts
result = scraper.scrape_and_save(
match_id=match_id,
db=db,
keywords=keywords,
scrape_comments=True
)
posts = result.get('posts', [])
comments = result.get('comments', [])
logger.info(
f"✅ Reddit scraping completed: "
f"{len(posts)} posts, {len(comments)} comments collected"
)
return {
'collected_count': len(posts) + len(comments),
'status': 'success',
'metadata': {
'source': 'reddit',
'match_id': match_id,
'keywords': keywords,
'posts_count': len(posts),
'comments_count': len(comments)
}
}
except Exception as e:
logger.error(f"❌ Reddit scraping failed: {e}")
return {
'collected_count': 0,
'status': 'error',
'error': str(e)
}
def create_scraping_worker(
twitter_bearer_token: str,
reddit_client_id: str,
reddit_client_secret: str
) -> ScrapingWorker:
"""
Factory function to create a scraping worker.
Args:
twitter_bearer_token: Twitter API bearer token
reddit_client_id: Reddit API client ID
reddit_client_secret: Reddit API client secret
Returns:
Configured ScrapingWorker instance
"""
return ScrapingWorker(
twitter_bearer_token=twitter_bearer_token,
reddit_client_id=reddit_client_id,
reddit_client_secret=reddit_client_secret
)

View File

@@ -0,0 +1,302 @@
"""
Sentiment analysis worker module.
This module provides a worker that consumes sentiment analysis tasks
from RabbitMQ and executes sentiment analysis operations.
"""
import logging
from typing import Dict, List
from sqlalchemy.orm import Session
from app.services.sentiment_service import (
process_tweet_batch,
process_reddit_post_batch,
get_sentiment_by_entity
)
from app.models.tweet import Tweet
from app.models.reddit_post import RedditPost
logger = logging.getLogger(__name__)
class SentimentWorker:
"""
Worker for processing sentiment analysis tasks.
Features:
- Consumes tasks from sentiment_analysis_tasks queue
- Executes VADER sentiment analysis
- Processes batches of tweets and Reddit posts
- Publishes results to results queue
- Handles errors with retries
- Structured logging
"""
def __init__(self):
"""Initialize sentiment analysis worker."""
# No initialization needed for sentiment worker
# VADER analyzer is initialized in sentiment_service
pass
def execute_sentiment_analysis_task(
self,
task: Dict,
db: Session
) -> Dict:
"""
Execute a sentiment analysis task.
Args:
task: Sentiment analysis task data
db: Database session
Returns:
Dictionary with sentiment analysis results
"""
source = task.get('source')
match_id = task.get('match_id')
entity_ids = task.get('entity_ids', [])
logger.info(
f"🔧 Executing sentiment analysis task: "
f"match_id={match_id}, source={source}, "
f"entities={len(entity_ids)}"
)
try:
if source == 'twitter':
return self._execute_twitter_sentiment_analysis(
match_id, entity_ids, db
)
elif source == 'reddit':
return self._execute_reddit_sentiment_analysis(
match_id, entity_ids, db
)
else:
logger.error(f"❌ Unknown sentiment source: {source}")
return {
'analyzed_count': 0,
'status': 'error',
'error': f'Unknown source: {source}'
}
except Exception as e:
logger.error(f"❌ Sentiment analysis task failed: {e}")
return {
'analyzed_count': 0,
'status': 'error',
'error': str(e)
}
def _execute_twitter_sentiment_analysis(
self,
match_id: int,
entity_ids: List[str],
db: Session
) -> Dict:
"""
Execute sentiment analysis for Twitter data.
Args:
match_id: Match identifier
entity_ids: List of tweet IDs
db: Database session
Returns:
Dictionary with sentiment analysis results
"""
try:
# Fetch tweets from database
tweets = db.query(Tweet).filter(
Tweet.tweet_id.in_(entity_ids)
).all()
if not tweets:
logger.warning(f"⚠️ No tweets found for entities: {entity_ids}")
return {
'analyzed_count': 0,
'status': 'success',
'metrics': {
'total_count': 0,
'positive_count': 0,
'negative_count': 0,
'neutral_count': 0,
'average_compound': 0.0
}
}
# Check if already analyzed
unanalyzed_tweets = []
for tweet in tweets:
existing_sentiment = get_sentiment_by_entity(
db, tweet.tweet_id, 'tweet'
)
if not existing_sentiment:
unanalyzed_tweets.append(tweet)
if not unanalyzed_tweets:
logger.info(
f" All {len(tweets)} tweets already analyzed"
)
# Get metrics from existing sentiments
metrics = self._calculate_metrics_from_existing(db, match_id)
return {
'analyzed_count': 0,
'status': 'success',
'metrics': metrics
}
# Analyze batch
sentiment_scores = process_tweet_batch(db, unanalyzed_tweets)
# Calculate metrics
metrics = self._calculate_sentiment_metrics(db, match_id)
logger.info(
f"✅ Twitter sentiment analysis completed: "
f"{len(sentiment_scores)} tweets analyzed"
)
return {
'analyzed_count': len(sentiment_scores),
'status': 'success',
'metrics': metrics
}
except Exception as e:
logger.error(f"❌ Twitter sentiment analysis failed: {e}")
return {
'analyzed_count': 0,
'status': 'error',
'error': str(e)
}
def _execute_reddit_sentiment_analysis(
self,
match_id: int,
entity_ids: List[str],
db: Session
) -> Dict:
"""
Execute sentiment analysis for Reddit data.
Args:
match_id: Match identifier
entity_ids: List of Reddit post IDs
db: Database session
Returns:
Dictionary with sentiment analysis results
"""
try:
# Fetch Reddit posts from database
posts = db.query(RedditPost).filter(
RedditPost.post_id.in_(entity_ids)
).all()
if not posts:
logger.warning(f"⚠️ No Reddit posts found for entities: {entity_ids}")
return {
'analyzed_count': 0,
'status': 'success',
'metrics': {
'total_count': 0,
'positive_count': 0,
'negative_count': 0,
'neutral_count': 0,
'average_compound': 0.0
}
}
# Check if already analyzed
unanalyzed_posts = []
for post in posts:
existing_sentiment = get_sentiment_by_entity(
db, post.post_id, 'reddit_post'
)
if not existing_sentiment:
unanalyzed_posts.append(post)
if not unanalyzed_posts:
logger.info(
f" All {len(posts)} Reddit posts already analyzed"
)
# Get metrics from existing sentiments
metrics = self._calculate_metrics_from_existing(db, match_id)
return {
'analyzed_count': 0,
'status': 'success',
'metrics': metrics
}
# Analyze batch
sentiment_scores = process_reddit_post_batch(db, unanalyzed_posts)
# Calculate metrics
metrics = self._calculate_sentiment_metrics(db, match_id)
logger.info(
f"✅ Reddit sentiment analysis completed: "
f"{len(sentiment_scores)} posts analyzed"
)
return {
'analyzed_count': len(sentiment_scores),
'status': 'success',
'metrics': metrics
}
except Exception as e:
logger.error(f"❌ Reddit sentiment analysis failed: {e}")
return {
'analyzed_count': 0,
'status': 'error',
'error': str(e)
}
def _calculate_sentiment_metrics(
self,
db: Session,
match_id: int
) -> Dict:
"""
Calculate aggregated sentiment metrics for a match.
Args:
db: Database session
match_id: Match identifier
Returns:
Dictionary with aggregated metrics
"""
from app.services.sentiment_service import calculate_match_sentiment_metrics
return calculate_match_sentiment_metrics(db, match_id)
def _calculate_metrics_from_existing(
self,
db: Session,
match_id: int
) -> Dict:
"""
Calculate metrics from existing sentiment scores.
Args:
db: Database session
match_id: Match identifier
Returns:
Dictionary with aggregated metrics
"""
return self._calculate_sentiment_metrics(db, match_id)
def create_sentiment_worker() -> SentimentWorker:
"""
Factory function to create a sentiment analysis worker.
Returns:
Configured SentimentWorker instance
"""
return SentimentWorker()