Initial commit
This commit is contained in:
2
backend/app/ml/__init__.py
Normal file
2
backend/app/ml/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
# ML Module
|
||||
# This module contains machine learning components for sentiment analysis and energy calculations
|
||||
619
backend/app/ml/backtesting.py
Normal file
619
backend/app/ml/backtesting.py
Normal file
@@ -0,0 +1,619 @@
|
||||
"""
|
||||
Backtesting Module.
|
||||
|
||||
This module provides functions to run backtesting on historical match data,
|
||||
comparing predictions with actual results to calculate accuracy metrics.
|
||||
"""
|
||||
|
||||
import json
|
||||
import csv
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Any, Optional
|
||||
from io import StringIO
|
||||
|
||||
from app.ml.prediction_calculator import calculate_prediction
|
||||
|
||||
|
||||
# Validation thresholds
|
||||
ACCURACY_VALIDATED_THRESHOLD = 60.0 # >= 60%: System validated
|
||||
ACCURACY_ALERT_THRESHOLD = 55.0 # < 55%: Revision required
|
||||
|
||||
|
||||
def validate_accuracy(accuracy: float) -> str:
|
||||
"""
|
||||
Validate the accuracy of the prediction system.
|
||||
|
||||
Args:
|
||||
accuracy: Accuracy percentage (0.0 - 100.0)
|
||||
|
||||
Returns:
|
||||
'VALIDATED' if accuracy >= 60%,
|
||||
'REVISION_REQUIRED' if accuracy < 55%,
|
||||
'BELOW_TARGET' if 55% <= accuracy < 60%
|
||||
|
||||
Examples:
|
||||
>>> validate_accuracy(70.0)
|
||||
'VALIDATED'
|
||||
>>> validate_accuracy(50.0)
|
||||
'REVISION_REQUIRED'
|
||||
>>> validate_accuracy(58.0)
|
||||
'BELOW_TARGET'
|
||||
"""
|
||||
if accuracy >= ACCURACY_VALIDATED_THRESHOLD:
|
||||
return 'VALIDATED'
|
||||
elif accuracy < ACCURACY_ALERT_THRESHOLD:
|
||||
return 'REVISION_REQUIRED'
|
||||
else:
|
||||
return 'BELOW_TARGET'
|
||||
|
||||
|
||||
def compare_prediction(predicted_winner: str, actual_winner: str) -> bool:
|
||||
"""
|
||||
Compare predicted winner with actual match result.
|
||||
|
||||
Args:
|
||||
predicted_winner: 'home', 'away', or 'draw'
|
||||
actual_winner: 'home', 'away', or 'draw'
|
||||
|
||||
Returns:
|
||||
True if prediction was correct, False otherwise
|
||||
|
||||
Examples:
|
||||
>>> compare_prediction('home', 'home')
|
||||
True
|
||||
>>> compare_prediction('home', 'away')
|
||||
False
|
||||
"""
|
||||
return predicted_winner.lower() == actual_winner.lower()
|
||||
|
||||
|
||||
def run_backtesting_single_match(
|
||||
match_id: int,
|
||||
home_team: str,
|
||||
away_team: str,
|
||||
home_energy: float,
|
||||
away_energy: float,
|
||||
actual_winner: str
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Run backtesting for a single historical match.
|
||||
|
||||
Calculates prediction and compares it with the actual result.
|
||||
|
||||
Args:
|
||||
match_id: Unique match identifier
|
||||
home_team: Name of the home team
|
||||
away_team: Name of the away team
|
||||
home_energy: Energy score of the home team
|
||||
away_energy: Energy score of the away team
|
||||
actual_winner: Actual result ('home', 'away', or 'draw')
|
||||
|
||||
Returns:
|
||||
Dictionary containing match details, prediction, and comparison result
|
||||
|
||||
Examples:
|
||||
>>> result = run_backtesting_single_match(1, 'PSG', 'OM', 65.0, 45.0, 'home')
|
||||
>>> result['correct']
|
||||
True
|
||||
"""
|
||||
# Calculate prediction
|
||||
prediction = calculate_prediction(home_energy, away_energy)
|
||||
|
||||
# Compare with actual result
|
||||
is_correct = compare_prediction(prediction['predicted_winner'], actual_winner)
|
||||
|
||||
return {
|
||||
'match_id': match_id,
|
||||
'home_team': home_team,
|
||||
'away_team': away_team,
|
||||
'home_energy': home_energy,
|
||||
'away_energy': away_energy,
|
||||
'prediction': prediction,
|
||||
'actual_winner': actual_winner,
|
||||
'correct': is_correct
|
||||
}
|
||||
|
||||
|
||||
def run_backtesting_batch(matches: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
"""
|
||||
Run backtesting on a batch of historical matches.
|
||||
|
||||
Processes multiple matches, calculates predictions, compares with actual
|
||||
results, and generates accuracy metrics and detailed report.
|
||||
|
||||
Args:
|
||||
matches: List of match dictionaries with keys:
|
||||
- match_id (int)
|
||||
- home_team (str)
|
||||
- away_team (str)
|
||||
- home_energy (float)
|
||||
- away_energy (float)
|
||||
- actual_winner (str)
|
||||
- league (str, optional)
|
||||
- date (datetime, optional)
|
||||
|
||||
Returns:
|
||||
Dictionary containing:
|
||||
- total_matches: Number of matches processed
|
||||
- correct_predictions: Number of correct predictions
|
||||
- incorrect_predictions: Number of incorrect predictions
|
||||
- accuracy: Accuracy percentage
|
||||
- status: Validation status (VALIDATED, REVISION_REQUIRED, BELOW_TARGET)
|
||||
- results: List of individual match results
|
||||
- metrics_by_league: Accuracy breakdown by league
|
||||
- timestamp: When the backtesting was run
|
||||
|
||||
Examples:
|
||||
>>> matches = [
|
||||
... {'match_id': 1, 'home_team': 'PSG', 'away_team': 'OM',
|
||||
... 'home_energy': 65.0, 'away_energy': 45.0, 'actual_winner': 'home'},
|
||||
... ]
|
||||
>>> result = run_backtesting_batch(matches)
|
||||
>>> result['accuracy']
|
||||
100.0
|
||||
"""
|
||||
results = []
|
||||
correct_predictions = 0
|
||||
incorrect_predictions = 0
|
||||
|
||||
# Track metrics by league
|
||||
league_metrics: Dict[str, Dict[str, Any]] = {}
|
||||
|
||||
for match in matches:
|
||||
# Validate required fields
|
||||
required_fields = ['match_id', 'home_team', 'away_team',
|
||||
'home_energy', 'away_energy', 'actual_winner']
|
||||
if not all(field in match for field in required_fields):
|
||||
raise ValueError(f"Match missing required fields: {match}")
|
||||
|
||||
# Extract league and date if available
|
||||
league = match.get('league', 'unknown')
|
||||
match_date = match.get('date')
|
||||
|
||||
# Run backtesting for this match
|
||||
result = run_backtesting_single_match(
|
||||
match_id=match['match_id'],
|
||||
home_team=match['home_team'],
|
||||
away_team=match['away_team'],
|
||||
home_energy=match['home_energy'],
|
||||
away_energy=match['away_energy'],
|
||||
actual_winner=match['actual_winner']
|
||||
)
|
||||
|
||||
# Add league and date to result
|
||||
result['league'] = league
|
||||
result['date'] = match_date.isoformat() if match_date else None
|
||||
|
||||
# Track correctness
|
||||
if result['correct']:
|
||||
correct_predictions += 1
|
||||
else:
|
||||
incorrect_predictions += 1
|
||||
|
||||
# Update league metrics
|
||||
if league not in league_metrics:
|
||||
league_metrics[league] = {
|
||||
'total': 0,
|
||||
'correct': 0,
|
||||
'accuracy': 0.0
|
||||
}
|
||||
league_metrics[league]['total'] += 1
|
||||
if result['correct']:
|
||||
league_metrics[league]['correct'] += 1
|
||||
|
||||
results.append(result)
|
||||
|
||||
# Calculate overall accuracy
|
||||
total_matches = len(matches)
|
||||
accuracy = (correct_predictions / total_matches * 100.0) if total_matches > 0 else 0.0
|
||||
|
||||
# Calculate accuracy per league
|
||||
for league, metrics in league_metrics.items():
|
||||
if metrics['total'] > 0:
|
||||
metrics['accuracy'] = (metrics['correct'] / metrics['total'] * 100.0)
|
||||
|
||||
# Get validation status
|
||||
status = validate_accuracy(accuracy)
|
||||
|
||||
return {
|
||||
'total_matches': total_matches,
|
||||
'correct_predictions': correct_predictions,
|
||||
'incorrect_predictions': incorrect_predictions,
|
||||
'accuracy': round(accuracy, 2),
|
||||
'status': status,
|
||||
'results': results,
|
||||
'metrics_by_league': league_metrics,
|
||||
'timestamp': datetime.utcnow().isoformat(),
|
||||
'validation_thresholds': {
|
||||
'validated': ACCURACY_VALIDATED_THRESHOLD,
|
||||
'alert': ACCURACY_ALERT_THRESHOLD
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def export_to_json(backtesting_result: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Export backtesting results to JSON format.
|
||||
|
||||
Args:
|
||||
backtesting_result: Result from run_backtesting_batch
|
||||
|
||||
Returns:
|
||||
JSON formatted string
|
||||
|
||||
Examples:
|
||||
>>> result = run_backtesting_batch(matches)
|
||||
>>> json_output = export_to_json(result)
|
||||
>>> isinstance(json_output, str)
|
||||
True
|
||||
"""
|
||||
return json.dumps(backtesting_result, indent=2, default=str)
|
||||
|
||||
|
||||
def export_to_csv(backtesting_result: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Export backtesting results to CSV format.
|
||||
|
||||
Args:
|
||||
backtesting_result: Result from run_backtesting_batch
|
||||
|
||||
Returns:
|
||||
CSV formatted string
|
||||
|
||||
Examples:
|
||||
>>> result = run_backtesting_batch(matches)
|
||||
>>> csv_output = export_to_csv(result)
|
||||
>>> isinstance(csv_output, str)
|
||||
True
|
||||
"""
|
||||
output = StringIO()
|
||||
fieldnames = [
|
||||
'match_id', 'league', 'date', 'home_team', 'away_team',
|
||||
'home_energy', 'away_energy', 'predicted_winner',
|
||||
'confidence', 'actual_winner', 'correct'
|
||||
]
|
||||
|
||||
writer = csv.DictWriter(output, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
|
||||
for result in backtesting_result.get('results', []):
|
||||
row = {
|
||||
'match_id': result['match_id'],
|
||||
'league': result.get('league', ''),
|
||||
'date': result.get('date', ''),
|
||||
'home_team': result['home_team'],
|
||||
'away_team': result['away_team'],
|
||||
'home_energy': result['home_energy'],
|
||||
'away_energy': result['away_energy'],
|
||||
'predicted_winner': result['prediction']['predicted_winner'],
|
||||
'confidence': result['prediction']['confidence'],
|
||||
'actual_winner': result['actual_winner'],
|
||||
'correct': result['correct']
|
||||
}
|
||||
writer.writerow(row)
|
||||
|
||||
return output.getvalue()
|
||||
|
||||
|
||||
def export_to_html(backtesting_result: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Export backtesting results to HTML format for publication.
|
||||
|
||||
Args:
|
||||
backtesting_result: Result from run_backtesting_batch
|
||||
|
||||
Returns:
|
||||
HTML formatted string with styling and charts
|
||||
|
||||
Examples:
|
||||
>>> result = run_backtesting_batch(matches)
|
||||
>>> html_output = export_to_html(result)
|
||||
>>> '<html>' in html_output
|
||||
True
|
||||
"""
|
||||
status_colors = {
|
||||
'VALIDATED': '#10B981', # Green
|
||||
'BELOW_TARGET': '#F59E0B', # Orange
|
||||
'REVISION_REQUIRED': '#EF4444' # Red
|
||||
}
|
||||
|
||||
status = backtesting_result['status']
|
||||
accuracy = backtesting_result['accuracy']
|
||||
total_matches = backtesting_result['total_matches']
|
||||
correct_predictions = backtesting_result['correct_predictions']
|
||||
incorrect_predictions = backtesting_result['incorrect_predictions']
|
||||
|
||||
# Build HTML
|
||||
html = f"""
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Backtesting Report - ChartBastan</title>
|
||||
<style>
|
||||
* {{ margin: 0; padding: 0; box-sizing: border-box; }}
|
||||
body {{
|
||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
|
||||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||||
padding: 20px;
|
||||
min-height: 100vh;
|
||||
}}
|
||||
.container {{
|
||||
max-width: 1200px;
|
||||
margin: 0 auto;
|
||||
background: white;
|
||||
border-radius: 20px;
|
||||
box-shadow: 0 20px 60px rgba(0,0,0,0.3);
|
||||
padding: 40px;
|
||||
}}
|
||||
.header {{
|
||||
text-align: center;
|
||||
margin-bottom: 40px;
|
||||
}}
|
||||
.header h1 {{
|
||||
font-size: 2.5em;
|
||||
color: #667eea;
|
||||
margin-bottom: 10px;
|
||||
}}
|
||||
.header p {{
|
||||
color: #666;
|
||||
font-size: 1.1em;
|
||||
}}
|
||||
.summary {{
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
|
||||
gap: 20px;
|
||||
margin-bottom: 40px;
|
||||
}}
|
||||
.card {{
|
||||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||||
color: white;
|
||||
padding: 25px;
|
||||
border-radius: 15px;
|
||||
text-align: center;
|
||||
}}
|
||||
.card h3 {{
|
||||
font-size: 0.9em;
|
||||
opacity: 0.9;
|
||||
margin-bottom: 10px;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 1px;
|
||||
}}
|
||||
.card .value {{
|
||||
font-size: 2.5em;
|
||||
font-weight: bold;
|
||||
margin-bottom: 5px;
|
||||
}}
|
||||
.card .sub {{
|
||||
font-size: 0.9em;
|
||||
opacity: 0.9;
|
||||
}}
|
||||
.status-badge {{
|
||||
display: inline-block;
|
||||
padding: 10px 25px;
|
||||
border-radius: 25px;
|
||||
color: white;
|
||||
font-weight: bold;
|
||||
font-size: 1.2em;
|
||||
margin: 20px 0;
|
||||
}}
|
||||
.section {{
|
||||
margin-bottom: 40px;
|
||||
}}
|
||||
.section h2 {{
|
||||
font-size: 1.8em;
|
||||
color: #333;
|
||||
margin-bottom: 20px;
|
||||
padding-bottom: 10px;
|
||||
border-bottom: 3px solid #667eea;
|
||||
}}
|
||||
table {{
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
margin-top: 20px;
|
||||
}}
|
||||
th, td {{
|
||||
padding: 15px;
|
||||
text-align: left;
|
||||
border-bottom: 1px solid #ddd;
|
||||
}}
|
||||
th {{
|
||||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||||
color: white;
|
||||
font-weight: 600;
|
||||
text-transform: uppercase;
|
||||
font-size: 0.85em;
|
||||
letter-spacing: 0.5px;
|
||||
}}
|
||||
tr:hover {{
|
||||
background: #f5f5f5;
|
||||
}}
|
||||
.correct {{
|
||||
color: #10B981;
|
||||
font-weight: bold;
|
||||
}}
|
||||
.incorrect {{
|
||||
color: #EF4444;
|
||||
font-weight: bold;
|
||||
}}
|
||||
.footer {{
|
||||
text-align: center;
|
||||
margin-top: 40px;
|
||||
padding-top: 20px;
|
||||
border-top: 2px solid #ddd;
|
||||
color: #666;
|
||||
}}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<div class="header">
|
||||
<h1>📊 Backtesting Report</h1>
|
||||
<p>ChartBastan Prediction System Performance Analysis</p>
|
||||
<p style="margin-top: 10px; font-size: 0.9em;">
|
||||
Generated: {backtesting_result.get('timestamp', 'N/A')}
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="summary">
|
||||
<div class="card">
|
||||
<h3>Total Matches</h3>
|
||||
<div class="value">{total_matches}</div>
|
||||
<div class="sub">matches analyzed</div>
|
||||
</div>
|
||||
<div class="card">
|
||||
<h3>Accuracy</h3>
|
||||
<div class="value">{accuracy}%</div>
|
||||
<div class="sub">prediction accuracy</div>
|
||||
</div>
|
||||
<div class="card">
|
||||
<h3>Correct</h3>
|
||||
<div class="value">{correct_predictions}</div>
|
||||
<div class="sub">predictions</div>
|
||||
</div>
|
||||
<div class="card">
|
||||
<h3>Incorrect</h3>
|
||||
<div class="value">{incorrect_predictions}</div>
|
||||
<div class="sub">predictions</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div style="text-align: center;">
|
||||
<div class="status-badge" style="background-color: {status_colors.get(status, '#666')};">
|
||||
Status: {status}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="section">
|
||||
<h2>📈 Metrics by League</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>League</th>
|
||||
<th>Matches</th>
|
||||
<th>Correct</th>
|
||||
<th>Accuracy</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
"""
|
||||
|
||||
# Add league metrics
|
||||
for league, metrics in backtesting_result.get('metrics_by_league', {}).items():
|
||||
html += f"""
|
||||
<tr>
|
||||
<td>{league}</td>
|
||||
<td>{metrics['total']}</td>
|
||||
<td>{metrics['correct']}</td>
|
||||
<td>{metrics['accuracy']:.2f}%</td>
|
||||
</tr>
|
||||
"""
|
||||
|
||||
html += """
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
<div class="section">
|
||||
<h2>📋 Detailed Results</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Match ID</th>
|
||||
<th>League</th>
|
||||
<th>Home vs Away</th>
|
||||
<th>Prediction</th>
|
||||
<th>Confidence</th>
|
||||
<th>Actual</th>
|
||||
<th>Result</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
"""
|
||||
|
||||
# Add detailed results
|
||||
for result in backtesting_result.get('results', []):
|
||||
result_class = 'correct' if result['correct'] else 'incorrect'
|
||||
html += f"""
|
||||
<tr>
|
||||
<td>{result['match_id']}</td>
|
||||
<td>{result.get('league', 'N/A')}</td>
|
||||
<td>{result['home_team']} vs {result['away_team']}</td>
|
||||
<td>{result['prediction']['predicted_winner']}</td>
|
||||
<td>{result['prediction']['confidence']:.1f}%</td>
|
||||
<td>{result['actual_winner']}</td>
|
||||
<td class="{result_class}">{'✓ Correct' if result['correct'] else '✗ Incorrect'}</td>
|
||||
</tr>
|
||||
"""
|
||||
|
||||
html += """
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
<div class="footer">
|
||||
<p>🎯 ChartBastan - Football Match Prediction System</p>
|
||||
<p>© 2026 All rights reserved</p>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
return html
|
||||
|
||||
|
||||
def filter_matches_by_league(matches: List[Dict[str, Any]], leagues: List[str]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Filter matches by league(s).
|
||||
|
||||
Args:
|
||||
matches: List of match dictionaries
|
||||
leagues: List of league names to include
|
||||
|
||||
Returns:
|
||||
Filtered list of matches
|
||||
|
||||
Examples:
|
||||
>>> matches = [{'league': 'Ligue 1', 'home_team': 'PSG', ...}]
|
||||
>>> filtered = filter_matches_by_league(matches, ['Ligue 1'])
|
||||
>>> len(filtered)
|
||||
1
|
||||
"""
|
||||
if not leagues:
|
||||
return matches
|
||||
|
||||
return [m for m in matches if m.get('league') in leagues]
|
||||
|
||||
|
||||
def filter_matches_by_period(
|
||||
matches: List[Dict[str, Any]],
|
||||
start_date: Optional[datetime] = None,
|
||||
end_date: Optional[datetime] = None
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Filter matches by date period.
|
||||
|
||||
Args:
|
||||
matches: List of match dictionaries
|
||||
start_date: Start date (inclusive), or None for no lower bound
|
||||
end_date: End date (inclusive), or None for no upper bound
|
||||
|
||||
Returns:
|
||||
Filtered list of matches
|
||||
|
||||
Examples:
|
||||
>>> from datetime import datetime
|
||||
>>> matches = [{'date': datetime(2026, 1, 1), ...}]
|
||||
>>> filtered = filter_matches_by_period(matches, datetime(2025, 1, 1))
|
||||
"""
|
||||
filtered = matches
|
||||
|
||||
if start_date:
|
||||
filtered = [m for m in filtered if m.get('date') and m['date'] >= start_date]
|
||||
|
||||
if end_date:
|
||||
filtered = [m for m in filtered if m.get('date') and m['date'] <= end_date]
|
||||
|
||||
return filtered
|
||||
356
backend/app/ml/energy_calculator.py
Normal file
356
backend/app/ml/energy_calculator.py
Normal file
@@ -0,0 +1,356 @@
|
||||
"""
|
||||
Energy Calculator Module.
|
||||
|
||||
This module calculates collective energy scores based on sentiment analysis
|
||||
from multiple sources (Twitter, Reddit, RSS) using a weighted formula.
|
||||
|
||||
Formula: Score = (Positive - Negative) × Volume × Virality
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional
|
||||
from logging import getLogger
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
# Source weights as specified in requirements
|
||||
SOURCE_WEIGHTS = {
|
||||
'twitter': 0.60,
|
||||
'reddit': 0.25,
|
||||
'rss': 0.15
|
||||
}
|
||||
|
||||
# Temporal weighting parameters
|
||||
TEMPORAL_DECAY_HOURS = 48 # Full decay over 48 hours
|
||||
MIN_TEMPORAL_WEIGHT = 0.5 # Minimum weight for old tweets
|
||||
|
||||
|
||||
def calculate_energy_score(
|
||||
match_id: int,
|
||||
team_id: int,
|
||||
twitter_sentiments: List[Dict[str, float]] = None,
|
||||
reddit_sentiments: List[Dict[str, float]] = None,
|
||||
rss_sentiments: List[Dict[str, float]] = None,
|
||||
tweets_with_timestamps: List[Dict] = None
|
||||
) -> Dict[str, any]:
|
||||
"""
|
||||
Calculate energy score for a team based on multi-source sentiment data.
|
||||
|
||||
Args:
|
||||
match_id: ID of the match
|
||||
team_id: ID of the team
|
||||
twitter_sentiments: List of Twitter sentiment scores
|
||||
reddit_sentiments: List of Reddit sentiment scores
|
||||
rss_sentiments: List of RSS sentiment scores
|
||||
tweets_with_timestamps: List of tweets with timestamps for temporal weighting
|
||||
|
||||
Returns:
|
||||
Dictionary containing:
|
||||
- score: Final energy score (0-100)
|
||||
- confidence: Confidence level (0-1)
|
||||
- sources_used: List of sources used in calculation
|
||||
"""
|
||||
# Initialize with empty lists if None
|
||||
twitter_sentiments = twitter_sentiments or []
|
||||
reddit_sentiments = reddit_sentiments or []
|
||||
rss_sentiments = rss_sentiments or []
|
||||
tweets_with_timestamps = tweets_with_timestamps or []
|
||||
|
||||
# Calculate energy scores for each source using the formula
|
||||
twitter_energy_score = _calculate_source_energy(twitter_sentiments)
|
||||
reddit_energy_score = _calculate_source_energy(reddit_sentiments)
|
||||
rss_energy_score = _calculate_source_energy(rss_sentiments)
|
||||
|
||||
# Determine available sources
|
||||
available_sources = []
|
||||
if twitter_sentiments:
|
||||
available_sources.append('twitter')
|
||||
if reddit_sentiments:
|
||||
available_sources.append('reddit')
|
||||
if rss_sentiments:
|
||||
available_sources.append('rss')
|
||||
|
||||
# Check if no sentiment data is available
|
||||
if not available_sources:
|
||||
logger.warning(f"No sentiment data available for match_id={match_id}, team_id={team_id}")
|
||||
return {
|
||||
'score': 0.0,
|
||||
'confidence': 0.0,
|
||||
'sources_used': []
|
||||
}
|
||||
|
||||
# Apply source weights (with degraded mode adjustment)
|
||||
weighted_score = apply_source_weights(
|
||||
twitter_score=twitter_energy_score,
|
||||
reddit_score=reddit_energy_score,
|
||||
rss_score=rss_energy_score,
|
||||
available_sources=available_sources
|
||||
)
|
||||
|
||||
# Apply temporal weighting if tweets with timestamps are available
|
||||
time_weighted_score = weighted_score
|
||||
if tweets_with_timestamps and available_sources:
|
||||
time_weighted_score = apply_temporal_weighting(
|
||||
base_score=weighted_score,
|
||||
tweets_with_timestamps=tweets_with_timestamps
|
||||
)
|
||||
|
||||
# Normalize score to 0-100 range
|
||||
final_score = normalize_score(time_weighted_score)
|
||||
|
||||
# Calculate confidence level
|
||||
total_weight = sum(SOURCE_WEIGHTS[s] for s in available_sources)
|
||||
confidence = calculate_confidence(
|
||||
available_sources=available_sources,
|
||||
total_weight=total_weight
|
||||
)
|
||||
|
||||
return {
|
||||
'score': final_score,
|
||||
'confidence': confidence,
|
||||
'sources_used': available_sources
|
||||
}
|
||||
|
||||
|
||||
def _calculate_source_energy(sentiments: List[Dict[str, float]]) -> float:
|
||||
"""
|
||||
Calculate energy score for a single source using the formula:
|
||||
Score = (Positive - Negative) × Volume × Virality
|
||||
|
||||
Args:
|
||||
sentiments: List of sentiment scores with 'positive' and 'negative' keys
|
||||
|
||||
Returns:
|
||||
Energy score for the source (can be negative or positive)
|
||||
"""
|
||||
if not sentiments:
|
||||
return 0.0
|
||||
|
||||
# Calculate aggregated metrics
|
||||
total_count = len(sentiments)
|
||||
positive_ratio = sum(s.get('positive', 0) for s in sentiments) / total_count
|
||||
negative_ratio = sum(s.get('negative', 0) for s in sentiments) / total_count
|
||||
|
||||
# Volume: total number of sentiments
|
||||
volume = total_count
|
||||
|
||||
# Virality: average absolute compound score (intensity of sentiment)
|
||||
virality = sum(abs(s.get('compound', 0)) for s in sentiments) / total_count
|
||||
|
||||
# Apply the energy formula
|
||||
energy = (positive_ratio - negative_ratio) * volume * virality
|
||||
|
||||
return energy
|
||||
|
||||
|
||||
def apply_source_weights(
|
||||
twitter_score: float,
|
||||
reddit_score: float,
|
||||
rss_score: float,
|
||||
available_sources: List[str]
|
||||
) -> float:
|
||||
"""
|
||||
Apply source weights to calculate weighted score.
|
||||
|
||||
Args:
|
||||
twitter_score: Energy score from Twitter
|
||||
reddit_score: Energy score from Reddit
|
||||
rss_score: Energy score from RSS
|
||||
available_sources: List of available sources
|
||||
|
||||
Returns:
|
||||
Weighted energy score
|
||||
"""
|
||||
if not available_sources:
|
||||
return 0.0
|
||||
|
||||
# Adjust weights for degraded mode
|
||||
adjusted_weights = adjust_weights_for_degraded_mode(
|
||||
original_weights=SOURCE_WEIGHTS,
|
||||
available_sources=available_sources
|
||||
)
|
||||
|
||||
# Calculate weighted score
|
||||
weighted_score = 0.0
|
||||
if 'twitter' in available_sources:
|
||||
weighted_score += twitter_score * adjusted_weights['twitter']
|
||||
if 'reddit' in available_sources:
|
||||
weighted_score += reddit_score * adjusted_weights['reddit']
|
||||
if 'rss' in available_sources:
|
||||
weighted_score += rss_score * adjusted_weights['rss']
|
||||
|
||||
return weighted_score
|
||||
|
||||
|
||||
def adjust_weights_for_degraded_mode(
|
||||
original_weights: Dict[str, float],
|
||||
available_sources: List[str]
|
||||
) -> Dict[str, float]:
|
||||
"""
|
||||
Adjust weights proportionally when sources are unavailable.
|
||||
|
||||
Args:
|
||||
original_weights: Original source weights
|
||||
available_sources: List of available sources
|
||||
|
||||
Returns:
|
||||
Adjusted weights that sum to 1.0
|
||||
"""
|
||||
if not available_sources:
|
||||
return {}
|
||||
|
||||
# Calculate total weight of available sources
|
||||
total_weight = sum(original_weights[s] for s in available_sources)
|
||||
|
||||
# Adjust weights proportionally
|
||||
adjusted_weights = {}
|
||||
for source in available_sources:
|
||||
adjusted_weights[source] = original_weights[source] / total_weight
|
||||
|
||||
logger.info(f"Adjusted weights for degraded mode: {adjusted_weights}")
|
||||
|
||||
return adjusted_weights
|
||||
|
||||
|
||||
def apply_temporal_weighting(
|
||||
base_score: float,
|
||||
tweets_with_timestamps: List[Dict]
|
||||
) -> float:
|
||||
"""
|
||||
Apply temporal weighting to energy score based on tweet recency.
|
||||
|
||||
Recent tweets (within 1 hour) have higher weight (1.0)
|
||||
Old tweets (24+ hours) have lower weight (0.5)
|
||||
Decay happens over 48 hours.
|
||||
|
||||
Args:
|
||||
base_score: Base energy score
|
||||
tweets_with_timestamps: List of tweets with 'created_at' timestamps
|
||||
|
||||
Returns:
|
||||
Temporally weighted energy score
|
||||
"""
|
||||
if not tweets_with_timestamps:
|
||||
return base_score
|
||||
|
||||
now = datetime.utcnow()
|
||||
weighted_sum = 0.0
|
||||
total_weight = 0.0
|
||||
|
||||
for tweet in tweets_with_timestamps:
|
||||
# Parse timestamp
|
||||
created_at = tweet.get('created_at')
|
||||
if not created_at:
|
||||
continue
|
||||
|
||||
# Calculate time difference in hours
|
||||
if isinstance(created_at, str):
|
||||
created_at = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
|
||||
|
||||
hours_ago = (now - created_at).total_seconds() / 3600
|
||||
|
||||
# Calculate temporal weight (linear decay from 1.0 to 0.5 over 48 hours)
|
||||
time_weight = max(MIN_TEMPORAL_WEIGHT, 1.0 - (hours_ago / TEMPORAL_DECAY_HOURS))
|
||||
|
||||
# Weight the tweet's contribution by its temporal weight
|
||||
sentiment_contribution = tweet.get('compound', 0)
|
||||
weighted_sum += sentiment_contribution * time_weight
|
||||
total_weight += time_weight
|
||||
|
||||
# Calculate weighted average
|
||||
if total_weight > 0:
|
||||
# Adjust base score by temporal factor
|
||||
temporal_factor = weighted_sum / total_weight
|
||||
# Apply temporal weighting to the base score
|
||||
time_weighted_score = base_score * (1 + abs(temporal_factor))
|
||||
else:
|
||||
time_weighted_score = base_score
|
||||
|
||||
return time_weighted_score
|
||||
|
||||
|
||||
def normalize_score(score: float) -> float:
|
||||
"""
|
||||
Normalize energy score to 0-100 range.
|
||||
|
||||
Args:
|
||||
score: Raw energy score
|
||||
|
||||
Returns:
|
||||
Normalized score between 0 and 100
|
||||
"""
|
||||
# Clamp score to 0-100 range
|
||||
normalized = max(0.0, min(100.0, score))
|
||||
return normalized
|
||||
|
||||
|
||||
def calculate_confidence(
|
||||
available_sources: List[str],
|
||||
total_weight: float
|
||||
) -> float:
|
||||
"""
|
||||
Calculate confidence level based on available sources.
|
||||
|
||||
Args:
|
||||
available_sources: List of available sources
|
||||
total_weight: Total weight of available sources
|
||||
|
||||
Returns:
|
||||
Confidence level between 0 and 1
|
||||
"""
|
||||
if not available_sources:
|
||||
return 0.0
|
||||
|
||||
# Confidence is based on total weight of available sources
|
||||
# All sources: 0.6 + 0.25 + 0.15 = 1.0 → confidence ~1.0
|
||||
# Single source (Twitter): 0.6 → confidence ~0.6
|
||||
# Single source (RSS): 0.15 → confidence ~0.15
|
||||
|
||||
confidence = total_weight
|
||||
|
||||
return confidence
|
||||
|
||||
|
||||
def calculate_energy_score_by_source(
|
||||
source: str,
|
||||
sentiments: List[Dict[str, float]]
|
||||
) -> float:
|
||||
"""
|
||||
Calculate energy score for a single source.
|
||||
|
||||
Args:
|
||||
source: Source name ('twitter', 'reddit', or 'rss')
|
||||
sentiments: List of sentiment scores
|
||||
|
||||
Returns:
|
||||
Energy score for the source
|
||||
"""
|
||||
if source not in SOURCE_WEIGHTS:
|
||||
logger.warning(f"Unknown source: {source}")
|
||||
return 0.0
|
||||
|
||||
energy_score = _calculate_source_energy(sentiments)
|
||||
return energy_score
|
||||
|
||||
|
||||
def get_source_weights() -> Dict[str, float]:
|
||||
"""
|
||||
Get the current source weights.
|
||||
|
||||
Returns:
|
||||
Dictionary of source weights
|
||||
"""
|
||||
return SOURCE_WEIGHTS.copy()
|
||||
|
||||
|
||||
def get_temporal_weighting_parameters() -> Dict[str, float]:
|
||||
"""
|
||||
Get the current temporal weighting parameters.
|
||||
|
||||
Returns:
|
||||
Dictionary of temporal weighting parameters
|
||||
"""
|
||||
return {
|
||||
'decay_hours': TEMPORAL_DECAY_HOURS,
|
||||
'min_weight': MIN_TEMPORAL_WEIGHT
|
||||
}
|
||||
146
backend/app/ml/prediction_calculator.py
Normal file
146
backend/app/ml/prediction_calculator.py
Normal file
@@ -0,0 +1,146 @@
|
||||
"""
|
||||
Prediction Calculator Module.
|
||||
|
||||
This module provides functions to calculate match predictions based on
|
||||
energy scores from sentiment analysis.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any
|
||||
|
||||
|
||||
def calculate_confidence_meter(home_energy: float, away_energy: float) -> float:
|
||||
"""
|
||||
Calculate the Confidence Meter (0-100%) based on energy difference.
|
||||
|
||||
The Confidence Meter represents how confident we are in the prediction
|
||||
based on the difference in energy scores between the two teams.
|
||||
|
||||
Formula: min(100, abs(home_energy - away_energy) * 2)
|
||||
|
||||
Args:
|
||||
home_energy: Energy score of the home team (float, any value)
|
||||
away_energy: Energy score of the away team (float, any value)
|
||||
|
||||
Returns:
|
||||
Confidence score between 0.0 and 100.0
|
||||
|
||||
Examples:
|
||||
>>> calculate_confidence_meter(50.0, 50.0)
|
||||
0.0
|
||||
>>> calculate_confidence_meter(60.0, 50.0)
|
||||
20.0
|
||||
>>> calculate_confidence_meter(100.0, 50.0)
|
||||
100.0
|
||||
"""
|
||||
energy_diff = abs(home_energy - away_energy)
|
||||
confidence = min(100.0, energy_diff * 2.0)
|
||||
return confidence
|
||||
|
||||
|
||||
def determine_winner(home_energy: float, away_energy: float) -> str:
|
||||
"""
|
||||
Determine the predicted winner based on energy scores.
|
||||
|
||||
Args:
|
||||
home_energy: Energy score of the home team
|
||||
away_energy: Energy score of the away team
|
||||
|
||||
Returns:
|
||||
'home' if home team has higher energy,
|
||||
'away' if away team has higher energy,
|
||||
'draw' if energies are equal
|
||||
|
||||
Examples:
|
||||
>>> determine_winner(60.0, 40.0)
|
||||
'home'
|
||||
>>> determine_winner(40.0, 60.0)
|
||||
'away'
|
||||
>>> determine_winner(50.0, 50.0)
|
||||
'draw'
|
||||
"""
|
||||
if home_energy > away_energy:
|
||||
return 'home'
|
||||
elif away_energy > home_energy:
|
||||
return 'away'
|
||||
else:
|
||||
return 'draw'
|
||||
|
||||
|
||||
def calculate_prediction(home_energy: float, away_energy: float) -> Dict[str, Any]:
|
||||
"""
|
||||
Calculate a complete match prediction based on energy scores.
|
||||
|
||||
This function combines confidence calculation and winner determination
|
||||
to provide a comprehensive prediction result.
|
||||
|
||||
Args:
|
||||
home_energy: Energy score of the home team
|
||||
away_energy: Energy score of the away team
|
||||
|
||||
Returns:
|
||||
Dictionary containing:
|
||||
- confidence: Confidence score (0.0 - 100.0)
|
||||
- predicted_winner: 'home', 'away', or 'draw'
|
||||
- home_energy: Original home energy score
|
||||
- away_energy: Original away energy score
|
||||
|
||||
Examples:
|
||||
>>> calculate_prediction(65.0, 45.0)
|
||||
{'confidence': 40.0, 'predicted_winner': 'home',
|
||||
'home_energy': 65.0, 'away_energy': 45.0}
|
||||
"""
|
||||
confidence = calculate_confidence_meter(home_energy, away_energy)
|
||||
predicted_winner = determine_winner(home_energy, away_energy)
|
||||
|
||||
return {
|
||||
'confidence': confidence,
|
||||
'predicted_winner': predicted_winner,
|
||||
'home_energy': home_energy,
|
||||
'away_energy': away_energy
|
||||
}
|
||||
|
||||
|
||||
def validate_prediction_result(result: Dict[str, Any]) -> bool:
|
||||
"""
|
||||
Validate that a prediction result contains all required fields and valid values.
|
||||
|
||||
Args:
|
||||
result: Dictionary to validate
|
||||
|
||||
Returns:
|
||||
True if valid, False otherwise
|
||||
|
||||
Examples:
|
||||
>>> validate_prediction_result({'confidence': 75.0, 'predicted_winner': 'home',
|
||||
... 'home_energy': 65.0, 'away_energy': 45.0})
|
||||
True
|
||||
>>> validate_prediction_result({'confidence': -10.0, 'predicted_winner': 'home',
|
||||
... 'home_energy': 65.0, 'away_energy': 45.0})
|
||||
False
|
||||
"""
|
||||
# Check required fields
|
||||
required_fields = ['confidence', 'predicted_winner', 'home_energy', 'away_energy']
|
||||
if not all(field in result for field in required_fields):
|
||||
return False
|
||||
|
||||
# Validate confidence
|
||||
confidence = result['confidence']
|
||||
if not isinstance(confidence, (int, float)):
|
||||
return False
|
||||
if confidence < 0.0 or confidence > 100.0:
|
||||
return False
|
||||
|
||||
# Validate predicted_winner
|
||||
winner = result['predicted_winner']
|
||||
if winner not in ['home', 'away', 'draw']:
|
||||
return False
|
||||
|
||||
# Validate energy scores (should be non-negative)
|
||||
home_energy = result['home_energy']
|
||||
away_energy = result['away_energy']
|
||||
if not isinstance(home_energy, (int, float)) or not isinstance(away_energy, (int, float)):
|
||||
return False
|
||||
if home_energy < 0.0 or away_energy < 0.0:
|
||||
return False
|
||||
|
||||
return True
|
||||
178
backend/app/ml/sentiment_analyzer.py
Normal file
178
backend/app/ml/sentiment_analyzer.py
Normal file
@@ -0,0 +1,178 @@
|
||||
"""
|
||||
Sentiment Analyzer Module
|
||||
Uses VADER (Valence Aware Dictionary and sEntiment Reasoner) for sentiment analysis.
|
||||
"""
|
||||
|
||||
from typing import Dict, List, Optional
|
||||
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
||||
|
||||
# Initialize the VADER analyzer globally for better performance
|
||||
_analyzer = SentimentIntensityAnalyzer()
|
||||
|
||||
|
||||
def classify_sentiment(compound: float) -> str:
|
||||
"""
|
||||
Classify sentiment based on compound score.
|
||||
|
||||
Args:
|
||||
compound: Compound sentiment score (-1 to 1)
|
||||
|
||||
Returns:
|
||||
Sentiment classification: 'positive', 'negative', or 'neutral'
|
||||
"""
|
||||
if compound >= 0.05:
|
||||
return 'positive'
|
||||
elif compound <= -0.05:
|
||||
return 'negative'
|
||||
else:
|
||||
return 'neutral'
|
||||
|
||||
|
||||
def analyze_sentiment(text: str) -> Dict[str, float]:
|
||||
"""
|
||||
Analyze sentiment of a text using VADER.
|
||||
|
||||
Args:
|
||||
text: Text to analyze
|
||||
|
||||
Returns:
|
||||
Dictionary with sentiment scores:
|
||||
- compound: Overall compound score (-1 to 1)
|
||||
- positive: Positive proportion (0 to 1)
|
||||
- negative: Negative proportion (0 to 1)
|
||||
- neutral: Neutral proportion (0 to 1)
|
||||
- sentiment: Classification ('positive', 'negative', or 'neutral')
|
||||
"""
|
||||
if not text or not isinstance(text, str):
|
||||
raise ValueError("Text must be a non-empty string")
|
||||
|
||||
scores = _analyzer.polarity_scores(text)
|
||||
sentiment = classify_sentiment(scores['compound'])
|
||||
|
||||
return {
|
||||
'compound': scores['compound'],
|
||||
'positive': scores['pos'],
|
||||
'negative': scores['neg'],
|
||||
'neutral': scores['neu'],
|
||||
'sentiment': sentiment
|
||||
}
|
||||
|
||||
|
||||
def analyze_sentiment_batch(texts: List[str]) -> List[Dict[str, float]]:
|
||||
"""
|
||||
Analyze sentiment of multiple texts in batch for better performance.
|
||||
|
||||
Args:
|
||||
texts: List of texts to analyze
|
||||
|
||||
Returns:
|
||||
List of sentiment score dictionaries
|
||||
"""
|
||||
results = []
|
||||
for text in texts:
|
||||
try:
|
||||
result = analyze_sentiment(text)
|
||||
results.append(result)
|
||||
except ValueError as e:
|
||||
# Log error but continue processing other texts
|
||||
print(f"Error analyzing text: {e}")
|
||||
results.append({
|
||||
'compound': 0.0,
|
||||
'positive': 0.0,
|
||||
'negative': 0.0,
|
||||
'neutral': 1.0,
|
||||
'sentiment': 'neutral'
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def calculate_aggregated_metrics(sentiments: List[Dict[str, float]]) -> Dict[str, float]:
|
||||
"""
|
||||
Calculate aggregated metrics from a list of sentiment analyses.
|
||||
|
||||
Args:
|
||||
sentiments: List of sentiment score dictionaries
|
||||
|
||||
Returns:
|
||||
Dictionary with aggregated metrics:
|
||||
- total_count: Total number of sentiments
|
||||
- positive_count: Count of positive sentiments
|
||||
- negative_count: Count of negative sentiments
|
||||
- neutral_count: Count of neutral sentiments
|
||||
- positive_ratio: Ratio of positive sentiments (0 to 1)
|
||||
- negative_ratio: Ratio of negative sentiments (0 to 1)
|
||||
- neutral_ratio: Ratio of neutral sentiments (0 to 1)
|
||||
- average_compound: Average compound score
|
||||
"""
|
||||
if not sentiments:
|
||||
return {
|
||||
'total_count': 0,
|
||||
'positive_count': 0,
|
||||
'negative_count': 0,
|
||||
'neutral_count': 0,
|
||||
'positive_ratio': 0.0,
|
||||
'negative_ratio': 0.0,
|
||||
'neutral_ratio': 0.0,
|
||||
'average_compound': 0.0
|
||||
}
|
||||
|
||||
total_count = len(sentiments)
|
||||
positive_count = sum(1 for s in sentiments if s['sentiment'] == 'positive')
|
||||
negative_count = sum(1 for s in sentiments if s['sentiment'] == 'negative')
|
||||
neutral_count = sum(1 for s in sentiments if s['sentiment'] == 'neutral')
|
||||
|
||||
average_compound = sum(s['compound'] for s in sentiments) / total_count
|
||||
|
||||
return {
|
||||
'total_count': total_count,
|
||||
'positive_count': positive_count,
|
||||
'negative_count': negative_count,
|
||||
'neutral_count': neutral_count,
|
||||
'positive_ratio': positive_count / total_count,
|
||||
'negative_ratio': negative_count / total_count,
|
||||
'neutral_ratio': neutral_count / total_count,
|
||||
'average_compound': average_compound
|
||||
}
|
||||
|
||||
|
||||
def test_analyzer_performance(num_tweets: int = 1000) -> float:
|
||||
"""
|
||||
Test the performance of the sentiment analyzer.
|
||||
|
||||
Args:
|
||||
num_tweets: Number of tweets to test with (default: 1000)
|
||||
|
||||
Returns:
|
||||
Time taken to analyze the tweets in seconds
|
||||
"""
|
||||
import time
|
||||
import random
|
||||
|
||||
# Generate sample tweets
|
||||
sample_tweets = [
|
||||
"I love this game! Best match ever!",
|
||||
"Terrible performance. Worst team ever.",
|
||||
"It's okay, nothing special.",
|
||||
"Amazing goal! What a comeback!",
|
||||
"Disappointed with the result.",
|
||||
"Great teamwork out there!",
|
||||
"Could have been better.",
|
||||
"Absolutely fantastic!",
|
||||
"Not good enough today.",
|
||||
"Well played both teams."
|
||||
]
|
||||
|
||||
tweets = [random.choice(sample_tweets) for _ in range(num_tweets)]
|
||||
|
||||
# Measure time
|
||||
start_time = time.time()
|
||||
results = analyze_sentiment_batch(tweets)
|
||||
end_time = time.time()
|
||||
|
||||
time_taken = end_time - start_time
|
||||
|
||||
print(f"Analyzed {len(results)} tweets in {time_taken:.4f} seconds")
|
||||
print(f"Performance: {num_tweets / time_taken:.2f} tweets/second")
|
||||
|
||||
return time_taken
|
||||
Reference in New Issue
Block a user