Initial commit

This commit is contained in:
2026-02-01 09:31:38 +01:00
commit e02db93960
4396 changed files with 1511612 additions and 0 deletions

View File

@@ -0,0 +1,2 @@
# ML Module
# This module contains machine learning components for sentiment analysis and energy calculations

View File

@@ -0,0 +1,619 @@
"""
Backtesting Module.
This module provides functions to run backtesting on historical match data,
comparing predictions with actual results to calculate accuracy metrics.
"""
import json
import csv
from datetime import datetime
from typing import Dict, List, Any, Optional
from io import StringIO
from app.ml.prediction_calculator import calculate_prediction
# Validation thresholds
ACCURACY_VALIDATED_THRESHOLD = 60.0 # >= 60%: System validated
ACCURACY_ALERT_THRESHOLD = 55.0 # < 55%: Revision required
def validate_accuracy(accuracy: float) -> str:
"""
Validate the accuracy of the prediction system.
Args:
accuracy: Accuracy percentage (0.0 - 100.0)
Returns:
'VALIDATED' if accuracy >= 60%,
'REVISION_REQUIRED' if accuracy < 55%,
'BELOW_TARGET' if 55% <= accuracy < 60%
Examples:
>>> validate_accuracy(70.0)
'VALIDATED'
>>> validate_accuracy(50.0)
'REVISION_REQUIRED'
>>> validate_accuracy(58.0)
'BELOW_TARGET'
"""
if accuracy >= ACCURACY_VALIDATED_THRESHOLD:
return 'VALIDATED'
elif accuracy < ACCURACY_ALERT_THRESHOLD:
return 'REVISION_REQUIRED'
else:
return 'BELOW_TARGET'
def compare_prediction(predicted_winner: str, actual_winner: str) -> bool:
"""
Compare predicted winner with actual match result.
Args:
predicted_winner: 'home', 'away', or 'draw'
actual_winner: 'home', 'away', or 'draw'
Returns:
True if prediction was correct, False otherwise
Examples:
>>> compare_prediction('home', 'home')
True
>>> compare_prediction('home', 'away')
False
"""
return predicted_winner.lower() == actual_winner.lower()
def run_backtesting_single_match(
match_id: int,
home_team: str,
away_team: str,
home_energy: float,
away_energy: float,
actual_winner: str
) -> Dict[str, Any]:
"""
Run backtesting for a single historical match.
Calculates prediction and compares it with the actual result.
Args:
match_id: Unique match identifier
home_team: Name of the home team
away_team: Name of the away team
home_energy: Energy score of the home team
away_energy: Energy score of the away team
actual_winner: Actual result ('home', 'away', or 'draw')
Returns:
Dictionary containing match details, prediction, and comparison result
Examples:
>>> result = run_backtesting_single_match(1, 'PSG', 'OM', 65.0, 45.0, 'home')
>>> result['correct']
True
"""
# Calculate prediction
prediction = calculate_prediction(home_energy, away_energy)
# Compare with actual result
is_correct = compare_prediction(prediction['predicted_winner'], actual_winner)
return {
'match_id': match_id,
'home_team': home_team,
'away_team': away_team,
'home_energy': home_energy,
'away_energy': away_energy,
'prediction': prediction,
'actual_winner': actual_winner,
'correct': is_correct
}
def run_backtesting_batch(matches: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Run backtesting on a batch of historical matches.
Processes multiple matches, calculates predictions, compares with actual
results, and generates accuracy metrics and detailed report.
Args:
matches: List of match dictionaries with keys:
- match_id (int)
- home_team (str)
- away_team (str)
- home_energy (float)
- away_energy (float)
- actual_winner (str)
- league (str, optional)
- date (datetime, optional)
Returns:
Dictionary containing:
- total_matches: Number of matches processed
- correct_predictions: Number of correct predictions
- incorrect_predictions: Number of incorrect predictions
- accuracy: Accuracy percentage
- status: Validation status (VALIDATED, REVISION_REQUIRED, BELOW_TARGET)
- results: List of individual match results
- metrics_by_league: Accuracy breakdown by league
- timestamp: When the backtesting was run
Examples:
>>> matches = [
... {'match_id': 1, 'home_team': 'PSG', 'away_team': 'OM',
... 'home_energy': 65.0, 'away_energy': 45.0, 'actual_winner': 'home'},
... ]
>>> result = run_backtesting_batch(matches)
>>> result['accuracy']
100.0
"""
results = []
correct_predictions = 0
incorrect_predictions = 0
# Track metrics by league
league_metrics: Dict[str, Dict[str, Any]] = {}
for match in matches:
# Validate required fields
required_fields = ['match_id', 'home_team', 'away_team',
'home_energy', 'away_energy', 'actual_winner']
if not all(field in match for field in required_fields):
raise ValueError(f"Match missing required fields: {match}")
# Extract league and date if available
league = match.get('league', 'unknown')
match_date = match.get('date')
# Run backtesting for this match
result = run_backtesting_single_match(
match_id=match['match_id'],
home_team=match['home_team'],
away_team=match['away_team'],
home_energy=match['home_energy'],
away_energy=match['away_energy'],
actual_winner=match['actual_winner']
)
# Add league and date to result
result['league'] = league
result['date'] = match_date.isoformat() if match_date else None
# Track correctness
if result['correct']:
correct_predictions += 1
else:
incorrect_predictions += 1
# Update league metrics
if league not in league_metrics:
league_metrics[league] = {
'total': 0,
'correct': 0,
'accuracy': 0.0
}
league_metrics[league]['total'] += 1
if result['correct']:
league_metrics[league]['correct'] += 1
results.append(result)
# Calculate overall accuracy
total_matches = len(matches)
accuracy = (correct_predictions / total_matches * 100.0) if total_matches > 0 else 0.0
# Calculate accuracy per league
for league, metrics in league_metrics.items():
if metrics['total'] > 0:
metrics['accuracy'] = (metrics['correct'] / metrics['total'] * 100.0)
# Get validation status
status = validate_accuracy(accuracy)
return {
'total_matches': total_matches,
'correct_predictions': correct_predictions,
'incorrect_predictions': incorrect_predictions,
'accuracy': round(accuracy, 2),
'status': status,
'results': results,
'metrics_by_league': league_metrics,
'timestamp': datetime.utcnow().isoformat(),
'validation_thresholds': {
'validated': ACCURACY_VALIDATED_THRESHOLD,
'alert': ACCURACY_ALERT_THRESHOLD
}
}
def export_to_json(backtesting_result: Dict[str, Any]) -> str:
"""
Export backtesting results to JSON format.
Args:
backtesting_result: Result from run_backtesting_batch
Returns:
JSON formatted string
Examples:
>>> result = run_backtesting_batch(matches)
>>> json_output = export_to_json(result)
>>> isinstance(json_output, str)
True
"""
return json.dumps(backtesting_result, indent=2, default=str)
def export_to_csv(backtesting_result: Dict[str, Any]) -> str:
"""
Export backtesting results to CSV format.
Args:
backtesting_result: Result from run_backtesting_batch
Returns:
CSV formatted string
Examples:
>>> result = run_backtesting_batch(matches)
>>> csv_output = export_to_csv(result)
>>> isinstance(csv_output, str)
True
"""
output = StringIO()
fieldnames = [
'match_id', 'league', 'date', 'home_team', 'away_team',
'home_energy', 'away_energy', 'predicted_winner',
'confidence', 'actual_winner', 'correct'
]
writer = csv.DictWriter(output, fieldnames=fieldnames)
writer.writeheader()
for result in backtesting_result.get('results', []):
row = {
'match_id': result['match_id'],
'league': result.get('league', ''),
'date': result.get('date', ''),
'home_team': result['home_team'],
'away_team': result['away_team'],
'home_energy': result['home_energy'],
'away_energy': result['away_energy'],
'predicted_winner': result['prediction']['predicted_winner'],
'confidence': result['prediction']['confidence'],
'actual_winner': result['actual_winner'],
'correct': result['correct']
}
writer.writerow(row)
return output.getvalue()
def export_to_html(backtesting_result: Dict[str, Any]) -> str:
"""
Export backtesting results to HTML format for publication.
Args:
backtesting_result: Result from run_backtesting_batch
Returns:
HTML formatted string with styling and charts
Examples:
>>> result = run_backtesting_batch(matches)
>>> html_output = export_to_html(result)
>>> '<html>' in html_output
True
"""
status_colors = {
'VALIDATED': '#10B981', # Green
'BELOW_TARGET': '#F59E0B', # Orange
'REVISION_REQUIRED': '#EF4444' # Red
}
status = backtesting_result['status']
accuracy = backtesting_result['accuracy']
total_matches = backtesting_result['total_matches']
correct_predictions = backtesting_result['correct_predictions']
incorrect_predictions = backtesting_result['incorrect_predictions']
# Build HTML
html = f"""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Backtesting Report - ChartBastan</title>
<style>
* {{ margin: 0; padding: 0; box-sizing: border-box; }}
body {{
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
padding: 20px;
min-height: 100vh;
}}
.container {{
max-width: 1200px;
margin: 0 auto;
background: white;
border-radius: 20px;
box-shadow: 0 20px 60px rgba(0,0,0,0.3);
padding: 40px;
}}
.header {{
text-align: center;
margin-bottom: 40px;
}}
.header h1 {{
font-size: 2.5em;
color: #667eea;
margin-bottom: 10px;
}}
.header p {{
color: #666;
font-size: 1.1em;
}}
.summary {{
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 20px;
margin-bottom: 40px;
}}
.card {{
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 25px;
border-radius: 15px;
text-align: center;
}}
.card h3 {{
font-size: 0.9em;
opacity: 0.9;
margin-bottom: 10px;
text-transform: uppercase;
letter-spacing: 1px;
}}
.card .value {{
font-size: 2.5em;
font-weight: bold;
margin-bottom: 5px;
}}
.card .sub {{
font-size: 0.9em;
opacity: 0.9;
}}
.status-badge {{
display: inline-block;
padding: 10px 25px;
border-radius: 25px;
color: white;
font-weight: bold;
font-size: 1.2em;
margin: 20px 0;
}}
.section {{
margin-bottom: 40px;
}}
.section h2 {{
font-size: 1.8em;
color: #333;
margin-bottom: 20px;
padding-bottom: 10px;
border-bottom: 3px solid #667eea;
}}
table {{
width: 100%;
border-collapse: collapse;
margin-top: 20px;
}}
th, td {{
padding: 15px;
text-align: left;
border-bottom: 1px solid #ddd;
}}
th {{
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
font-weight: 600;
text-transform: uppercase;
font-size: 0.85em;
letter-spacing: 0.5px;
}}
tr:hover {{
background: #f5f5f5;
}}
.correct {{
color: #10B981;
font-weight: bold;
}}
.incorrect {{
color: #EF4444;
font-weight: bold;
}}
.footer {{
text-align: center;
margin-top: 40px;
padding-top: 20px;
border-top: 2px solid #ddd;
color: #666;
}}
</style>
</head>
<body>
<div class="container">
<div class="header">
<h1>📊 Backtesting Report</h1>
<p>ChartBastan Prediction System Performance Analysis</p>
<p style="margin-top: 10px; font-size: 0.9em;">
Generated: {backtesting_result.get('timestamp', 'N/A')}
</p>
</div>
<div class="summary">
<div class="card">
<h3>Total Matches</h3>
<div class="value">{total_matches}</div>
<div class="sub">matches analyzed</div>
</div>
<div class="card">
<h3>Accuracy</h3>
<div class="value">{accuracy}%</div>
<div class="sub">prediction accuracy</div>
</div>
<div class="card">
<h3>Correct</h3>
<div class="value">{correct_predictions}</div>
<div class="sub">predictions</div>
</div>
<div class="card">
<h3>Incorrect</h3>
<div class="value">{incorrect_predictions}</div>
<div class="sub">predictions</div>
</div>
</div>
<div style="text-align: center;">
<div class="status-badge" style="background-color: {status_colors.get(status, '#666')};">
Status: {status}
</div>
</div>
<div class="section">
<h2>📈 Metrics by League</h2>
<table>
<thead>
<tr>
<th>League</th>
<th>Matches</th>
<th>Correct</th>
<th>Accuracy</th>
</tr>
</thead>
<tbody>
"""
# Add league metrics
for league, metrics in backtesting_result.get('metrics_by_league', {}).items():
html += f"""
<tr>
<td>{league}</td>
<td>{metrics['total']}</td>
<td>{metrics['correct']}</td>
<td>{metrics['accuracy']:.2f}%</td>
</tr>
"""
html += """
</tbody>
</table>
</div>
<div class="section">
<h2>📋 Detailed Results</h2>
<table>
<thead>
<tr>
<th>Match ID</th>
<th>League</th>
<th>Home vs Away</th>
<th>Prediction</th>
<th>Confidence</th>
<th>Actual</th>
<th>Result</th>
</tr>
</thead>
<tbody>
"""
# Add detailed results
for result in backtesting_result.get('results', []):
result_class = 'correct' if result['correct'] else 'incorrect'
html += f"""
<tr>
<td>{result['match_id']}</td>
<td>{result.get('league', 'N/A')}</td>
<td>{result['home_team']} vs {result['away_team']}</td>
<td>{result['prediction']['predicted_winner']}</td>
<td>{result['prediction']['confidence']:.1f}%</td>
<td>{result['actual_winner']}</td>
<td class="{result_class}">{'✓ Correct' if result['correct'] else '✗ Incorrect'}</td>
</tr>
"""
html += """
</tbody>
</table>
</div>
<div class="footer">
<p>🎯 ChartBastan - Football Match Prediction System</p>
<p>© 2026 All rights reserved</p>
</div>
</div>
</body>
</html>
"""
return html
def filter_matches_by_league(matches: List[Dict[str, Any]], leagues: List[str]) -> List[Dict[str, Any]]:
"""
Filter matches by league(s).
Args:
matches: List of match dictionaries
leagues: List of league names to include
Returns:
Filtered list of matches
Examples:
>>> matches = [{'league': 'Ligue 1', 'home_team': 'PSG', ...}]
>>> filtered = filter_matches_by_league(matches, ['Ligue 1'])
>>> len(filtered)
1
"""
if not leagues:
return matches
return [m for m in matches if m.get('league') in leagues]
def filter_matches_by_period(
matches: List[Dict[str, Any]],
start_date: Optional[datetime] = None,
end_date: Optional[datetime] = None
) -> List[Dict[str, Any]]:
"""
Filter matches by date period.
Args:
matches: List of match dictionaries
start_date: Start date (inclusive), or None for no lower bound
end_date: End date (inclusive), or None for no upper bound
Returns:
Filtered list of matches
Examples:
>>> from datetime import datetime
>>> matches = [{'date': datetime(2026, 1, 1), ...}]
>>> filtered = filter_matches_by_period(matches, datetime(2025, 1, 1))
"""
filtered = matches
if start_date:
filtered = [m for m in filtered if m.get('date') and m['date'] >= start_date]
if end_date:
filtered = [m for m in filtered if m.get('date') and m['date'] <= end_date]
return filtered

View File

@@ -0,0 +1,356 @@
"""
Energy Calculator Module.
This module calculates collective energy scores based on sentiment analysis
from multiple sources (Twitter, Reddit, RSS) using a weighted formula.
Formula: Score = (Positive - Negative) × Volume × Virality
"""
from datetime import datetime
from typing import Dict, List, Optional
from logging import getLogger
logger = getLogger(__name__)
# Source weights as specified in requirements
SOURCE_WEIGHTS = {
'twitter': 0.60,
'reddit': 0.25,
'rss': 0.15
}
# Temporal weighting parameters
TEMPORAL_DECAY_HOURS = 48 # Full decay over 48 hours
MIN_TEMPORAL_WEIGHT = 0.5 # Minimum weight for old tweets
def calculate_energy_score(
match_id: int,
team_id: int,
twitter_sentiments: List[Dict[str, float]] = None,
reddit_sentiments: List[Dict[str, float]] = None,
rss_sentiments: List[Dict[str, float]] = None,
tweets_with_timestamps: List[Dict] = None
) -> Dict[str, any]:
"""
Calculate energy score for a team based on multi-source sentiment data.
Args:
match_id: ID of the match
team_id: ID of the team
twitter_sentiments: List of Twitter sentiment scores
reddit_sentiments: List of Reddit sentiment scores
rss_sentiments: List of RSS sentiment scores
tweets_with_timestamps: List of tweets with timestamps for temporal weighting
Returns:
Dictionary containing:
- score: Final energy score (0-100)
- confidence: Confidence level (0-1)
- sources_used: List of sources used in calculation
"""
# Initialize with empty lists if None
twitter_sentiments = twitter_sentiments or []
reddit_sentiments = reddit_sentiments or []
rss_sentiments = rss_sentiments or []
tweets_with_timestamps = tweets_with_timestamps or []
# Calculate energy scores for each source using the formula
twitter_energy_score = _calculate_source_energy(twitter_sentiments)
reddit_energy_score = _calculate_source_energy(reddit_sentiments)
rss_energy_score = _calculate_source_energy(rss_sentiments)
# Determine available sources
available_sources = []
if twitter_sentiments:
available_sources.append('twitter')
if reddit_sentiments:
available_sources.append('reddit')
if rss_sentiments:
available_sources.append('rss')
# Check if no sentiment data is available
if not available_sources:
logger.warning(f"No sentiment data available for match_id={match_id}, team_id={team_id}")
return {
'score': 0.0,
'confidence': 0.0,
'sources_used': []
}
# Apply source weights (with degraded mode adjustment)
weighted_score = apply_source_weights(
twitter_score=twitter_energy_score,
reddit_score=reddit_energy_score,
rss_score=rss_energy_score,
available_sources=available_sources
)
# Apply temporal weighting if tweets with timestamps are available
time_weighted_score = weighted_score
if tweets_with_timestamps and available_sources:
time_weighted_score = apply_temporal_weighting(
base_score=weighted_score,
tweets_with_timestamps=tweets_with_timestamps
)
# Normalize score to 0-100 range
final_score = normalize_score(time_weighted_score)
# Calculate confidence level
total_weight = sum(SOURCE_WEIGHTS[s] for s in available_sources)
confidence = calculate_confidence(
available_sources=available_sources,
total_weight=total_weight
)
return {
'score': final_score,
'confidence': confidence,
'sources_used': available_sources
}
def _calculate_source_energy(sentiments: List[Dict[str, float]]) -> float:
"""
Calculate energy score for a single source using the formula:
Score = (Positive - Negative) × Volume × Virality
Args:
sentiments: List of sentiment scores with 'positive' and 'negative' keys
Returns:
Energy score for the source (can be negative or positive)
"""
if not sentiments:
return 0.0
# Calculate aggregated metrics
total_count = len(sentiments)
positive_ratio = sum(s.get('positive', 0) for s in sentiments) / total_count
negative_ratio = sum(s.get('negative', 0) for s in sentiments) / total_count
# Volume: total number of sentiments
volume = total_count
# Virality: average absolute compound score (intensity of sentiment)
virality = sum(abs(s.get('compound', 0)) for s in sentiments) / total_count
# Apply the energy formula
energy = (positive_ratio - negative_ratio) * volume * virality
return energy
def apply_source_weights(
twitter_score: float,
reddit_score: float,
rss_score: float,
available_sources: List[str]
) -> float:
"""
Apply source weights to calculate weighted score.
Args:
twitter_score: Energy score from Twitter
reddit_score: Energy score from Reddit
rss_score: Energy score from RSS
available_sources: List of available sources
Returns:
Weighted energy score
"""
if not available_sources:
return 0.0
# Adjust weights for degraded mode
adjusted_weights = adjust_weights_for_degraded_mode(
original_weights=SOURCE_WEIGHTS,
available_sources=available_sources
)
# Calculate weighted score
weighted_score = 0.0
if 'twitter' in available_sources:
weighted_score += twitter_score * adjusted_weights['twitter']
if 'reddit' in available_sources:
weighted_score += reddit_score * adjusted_weights['reddit']
if 'rss' in available_sources:
weighted_score += rss_score * adjusted_weights['rss']
return weighted_score
def adjust_weights_for_degraded_mode(
original_weights: Dict[str, float],
available_sources: List[str]
) -> Dict[str, float]:
"""
Adjust weights proportionally when sources are unavailable.
Args:
original_weights: Original source weights
available_sources: List of available sources
Returns:
Adjusted weights that sum to 1.0
"""
if not available_sources:
return {}
# Calculate total weight of available sources
total_weight = sum(original_weights[s] for s in available_sources)
# Adjust weights proportionally
adjusted_weights = {}
for source in available_sources:
adjusted_weights[source] = original_weights[source] / total_weight
logger.info(f"Adjusted weights for degraded mode: {adjusted_weights}")
return adjusted_weights
def apply_temporal_weighting(
base_score: float,
tweets_with_timestamps: List[Dict]
) -> float:
"""
Apply temporal weighting to energy score based on tweet recency.
Recent tweets (within 1 hour) have higher weight (1.0)
Old tweets (24+ hours) have lower weight (0.5)
Decay happens over 48 hours.
Args:
base_score: Base energy score
tweets_with_timestamps: List of tweets with 'created_at' timestamps
Returns:
Temporally weighted energy score
"""
if not tweets_with_timestamps:
return base_score
now = datetime.utcnow()
weighted_sum = 0.0
total_weight = 0.0
for tweet in tweets_with_timestamps:
# Parse timestamp
created_at = tweet.get('created_at')
if not created_at:
continue
# Calculate time difference in hours
if isinstance(created_at, str):
created_at = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
hours_ago = (now - created_at).total_seconds() / 3600
# Calculate temporal weight (linear decay from 1.0 to 0.5 over 48 hours)
time_weight = max(MIN_TEMPORAL_WEIGHT, 1.0 - (hours_ago / TEMPORAL_DECAY_HOURS))
# Weight the tweet's contribution by its temporal weight
sentiment_contribution = tweet.get('compound', 0)
weighted_sum += sentiment_contribution * time_weight
total_weight += time_weight
# Calculate weighted average
if total_weight > 0:
# Adjust base score by temporal factor
temporal_factor = weighted_sum / total_weight
# Apply temporal weighting to the base score
time_weighted_score = base_score * (1 + abs(temporal_factor))
else:
time_weighted_score = base_score
return time_weighted_score
def normalize_score(score: float) -> float:
"""
Normalize energy score to 0-100 range.
Args:
score: Raw energy score
Returns:
Normalized score between 0 and 100
"""
# Clamp score to 0-100 range
normalized = max(0.0, min(100.0, score))
return normalized
def calculate_confidence(
available_sources: List[str],
total_weight: float
) -> float:
"""
Calculate confidence level based on available sources.
Args:
available_sources: List of available sources
total_weight: Total weight of available sources
Returns:
Confidence level between 0 and 1
"""
if not available_sources:
return 0.0
# Confidence is based on total weight of available sources
# All sources: 0.6 + 0.25 + 0.15 = 1.0 → confidence ~1.0
# Single source (Twitter): 0.6 → confidence ~0.6
# Single source (RSS): 0.15 → confidence ~0.15
confidence = total_weight
return confidence
def calculate_energy_score_by_source(
source: str,
sentiments: List[Dict[str, float]]
) -> float:
"""
Calculate energy score for a single source.
Args:
source: Source name ('twitter', 'reddit', or 'rss')
sentiments: List of sentiment scores
Returns:
Energy score for the source
"""
if source not in SOURCE_WEIGHTS:
logger.warning(f"Unknown source: {source}")
return 0.0
energy_score = _calculate_source_energy(sentiments)
return energy_score
def get_source_weights() -> Dict[str, float]:
"""
Get the current source weights.
Returns:
Dictionary of source weights
"""
return SOURCE_WEIGHTS.copy()
def get_temporal_weighting_parameters() -> Dict[str, float]:
"""
Get the current temporal weighting parameters.
Returns:
Dictionary of temporal weighting parameters
"""
return {
'decay_hours': TEMPORAL_DECAY_HOURS,
'min_weight': MIN_TEMPORAL_WEIGHT
}

View File

@@ -0,0 +1,146 @@
"""
Prediction Calculator Module.
This module provides functions to calculate match predictions based on
energy scores from sentiment analysis.
"""
from typing import Dict, Any
def calculate_confidence_meter(home_energy: float, away_energy: float) -> float:
"""
Calculate the Confidence Meter (0-100%) based on energy difference.
The Confidence Meter represents how confident we are in the prediction
based on the difference in energy scores between the two teams.
Formula: min(100, abs(home_energy - away_energy) * 2)
Args:
home_energy: Energy score of the home team (float, any value)
away_energy: Energy score of the away team (float, any value)
Returns:
Confidence score between 0.0 and 100.0
Examples:
>>> calculate_confidence_meter(50.0, 50.0)
0.0
>>> calculate_confidence_meter(60.0, 50.0)
20.0
>>> calculate_confidence_meter(100.0, 50.0)
100.0
"""
energy_diff = abs(home_energy - away_energy)
confidence = min(100.0, energy_diff * 2.0)
return confidence
def determine_winner(home_energy: float, away_energy: float) -> str:
"""
Determine the predicted winner based on energy scores.
Args:
home_energy: Energy score of the home team
away_energy: Energy score of the away team
Returns:
'home' if home team has higher energy,
'away' if away team has higher energy,
'draw' if energies are equal
Examples:
>>> determine_winner(60.0, 40.0)
'home'
>>> determine_winner(40.0, 60.0)
'away'
>>> determine_winner(50.0, 50.0)
'draw'
"""
if home_energy > away_energy:
return 'home'
elif away_energy > home_energy:
return 'away'
else:
return 'draw'
def calculate_prediction(home_energy: float, away_energy: float) -> Dict[str, Any]:
"""
Calculate a complete match prediction based on energy scores.
This function combines confidence calculation and winner determination
to provide a comprehensive prediction result.
Args:
home_energy: Energy score of the home team
away_energy: Energy score of the away team
Returns:
Dictionary containing:
- confidence: Confidence score (0.0 - 100.0)
- predicted_winner: 'home', 'away', or 'draw'
- home_energy: Original home energy score
- away_energy: Original away energy score
Examples:
>>> calculate_prediction(65.0, 45.0)
{'confidence': 40.0, 'predicted_winner': 'home',
'home_energy': 65.0, 'away_energy': 45.0}
"""
confidence = calculate_confidence_meter(home_energy, away_energy)
predicted_winner = determine_winner(home_energy, away_energy)
return {
'confidence': confidence,
'predicted_winner': predicted_winner,
'home_energy': home_energy,
'away_energy': away_energy
}
def validate_prediction_result(result: Dict[str, Any]) -> bool:
"""
Validate that a prediction result contains all required fields and valid values.
Args:
result: Dictionary to validate
Returns:
True if valid, False otherwise
Examples:
>>> validate_prediction_result({'confidence': 75.0, 'predicted_winner': 'home',
... 'home_energy': 65.0, 'away_energy': 45.0})
True
>>> validate_prediction_result({'confidence': -10.0, 'predicted_winner': 'home',
... 'home_energy': 65.0, 'away_energy': 45.0})
False
"""
# Check required fields
required_fields = ['confidence', 'predicted_winner', 'home_energy', 'away_energy']
if not all(field in result for field in required_fields):
return False
# Validate confidence
confidence = result['confidence']
if not isinstance(confidence, (int, float)):
return False
if confidence < 0.0 or confidence > 100.0:
return False
# Validate predicted_winner
winner = result['predicted_winner']
if winner not in ['home', 'away', 'draw']:
return False
# Validate energy scores (should be non-negative)
home_energy = result['home_energy']
away_energy = result['away_energy']
if not isinstance(home_energy, (int, float)) or not isinstance(away_energy, (int, float)):
return False
if home_energy < 0.0 or away_energy < 0.0:
return False
return True

View File

@@ -0,0 +1,178 @@
"""
Sentiment Analyzer Module
Uses VADER (Valence Aware Dictionary and sEntiment Reasoner) for sentiment analysis.
"""
from typing import Dict, List, Optional
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
# Initialize the VADER analyzer globally for better performance
_analyzer = SentimentIntensityAnalyzer()
def classify_sentiment(compound: float) -> str:
"""
Classify sentiment based on compound score.
Args:
compound: Compound sentiment score (-1 to 1)
Returns:
Sentiment classification: 'positive', 'negative', or 'neutral'
"""
if compound >= 0.05:
return 'positive'
elif compound <= -0.05:
return 'negative'
else:
return 'neutral'
def analyze_sentiment(text: str) -> Dict[str, float]:
"""
Analyze sentiment of a text using VADER.
Args:
text: Text to analyze
Returns:
Dictionary with sentiment scores:
- compound: Overall compound score (-1 to 1)
- positive: Positive proportion (0 to 1)
- negative: Negative proportion (0 to 1)
- neutral: Neutral proportion (0 to 1)
- sentiment: Classification ('positive', 'negative', or 'neutral')
"""
if not text or not isinstance(text, str):
raise ValueError("Text must be a non-empty string")
scores = _analyzer.polarity_scores(text)
sentiment = classify_sentiment(scores['compound'])
return {
'compound': scores['compound'],
'positive': scores['pos'],
'negative': scores['neg'],
'neutral': scores['neu'],
'sentiment': sentiment
}
def analyze_sentiment_batch(texts: List[str]) -> List[Dict[str, float]]:
"""
Analyze sentiment of multiple texts in batch for better performance.
Args:
texts: List of texts to analyze
Returns:
List of sentiment score dictionaries
"""
results = []
for text in texts:
try:
result = analyze_sentiment(text)
results.append(result)
except ValueError as e:
# Log error but continue processing other texts
print(f"Error analyzing text: {e}")
results.append({
'compound': 0.0,
'positive': 0.0,
'negative': 0.0,
'neutral': 1.0,
'sentiment': 'neutral'
})
return results
def calculate_aggregated_metrics(sentiments: List[Dict[str, float]]) -> Dict[str, float]:
"""
Calculate aggregated metrics from a list of sentiment analyses.
Args:
sentiments: List of sentiment score dictionaries
Returns:
Dictionary with aggregated metrics:
- total_count: Total number of sentiments
- positive_count: Count of positive sentiments
- negative_count: Count of negative sentiments
- neutral_count: Count of neutral sentiments
- positive_ratio: Ratio of positive sentiments (0 to 1)
- negative_ratio: Ratio of negative sentiments (0 to 1)
- neutral_ratio: Ratio of neutral sentiments (0 to 1)
- average_compound: Average compound score
"""
if not sentiments:
return {
'total_count': 0,
'positive_count': 0,
'negative_count': 0,
'neutral_count': 0,
'positive_ratio': 0.0,
'negative_ratio': 0.0,
'neutral_ratio': 0.0,
'average_compound': 0.0
}
total_count = len(sentiments)
positive_count = sum(1 for s in sentiments if s['sentiment'] == 'positive')
negative_count = sum(1 for s in sentiments if s['sentiment'] == 'negative')
neutral_count = sum(1 for s in sentiments if s['sentiment'] == 'neutral')
average_compound = sum(s['compound'] for s in sentiments) / total_count
return {
'total_count': total_count,
'positive_count': positive_count,
'negative_count': negative_count,
'neutral_count': neutral_count,
'positive_ratio': positive_count / total_count,
'negative_ratio': negative_count / total_count,
'neutral_ratio': neutral_count / total_count,
'average_compound': average_compound
}
def test_analyzer_performance(num_tweets: int = 1000) -> float:
"""
Test the performance of the sentiment analyzer.
Args:
num_tweets: Number of tweets to test with (default: 1000)
Returns:
Time taken to analyze the tweets in seconds
"""
import time
import random
# Generate sample tweets
sample_tweets = [
"I love this game! Best match ever!",
"Terrible performance. Worst team ever.",
"It's okay, nothing special.",
"Amazing goal! What a comeback!",
"Disappointed with the result.",
"Great teamwork out there!",
"Could have been better.",
"Absolutely fantastic!",
"Not good enough today.",
"Well played both teams."
]
tweets = [random.choice(sample_tweets) for _ in range(num_tweets)]
# Measure time
start_time = time.time()
results = analyze_sentiment_batch(tweets)
end_time = time.time()
time_taken = end_time - start_time
print(f"Analyzed {len(results)} tweets in {time_taken:.4f} seconds")
print(f"Performance: {num_tweets / time_taken:.2f} tweets/second")
return time_taken