chartbastan/backend/app/ml/sentiment_analyzer.py
2026-02-01 09:31:38 +01:00

179 lines
5.3 KiB
Python

"""
Sentiment Analyzer Module
Uses VADER (Valence Aware Dictionary and sEntiment Reasoner) for sentiment analysis.
"""
from typing import Dict, List, Optional
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
# Initialize the VADER analyzer globally for better performance
_analyzer = SentimentIntensityAnalyzer()
def classify_sentiment(compound: float) -> str:
"""
Classify sentiment based on compound score.
Args:
compound: Compound sentiment score (-1 to 1)
Returns:
Sentiment classification: 'positive', 'negative', or 'neutral'
"""
if compound >= 0.05:
return 'positive'
elif compound <= -0.05:
return 'negative'
else:
return 'neutral'
def analyze_sentiment(text: str) -> Dict[str, float]:
"""
Analyze sentiment of a text using VADER.
Args:
text: Text to analyze
Returns:
Dictionary with sentiment scores:
- compound: Overall compound score (-1 to 1)
- positive: Positive proportion (0 to 1)
- negative: Negative proportion (0 to 1)
- neutral: Neutral proportion (0 to 1)
- sentiment: Classification ('positive', 'negative', or 'neutral')
"""
if not text or not isinstance(text, str):
raise ValueError("Text must be a non-empty string")
scores = _analyzer.polarity_scores(text)
sentiment = classify_sentiment(scores['compound'])
return {
'compound': scores['compound'],
'positive': scores['pos'],
'negative': scores['neg'],
'neutral': scores['neu'],
'sentiment': sentiment
}
def analyze_sentiment_batch(texts: List[str]) -> List[Dict[str, float]]:
"""
Analyze sentiment of multiple texts in batch for better performance.
Args:
texts: List of texts to analyze
Returns:
List of sentiment score dictionaries
"""
results = []
for text in texts:
try:
result = analyze_sentiment(text)
results.append(result)
except ValueError as e:
# Log error but continue processing other texts
print(f"Error analyzing text: {e}")
results.append({
'compound': 0.0,
'positive': 0.0,
'negative': 0.0,
'neutral': 1.0,
'sentiment': 'neutral'
})
return results
def calculate_aggregated_metrics(sentiments: List[Dict[str, float]]) -> Dict[str, float]:
"""
Calculate aggregated metrics from a list of sentiment analyses.
Args:
sentiments: List of sentiment score dictionaries
Returns:
Dictionary with aggregated metrics:
- total_count: Total number of sentiments
- positive_count: Count of positive sentiments
- negative_count: Count of negative sentiments
- neutral_count: Count of neutral sentiments
- positive_ratio: Ratio of positive sentiments (0 to 1)
- negative_ratio: Ratio of negative sentiments (0 to 1)
- neutral_ratio: Ratio of neutral sentiments (0 to 1)
- average_compound: Average compound score
"""
if not sentiments:
return {
'total_count': 0,
'positive_count': 0,
'negative_count': 0,
'neutral_count': 0,
'positive_ratio': 0.0,
'negative_ratio': 0.0,
'neutral_ratio': 0.0,
'average_compound': 0.0
}
total_count = len(sentiments)
positive_count = sum(1 for s in sentiments if s['sentiment'] == 'positive')
negative_count = sum(1 for s in sentiments if s['sentiment'] == 'negative')
neutral_count = sum(1 for s in sentiments if s['sentiment'] == 'neutral')
average_compound = sum(s['compound'] for s in sentiments) / total_count
return {
'total_count': total_count,
'positive_count': positive_count,
'negative_count': negative_count,
'neutral_count': neutral_count,
'positive_ratio': positive_count / total_count,
'negative_ratio': negative_count / total_count,
'neutral_ratio': neutral_count / total_count,
'average_compound': average_compound
}
def test_analyzer_performance(num_tweets: int = 1000) -> float:
"""
Test the performance of the sentiment analyzer.
Args:
num_tweets: Number of tweets to test with (default: 1000)
Returns:
Time taken to analyze the tweets in seconds
"""
import time
import random
# Generate sample tweets
sample_tweets = [
"I love this game! Best match ever!",
"Terrible performance. Worst team ever.",
"It's okay, nothing special.",
"Amazing goal! What a comeback!",
"Disappointed with the result.",
"Great teamwork out there!",
"Could have been better.",
"Absolutely fantastic!",
"Not good enough today.",
"Well played both teams."
]
tweets = [random.choice(sample_tweets) for _ in range(num_tweets)]
# Measure time
start_time = time.time()
results = analyze_sentiment_batch(tweets)
end_time = time.time()
time_taken = end_time - start_time
print(f"Analyzed {len(results)} tweets in {time_taken:.4f} seconds")
print(f"Performance: {num_tweets / time_taken:.2f} tweets/second")
return time_taken