chartbastan/backend/app/ml/sentiment_analyzer.py

"""
Sentiment Analyzer Module
Uses VADER (Valence Aware Dictionary and sEntiment Reasoner) for sentiment analysis.
"""

from typing import Dict, List, Optional
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize the VADER analyzer globally for better performance
_analyzer = SentimentIntensityAnalyzer()


def classify_sentiment(compound: float) -> str:
    """
    Classify sentiment based on compound score.

    Args:
        compound: Compound sentiment score (-1 to 1)

    Returns:
        Sentiment classification: 'positive', 'negative', or 'neutral'
    """
    if compound >= 0.05:
        return 'positive'
    elif compound <= -0.05:
        return 'negative'
    else:
        return 'neutral'


def analyze_sentiment(text: str) -> Dict[str, float]:
    """
    Analyze sentiment of a text using VADER.

    Args:
        text: Text to analyze

    Returns:
        Dictionary with sentiment scores:
            - compound: Overall compound score (-1 to 1)
            - positive: Positive proportion (0 to 1)
            - negative: Negative proportion (0 to 1)
            - neutral: Neutral proportion (0 to 1)
            - sentiment: Classification ('positive', 'negative', or 'neutral')
    """
    if not text or not isinstance(text, str):
        raise ValueError("Text must be a non-empty string")

    scores = _analyzer.polarity_scores(text)
    sentiment = classify_sentiment(scores['compound'])

    return {
        'compound': scores['compound'],
        'positive': scores['pos'],
        'negative': scores['neg'],
        'neutral': scores['neu'],
        'sentiment': sentiment
    }


def analyze_sentiment_batch(texts: List[str]) -> List[Dict[str, float]]:
    """
    Analyze sentiment of multiple texts in batch for better performance.

    Args:
        texts: List of texts to analyze

    Returns:
        List of sentiment score dictionaries
    """
    results = []
    for text in texts:
        try:
            result = analyze_sentiment(text)
            results.append(result)
        except ValueError as e:
            # Log error but continue processing other texts
            print(f"Error analyzing text: {e}")
            results.append({
                'compound': 0.0,
                'positive': 0.0,
                'negative': 0.0,
                'neutral': 1.0,
                'sentiment': 'neutral'
            })

    return results


def calculate_aggregated_metrics(sentiments: List[Dict[str, float]]) -> Dict[str, float]:
    """
    Calculate aggregated metrics from a list of sentiment analyses.

    Args:
        sentiments: List of sentiment score dictionaries

    Returns:
        Dictionary with aggregated metrics:
            - total_count: Total number of sentiments
            - positive_count: Count of positive sentiments
            - negative_count: Count of negative sentiments
            - neutral_count: Count of neutral sentiments
            - positive_ratio: Ratio of positive sentiments (0 to 1)
            - negative_ratio: Ratio of negative sentiments (0 to 1)
            - neutral_ratio: Ratio of neutral sentiments (0 to 1)
            - average_compound: Average compound score
    """
    if not sentiments:
        return {
            'total_count': 0,
            'positive_count': 0,
            'negative_count': 0,
            'neutral_count': 0,
            'positive_ratio': 0.0,
            'negative_ratio': 0.0,
            'neutral_ratio': 0.0,
            'average_compound': 0.0
        }

    total_count = len(sentiments)
    positive_count = sum(1 for s in sentiments if s['sentiment'] == 'positive')
    negative_count = sum(1 for s in sentiments if s['sentiment'] == 'negative')
    neutral_count = sum(1 for s in sentiments if s['sentiment'] == 'neutral')

    average_compound = sum(s['compound'] for s in sentiments) / total_count

    return {
        'total_count': total_count,
        'positive_count': positive_count,
        'negative_count': negative_count,
        'neutral_count': neutral_count,
        'positive_ratio': positive_count / total_count,
        'negative_ratio': negative_count / total_count,
        'neutral_ratio': neutral_count / total_count,
        'average_compound': average_compound
    }


def test_analyzer_performance(num_tweets: int = 1000) -> float:
    """
    Test the performance of the sentiment analyzer.

    Args:
        num_tweets: Number of tweets to test with (default: 1000)

    Returns:
        Time taken to analyze the tweets in seconds
    """
    import time
    import random

    # Generate sample tweets
    sample_tweets = [
        "I love this game! Best match ever!",
        "Terrible performance. Worst team ever.",
        "It's okay, nothing special.",
        "Amazing goal! What a comeback!",
        "Disappointed with the result.",
        "Great teamwork out there!",
        "Could have been better.",
        "Absolutely fantastic!",
        "Not good enough today.",
        "Well played both teams."
    ]

    tweets = [random.choice(sample_tweets) for _ in range(num_tweets)]

    # Measure time
    start_time = time.time()
    results = analyze_sentiment_batch(tweets)
    end_time = time.time()

    time_taken = end_time - start_time

    print(f"Analyzed {len(results)} tweets in {time_taken:.4f} seconds")
    print(f"Performance: {num_tweets / time_taken:.2f} tweets/second")

    return time_taken