""" Twitter scraper module with rate limiting and degraded mode support. This module provides functionality to scrape tweets for football matches, with built-in rate limiting (1000 req/hour) and degraded mode for VIP matches. """ import logging import time from datetime import datetime, timezone from typing import List, Dict, Optional from dataclasses import dataclass import tweepy from sqlalchemy.orm import Session logger = logging.getLogger(__name__) @dataclass class RateLimitInfo: """Information about rate limit usage.""" remaining: int limit: int reset_time: Optional[datetime] @property def usage_percentage(self) -> float: """Calculate usage percentage.""" return (self.limit - self.remaining) / self.limit if self.limit > 0 else 0 @dataclass class TweetData: """Structured data for a tweet.""" tweet_id: str text: str created_at: datetime retweet_count: int like_count: int match_id: int source: str = "twitter" class TwitterScraper: """ Twitter scraper with rate limiting and degraded mode. Features: - Rate limiting (1000 req/hour) - Predictive alerts when approaching limits (>90%) - Exponential backoff for retries - Degraded mode for VIP matches only - Structured logging """ def __init__( self, bearer_token: str, max_tweets_per_hour: int = 1000, rate_limit_alert_threshold: float = 0.9, vip_match_ids: Optional[List[int]] = None ): """ Initialize Twitter scraper. Args: bearer_token: Twitter API bearer token max_tweets_per_hour: Maximum API calls per hour (default: 1000) rate_limit_alert_threshold: Alert threshold (0.0-1.0, default: 0.9) vip_match_ids: List of VIP match IDs for degraded mode """ self.bearer_token = bearer_token self.max_tweets_per_hour = max_tweets_per_hour self.rate_limit_alert_threshold = rate_limit_alert_threshold self.vip_match_ids = vip_match_ids or [] self.vip_mode_only = False # Initialize Twitter API client self.client = tweepy.Client(bearer_token=bearer_token) # Rate limit tracking self.api_calls_made = 0 self.rate_limit_info: Optional[RateLimitInfo] = None # Verify authentication self._verify_authentication() def _verify_authentication(self) -> None: """Verify Twitter API authentication.""" try: # Try to get user info to verify authentication response = self.client.get_me() if response.data: logger.info(f"✅ Twitter API authenticated successfully as @{response.data.username}") else: logger.warning("⚠️ Twitter API authentication returned no user data") except Exception as e: logger.error(f"❌ Twitter API authentication failed: {e}") raise def _check_rate_limit(self) -> bool: """ Check rate limit status and handle alerts. Returns: True if API calls can be made, False otherwise """ # Calculate usage if self.rate_limit_info: usage = self.rate_limit_info.usage_percentage else: usage = self.api_calls_made / self.max_tweets_per_hour # Predictive alert at threshold if usage >= self.rate_limit_alert_threshold: logger.warning( f"⚠️ Rate limit approaching {usage * 100:.1f}% " f"({self.api_calls_made}/{self.max_tweets_per_hour} calls)" ) # Check if limit reached if usage >= 1.0: logger.error( f"❌ Rate limit reached ({self.api_calls_made}/{self.max_tweets_per_hour})" ) return False return True def _wait_for_rate_limit_reset(self) -> None: """ Wait for rate limit to reset with exponential backoff. """ if self.rate_limit_info and self.rate_limit_info.reset_time: now = datetime.now(timezone.utc) wait_seconds = (self.rate_limit_info.reset_time - now).total_seconds() wait_seconds = max(60, wait_seconds) # Minimum 1 minute wait else: # Default to waiting 1 hour if no reset time available wait_seconds = 3600 logger.info(f"⏳ Waiting {wait_seconds/60:.1f} minutes for rate limit reset...") time.sleep(wait_seconds) # Reset counters after waiting self.api_calls_made = 0 self.vip_mode_only = False def _enable_vip_mode_only(self) -> None: """Enable VIP mode (degraded mode).""" if not self.vip_mode_only: self.vip_mode_only = True logger.warning( "⚠️ ENTERING DEGRADED MODE - VIP MATCHES ONLY\n" f"VIP match IDs: {self.vip_match_ids}" ) def scrape_twitter_match( self, match_id: int, keywords: List[str], max_results: int = 100 ) -> List[TweetData]: """ Scrape tweets for a specific match using keywords. Args: match_id: Match identifier keywords: List of keywords to search (e.g., ["#MatchName", "team1 vs team2"]) max_results: Maximum number of tweets to retrieve (default: 100) Returns: List of TweetData objects Raises: ValueError: If match is not VIP and VIP mode is active tweepy.TweepyException: For API errors """ # Check VIP mode if self.vip_mode_only and match_id not in self.vip_match_ids: logger.warning( f"⚠️ Skipping match {match_id} - Not in VIP list " f"(degraded mode active)" ) raise ValueError(f"Match {match_id} is not VIP and degraded mode is active") # Check rate limit before scraping if not self._check_rate_limit(): self._enable_vip_mode_only() self._wait_for_rate_limit_reset() # Build search query query = " OR ".join(keywords) logger.info(f"🔍 Searching tweets for match {match_id}: '{query}'") try: # Increment API call counter self.api_calls_made += 1 # Search for tweets response = self.client.search_recent_tweets( query=query, max_results=max_results, tweet_fields=[ 'created_at', 'public_metrics', 'text', 'author_id' ] ) if not response.data: logger.info(f"ℹ️ No tweets found for match {match_id}") return [] # Parse tweets tweets = [] for tweet in response.data: tweet_data = TweetData( tweet_id=tweet.id, text=tweet.text, created_at=tweet.created_at, retweet_count=tweet.public_metrics['retweet_count'], like_count=tweet.public_metrics['like_count'], match_id=match_id, source="twitter" ) tweets.append(tweet_data) logger.info( f"✅ Collected {len(tweets)} tweets for match {match_id} " f"({self.api_calls_made}/{self.max_tweets_per_hour} API calls)" ) return tweets except tweepy.TooManyRequests: logger.error("❌ Rate limit exceeded during scraping") self._enable_vip_mode_only() self._wait_for_rate_limit_reset() return [] except tweepy.TweepyException as e: logger.error(f"❌ Twitter API error: {e}") raise def save_tweets_to_db(self, tweets: List[TweetData], db: Session) -> None: """ Save tweets to database. Args: tweets: List of TweetData objects db: SQLAlchemy database session """ from app.models.tweet import Tweet saved_count = 0 for tweet_data in tweets: # Check if tweet already exists existing = db.query(Tweet).filter( Tweet.tweet_id == tweet_data.tweet_id ).first() if existing: logger.debug(f"Tweet {tweet_data.tweet_id} already exists, skipping") continue # Create new tweet tweet = Tweet( tweet_id=tweet_data.tweet_id, text=tweet_data.text, created_at=tweet_data.created_at, retweet_count=tweet_data.retweet_count, like_count=tweet_data.like_count, match_id=tweet_data.match_id, source=tweet_data.source ) db.add(tweet) saved_count += 1 # Commit changes try: db.commit() logger.info(f"✅ Saved {saved_count} new tweets to database") except Exception as e: db.rollback() logger.error(f"❌ Failed to save tweets to database: {e}") raise def scrape_and_save( self, match_id: int, keywords: List[str], db: Session, max_results: int = 100 ) -> List[TweetData]: """ Scrape tweets for a match and save to database. Args: match_id: Match identifier keywords: List of keywords to search db: SQLAlchemy database session max_results: Maximum number of tweets to retrieve Returns: List of TweetData objects """ try: # Scrape tweets tweets = self.scrape_twitter_match(match_id, keywords, max_results) # Save to database if tweets: self.save_tweets_to_db(tweets, db) return tweets except Exception as e: logger.error(f"❌ Failed to scrape and save tweets for match {match_id}: {e}") raise def create_twitter_scraper( bearer_token: str, vip_match_ids: Optional[List[int]] = None ) -> TwitterScraper: """ Factory function to create a Twitter scraper instance. Args: bearer_token: Twitter API bearer token vip_match_ids: Optional list of VIP match IDs Returns: Configured TwitterScraper instance """ # TODO: Load from environment variables or config file max_tweets_per_hour = 1000 rate_limit_alert_threshold = 0.9 scraper = TwitterScraper( bearer_token=bearer_token, max_tweets_per_hour=max_tweets_per_hour, rate_limit_alert_threshold=rate_limit_alert_threshold, vip_match_ids=vip_match_ids or [] ) return scraper