352 lines
11 KiB
Python
352 lines
11 KiB
Python
"""
|
||
Twitter scraper module with rate limiting and degraded mode support.
|
||
|
||
This module provides functionality to scrape tweets for football matches,
|
||
with built-in rate limiting (1000 req/hour) and degraded mode for VIP matches.
|
||
"""
|
||
|
||
import logging
|
||
import time
|
||
from datetime import datetime, timezone
|
||
from typing import List, Dict, Optional
|
||
from dataclasses import dataclass
|
||
|
||
import tweepy
|
||
from sqlalchemy.orm import Session
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
@dataclass
|
||
class RateLimitInfo:
|
||
"""Information about rate limit usage."""
|
||
remaining: int
|
||
limit: int
|
||
reset_time: Optional[datetime]
|
||
|
||
@property
|
||
def usage_percentage(self) -> float:
|
||
"""Calculate usage percentage."""
|
||
return (self.limit - self.remaining) / self.limit if self.limit > 0 else 0
|
||
|
||
|
||
@dataclass
|
||
class TweetData:
|
||
"""Structured data for a tweet."""
|
||
tweet_id: str
|
||
text: str
|
||
created_at: datetime
|
||
retweet_count: int
|
||
like_count: int
|
||
match_id: int
|
||
source: str = "twitter"
|
||
|
||
|
||
class TwitterScraper:
|
||
"""
|
||
Twitter scraper with rate limiting and degraded mode.
|
||
|
||
Features:
|
||
- Rate limiting (1000 req/hour)
|
||
- Predictive alerts when approaching limits (>90%)
|
||
- Exponential backoff for retries
|
||
- Degraded mode for VIP matches only
|
||
- Structured logging
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
bearer_token: str,
|
||
max_tweets_per_hour: int = 1000,
|
||
rate_limit_alert_threshold: float = 0.9,
|
||
vip_match_ids: Optional[List[int]] = None
|
||
):
|
||
"""
|
||
Initialize Twitter scraper.
|
||
|
||
Args:
|
||
bearer_token: Twitter API bearer token
|
||
max_tweets_per_hour: Maximum API calls per hour (default: 1000)
|
||
rate_limit_alert_threshold: Alert threshold (0.0-1.0, default: 0.9)
|
||
vip_match_ids: List of VIP match IDs for degraded mode
|
||
"""
|
||
self.bearer_token = bearer_token
|
||
self.max_tweets_per_hour = max_tweets_per_hour
|
||
self.rate_limit_alert_threshold = rate_limit_alert_threshold
|
||
self.vip_match_ids = vip_match_ids or []
|
||
self.vip_mode_only = False
|
||
|
||
# Initialize Twitter API client
|
||
self.client = tweepy.Client(bearer_token=bearer_token)
|
||
|
||
# Rate limit tracking
|
||
self.api_calls_made = 0
|
||
self.rate_limit_info: Optional[RateLimitInfo] = None
|
||
|
||
# Verify authentication
|
||
self._verify_authentication()
|
||
|
||
def _verify_authentication(self) -> None:
|
||
"""Verify Twitter API authentication."""
|
||
try:
|
||
# Try to get user info to verify authentication
|
||
response = self.client.get_me()
|
||
if response.data:
|
||
logger.info(f"✅ Twitter API authenticated successfully as @{response.data.username}")
|
||
else:
|
||
logger.warning("⚠️ Twitter API authentication returned no user data")
|
||
except Exception as e:
|
||
logger.error(f"❌ Twitter API authentication failed: {e}")
|
||
raise
|
||
|
||
def _check_rate_limit(self) -> bool:
|
||
"""
|
||
Check rate limit status and handle alerts.
|
||
|
||
Returns:
|
||
True if API calls can be made, False otherwise
|
||
"""
|
||
# Calculate usage
|
||
if self.rate_limit_info:
|
||
usage = self.rate_limit_info.usage_percentage
|
||
else:
|
||
usage = self.api_calls_made / self.max_tweets_per_hour
|
||
|
||
# Predictive alert at threshold
|
||
if usage >= self.rate_limit_alert_threshold:
|
||
logger.warning(
|
||
f"⚠️ Rate limit approaching {usage * 100:.1f}% "
|
||
f"({self.api_calls_made}/{self.max_tweets_per_hour} calls)"
|
||
)
|
||
|
||
# Check if limit reached
|
||
if usage >= 1.0:
|
||
logger.error(
|
||
f"❌ Rate limit reached ({self.api_calls_made}/{self.max_tweets_per_hour})"
|
||
)
|
||
return False
|
||
|
||
return True
|
||
|
||
def _wait_for_rate_limit_reset(self) -> None:
|
||
"""
|
||
Wait for rate limit to reset with exponential backoff.
|
||
"""
|
||
if self.rate_limit_info and self.rate_limit_info.reset_time:
|
||
now = datetime.now(timezone.utc)
|
||
wait_seconds = (self.rate_limit_info.reset_time - now).total_seconds()
|
||
wait_seconds = max(60, wait_seconds) # Minimum 1 minute wait
|
||
else:
|
||
# Default to waiting 1 hour if no reset time available
|
||
wait_seconds = 3600
|
||
|
||
logger.info(f"⏳ Waiting {wait_seconds/60:.1f} minutes for rate limit reset...")
|
||
time.sleep(wait_seconds)
|
||
|
||
# Reset counters after waiting
|
||
self.api_calls_made = 0
|
||
self.vip_mode_only = False
|
||
|
||
def _enable_vip_mode_only(self) -> None:
|
||
"""Enable VIP mode (degraded mode)."""
|
||
if not self.vip_mode_only:
|
||
self.vip_mode_only = True
|
||
logger.warning(
|
||
"⚠️ ENTERING DEGRADED MODE - VIP MATCHES ONLY\n"
|
||
f"VIP match IDs: {self.vip_match_ids}"
|
||
)
|
||
|
||
def scrape_twitter_match(
|
||
self,
|
||
match_id: int,
|
||
keywords: List[str],
|
||
max_results: int = 100
|
||
) -> List[TweetData]:
|
||
"""
|
||
Scrape tweets for a specific match using keywords.
|
||
|
||
Args:
|
||
match_id: Match identifier
|
||
keywords: List of keywords to search (e.g., ["#MatchName", "team1 vs team2"])
|
||
max_results: Maximum number of tweets to retrieve (default: 100)
|
||
|
||
Returns:
|
||
List of TweetData objects
|
||
|
||
Raises:
|
||
ValueError: If match is not VIP and VIP mode is active
|
||
tweepy.TweepyException: For API errors
|
||
"""
|
||
# Check VIP mode
|
||
if self.vip_mode_only and match_id not in self.vip_match_ids:
|
||
logger.warning(
|
||
f"⚠️ Skipping match {match_id} - Not in VIP list "
|
||
f"(degraded mode active)"
|
||
)
|
||
raise ValueError(f"Match {match_id} is not VIP and degraded mode is active")
|
||
|
||
# Check rate limit before scraping
|
||
if not self._check_rate_limit():
|
||
self._enable_vip_mode_only()
|
||
self._wait_for_rate_limit_reset()
|
||
|
||
# Build search query
|
||
query = " OR ".join(keywords)
|
||
logger.info(f"🔍 Searching tweets for match {match_id}: '{query}'")
|
||
|
||
try:
|
||
# Increment API call counter
|
||
self.api_calls_made += 1
|
||
|
||
# Search for tweets
|
||
response = self.client.search_recent_tweets(
|
||
query=query,
|
||
max_results=max_results,
|
||
tweet_fields=[
|
||
'created_at',
|
||
'public_metrics',
|
||
'text',
|
||
'author_id'
|
||
]
|
||
)
|
||
|
||
if not response.data:
|
||
logger.info(f"ℹ️ No tweets found for match {match_id}")
|
||
return []
|
||
|
||
# Parse tweets
|
||
tweets = []
|
||
for tweet in response.data:
|
||
tweet_data = TweetData(
|
||
tweet_id=tweet.id,
|
||
text=tweet.text,
|
||
created_at=tweet.created_at,
|
||
retweet_count=tweet.public_metrics['retweet_count'],
|
||
like_count=tweet.public_metrics['like_count'],
|
||
match_id=match_id,
|
||
source="twitter"
|
||
)
|
||
tweets.append(tweet_data)
|
||
|
||
logger.info(
|
||
f"✅ Collected {len(tweets)} tweets for match {match_id} "
|
||
f"({self.api_calls_made}/{self.max_tweets_per_hour} API calls)"
|
||
)
|
||
|
||
return tweets
|
||
|
||
except tweepy.TooManyRequests:
|
||
logger.error("❌ Rate limit exceeded during scraping")
|
||
self._enable_vip_mode_only()
|
||
self._wait_for_rate_limit_reset()
|
||
return []
|
||
|
||
except tweepy.TweepyException as e:
|
||
logger.error(f"❌ Twitter API error: {e}")
|
||
raise
|
||
|
||
def save_tweets_to_db(self, tweets: List[TweetData], db: Session) -> None:
|
||
"""
|
||
Save tweets to database.
|
||
|
||
Args:
|
||
tweets: List of TweetData objects
|
||
db: SQLAlchemy database session
|
||
"""
|
||
from app.models.tweet import Tweet
|
||
|
||
saved_count = 0
|
||
for tweet_data in tweets:
|
||
# Check if tweet already exists
|
||
existing = db.query(Tweet).filter(
|
||
Tweet.tweet_id == tweet_data.tweet_id
|
||
).first()
|
||
|
||
if existing:
|
||
logger.debug(f"Tweet {tweet_data.tweet_id} already exists, skipping")
|
||
continue
|
||
|
||
# Create new tweet
|
||
tweet = Tweet(
|
||
tweet_id=tweet_data.tweet_id,
|
||
text=tweet_data.text,
|
||
created_at=tweet_data.created_at,
|
||
retweet_count=tweet_data.retweet_count,
|
||
like_count=tweet_data.like_count,
|
||
match_id=tweet_data.match_id,
|
||
source=tweet_data.source
|
||
)
|
||
|
||
db.add(tweet)
|
||
saved_count += 1
|
||
|
||
# Commit changes
|
||
try:
|
||
db.commit()
|
||
logger.info(f"✅ Saved {saved_count} new tweets to database")
|
||
except Exception as e:
|
||
db.rollback()
|
||
logger.error(f"❌ Failed to save tweets to database: {e}")
|
||
raise
|
||
|
||
def scrape_and_save(
|
||
self,
|
||
match_id: int,
|
||
keywords: List[str],
|
||
db: Session,
|
||
max_results: int = 100
|
||
) -> List[TweetData]:
|
||
"""
|
||
Scrape tweets for a match and save to database.
|
||
|
||
Args:
|
||
match_id: Match identifier
|
||
keywords: List of keywords to search
|
||
db: SQLAlchemy database session
|
||
max_results: Maximum number of tweets to retrieve
|
||
|
||
Returns:
|
||
List of TweetData objects
|
||
"""
|
||
try:
|
||
# Scrape tweets
|
||
tweets = self.scrape_twitter_match(match_id, keywords, max_results)
|
||
|
||
# Save to database
|
||
if tweets:
|
||
self.save_tweets_to_db(tweets, db)
|
||
|
||
return tweets
|
||
|
||
except Exception as e:
|
||
logger.error(f"❌ Failed to scrape and save tweets for match {match_id}: {e}")
|
||
raise
|
||
|
||
|
||
def create_twitter_scraper(
|
||
bearer_token: str,
|
||
vip_match_ids: Optional[List[int]] = None
|
||
) -> TwitterScraper:
|
||
"""
|
||
Factory function to create a Twitter scraper instance.
|
||
|
||
Args:
|
||
bearer_token: Twitter API bearer token
|
||
vip_match_ids: Optional list of VIP match IDs
|
||
|
||
Returns:
|
||
Configured TwitterScraper instance
|
||
"""
|
||
# TODO: Load from environment variables or config file
|
||
max_tweets_per_hour = 1000
|
||
rate_limit_alert_threshold = 0.9
|
||
|
||
scraper = TwitterScraper(
|
||
bearer_token=bearer_token,
|
||
max_tweets_per_hour=max_tweets_per_hour,
|
||
rate_limit_alert_threshold=rate_limit_alert_threshold,
|
||
vip_match_ids=vip_match_ids or []
|
||
)
|
||
|
||
return scraper
|