chartbastan/backend/app/scrapers/twitter_scraper.py
2026-02-01 09:31:38 +01:00

352 lines
11 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Twitter scraper module with rate limiting and degraded mode support.
This module provides functionality to scrape tweets for football matches,
with built-in rate limiting (1000 req/hour) and degraded mode for VIP matches.
"""
import logging
import time
from datetime import datetime, timezone
from typing import List, Dict, Optional
from dataclasses import dataclass
import tweepy
from sqlalchemy.orm import Session
logger = logging.getLogger(__name__)
@dataclass
class RateLimitInfo:
"""Information about rate limit usage."""
remaining: int
limit: int
reset_time: Optional[datetime]
@property
def usage_percentage(self) -> float:
"""Calculate usage percentage."""
return (self.limit - self.remaining) / self.limit if self.limit > 0 else 0
@dataclass
class TweetData:
"""Structured data for a tweet."""
tweet_id: str
text: str
created_at: datetime
retweet_count: int
like_count: int
match_id: int
source: str = "twitter"
class TwitterScraper:
"""
Twitter scraper with rate limiting and degraded mode.
Features:
- Rate limiting (1000 req/hour)
- Predictive alerts when approaching limits (>90%)
- Exponential backoff for retries
- Degraded mode for VIP matches only
- Structured logging
"""
def __init__(
self,
bearer_token: str,
max_tweets_per_hour: int = 1000,
rate_limit_alert_threshold: float = 0.9,
vip_match_ids: Optional[List[int]] = None
):
"""
Initialize Twitter scraper.
Args:
bearer_token: Twitter API bearer token
max_tweets_per_hour: Maximum API calls per hour (default: 1000)
rate_limit_alert_threshold: Alert threshold (0.0-1.0, default: 0.9)
vip_match_ids: List of VIP match IDs for degraded mode
"""
self.bearer_token = bearer_token
self.max_tweets_per_hour = max_tweets_per_hour
self.rate_limit_alert_threshold = rate_limit_alert_threshold
self.vip_match_ids = vip_match_ids or []
self.vip_mode_only = False
# Initialize Twitter API client
self.client = tweepy.Client(bearer_token=bearer_token)
# Rate limit tracking
self.api_calls_made = 0
self.rate_limit_info: Optional[RateLimitInfo] = None
# Verify authentication
self._verify_authentication()
def _verify_authentication(self) -> None:
"""Verify Twitter API authentication."""
try:
# Try to get user info to verify authentication
response = self.client.get_me()
if response.data:
logger.info(f"✅ Twitter API authenticated successfully as @{response.data.username}")
else:
logger.warning("⚠️ Twitter API authentication returned no user data")
except Exception as e:
logger.error(f"❌ Twitter API authentication failed: {e}")
raise
def _check_rate_limit(self) -> bool:
"""
Check rate limit status and handle alerts.
Returns:
True if API calls can be made, False otherwise
"""
# Calculate usage
if self.rate_limit_info:
usage = self.rate_limit_info.usage_percentage
else:
usage = self.api_calls_made / self.max_tweets_per_hour
# Predictive alert at threshold
if usage >= self.rate_limit_alert_threshold:
logger.warning(
f"⚠️ Rate limit approaching {usage * 100:.1f}% "
f"({self.api_calls_made}/{self.max_tweets_per_hour} calls)"
)
# Check if limit reached
if usage >= 1.0:
logger.error(
f"❌ Rate limit reached ({self.api_calls_made}/{self.max_tweets_per_hour})"
)
return False
return True
def _wait_for_rate_limit_reset(self) -> None:
"""
Wait for rate limit to reset with exponential backoff.
"""
if self.rate_limit_info and self.rate_limit_info.reset_time:
now = datetime.now(timezone.utc)
wait_seconds = (self.rate_limit_info.reset_time - now).total_seconds()
wait_seconds = max(60, wait_seconds) # Minimum 1 minute wait
else:
# Default to waiting 1 hour if no reset time available
wait_seconds = 3600
logger.info(f"⏳ Waiting {wait_seconds/60:.1f} minutes for rate limit reset...")
time.sleep(wait_seconds)
# Reset counters after waiting
self.api_calls_made = 0
self.vip_mode_only = False
def _enable_vip_mode_only(self) -> None:
"""Enable VIP mode (degraded mode)."""
if not self.vip_mode_only:
self.vip_mode_only = True
logger.warning(
"⚠️ ENTERING DEGRADED MODE - VIP MATCHES ONLY\n"
f"VIP match IDs: {self.vip_match_ids}"
)
def scrape_twitter_match(
self,
match_id: int,
keywords: List[str],
max_results: int = 100
) -> List[TweetData]:
"""
Scrape tweets for a specific match using keywords.
Args:
match_id: Match identifier
keywords: List of keywords to search (e.g., ["#MatchName", "team1 vs team2"])
max_results: Maximum number of tweets to retrieve (default: 100)
Returns:
List of TweetData objects
Raises:
ValueError: If match is not VIP and VIP mode is active
tweepy.TweepyException: For API errors
"""
# Check VIP mode
if self.vip_mode_only and match_id not in self.vip_match_ids:
logger.warning(
f"⚠️ Skipping match {match_id} - Not in VIP list "
f"(degraded mode active)"
)
raise ValueError(f"Match {match_id} is not VIP and degraded mode is active")
# Check rate limit before scraping
if not self._check_rate_limit():
self._enable_vip_mode_only()
self._wait_for_rate_limit_reset()
# Build search query
query = " OR ".join(keywords)
logger.info(f"🔍 Searching tweets for match {match_id}: '{query}'")
try:
# Increment API call counter
self.api_calls_made += 1
# Search for tweets
response = self.client.search_recent_tweets(
query=query,
max_results=max_results,
tweet_fields=[
'created_at',
'public_metrics',
'text',
'author_id'
]
)
if not response.data:
logger.info(f" No tweets found for match {match_id}")
return []
# Parse tweets
tweets = []
for tweet in response.data:
tweet_data = TweetData(
tweet_id=tweet.id,
text=tweet.text,
created_at=tweet.created_at,
retweet_count=tweet.public_metrics['retweet_count'],
like_count=tweet.public_metrics['like_count'],
match_id=match_id,
source="twitter"
)
tweets.append(tweet_data)
logger.info(
f"✅ Collected {len(tweets)} tweets for match {match_id} "
f"({self.api_calls_made}/{self.max_tweets_per_hour} API calls)"
)
return tweets
except tweepy.TooManyRequests:
logger.error("❌ Rate limit exceeded during scraping")
self._enable_vip_mode_only()
self._wait_for_rate_limit_reset()
return []
except tweepy.TweepyException as e:
logger.error(f"❌ Twitter API error: {e}")
raise
def save_tweets_to_db(self, tweets: List[TweetData], db: Session) -> None:
"""
Save tweets to database.
Args:
tweets: List of TweetData objects
db: SQLAlchemy database session
"""
from app.models.tweet import Tweet
saved_count = 0
for tweet_data in tweets:
# Check if tweet already exists
existing = db.query(Tweet).filter(
Tweet.tweet_id == tweet_data.tweet_id
).first()
if existing:
logger.debug(f"Tweet {tweet_data.tweet_id} already exists, skipping")
continue
# Create new tweet
tweet = Tweet(
tweet_id=tweet_data.tweet_id,
text=tweet_data.text,
created_at=tweet_data.created_at,
retweet_count=tweet_data.retweet_count,
like_count=tweet_data.like_count,
match_id=tweet_data.match_id,
source=tweet_data.source
)
db.add(tweet)
saved_count += 1
# Commit changes
try:
db.commit()
logger.info(f"✅ Saved {saved_count} new tweets to database")
except Exception as e:
db.rollback()
logger.error(f"❌ Failed to save tweets to database: {e}")
raise
def scrape_and_save(
self,
match_id: int,
keywords: List[str],
db: Session,
max_results: int = 100
) -> List[TweetData]:
"""
Scrape tweets for a match and save to database.
Args:
match_id: Match identifier
keywords: List of keywords to search
db: SQLAlchemy database session
max_results: Maximum number of tweets to retrieve
Returns:
List of TweetData objects
"""
try:
# Scrape tweets
tweets = self.scrape_twitter_match(match_id, keywords, max_results)
# Save to database
if tweets:
self.save_tweets_to_db(tweets, db)
return tweets
except Exception as e:
logger.error(f"❌ Failed to scrape and save tweets for match {match_id}: {e}")
raise
def create_twitter_scraper(
bearer_token: str,
vip_match_ids: Optional[List[int]] = None
) -> TwitterScraper:
"""
Factory function to create a Twitter scraper instance.
Args:
bearer_token: Twitter API bearer token
vip_match_ids: Optional list of VIP match IDs
Returns:
Configured TwitterScraper instance
"""
# TODO: Load from environment variables or config file
max_tweets_per_hour = 1000
rate_limit_alert_threshold = 0.9
scraper = TwitterScraper(
bearer_token=bearer_token,
max_tweets_per_hour=max_tweets_per_hour,
rate_limit_alert_threshold=rate_limit_alert_threshold,
vip_match_ids=vip_match_ids or []
)
return scraper