Initial commit

This commit is contained in:
2026-02-01 09:31:38 +01:00
commit e02db93960
4396 changed files with 1511612 additions and 0 deletions

View File

@@ -0,0 +1,144 @@
# Twitter Scraper
Module de scraping Twitter avec gestion des rate limiting et mode dégradé.
## Fonctionnalités
- ✅ Collecte de tweets pour les matchs de football
- ✅ Rate limiting (1000 req/heure) avec alertes prédictives (>90%)
- ✅ Mode dégradé avec priorisation des matchs VIP
- ✅ Retry avec backoff exponentiel
- ✅ Logging structuré pour monitoring
- ✅ Stockage en base de données partagée
## Installation
```bash
pip install tweepy==4.14.0
```
## Configuration
Variables d'environnement requises:
```bash
# Twitter API Bearer Token
TWITTER_BEARER_TOKEN="your_bearer_token_here"
```
## Utilisation
### Exemple de base
```python
from app.scrapers.twitter_scraper import create_twitter_scraper
from app.database import SessionLocal
# Créer le scraper
scraper = create_twitter_scraper(
bearer_token="your_bearer_token",
vip_match_ids=[1, 2, 3] # Matchs VIP pour mode dégradé
)
# Scrapper des tweets pour un match
db = SessionLocal()
try:
tweets = scraper.scrape_and_save(
match_id=1,
keywords=["#MatchName", "Team1 vs Team2"],
db=db,
max_results=100
)
print(f"{len(tweets)} tweets collectés")
finally:
db.close()
```
### Configuration avancée
```python
from app.scrapers.twitter_scraper import TwitterScraper
scraper = TwitterScraper(
bearer_token="your_bearer_token",
max_tweets_per_hour=1000, # Limite par défaut
rate_limit_alert_threshold=0.9, # Alert à 90%
vip_match_ids=[1, 2, 3, 4, 5]
)
```
## Architecture
### Rate Limiting
Le scraper implémente:
- **Tracking en temps réel** des appels API
- **Alertes prédictives** quand la limite est atteinte à >90%
- **Mode dégradé automatique** quand la limite est atteinte
- **Backoff exponentiel** pour éviter les blocages
### Mode Dégradé
Quand le rate limit est atteint:
- Le scraper passe en mode VIP seulement
- Seuls les matchs VIP sont scrapés
- Alertes loggées pour monitoring
- Les données sont sauvegardées avant arrêt
## Tests
Exécuter les tests:
```bash
cd backend
pytest tests/test_twitter_scraper.py -v
```
## Intégration
Le module s'intègre avec:
- **SQLite**: Base de données partagée avec Next.js
- **SQLAlchemy**: ORM pour le backend FastAPI
- **Drizzle ORM**: ORM pour le frontend Next.js
- **RabbitMQ** (Phase 2+): Queue asynchrone pour découplage
## Conventions de Code
- **Nommage Python**: `snake_case`
- **Nommage Base de données**: `snake_case`
- **Logging**: Structuré avec `logging` module
- **Type hints**: Obligatoires avec `typing`
## Documentation API
Voir [documentation Tweepy](https://docs.tweepy.org/) pour plus de détails sur l'API Twitter.
## Dépannage
### Erreur: "Twitter API authentication failed"
Vérifiez votre bearer token:
```python
client.get_me() # Devrait retourner vos infos utilisateur
```
### Rate limit atteint trop rapidement
Vérifiez l'utilisation:
```python
print(f"API calls: {scraper.api_calls_made}/{scraper.max_tweets_per_hour}")
```
### Mode dégradé activé sans raison
Vérifiez les seuils:
```python
print(f"Usage: {scraper.rate_limit_info.usage_percentage * 100:.1f}%")
```
## Prochaines Étapes
- [ ] Intégration avec RabbitMQ (Phase 2)
- [ ] Système de priorisation dynamique
- [ ] Dashboard de monitoring en temps réel
- [ ] Tests d'intégration E2E

View File

@@ -0,0 +1,12 @@
"""
Scrapers package for collecting data from various sources.
This package contains modules for scraping data from:
- Twitter
- Reddit (to be implemented)
- RSS feeds (to be implemented)
"""
from .twitter_scraper import TwitterScraper
__all__ = ["TwitterScraper"]

View File

@@ -0,0 +1,441 @@
"""
Reddit scraper module with robust error handling.
This module provides functionality to scrape Reddit posts and comments
about football matches, with built-in error handling and logging.
"""
import logging
from datetime import datetime, timezone
from typing import List, Dict, Optional
from dataclasses import dataclass
import praw
from sqlalchemy.orm import Session
logger = logging.getLogger(__name__)
@dataclass
class RedditPostData:
"""Structured data for a Reddit post."""
post_id: str
title: str
text: str
upvotes: int
created_at: datetime
match_id: int
subreddit: str
source: str = "reddit"
@dataclass
class RedditCommentData:
"""Structured data for a Reddit comment."""
comment_id: str
post_id: str
text: str
upvotes: int
created_at: datetime
source: str = "reddit"
class RedditScraper:
"""
Reddit scraper with robust error handling.
Features:
- Scrapes posts and comments from specified subreddits
- Error handling without stopping the process
- Continues with other sources on errors
- Structured logging
- Timeout configuration
"""
def __init__(
self,
client_id: str,
client_secret: str,
subreddits: List[str],
max_posts_per_subreddit: int = 100,
max_comments_per_post: int = 50,
user_agent: str = "Chartbastan/1.0"
):
"""
Initialize Reddit scraper.
Args:
client_id: Reddit API client ID
client_secret: Reddit API client secret
subreddits: List of subreddits to scrape
max_posts_per_subreddit: Maximum posts to collect per subreddit
max_comments_per_post: Maximum comments to collect per post
user_agent: User agent string for API requests
"""
self.client_id = client_id
self.client_secret = client_secret
self.subreddits = subreddits
self.max_posts_per_subreddit = max_posts_per_subreddit
self.max_comments_per_post = max_comments_per_post
self.user_agent = user_agent
# Initialize Reddit API client
self.reddit = praw.Reddit(
client_id=client_id,
client_secret=client_secret,
user_agent=user_agent
)
# Verify authentication
self._verify_authentication()
def _verify_authentication(self) -> None:
"""Verify Reddit API authentication."""
try:
# Try to get authenticated user
user = self.reddit.user.me()
if user:
logger.info(f"✅ Reddit API authenticated successfully as /u/{user.name}")
else:
logger.warning("⚠️ Reddit API authentication returned no user data")
except Exception as e:
logger.error(f"❌ Reddit API authentication failed: {e}")
raise
def scrape_posts(
self,
subreddit: str,
match_id: int,
keywords: Optional[List[str]] = None
) -> List[RedditPostData]:
"""
Scrape posts from a subreddit for a specific match.
Args:
subreddit: Subreddit name (e.g., "soccer")
match_id: Match identifier
keywords: Optional list of keywords to filter posts
Returns:
List of RedditPostData objects
"""
posts_data = []
try:
logger.info(f"🔍 Scraping posts from r/{subreddit} for match {match_id}")
# Get subreddit
sub = self.reddit.subreddit(subreddit)
# Fetch new posts
posts = list(sub.new(limit=self.max_posts_per_subreddit))
if not posts:
logger.info(f" No posts found in r/{subreddit}")
return posts_data
# Filter by keywords if provided
for post in posts:
# Skip if keywords provided and not matching
if keywords:
text_to_search = f"{post.title.lower()} {post.selftext.lower()}"
if not any(keyword.lower() in text_to_search for keyword in keywords):
continue
# Create post data
post_data = RedditPostData(
post_id=post.id,
title=post.title,
text=post.selftext if hasattr(post, 'selftext') else "",
upvotes=post.score,
created_at=datetime.fromtimestamp(post.created_utc, tz=timezone.utc),
match_id=match_id,
subreddit=subreddit,
source="reddit"
)
posts_data.append(post_data)
logger.info(f"✅ Collected {len(posts_data)} posts from r/{subreddit}")
except praw.exceptions.PRAWException as e:
logger.error(f"❌ Reddit API error while scraping r/{subreddit}: {e}")
except Exception as e:
logger.error(f"❌ Unexpected error while scraping r/{subreddit}: {e}")
return posts_data
def scrape_comments(
self,
post_id: str,
post,
max_comments: Optional[int] = None
) -> List[RedditCommentData]:
"""
Scrape comments from a Reddit post.
Args:
post_id: Reddit post ID
post: PRAW submission object
max_comments: Maximum number of comments to collect
Returns:
List of RedditCommentData objects
"""
comments_data = []
limit = max_comments or self.max_comments_per_post
try:
logger.info(f"💬 Scraping comments for post {post_id}")
# Get comments (replace_more removes "more comments" placeholders)
post.comments.replace_more(limit=0)
comments = list(post.comments.list())[:limit]
if not comments:
logger.info(f" No comments found for post {post_id}")
return comments_data
# Process comments
for comment in comments:
# Skip if comment doesn't have required attributes
if not hasattr(comment, 'id') or not hasattr(comment, 'body'):
continue
comment_data = RedditCommentData(
comment_id=comment.id,
post_id=post_id,
text=comment.body,
upvotes=comment.score,
created_at=datetime.fromtimestamp(comment.created_utc, tz=timezone.utc),
source="reddit"
)
comments_data.append(comment_data)
logger.info(f"✅ Collected {len(comments_data)} comments for post {post_id}")
except praw.exceptions.PRAWException as e:
logger.error(f"❌ Reddit API error while scraping comments for post {post_id}: {e}")
except Exception as e:
logger.error(f"❌ Unexpected error while scraping comments for post {post_id}: {e}")
return comments_data
def save_posts_to_db(self, posts: List[RedditPostData], db: Session) -> None:
"""
Save Reddit posts to database.
Args:
posts: List of RedditPostData objects
db: SQLAlchemy database session
"""
from app.models.reddit_post import RedditPost
saved_count = 0
for post_data in posts:
# Check if post already exists
existing = db.query(RedditPost).filter(
RedditPost.post_id == post_data.post_id
).first()
if existing:
logger.debug(f"Post {post_data.post_id} already exists, skipping")
continue
# Create new post
post = RedditPost(
post_id=post_data.post_id,
title=post_data.title,
text=post_data.text,
upvotes=post_data.upvotes,
created_at=post_data.created_at,
match_id=post_data.match_id,
subreddit=post_data.subreddit,
source=post_data.source
)
db.add(post)
saved_count += 1
# Commit changes
try:
db.commit()
logger.info(f"✅ Saved {saved_count} new Reddit posts to database")
except Exception as e:
db.rollback()
logger.error(f"❌ Failed to save Reddit posts to database: {e}")
raise
def save_comments_to_db(self, comments: List[RedditCommentData], db: Session) -> None:
"""
Save Reddit comments to database.
Args:
comments: List of RedditCommentData objects
db: SQLAlchemy database session
"""
from app.models.reddit_post import RedditComment
saved_count = 0
for comment_data in comments:
# Check if comment already exists
existing = db.query(RedditComment).filter(
RedditComment.comment_id == comment_data.comment_id
).first()
if existing:
logger.debug(f"Comment {comment_data.comment_id} already exists, skipping")
continue
# Create new comment
comment = RedditComment(
comment_id=comment_data.comment_id,
post_id=comment_data.post_id,
text=comment_data.text,
upvotes=comment_data.upvotes,
created_at=comment_data.created_at,
source=comment_data.source
)
db.add(comment)
saved_count += 1
# Commit changes
try:
db.commit()
logger.info(f"✅ Saved {saved_count} new Reddit comments to database")
except Exception as e:
db.rollback()
logger.error(f"❌ Failed to save Reddit comments to database: {e}")
raise
def scrape_reddit_match(
self,
match_id: int,
keywords: Optional[List[str]] = None,
scrape_comments: bool = True,
db: Optional[Session] = None
) -> Dict[str, List]:
"""
Scrape Reddit posts and comments for a specific match.
Args:
match_id: Match identifier
keywords: Optional list of keywords to filter posts
scrape_comments: Whether to scrape comments
db: Optional database session for immediate saving
Returns:
Dictionary with 'posts' and 'comments' lists
"""
all_posts = []
all_comments = []
# Scrape from all configured subreddits
for subreddit in self.subreddits:
try:
# Scrape posts
posts = self.scrape_posts(subreddit, match_id, keywords)
all_posts.extend(posts)
# Save posts if db session provided
if db and posts:
self.save_posts_to_db(posts, db)
# Scrape comments if requested
if scrape_comments and posts:
# Get PRAW post objects for comment scraping
sub = self.reddit.subreddit(subreddit)
praw_posts = list(sub.new(limit=self.max_posts_per_subreddit))
for post_data in posts:
# Find matching PRAW post
praw_post = next(
(p for p in praw_posts if p.id == post_data.post_id),
None
)
if praw_post:
comments = self.scrape_comments(post_data.post_id, praw_post)
all_comments.extend(comments)
# Save comments if db session provided
if db and comments:
self.save_comments_to_db(comments, db)
except Exception as e:
logger.error(
f"❌ Failed to scrape r/{subreddit} for match {match_id}: {e}. "
f"Continuing with other sources..."
)
continue
logger.info(
f"✅ Total collected for match {match_id}: "
f"{len(all_posts)} posts, {len(all_comments)} comments"
)
return {
'posts': all_posts,
'comments': all_comments
}
def scrape_and_save(
self,
match_id: int,
db: Session,
keywords: Optional[List[str]] = None,
scrape_comments: bool = True
) -> Dict[str, List]:
"""
Scrape Reddit data for a match and save to database.
Args:
match_id: Match identifier
db: SQLAlchemy database session
keywords: Optional list of keywords to filter posts
scrape_comments: Whether to scrape comments
Returns:
Dictionary with 'posts' and 'comments' lists
"""
try:
return self.scrape_reddit_match(
match_id=match_id,
keywords=keywords,
scrape_comments=scrape_comments,
db=db
)
except Exception as e:
logger.error(f"❌ Failed to scrape and save Reddit data for match {match_id}: {e}")
raise
def create_reddit_scraper(
client_id: str,
client_secret: str,
subreddits: Optional[List[str]] = None
) -> RedditScraper:
"""
Factory function to create a Reddit scraper instance.
Args:
client_id: Reddit API client ID
client_secret: Reddit API client secret
subreddits: Optional list of subreddits to scrape
Returns:
Configured RedditScraper instance
"""
# Default subreddits if not provided
if subreddits is None:
subreddits = ["soccer", "football", "Ligue1", "PremierLeague"]
scraper = RedditScraper(
client_id=client_id,
client_secret=client_secret,
subreddits=subreddits,
max_posts_per_subreddit=100,
max_comments_per_post=50
)
return scraper

View File

@@ -0,0 +1,380 @@
"""
RSS scraper module with robust error handling.
This module provides functionality to scrape RSS feeds from sports sources,
with built-in error handling and logging.
"""
import logging
import feedparser
from datetime import datetime, timezone
from typing import List, Dict, Optional
from dataclasses import dataclass
from urllib.parse import urlparse
from sqlalchemy.orm import Session
logger = logging.getLogger(__name__)
@dataclass
class RSSArticleData:
"""Structured data for an RSS article."""
article_id: str
title: str
content: str
published_at: datetime
source_url: str
match_id: Optional[int]
source: str = "rss"
class RSSScraper:
"""
RSS scraper with robust error handling.
Features:
- Scrapes RSS feeds from configured sports sources
- Error handling without stopping the process
- Continues with other sources on errors
- Structured logging
- Timeout configuration
- Filters relevant football articles
"""
# Default RSS sources for sports news
DEFAULT_RSS_SOURCES = [
"http://www.espn.com/espn/rss/news",
"http://feeds.bbci.co.uk/sport/football/rss.xml",
"https://www.goal.com/rss",
"https://www.skysports.com/rss/12040",
]
# Keywords to filter relevant football articles
FOOTBALL_KEYWORDS = [
"football", "soccer", "match", "goal", "premier league",
"la liga", "serie a", "bundesliga", "ligue 1", "champions league",
"euro", "world cup", "cup", "league", "team", "club", "player",
"coach", "manager", "score", "result", "transfer"
]
def __init__(
self,
rss_sources: Optional[List[str]] = None,
timeout: int = 30,
max_articles_per_source: int = 100,
keywords: Optional[List[str]] = None
):
"""
Initialize RSS scraper.
Args:
rss_sources: List of RSS feed URLs to scrape
timeout: Request timeout in seconds (default: 30)
max_articles_per_source: Maximum articles to collect per source
keywords: List of keywords to filter relevant articles
"""
self.rss_sources = rss_sources or self.DEFAULT_RSS_SOURCES
self.timeout = timeout
self.max_articles_per_source = max_articles_per_source
self.keywords = keywords or self.FOOTBALL_KEYWORDS
logger.info(f"📰 RSS Scraper initialized with {len(self.rss_sources)} sources")
for i, source in enumerate(self.rss_sources, 1):
domain = urlparse(source).netloc
logger.info(f" {i}. {domain}")
def _is_article_relevant(self, title: str, content: str) -> bool:
"""
Check if an article is relevant to football based on keywords.
Args:
title: Article title
content: Article content
Returns:
True if article is relevant, False otherwise
"""
text_to_check = f"{title.lower()} {content.lower()}"
# Check if any keyword is present
for keyword in self.keywords:
if keyword.lower() in text_to_check:
return True
return False
def _parse_published_date(self, published: str) -> datetime:
"""
Parse the published date from RSS feed.
Args:
published: Published date string from RSS feed
Returns:
Datetime object in UTC timezone
"""
try:
# feedparser automatically parses dates
parsed = feedparser.parse(published)
if hasattr(parsed, 'published_parsed') and parsed.published_parsed:
return datetime(*parsed.published_parsed[:6], tzinfo=timezone.utc)
# Fallback: try to parse as string
from email.utils import parsedate_to_datetime
return parsedate_to_datetime(published).astimezone(timezone.utc)
except Exception as e:
logger.warning(f"⚠️ Failed to parse date '{published}': {e}")
return datetime.now(timezone.utc)
def _parse_feed(
self,
source_url: str
) -> List[RSSArticleData]:
"""
Parse RSS feed and extract articles.
Args:
source_url: URL of the RSS feed
Returns:
List of RSSArticleData objects
"""
articles = []
try:
logger.info(f"🔍 Parsing RSS feed: {source_url}")
# Parse RSS feed with timeout
feed = feedparser.parse(source_url)
# Check for feed errors
if feed.get('bozo', False):
logger.warning(f"⚠️ RSS feed has malformed XML: {source_url}")
# Continue anyway as feedparser can handle some malformed feeds
# Extract feed info
feed_title = feed.feed.get('title', 'Unknown')
logger.info(f"📰 Feed: {feed_title}")
logger.info(f" Total entries: {len(feed.entries)}")
# Process entries
for entry in feed.entries[:self.max_articles_per_source]:
try:
# Extract article ID
article_id = entry.get('id') or entry.get('link', '')
if not article_id:
logger.warning(f"⚠️ Article missing ID, skipping")
continue
# Extract title
title = entry.get('title', '')
if not title:
logger.warning(f"⚠️ Article missing title, skipping")
continue
# Extract content
content = ''
if 'content' in entry:
content = entry.content[0].value if entry.content else ''
elif 'summary' in entry:
content = entry.summary
elif 'description' in entry:
content = entry.description
# Parse published date
published_str = entry.get('published') or entry.get('updated')
if not published_str:
logger.warning(f"⚠️ Article missing published date, using current time")
published_at = datetime.now(timezone.utc)
else:
published_at = self._parse_published_date(published_str)
# Filter relevant articles
if not self._is_article_relevant(title, content):
logger.debug(f"🚫 Article not relevant: {title}")
continue
# Create article data
article_data = RSSArticleData(
article_id=article_id,
title=title,
content=content,
published_at=published_at,
source_url=source_url,
match_id=None, # Will be matched later if needed
source=feed_title
)
articles.append(article_data)
except Exception as e:
logger.error(f"❌ Error processing article: {e}")
continue
logger.info(f"✅ Collected {len(articles)} relevant articles from {source_url}")
except Exception as e:
logger.error(f"❌ Failed to parse RSS feed {source_url}: {e}")
return articles
def scrape_all_sources(
self,
match_id: Optional[int] = None
) -> List[RSSArticleData]:
"""
Scrape all configured RSS sources.
Args:
match_id: Optional match ID to associate with articles
Returns:
List of RSSArticleData objects from all sources
"""
all_articles = []
for source_url in self.rss_sources:
try:
# Parse feed
articles = self._parse_feed(source_url)
# Set match_id if provided
if match_id:
for article in articles:
article.match_id = match_id
all_articles.extend(articles)
except Exception as e:
logger.error(
f"❌ Failed to scrape source {source_url}: {e}. "
f"Continuing with other sources..."
)
continue
logger.info(
f"✅ Total articles collected from all sources: {len(all_articles)}"
)
return all_articles
def scrape_single_source(
self,
source_url: str,
match_id: Optional[int] = None
) -> List[RSSArticleData]:
"""
Scrape a single RSS source.
Args:
source_url: URL of the RSS feed to scrape
match_id: Optional match ID to associate with articles
Returns:
List of RSSArticleData objects
"""
articles = self._parse_feed(source_url)
# Set match_id if provided
if match_id:
for article in articles:
article.match_id = match_id
return articles
def save_articles_to_db(self, articles: List[RSSArticleData], db: Session) -> None:
"""
Save RSS articles to database.
Args:
articles: List of RSSArticleData objects
db: SQLAlchemy database session
"""
from app.models.rss_article import RSSArticle
saved_count = 0
for article_data in articles:
# Check if article already exists
existing = db.query(RSSArticle).filter(
RSSArticle.article_id == article_data.article_id
).first()
if existing:
logger.debug(f"Article {article_data.article_id} already exists, skipping")
continue
# Create new article
article = RSSArticle(
article_id=article_data.article_id,
title=article_data.title,
content=article_data.content,
published_at=article_data.published_at,
source_url=article_data.source_url,
match_id=article_data.match_id,
source=article_data.source
)
db.add(article)
saved_count += 1
# Commit changes
try:
db.commit()
logger.info(f"✅ Saved {saved_count} new RSS articles to database")
except Exception as e:
db.rollback()
logger.error(f"❌ Failed to save RSS articles to database: {e}")
raise
def scrape_and_save(
self,
db: Session,
match_id: Optional[int] = None
) -> List[RSSArticleData]:
"""
Scrape all RSS sources and save to database.
Args:
db: SQLAlchemy database session
match_id: Optional match ID to associate with articles
Returns:
List of RSSArticleData objects
"""
try:
# Scrape articles
articles = self.scrape_all_sources(match_id)
# Save to database
if articles:
self.save_articles_to_db(articles, db)
return articles
except Exception as e:
logger.error(f"❌ Failed to scrape and save RSS articles: {e}")
raise
def create_rss_scraper(
rss_sources: Optional[List[str]] = None,
keywords: Optional[List[str]] = None
) -> RSSScraper:
"""
Factory function to create an RSS scraper instance.
Args:
rss_sources: Optional list of RSS feed URLs
keywords: Optional list of keywords to filter articles
Returns:
Configured RSSScraper instance
"""
scraper = RSSScraper(
rss_sources=rss_sources,
timeout=30,
max_articles_per_source=100,
keywords=keywords
)
return scraper

View File

@@ -0,0 +1,351 @@
"""
Twitter scraper module with rate limiting and degraded mode support.
This module provides functionality to scrape tweets for football matches,
with built-in rate limiting (1000 req/hour) and degraded mode for VIP matches.
"""
import logging
import time
from datetime import datetime, timezone
from typing import List, Dict, Optional
from dataclasses import dataclass
import tweepy
from sqlalchemy.orm import Session
logger = logging.getLogger(__name__)
@dataclass
class RateLimitInfo:
"""Information about rate limit usage."""
remaining: int
limit: int
reset_time: Optional[datetime]
@property
def usage_percentage(self) -> float:
"""Calculate usage percentage."""
return (self.limit - self.remaining) / self.limit if self.limit > 0 else 0
@dataclass
class TweetData:
"""Structured data for a tweet."""
tweet_id: str
text: str
created_at: datetime
retweet_count: int
like_count: int
match_id: int
source: str = "twitter"
class TwitterScraper:
"""
Twitter scraper with rate limiting and degraded mode.
Features:
- Rate limiting (1000 req/hour)
- Predictive alerts when approaching limits (>90%)
- Exponential backoff for retries
- Degraded mode for VIP matches only
- Structured logging
"""
def __init__(
self,
bearer_token: str,
max_tweets_per_hour: int = 1000,
rate_limit_alert_threshold: float = 0.9,
vip_match_ids: Optional[List[int]] = None
):
"""
Initialize Twitter scraper.
Args:
bearer_token: Twitter API bearer token
max_tweets_per_hour: Maximum API calls per hour (default: 1000)
rate_limit_alert_threshold: Alert threshold (0.0-1.0, default: 0.9)
vip_match_ids: List of VIP match IDs for degraded mode
"""
self.bearer_token = bearer_token
self.max_tweets_per_hour = max_tweets_per_hour
self.rate_limit_alert_threshold = rate_limit_alert_threshold
self.vip_match_ids = vip_match_ids or []
self.vip_mode_only = False
# Initialize Twitter API client
self.client = tweepy.Client(bearer_token=bearer_token)
# Rate limit tracking
self.api_calls_made = 0
self.rate_limit_info: Optional[RateLimitInfo] = None
# Verify authentication
self._verify_authentication()
def _verify_authentication(self) -> None:
"""Verify Twitter API authentication."""
try:
# Try to get user info to verify authentication
response = self.client.get_me()
if response.data:
logger.info(f"✅ Twitter API authenticated successfully as @{response.data.username}")
else:
logger.warning("⚠️ Twitter API authentication returned no user data")
except Exception as e:
logger.error(f"❌ Twitter API authentication failed: {e}")
raise
def _check_rate_limit(self) -> bool:
"""
Check rate limit status and handle alerts.
Returns:
True if API calls can be made, False otherwise
"""
# Calculate usage
if self.rate_limit_info:
usage = self.rate_limit_info.usage_percentage
else:
usage = self.api_calls_made / self.max_tweets_per_hour
# Predictive alert at threshold
if usage >= self.rate_limit_alert_threshold:
logger.warning(
f"⚠️ Rate limit approaching {usage * 100:.1f}% "
f"({self.api_calls_made}/{self.max_tweets_per_hour} calls)"
)
# Check if limit reached
if usage >= 1.0:
logger.error(
f"❌ Rate limit reached ({self.api_calls_made}/{self.max_tweets_per_hour})"
)
return False
return True
def _wait_for_rate_limit_reset(self) -> None:
"""
Wait for rate limit to reset with exponential backoff.
"""
if self.rate_limit_info and self.rate_limit_info.reset_time:
now = datetime.now(timezone.utc)
wait_seconds = (self.rate_limit_info.reset_time - now).total_seconds()
wait_seconds = max(60, wait_seconds) # Minimum 1 minute wait
else:
# Default to waiting 1 hour if no reset time available
wait_seconds = 3600
logger.info(f"⏳ Waiting {wait_seconds/60:.1f} minutes for rate limit reset...")
time.sleep(wait_seconds)
# Reset counters after waiting
self.api_calls_made = 0
self.vip_mode_only = False
def _enable_vip_mode_only(self) -> None:
"""Enable VIP mode (degraded mode)."""
if not self.vip_mode_only:
self.vip_mode_only = True
logger.warning(
"⚠️ ENTERING DEGRADED MODE - VIP MATCHES ONLY\n"
f"VIP match IDs: {self.vip_match_ids}"
)
def scrape_twitter_match(
self,
match_id: int,
keywords: List[str],
max_results: int = 100
) -> List[TweetData]:
"""
Scrape tweets for a specific match using keywords.
Args:
match_id: Match identifier
keywords: List of keywords to search (e.g., ["#MatchName", "team1 vs team2"])
max_results: Maximum number of tweets to retrieve (default: 100)
Returns:
List of TweetData objects
Raises:
ValueError: If match is not VIP and VIP mode is active
tweepy.TweepyException: For API errors
"""
# Check VIP mode
if self.vip_mode_only and match_id not in self.vip_match_ids:
logger.warning(
f"⚠️ Skipping match {match_id} - Not in VIP list "
f"(degraded mode active)"
)
raise ValueError(f"Match {match_id} is not VIP and degraded mode is active")
# Check rate limit before scraping
if not self._check_rate_limit():
self._enable_vip_mode_only()
self._wait_for_rate_limit_reset()
# Build search query
query = " OR ".join(keywords)
logger.info(f"🔍 Searching tweets for match {match_id}: '{query}'")
try:
# Increment API call counter
self.api_calls_made += 1
# Search for tweets
response = self.client.search_recent_tweets(
query=query,
max_results=max_results,
tweet_fields=[
'created_at',
'public_metrics',
'text',
'author_id'
]
)
if not response.data:
logger.info(f" No tweets found for match {match_id}")
return []
# Parse tweets
tweets = []
for tweet in response.data:
tweet_data = TweetData(
tweet_id=tweet.id,
text=tweet.text,
created_at=tweet.created_at,
retweet_count=tweet.public_metrics['retweet_count'],
like_count=tweet.public_metrics['like_count'],
match_id=match_id,
source="twitter"
)
tweets.append(tweet_data)
logger.info(
f"✅ Collected {len(tweets)} tweets for match {match_id} "
f"({self.api_calls_made}/{self.max_tweets_per_hour} API calls)"
)
return tweets
except tweepy.TooManyRequests:
logger.error("❌ Rate limit exceeded during scraping")
self._enable_vip_mode_only()
self._wait_for_rate_limit_reset()
return []
except tweepy.TweepyException as e:
logger.error(f"❌ Twitter API error: {e}")
raise
def save_tweets_to_db(self, tweets: List[TweetData], db: Session) -> None:
"""
Save tweets to database.
Args:
tweets: List of TweetData objects
db: SQLAlchemy database session
"""
from app.models.tweet import Tweet
saved_count = 0
for tweet_data in tweets:
# Check if tweet already exists
existing = db.query(Tweet).filter(
Tweet.tweet_id == tweet_data.tweet_id
).first()
if existing:
logger.debug(f"Tweet {tweet_data.tweet_id} already exists, skipping")
continue
# Create new tweet
tweet = Tweet(
tweet_id=tweet_data.tweet_id,
text=tweet_data.text,
created_at=tweet_data.created_at,
retweet_count=tweet_data.retweet_count,
like_count=tweet_data.like_count,
match_id=tweet_data.match_id,
source=tweet_data.source
)
db.add(tweet)
saved_count += 1
# Commit changes
try:
db.commit()
logger.info(f"✅ Saved {saved_count} new tweets to database")
except Exception as e:
db.rollback()
logger.error(f"❌ Failed to save tweets to database: {e}")
raise
def scrape_and_save(
self,
match_id: int,
keywords: List[str],
db: Session,
max_results: int = 100
) -> List[TweetData]:
"""
Scrape tweets for a match and save to database.
Args:
match_id: Match identifier
keywords: List of keywords to search
db: SQLAlchemy database session
max_results: Maximum number of tweets to retrieve
Returns:
List of TweetData objects
"""
try:
# Scrape tweets
tweets = self.scrape_twitter_match(match_id, keywords, max_results)
# Save to database
if tweets:
self.save_tweets_to_db(tweets, db)
return tweets
except Exception as e:
logger.error(f"❌ Failed to scrape and save tweets for match {match_id}: {e}")
raise
def create_twitter_scraper(
bearer_token: str,
vip_match_ids: Optional[List[int]] = None
) -> TwitterScraper:
"""
Factory function to create a Twitter scraper instance.
Args:
bearer_token: Twitter API bearer token
vip_match_ids: Optional list of VIP match IDs
Returns:
Configured TwitterScraper instance
"""
# TODO: Load from environment variables or config file
max_tweets_per_hour = 1000
rate_limit_alert_threshold = 0.9
scraper = TwitterScraper(
bearer_token=bearer_token,
max_tweets_per_hour=max_tweets_per_hour,
rate_limit_alert_threshold=rate_limit_alert_threshold,
vip_match_ids=vip_match_ids or []
)
return scraper