442 lines
14 KiB
Python
442 lines
14 KiB
Python
"""
|
||
Reddit scraper module with robust error handling.
|
||
|
||
This module provides functionality to scrape Reddit posts and comments
|
||
about football matches, with built-in error handling and logging.
|
||
"""
|
||
|
||
import logging
|
||
from datetime import datetime, timezone
|
||
from typing import List, Dict, Optional
|
||
from dataclasses import dataclass
|
||
|
||
import praw
|
||
from sqlalchemy.orm import Session
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
@dataclass
|
||
class RedditPostData:
|
||
"""Structured data for a Reddit post."""
|
||
post_id: str
|
||
title: str
|
||
text: str
|
||
upvotes: int
|
||
created_at: datetime
|
||
match_id: int
|
||
subreddit: str
|
||
source: str = "reddit"
|
||
|
||
|
||
@dataclass
|
||
class RedditCommentData:
|
||
"""Structured data for a Reddit comment."""
|
||
comment_id: str
|
||
post_id: str
|
||
text: str
|
||
upvotes: int
|
||
created_at: datetime
|
||
source: str = "reddit"
|
||
|
||
|
||
class RedditScraper:
|
||
"""
|
||
Reddit scraper with robust error handling.
|
||
|
||
Features:
|
||
- Scrapes posts and comments from specified subreddits
|
||
- Error handling without stopping the process
|
||
- Continues with other sources on errors
|
||
- Structured logging
|
||
- Timeout configuration
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
client_id: str,
|
||
client_secret: str,
|
||
subreddits: List[str],
|
||
max_posts_per_subreddit: int = 100,
|
||
max_comments_per_post: int = 50,
|
||
user_agent: str = "Chartbastan/1.0"
|
||
):
|
||
"""
|
||
Initialize Reddit scraper.
|
||
|
||
Args:
|
||
client_id: Reddit API client ID
|
||
client_secret: Reddit API client secret
|
||
subreddits: List of subreddits to scrape
|
||
max_posts_per_subreddit: Maximum posts to collect per subreddit
|
||
max_comments_per_post: Maximum comments to collect per post
|
||
user_agent: User agent string for API requests
|
||
"""
|
||
self.client_id = client_id
|
||
self.client_secret = client_secret
|
||
self.subreddits = subreddits
|
||
self.max_posts_per_subreddit = max_posts_per_subreddit
|
||
self.max_comments_per_post = max_comments_per_post
|
||
self.user_agent = user_agent
|
||
|
||
# Initialize Reddit API client
|
||
self.reddit = praw.Reddit(
|
||
client_id=client_id,
|
||
client_secret=client_secret,
|
||
user_agent=user_agent
|
||
)
|
||
|
||
# Verify authentication
|
||
self._verify_authentication()
|
||
|
||
def _verify_authentication(self) -> None:
|
||
"""Verify Reddit API authentication."""
|
||
try:
|
||
# Try to get authenticated user
|
||
user = self.reddit.user.me()
|
||
if user:
|
||
logger.info(f"✅ Reddit API authenticated successfully as /u/{user.name}")
|
||
else:
|
||
logger.warning("⚠️ Reddit API authentication returned no user data")
|
||
except Exception as e:
|
||
logger.error(f"❌ Reddit API authentication failed: {e}")
|
||
raise
|
||
|
||
def scrape_posts(
|
||
self,
|
||
subreddit: str,
|
||
match_id: int,
|
||
keywords: Optional[List[str]] = None
|
||
) -> List[RedditPostData]:
|
||
"""
|
||
Scrape posts from a subreddit for a specific match.
|
||
|
||
Args:
|
||
subreddit: Subreddit name (e.g., "soccer")
|
||
match_id: Match identifier
|
||
keywords: Optional list of keywords to filter posts
|
||
|
||
Returns:
|
||
List of RedditPostData objects
|
||
"""
|
||
posts_data = []
|
||
|
||
try:
|
||
logger.info(f"🔍 Scraping posts from r/{subreddit} for match {match_id}")
|
||
|
||
# Get subreddit
|
||
sub = self.reddit.subreddit(subreddit)
|
||
|
||
# Fetch new posts
|
||
posts = list(sub.new(limit=self.max_posts_per_subreddit))
|
||
|
||
if not posts:
|
||
logger.info(f"ℹ️ No posts found in r/{subreddit}")
|
||
return posts_data
|
||
|
||
# Filter by keywords if provided
|
||
for post in posts:
|
||
# Skip if keywords provided and not matching
|
||
if keywords:
|
||
text_to_search = f"{post.title.lower()} {post.selftext.lower()}"
|
||
if not any(keyword.lower() in text_to_search for keyword in keywords):
|
||
continue
|
||
|
||
# Create post data
|
||
post_data = RedditPostData(
|
||
post_id=post.id,
|
||
title=post.title,
|
||
text=post.selftext if hasattr(post, 'selftext') else "",
|
||
upvotes=post.score,
|
||
created_at=datetime.fromtimestamp(post.created_utc, tz=timezone.utc),
|
||
match_id=match_id,
|
||
subreddit=subreddit,
|
||
source="reddit"
|
||
)
|
||
posts_data.append(post_data)
|
||
|
||
logger.info(f"✅ Collected {len(posts_data)} posts from r/{subreddit}")
|
||
|
||
except praw.exceptions.PRAWException as e:
|
||
logger.error(f"❌ Reddit API error while scraping r/{subreddit}: {e}")
|
||
except Exception as e:
|
||
logger.error(f"❌ Unexpected error while scraping r/{subreddit}: {e}")
|
||
|
||
return posts_data
|
||
|
||
def scrape_comments(
|
||
self,
|
||
post_id: str,
|
||
post,
|
||
max_comments: Optional[int] = None
|
||
) -> List[RedditCommentData]:
|
||
"""
|
||
Scrape comments from a Reddit post.
|
||
|
||
Args:
|
||
post_id: Reddit post ID
|
||
post: PRAW submission object
|
||
max_comments: Maximum number of comments to collect
|
||
|
||
Returns:
|
||
List of RedditCommentData objects
|
||
"""
|
||
comments_data = []
|
||
limit = max_comments or self.max_comments_per_post
|
||
|
||
try:
|
||
logger.info(f"💬 Scraping comments for post {post_id}")
|
||
|
||
# Get comments (replace_more removes "more comments" placeholders)
|
||
post.comments.replace_more(limit=0)
|
||
comments = list(post.comments.list())[:limit]
|
||
|
||
if not comments:
|
||
logger.info(f"ℹ️ No comments found for post {post_id}")
|
||
return comments_data
|
||
|
||
# Process comments
|
||
for comment in comments:
|
||
# Skip if comment doesn't have required attributes
|
||
if not hasattr(comment, 'id') or not hasattr(comment, 'body'):
|
||
continue
|
||
|
||
comment_data = RedditCommentData(
|
||
comment_id=comment.id,
|
||
post_id=post_id,
|
||
text=comment.body,
|
||
upvotes=comment.score,
|
||
created_at=datetime.fromtimestamp(comment.created_utc, tz=timezone.utc),
|
||
source="reddit"
|
||
)
|
||
comments_data.append(comment_data)
|
||
|
||
logger.info(f"✅ Collected {len(comments_data)} comments for post {post_id}")
|
||
|
||
except praw.exceptions.PRAWException as e:
|
||
logger.error(f"❌ Reddit API error while scraping comments for post {post_id}: {e}")
|
||
except Exception as e:
|
||
logger.error(f"❌ Unexpected error while scraping comments for post {post_id}: {e}")
|
||
|
||
return comments_data
|
||
|
||
def save_posts_to_db(self, posts: List[RedditPostData], db: Session) -> None:
|
||
"""
|
||
Save Reddit posts to database.
|
||
|
||
Args:
|
||
posts: List of RedditPostData objects
|
||
db: SQLAlchemy database session
|
||
"""
|
||
from app.models.reddit_post import RedditPost
|
||
|
||
saved_count = 0
|
||
for post_data in posts:
|
||
# Check if post already exists
|
||
existing = db.query(RedditPost).filter(
|
||
RedditPost.post_id == post_data.post_id
|
||
).first()
|
||
|
||
if existing:
|
||
logger.debug(f"Post {post_data.post_id} already exists, skipping")
|
||
continue
|
||
|
||
# Create new post
|
||
post = RedditPost(
|
||
post_id=post_data.post_id,
|
||
title=post_data.title,
|
||
text=post_data.text,
|
||
upvotes=post_data.upvotes,
|
||
created_at=post_data.created_at,
|
||
match_id=post_data.match_id,
|
||
subreddit=post_data.subreddit,
|
||
source=post_data.source
|
||
)
|
||
|
||
db.add(post)
|
||
saved_count += 1
|
||
|
||
# Commit changes
|
||
try:
|
||
db.commit()
|
||
logger.info(f"✅ Saved {saved_count} new Reddit posts to database")
|
||
except Exception as e:
|
||
db.rollback()
|
||
logger.error(f"❌ Failed to save Reddit posts to database: {e}")
|
||
raise
|
||
|
||
def save_comments_to_db(self, comments: List[RedditCommentData], db: Session) -> None:
|
||
"""
|
||
Save Reddit comments to database.
|
||
|
||
Args:
|
||
comments: List of RedditCommentData objects
|
||
db: SQLAlchemy database session
|
||
"""
|
||
from app.models.reddit_post import RedditComment
|
||
|
||
saved_count = 0
|
||
for comment_data in comments:
|
||
# Check if comment already exists
|
||
existing = db.query(RedditComment).filter(
|
||
RedditComment.comment_id == comment_data.comment_id
|
||
).first()
|
||
|
||
if existing:
|
||
logger.debug(f"Comment {comment_data.comment_id} already exists, skipping")
|
||
continue
|
||
|
||
# Create new comment
|
||
comment = RedditComment(
|
||
comment_id=comment_data.comment_id,
|
||
post_id=comment_data.post_id,
|
||
text=comment_data.text,
|
||
upvotes=comment_data.upvotes,
|
||
created_at=comment_data.created_at,
|
||
source=comment_data.source
|
||
)
|
||
|
||
db.add(comment)
|
||
saved_count += 1
|
||
|
||
# Commit changes
|
||
try:
|
||
db.commit()
|
||
logger.info(f"✅ Saved {saved_count} new Reddit comments to database")
|
||
except Exception as e:
|
||
db.rollback()
|
||
logger.error(f"❌ Failed to save Reddit comments to database: {e}")
|
||
raise
|
||
|
||
def scrape_reddit_match(
|
||
self,
|
||
match_id: int,
|
||
keywords: Optional[List[str]] = None,
|
||
scrape_comments: bool = True,
|
||
db: Optional[Session] = None
|
||
) -> Dict[str, List]:
|
||
"""
|
||
Scrape Reddit posts and comments for a specific match.
|
||
|
||
Args:
|
||
match_id: Match identifier
|
||
keywords: Optional list of keywords to filter posts
|
||
scrape_comments: Whether to scrape comments
|
||
db: Optional database session for immediate saving
|
||
|
||
Returns:
|
||
Dictionary with 'posts' and 'comments' lists
|
||
"""
|
||
all_posts = []
|
||
all_comments = []
|
||
|
||
# Scrape from all configured subreddits
|
||
for subreddit in self.subreddits:
|
||
try:
|
||
# Scrape posts
|
||
posts = self.scrape_posts(subreddit, match_id, keywords)
|
||
all_posts.extend(posts)
|
||
|
||
# Save posts if db session provided
|
||
if db and posts:
|
||
self.save_posts_to_db(posts, db)
|
||
|
||
# Scrape comments if requested
|
||
if scrape_comments and posts:
|
||
# Get PRAW post objects for comment scraping
|
||
sub = self.reddit.subreddit(subreddit)
|
||
praw_posts = list(sub.new(limit=self.max_posts_per_subreddit))
|
||
|
||
for post_data in posts:
|
||
# Find matching PRAW post
|
||
praw_post = next(
|
||
(p for p in praw_posts if p.id == post_data.post_id),
|
||
None
|
||
)
|
||
|
||
if praw_post:
|
||
comments = self.scrape_comments(post_data.post_id, praw_post)
|
||
all_comments.extend(comments)
|
||
|
||
# Save comments if db session provided
|
||
if db and comments:
|
||
self.save_comments_to_db(comments, db)
|
||
|
||
except Exception as e:
|
||
logger.error(
|
||
f"❌ Failed to scrape r/{subreddit} for match {match_id}: {e}. "
|
||
f"Continuing with other sources..."
|
||
)
|
||
continue
|
||
|
||
logger.info(
|
||
f"✅ Total collected for match {match_id}: "
|
||
f"{len(all_posts)} posts, {len(all_comments)} comments"
|
||
)
|
||
|
||
return {
|
||
'posts': all_posts,
|
||
'comments': all_comments
|
||
}
|
||
|
||
def scrape_and_save(
|
||
self,
|
||
match_id: int,
|
||
db: Session,
|
||
keywords: Optional[List[str]] = None,
|
||
scrape_comments: bool = True
|
||
) -> Dict[str, List]:
|
||
"""
|
||
Scrape Reddit data for a match and save to database.
|
||
|
||
Args:
|
||
match_id: Match identifier
|
||
db: SQLAlchemy database session
|
||
keywords: Optional list of keywords to filter posts
|
||
scrape_comments: Whether to scrape comments
|
||
|
||
Returns:
|
||
Dictionary with 'posts' and 'comments' lists
|
||
"""
|
||
try:
|
||
return self.scrape_reddit_match(
|
||
match_id=match_id,
|
||
keywords=keywords,
|
||
scrape_comments=scrape_comments,
|
||
db=db
|
||
)
|
||
except Exception as e:
|
||
logger.error(f"❌ Failed to scrape and save Reddit data for match {match_id}: {e}")
|
||
raise
|
||
|
||
|
||
def create_reddit_scraper(
|
||
client_id: str,
|
||
client_secret: str,
|
||
subreddits: Optional[List[str]] = None
|
||
) -> RedditScraper:
|
||
"""
|
||
Factory function to create a Reddit scraper instance.
|
||
|
||
Args:
|
||
client_id: Reddit API client ID
|
||
client_secret: Reddit API client secret
|
||
subreddits: Optional list of subreddits to scrape
|
||
|
||
Returns:
|
||
Configured RedditScraper instance
|
||
"""
|
||
# Default subreddits if not provided
|
||
if subreddits is None:
|
||
subreddits = ["soccer", "football", "Ligue1", "PremierLeague"]
|
||
|
||
scraper = RedditScraper(
|
||
client_id=client_id,
|
||
client_secret=client_secret,
|
||
subreddits=subreddits,
|
||
max_posts_per_subreddit=100,
|
||
max_comments_per_post=50
|
||
)
|
||
|
||
return scraper
|