""" Reddit scraper module with robust error handling. This module provides functionality to scrape Reddit posts and comments about football matches, with built-in error handling and logging. """ import logging from datetime import datetime, timezone from typing import List, Dict, Optional from dataclasses import dataclass import praw from sqlalchemy.orm import Session logger = logging.getLogger(__name__) @dataclass class RedditPostData: """Structured data for a Reddit post.""" post_id: str title: str text: str upvotes: int created_at: datetime match_id: int subreddit: str source: str = "reddit" @dataclass class RedditCommentData: """Structured data for a Reddit comment.""" comment_id: str post_id: str text: str upvotes: int created_at: datetime source: str = "reddit" class RedditScraper: """ Reddit scraper with robust error handling. Features: - Scrapes posts and comments from specified subreddits - Error handling without stopping the process - Continues with other sources on errors - Structured logging - Timeout configuration """ def __init__( self, client_id: str, client_secret: str, subreddits: List[str], max_posts_per_subreddit: int = 100, max_comments_per_post: int = 50, user_agent: str = "Chartbastan/1.0" ): """ Initialize Reddit scraper. Args: client_id: Reddit API client ID client_secret: Reddit API client secret subreddits: List of subreddits to scrape max_posts_per_subreddit: Maximum posts to collect per subreddit max_comments_per_post: Maximum comments to collect per post user_agent: User agent string for API requests """ self.client_id = client_id self.client_secret = client_secret self.subreddits = subreddits self.max_posts_per_subreddit = max_posts_per_subreddit self.max_comments_per_post = max_comments_per_post self.user_agent = user_agent # Initialize Reddit API client self.reddit = praw.Reddit( client_id=client_id, client_secret=client_secret, user_agent=user_agent ) # Verify authentication self._verify_authentication() def _verify_authentication(self) -> None: """Verify Reddit API authentication.""" try: # Try to get authenticated user user = self.reddit.user.me() if user: logger.info(f"✅ Reddit API authenticated successfully as /u/{user.name}") else: logger.warning("âš ī¸ Reddit API authentication returned no user data") except Exception as e: logger.error(f"❌ Reddit API authentication failed: {e}") raise def scrape_posts( self, subreddit: str, match_id: int, keywords: Optional[List[str]] = None ) -> List[RedditPostData]: """ Scrape posts from a subreddit for a specific match. Args: subreddit: Subreddit name (e.g., "soccer") match_id: Match identifier keywords: Optional list of keywords to filter posts Returns: List of RedditPostData objects """ posts_data = [] try: logger.info(f"🔍 Scraping posts from r/{subreddit} for match {match_id}") # Get subreddit sub = self.reddit.subreddit(subreddit) # Fetch new posts posts = list(sub.new(limit=self.max_posts_per_subreddit)) if not posts: logger.info(f"â„šī¸ No posts found in r/{subreddit}") return posts_data # Filter by keywords if provided for post in posts: # Skip if keywords provided and not matching if keywords: text_to_search = f"{post.title.lower()} {post.selftext.lower()}" if not any(keyword.lower() in text_to_search for keyword in keywords): continue # Create post data post_data = RedditPostData( post_id=post.id, title=post.title, text=post.selftext if hasattr(post, 'selftext') else "", upvotes=post.score, created_at=datetime.fromtimestamp(post.created_utc, tz=timezone.utc), match_id=match_id, subreddit=subreddit, source="reddit" ) posts_data.append(post_data) logger.info(f"✅ Collected {len(posts_data)} posts from r/{subreddit}") except praw.exceptions.PRAWException as e: logger.error(f"❌ Reddit API error while scraping r/{subreddit}: {e}") except Exception as e: logger.error(f"❌ Unexpected error while scraping r/{subreddit}: {e}") return posts_data def scrape_comments( self, post_id: str, post, max_comments: Optional[int] = None ) -> List[RedditCommentData]: """ Scrape comments from a Reddit post. Args: post_id: Reddit post ID post: PRAW submission object max_comments: Maximum number of comments to collect Returns: List of RedditCommentData objects """ comments_data = [] limit = max_comments or self.max_comments_per_post try: logger.info(f"đŸ’Ŧ Scraping comments for post {post_id}") # Get comments (replace_more removes "more comments" placeholders) post.comments.replace_more(limit=0) comments = list(post.comments.list())[:limit] if not comments: logger.info(f"â„šī¸ No comments found for post {post_id}") return comments_data # Process comments for comment in comments: # Skip if comment doesn't have required attributes if not hasattr(comment, 'id') or not hasattr(comment, 'body'): continue comment_data = RedditCommentData( comment_id=comment.id, post_id=post_id, text=comment.body, upvotes=comment.score, created_at=datetime.fromtimestamp(comment.created_utc, tz=timezone.utc), source="reddit" ) comments_data.append(comment_data) logger.info(f"✅ Collected {len(comments_data)} comments for post {post_id}") except praw.exceptions.PRAWException as e: logger.error(f"❌ Reddit API error while scraping comments for post {post_id}: {e}") except Exception as e: logger.error(f"❌ Unexpected error while scraping comments for post {post_id}: {e}") return comments_data def save_posts_to_db(self, posts: List[RedditPostData], db: Session) -> None: """ Save Reddit posts to database. Args: posts: List of RedditPostData objects db: SQLAlchemy database session """ from app.models.reddit_post import RedditPost saved_count = 0 for post_data in posts: # Check if post already exists existing = db.query(RedditPost).filter( RedditPost.post_id == post_data.post_id ).first() if existing: logger.debug(f"Post {post_data.post_id} already exists, skipping") continue # Create new post post = RedditPost( post_id=post_data.post_id, title=post_data.title, text=post_data.text, upvotes=post_data.upvotes, created_at=post_data.created_at, match_id=post_data.match_id, subreddit=post_data.subreddit, source=post_data.source ) db.add(post) saved_count += 1 # Commit changes try: db.commit() logger.info(f"✅ Saved {saved_count} new Reddit posts to database") except Exception as e: db.rollback() logger.error(f"❌ Failed to save Reddit posts to database: {e}") raise def save_comments_to_db(self, comments: List[RedditCommentData], db: Session) -> None: """ Save Reddit comments to database. Args: comments: List of RedditCommentData objects db: SQLAlchemy database session """ from app.models.reddit_post import RedditComment saved_count = 0 for comment_data in comments: # Check if comment already exists existing = db.query(RedditComment).filter( RedditComment.comment_id == comment_data.comment_id ).first() if existing: logger.debug(f"Comment {comment_data.comment_id} already exists, skipping") continue # Create new comment comment = RedditComment( comment_id=comment_data.comment_id, post_id=comment_data.post_id, text=comment_data.text, upvotes=comment_data.upvotes, created_at=comment_data.created_at, source=comment_data.source ) db.add(comment) saved_count += 1 # Commit changes try: db.commit() logger.info(f"✅ Saved {saved_count} new Reddit comments to database") except Exception as e: db.rollback() logger.error(f"❌ Failed to save Reddit comments to database: {e}") raise def scrape_reddit_match( self, match_id: int, keywords: Optional[List[str]] = None, scrape_comments: bool = True, db: Optional[Session] = None ) -> Dict[str, List]: """ Scrape Reddit posts and comments for a specific match. Args: match_id: Match identifier keywords: Optional list of keywords to filter posts scrape_comments: Whether to scrape comments db: Optional database session for immediate saving Returns: Dictionary with 'posts' and 'comments' lists """ all_posts = [] all_comments = [] # Scrape from all configured subreddits for subreddit in self.subreddits: try: # Scrape posts posts = self.scrape_posts(subreddit, match_id, keywords) all_posts.extend(posts) # Save posts if db session provided if db and posts: self.save_posts_to_db(posts, db) # Scrape comments if requested if scrape_comments and posts: # Get PRAW post objects for comment scraping sub = self.reddit.subreddit(subreddit) praw_posts = list(sub.new(limit=self.max_posts_per_subreddit)) for post_data in posts: # Find matching PRAW post praw_post = next( (p for p in praw_posts if p.id == post_data.post_id), None ) if praw_post: comments = self.scrape_comments(post_data.post_id, praw_post) all_comments.extend(comments) # Save comments if db session provided if db and comments: self.save_comments_to_db(comments, db) except Exception as e: logger.error( f"❌ Failed to scrape r/{subreddit} for match {match_id}: {e}. " f"Continuing with other sources..." ) continue logger.info( f"✅ Total collected for match {match_id}: " f"{len(all_posts)} posts, {len(all_comments)} comments" ) return { 'posts': all_posts, 'comments': all_comments } def scrape_and_save( self, match_id: int, db: Session, keywords: Optional[List[str]] = None, scrape_comments: bool = True ) -> Dict[str, List]: """ Scrape Reddit data for a match and save to database. Args: match_id: Match identifier db: SQLAlchemy database session keywords: Optional list of keywords to filter posts scrape_comments: Whether to scrape comments Returns: Dictionary with 'posts' and 'comments' lists """ try: return self.scrape_reddit_match( match_id=match_id, keywords=keywords, scrape_comments=scrape_comments, db=db ) except Exception as e: logger.error(f"❌ Failed to scrape and save Reddit data for match {match_id}: {e}") raise def create_reddit_scraper( client_id: str, client_secret: str, subreddits: Optional[List[str]] = None ) -> RedditScraper: """ Factory function to create a Reddit scraper instance. Args: client_id: Reddit API client ID client_secret: Reddit API client secret subreddits: Optional list of subreddits to scrape Returns: Configured RedditScraper instance """ # Default subreddits if not provided if subreddits is None: subreddits = ["soccer", "football", "Ligue1", "PremierLeague"] scraper = RedditScraper( client_id=client_id, client_secret=client_secret, subreddits=subreddits, max_posts_per_subreddit=100, max_comments_per_post=50 ) return scraper