chartbastan/backend/app/scrapers/reddit_scraper.py

"""
Reddit scraper module with robust error handling.

This module provides functionality to scrape Reddit posts and comments
about football matches, with built-in error handling and logging.
"""

import logging
from datetime import datetime, timezone
from typing import List, Dict, Optional
from dataclasses import dataclass

import praw
from sqlalchemy.orm import Session

logger = logging.getLogger(__name__)


@dataclass
class RedditPostData:
    """Structured data for a Reddit post."""
    post_id: str
    title: str
    text: str
    upvotes: int
    created_at: datetime
    match_id: int
    subreddit: str
    source: str = "reddit"


@dataclass
class RedditCommentData:
    """Structured data for a Reddit comment."""
    comment_id: str
    post_id: str
    text: str
    upvotes: int
    created_at: datetime
    source: str = "reddit"


class RedditScraper:
    """
    Reddit scraper with robust error handling.

    Features:
    - Scrapes posts and comments from specified subreddits
    - Error handling without stopping the process
    - Continues with other sources on errors
    - Structured logging
    - Timeout configuration
    """

    def __init__(
        self,
        client_id: str,
        client_secret: str,
        subreddits: List[str],
        max_posts_per_subreddit: int = 100,
        max_comments_per_post: int = 50,
        user_agent: str = "Chartbastan/1.0"
    ):
        """
        Initialize Reddit scraper.

        Args:
            client_id: Reddit API client ID
            client_secret: Reddit API client secret
            subreddits: List of subreddits to scrape
            max_posts_per_subreddit: Maximum posts to collect per subreddit
            max_comments_per_post: Maximum comments to collect per post
            user_agent: User agent string for API requests
        """
        self.client_id = client_id
        self.client_secret = client_secret
        self.subreddits = subreddits
        self.max_posts_per_subreddit = max_posts_per_subreddit
        self.max_comments_per_post = max_comments_per_post
        self.user_agent = user_agent

        # Initialize Reddit API client
        self.reddit = praw.Reddit(
            client_id=client_id,
            client_secret=client_secret,
            user_agent=user_agent
        )

        # Verify authentication
        self._verify_authentication()

    def _verify_authentication(self) -> None:
        """Verify Reddit API authentication."""
        try:
            # Try to get authenticated user
            user = self.reddit.user.me()
            if user:
                logger.info(f"✅ Reddit API authenticated successfully as /u/{user.name}")
            else:
                logger.warning("⚠️ Reddit API authentication returned no user data")
        except Exception as e:
            logger.error(f"❌ Reddit API authentication failed: {e}")
            raise

    def scrape_posts(
        self,
        subreddit: str,
        match_id: int,
        keywords: Optional[List[str]] = None
    ) -> List[RedditPostData]:
        """
        Scrape posts from a subreddit for a specific match.

        Args:
            subreddit: Subreddit name (e.g., "soccer")
            match_id: Match identifier
            keywords: Optional list of keywords to filter posts

        Returns:
            List of RedditPostData objects
        """
        posts_data = []

        try:
            logger.info(f"🔍 Scraping posts from r/{subreddit} for match {match_id}")

            # Get subreddit
            sub = self.reddit.subreddit(subreddit)

            # Fetch new posts
            posts = list(sub.new(limit=self.max_posts_per_subreddit))

            if not posts:
                logger.info(f"ℹ️ No posts found in r/{subreddit}")
                return posts_data

            # Filter by keywords if provided
            for post in posts:
                # Skip if keywords provided and not matching
                if keywords:
                    text_to_search = f"{post.title.lower()} {post.selftext.lower()}"
                    if not any(keyword.lower() in text_to_search for keyword in keywords):
                        continue

                # Create post data
                post_data = RedditPostData(
                    post_id=post.id,
                    title=post.title,
                    text=post.selftext if hasattr(post, 'selftext') else "",
                    upvotes=post.score,
                    created_at=datetime.fromtimestamp(post.created_utc, tz=timezone.utc),
                    match_id=match_id,
                    subreddit=subreddit,
                    source="reddit"
                )
                posts_data.append(post_data)

            logger.info(f"✅ Collected {len(posts_data)} posts from r/{subreddit}")

        except praw.exceptions.PRAWException as e:
            logger.error(f"❌ Reddit API error while scraping r/{subreddit}: {e}")
        except Exception as e:
            logger.error(f"❌ Unexpected error while scraping r/{subreddit}: {e}")

        return posts_data

    def scrape_comments(
        self,
        post_id: str,
        post,
        max_comments: Optional[int] = None
    ) -> List[RedditCommentData]:
        """
        Scrape comments from a Reddit post.

        Args:
            post_id: Reddit post ID
            post: PRAW submission object
            max_comments: Maximum number of comments to collect

        Returns:
            List of RedditCommentData objects
        """
        comments_data = []
        limit = max_comments or self.max_comments_per_post

        try:
            logger.info(f"💬 Scraping comments for post {post_id}")

            # Get comments (replace_more removes "more comments" placeholders)
            post.comments.replace_more(limit=0)
            comments = list(post.comments.list())[:limit]

            if not comments:
                logger.info(f"ℹ️ No comments found for post {post_id}")
                return comments_data

            # Process comments
            for comment in comments:
                # Skip if comment doesn't have required attributes
                if not hasattr(comment, 'id') or not hasattr(comment, 'body'):
                    continue

                comment_data = RedditCommentData(
                    comment_id=comment.id,
                    post_id=post_id,
                    text=comment.body,
                    upvotes=comment.score,
                    created_at=datetime.fromtimestamp(comment.created_utc, tz=timezone.utc),
                    source="reddit"
                )
                comments_data.append(comment_data)

            logger.info(f"✅ Collected {len(comments_data)} comments for post {post_id}")

        except praw.exceptions.PRAWException as e:
            logger.error(f"❌ Reddit API error while scraping comments for post {post_id}: {e}")
        except Exception as e:
            logger.error(f"❌ Unexpected error while scraping comments for post {post_id}: {e}")

        return comments_data

    def save_posts_to_db(self, posts: List[RedditPostData], db: Session) -> None:
        """
        Save Reddit posts to database.

        Args:
            posts: List of RedditPostData objects
            db: SQLAlchemy database session
        """
        from app.models.reddit_post import RedditPost

        saved_count = 0
        for post_data in posts:
            # Check if post already exists
            existing = db.query(RedditPost).filter(
                RedditPost.post_id == post_data.post_id
            ).first()

            if existing:
                logger.debug(f"Post {post_data.post_id} already exists, skipping")
                continue

            # Create new post
            post = RedditPost(
                post_id=post_data.post_id,
                title=post_data.title,
                text=post_data.text,
                upvotes=post_data.upvotes,
                created_at=post_data.created_at,
                match_id=post_data.match_id,
                subreddit=post_data.subreddit,
                source=post_data.source
            )

            db.add(post)
            saved_count += 1

        # Commit changes
        try:
            db.commit()
            logger.info(f"✅ Saved {saved_count} new Reddit posts to database")
        except Exception as e:
            db.rollback()
            logger.error(f"❌ Failed to save Reddit posts to database: {e}")
            raise

    def save_comments_to_db(self, comments: List[RedditCommentData], db: Session) -> None:
        """
        Save Reddit comments to database.

        Args:
            comments: List of RedditCommentData objects
            db: SQLAlchemy database session
        """
        from app.models.reddit_post import RedditComment

        saved_count = 0
        for comment_data in comments:
            # Check if comment already exists
            existing = db.query(RedditComment).filter(
                RedditComment.comment_id == comment_data.comment_id
            ).first()

            if existing:
                logger.debug(f"Comment {comment_data.comment_id} already exists, skipping")
                continue

            # Create new comment
            comment = RedditComment(
                comment_id=comment_data.comment_id,
                post_id=comment_data.post_id,
                text=comment_data.text,
                upvotes=comment_data.upvotes,
                created_at=comment_data.created_at,
                source=comment_data.source
            )

            db.add(comment)
            saved_count += 1

        # Commit changes
        try:
            db.commit()
            logger.info(f"✅ Saved {saved_count} new Reddit comments to database")
        except Exception as e:
            db.rollback()
            logger.error(f"❌ Failed to save Reddit comments to database: {e}")
            raise

    def scrape_reddit_match(
        self,
        match_id: int,
        keywords: Optional[List[str]] = None,
        scrape_comments: bool = True,
        db: Optional[Session] = None
    ) -> Dict[str, List]:
        """
        Scrape Reddit posts and comments for a specific match.

        Args:
            match_id: Match identifier
            keywords: Optional list of keywords to filter posts
            scrape_comments: Whether to scrape comments
            db: Optional database session for immediate saving

        Returns:
            Dictionary with 'posts' and 'comments' lists
        """
        all_posts = []
        all_comments = []

        # Scrape from all configured subreddits
        for subreddit in self.subreddits:
            try:
                # Scrape posts
                posts = self.scrape_posts(subreddit, match_id, keywords)
                all_posts.extend(posts)

                # Save posts if db session provided
                if db and posts:
                    self.save_posts_to_db(posts, db)

                # Scrape comments if requested
                if scrape_comments and posts:
                    # Get PRAW post objects for comment scraping
                    sub = self.reddit.subreddit(subreddit)
                    praw_posts = list(sub.new(limit=self.max_posts_per_subreddit))

                    for post_data in posts:
                        # Find matching PRAW post
                        praw_post = next(
                            (p for p in praw_posts if p.id == post_data.post_id),
                            None
                        )

                        if praw_post:
                            comments = self.scrape_comments(post_data.post_id, praw_post)
                            all_comments.extend(comments)

                            # Save comments if db session provided
                            if db and comments:
                                self.save_comments_to_db(comments, db)

            except Exception as e:
                logger.error(
                    f"❌ Failed to scrape r/{subreddit} for match {match_id}: {e}. "
                    f"Continuing with other sources..."
                )
                continue

        logger.info(
            f"✅ Total collected for match {match_id}: "
            f"{len(all_posts)} posts, {len(all_comments)} comments"
        )

        return {
            'posts': all_posts,
            'comments': all_comments
        }

    def scrape_and_save(
        self,
        match_id: int,
        db: Session,
        keywords: Optional[List[str]] = None,
        scrape_comments: bool = True
    ) -> Dict[str, List]:
        """
        Scrape Reddit data for a match and save to database.

        Args:
            match_id: Match identifier
            db: SQLAlchemy database session
            keywords: Optional list of keywords to filter posts
            scrape_comments: Whether to scrape comments

        Returns:
            Dictionary with 'posts' and 'comments' lists
        """
        try:
            return self.scrape_reddit_match(
                match_id=match_id,
                keywords=keywords,
                scrape_comments=scrape_comments,
                db=db
            )
        except Exception as e:
            logger.error(f"❌ Failed to scrape and save Reddit data for match {match_id}: {e}")
            raise


def create_reddit_scraper(
    client_id: str,
    client_secret: str,
    subreddits: Optional[List[str]] = None
) -> RedditScraper:
    """
    Factory function to create a Reddit scraper instance.

    Args:
        client_id: Reddit API client ID
        client_secret: Reddit API client secret
        subreddits: Optional list of subreddits to scrape

    Returns:
        Configured RedditScraper instance
    """
    # Default subreddits if not provided
    if subreddits is None:
        subreddits = ["soccer", "football", "Ligue1", "PremierLeague"]

    scraper = RedditScraper(
        client_id=client_id,
        client_secret=client_secret,
        subreddits=subreddits,
        max_posts_per_subreddit=100,
        max_comments_per_post=50
    )

    return scraper