chartbastan/backend/app/scrapers/reddit_scraper.py
2026-02-01 09:31:38 +01:00

442 lines
14 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Reddit scraper module with robust error handling.
This module provides functionality to scrape Reddit posts and comments
about football matches, with built-in error handling and logging.
"""
import logging
from datetime import datetime, timezone
from typing import List, Dict, Optional
from dataclasses import dataclass
import praw
from sqlalchemy.orm import Session
logger = logging.getLogger(__name__)
@dataclass
class RedditPostData:
"""Structured data for a Reddit post."""
post_id: str
title: str
text: str
upvotes: int
created_at: datetime
match_id: int
subreddit: str
source: str = "reddit"
@dataclass
class RedditCommentData:
"""Structured data for a Reddit comment."""
comment_id: str
post_id: str
text: str
upvotes: int
created_at: datetime
source: str = "reddit"
class RedditScraper:
"""
Reddit scraper with robust error handling.
Features:
- Scrapes posts and comments from specified subreddits
- Error handling without stopping the process
- Continues with other sources on errors
- Structured logging
- Timeout configuration
"""
def __init__(
self,
client_id: str,
client_secret: str,
subreddits: List[str],
max_posts_per_subreddit: int = 100,
max_comments_per_post: int = 50,
user_agent: str = "Chartbastan/1.0"
):
"""
Initialize Reddit scraper.
Args:
client_id: Reddit API client ID
client_secret: Reddit API client secret
subreddits: List of subreddits to scrape
max_posts_per_subreddit: Maximum posts to collect per subreddit
max_comments_per_post: Maximum comments to collect per post
user_agent: User agent string for API requests
"""
self.client_id = client_id
self.client_secret = client_secret
self.subreddits = subreddits
self.max_posts_per_subreddit = max_posts_per_subreddit
self.max_comments_per_post = max_comments_per_post
self.user_agent = user_agent
# Initialize Reddit API client
self.reddit = praw.Reddit(
client_id=client_id,
client_secret=client_secret,
user_agent=user_agent
)
# Verify authentication
self._verify_authentication()
def _verify_authentication(self) -> None:
"""Verify Reddit API authentication."""
try:
# Try to get authenticated user
user = self.reddit.user.me()
if user:
logger.info(f"✅ Reddit API authenticated successfully as /u/{user.name}")
else:
logger.warning("⚠️ Reddit API authentication returned no user data")
except Exception as e:
logger.error(f"❌ Reddit API authentication failed: {e}")
raise
def scrape_posts(
self,
subreddit: str,
match_id: int,
keywords: Optional[List[str]] = None
) -> List[RedditPostData]:
"""
Scrape posts from a subreddit for a specific match.
Args:
subreddit: Subreddit name (e.g., "soccer")
match_id: Match identifier
keywords: Optional list of keywords to filter posts
Returns:
List of RedditPostData objects
"""
posts_data = []
try:
logger.info(f"🔍 Scraping posts from r/{subreddit} for match {match_id}")
# Get subreddit
sub = self.reddit.subreddit(subreddit)
# Fetch new posts
posts = list(sub.new(limit=self.max_posts_per_subreddit))
if not posts:
logger.info(f" No posts found in r/{subreddit}")
return posts_data
# Filter by keywords if provided
for post in posts:
# Skip if keywords provided and not matching
if keywords:
text_to_search = f"{post.title.lower()} {post.selftext.lower()}"
if not any(keyword.lower() in text_to_search for keyword in keywords):
continue
# Create post data
post_data = RedditPostData(
post_id=post.id,
title=post.title,
text=post.selftext if hasattr(post, 'selftext') else "",
upvotes=post.score,
created_at=datetime.fromtimestamp(post.created_utc, tz=timezone.utc),
match_id=match_id,
subreddit=subreddit,
source="reddit"
)
posts_data.append(post_data)
logger.info(f"✅ Collected {len(posts_data)} posts from r/{subreddit}")
except praw.exceptions.PRAWException as e:
logger.error(f"❌ Reddit API error while scraping r/{subreddit}: {e}")
except Exception as e:
logger.error(f"❌ Unexpected error while scraping r/{subreddit}: {e}")
return posts_data
def scrape_comments(
self,
post_id: str,
post,
max_comments: Optional[int] = None
) -> List[RedditCommentData]:
"""
Scrape comments from a Reddit post.
Args:
post_id: Reddit post ID
post: PRAW submission object
max_comments: Maximum number of comments to collect
Returns:
List of RedditCommentData objects
"""
comments_data = []
limit = max_comments or self.max_comments_per_post
try:
logger.info(f"💬 Scraping comments for post {post_id}")
# Get comments (replace_more removes "more comments" placeholders)
post.comments.replace_more(limit=0)
comments = list(post.comments.list())[:limit]
if not comments:
logger.info(f" No comments found for post {post_id}")
return comments_data
# Process comments
for comment in comments:
# Skip if comment doesn't have required attributes
if not hasattr(comment, 'id') or not hasattr(comment, 'body'):
continue
comment_data = RedditCommentData(
comment_id=comment.id,
post_id=post_id,
text=comment.body,
upvotes=comment.score,
created_at=datetime.fromtimestamp(comment.created_utc, tz=timezone.utc),
source="reddit"
)
comments_data.append(comment_data)
logger.info(f"✅ Collected {len(comments_data)} comments for post {post_id}")
except praw.exceptions.PRAWException as e:
logger.error(f"❌ Reddit API error while scraping comments for post {post_id}: {e}")
except Exception as e:
logger.error(f"❌ Unexpected error while scraping comments for post {post_id}: {e}")
return comments_data
def save_posts_to_db(self, posts: List[RedditPostData], db: Session) -> None:
"""
Save Reddit posts to database.
Args:
posts: List of RedditPostData objects
db: SQLAlchemy database session
"""
from app.models.reddit_post import RedditPost
saved_count = 0
for post_data in posts:
# Check if post already exists
existing = db.query(RedditPost).filter(
RedditPost.post_id == post_data.post_id
).first()
if existing:
logger.debug(f"Post {post_data.post_id} already exists, skipping")
continue
# Create new post
post = RedditPost(
post_id=post_data.post_id,
title=post_data.title,
text=post_data.text,
upvotes=post_data.upvotes,
created_at=post_data.created_at,
match_id=post_data.match_id,
subreddit=post_data.subreddit,
source=post_data.source
)
db.add(post)
saved_count += 1
# Commit changes
try:
db.commit()
logger.info(f"✅ Saved {saved_count} new Reddit posts to database")
except Exception as e:
db.rollback()
logger.error(f"❌ Failed to save Reddit posts to database: {e}")
raise
def save_comments_to_db(self, comments: List[RedditCommentData], db: Session) -> None:
"""
Save Reddit comments to database.
Args:
comments: List of RedditCommentData objects
db: SQLAlchemy database session
"""
from app.models.reddit_post import RedditComment
saved_count = 0
for comment_data in comments:
# Check if comment already exists
existing = db.query(RedditComment).filter(
RedditComment.comment_id == comment_data.comment_id
).first()
if existing:
logger.debug(f"Comment {comment_data.comment_id} already exists, skipping")
continue
# Create new comment
comment = RedditComment(
comment_id=comment_data.comment_id,
post_id=comment_data.post_id,
text=comment_data.text,
upvotes=comment_data.upvotes,
created_at=comment_data.created_at,
source=comment_data.source
)
db.add(comment)
saved_count += 1
# Commit changes
try:
db.commit()
logger.info(f"✅ Saved {saved_count} new Reddit comments to database")
except Exception as e:
db.rollback()
logger.error(f"❌ Failed to save Reddit comments to database: {e}")
raise
def scrape_reddit_match(
self,
match_id: int,
keywords: Optional[List[str]] = None,
scrape_comments: bool = True,
db: Optional[Session] = None
) -> Dict[str, List]:
"""
Scrape Reddit posts and comments for a specific match.
Args:
match_id: Match identifier
keywords: Optional list of keywords to filter posts
scrape_comments: Whether to scrape comments
db: Optional database session for immediate saving
Returns:
Dictionary with 'posts' and 'comments' lists
"""
all_posts = []
all_comments = []
# Scrape from all configured subreddits
for subreddit in self.subreddits:
try:
# Scrape posts
posts = self.scrape_posts(subreddit, match_id, keywords)
all_posts.extend(posts)
# Save posts if db session provided
if db and posts:
self.save_posts_to_db(posts, db)
# Scrape comments if requested
if scrape_comments and posts:
# Get PRAW post objects for comment scraping
sub = self.reddit.subreddit(subreddit)
praw_posts = list(sub.new(limit=self.max_posts_per_subreddit))
for post_data in posts:
# Find matching PRAW post
praw_post = next(
(p for p in praw_posts if p.id == post_data.post_id),
None
)
if praw_post:
comments = self.scrape_comments(post_data.post_id, praw_post)
all_comments.extend(comments)
# Save comments if db session provided
if db and comments:
self.save_comments_to_db(comments, db)
except Exception as e:
logger.error(
f"❌ Failed to scrape r/{subreddit} for match {match_id}: {e}. "
f"Continuing with other sources..."
)
continue
logger.info(
f"✅ Total collected for match {match_id}: "
f"{len(all_posts)} posts, {len(all_comments)} comments"
)
return {
'posts': all_posts,
'comments': all_comments
}
def scrape_and_save(
self,
match_id: int,
db: Session,
keywords: Optional[List[str]] = None,
scrape_comments: bool = True
) -> Dict[str, List]:
"""
Scrape Reddit data for a match and save to database.
Args:
match_id: Match identifier
db: SQLAlchemy database session
keywords: Optional list of keywords to filter posts
scrape_comments: Whether to scrape comments
Returns:
Dictionary with 'posts' and 'comments' lists
"""
try:
return self.scrape_reddit_match(
match_id=match_id,
keywords=keywords,
scrape_comments=scrape_comments,
db=db
)
except Exception as e:
logger.error(f"❌ Failed to scrape and save Reddit data for match {match_id}: {e}")
raise
def create_reddit_scraper(
client_id: str,
client_secret: str,
subreddits: Optional[List[str]] = None
) -> RedditScraper:
"""
Factory function to create a Reddit scraper instance.
Args:
client_id: Reddit API client ID
client_secret: Reddit API client secret
subreddits: Optional list of subreddits to scrape
Returns:
Configured RedditScraper instance
"""
# Default subreddits if not provided
if subreddits is None:
subreddits = ["soccer", "football", "Ligue1", "PremierLeague"]
scraper = RedditScraper(
client_id=client_id,
client_secret=client_secret,
subreddits=subreddits,
max_posts_per_subreddit=100,
max_comments_per_post=50
)
return scraper