import os import fitz # PyMuPDF import logging from PIL import Image import io import tempfile from pix2text import Pix2Text import re # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) logger = logging.getLogger(__name__) class LatexExtractor: def __init__(self): # Initialize Pix2Text with LaTeX OCR engine self.p2t = Pix2Text(math_engine='mfd', math_dpi=150) logger.info("Initialized Pix2Text with LaTeX OCR engine") def detect_equations_from_image(self, image_path): """Detect and extract LaTeX equations from an image""" logger.info(f"Processing image: {image_path}") try: # Process image with Pix2Text result = self.p2t.recognize(image_path) # Extract math blocks (LaTeX equations) math_blocks = [] for item in result: if item.get('type') == 'math' and item.get('text'): math_blocks.append(item.get('text')) logger.info(f"Extracted {len(math_blocks)} LaTeX equations from image") return math_blocks except Exception as e: logger.error(f"Error extracting LaTeX from image: {str(e)}") return [] def extract_equations_from_pdf(self, pdf_path, output_dir=None): """Extract LaTeX equations from each page of a PDF""" logger.info(f"Processing PDF: {pdf_path}") if output_dir is None: output_dir = os.path.join(os.path.dirname(pdf_path), "equations") os.makedirs(output_dir, exist_ok=True) # Open the PDF doc = fitz.open(pdf_path) logger.info(f"PDF opened successfully. Document has {len(doc)} pages") all_equations = [] # Process each page for page_num, page in enumerate(doc, 1): logger.info(f"Processing page {page_num}/{len(doc)}") # Render page to image pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # Higher resolution for better OCR # Save the page image to a temporary file with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp: pix.save(tmp.name) tmp_path = tmp.name # Process the page image to extract equations page_equations = self.detect_equations_from_image(tmp_path) # Add page number information to each equation for i, eq in enumerate(page_equations): all_equations.append({ "page": page_num, "index": i+1, "latex": eq }) # Clean up temporary file os.unlink(tmp_path) # Save all equations to a Markdown file md_path = os.path.join(output_dir, f"{os.path.splitext(os.path.basename(pdf_path))[0]}_equations.md") with open(md_path, "w", encoding="utf-8") as f: f.write(f"# Equations from {os.path.basename(pdf_path)}\n\n") for eq in all_equations: f.write(f"## Page {eq['page']} - Equation {eq['index']}\n\n") f.write(f"$$\n{eq['latex']}\n$$\n\n") logger.info(f"Extracted {len(all_equations)} equations. Saved to {md_path}") return all_equations def main(): import argparse parser = argparse.ArgumentParser(description="Extract LaTeX equations from PDF documents") parser.add_argument("pdf_path", help="Path to the PDF file") parser.add_argument("--output_dir", help="Directory to save extracted equations", default=None) parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging (DEBUG level)") args = parser.parse_args() # Set log level if args.verbose: logging.getLogger().setLevel(logging.DEBUG) logger.info("Verbose logging enabled") # Extract equations extractor = LatexExtractor() extractor.extract_equations_from_pdf(args.pdf_path, args.output_dir) if __name__ == "__main__": main()