rag/pdf_to_latex.py

import os
import fitz  # PyMuPDF
import logging
from PIL import Image
import io
import tempfile
from pix2text import Pix2Text
import re

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)

class LatexExtractor:
    def __init__(self):
        # Initialize Pix2Text with LaTeX OCR engine
        self.p2t = Pix2Text(math_engine='mfd', math_dpi=150)
        logger.info("Initialized Pix2Text with LaTeX OCR engine")

    def detect_equations_from_image(self, image_path):
        """Detect and extract LaTeX equations from an image"""
        logger.info(f"Processing image: {image_path}")

        try:
            # Process image with Pix2Text
            result = self.p2t.recognize(image_path)

            # Extract math blocks (LaTeX equations)
            math_blocks = []
            for item in result:
                if item.get('type') == 'math' and item.get('text'):
                    math_blocks.append(item.get('text'))

            logger.info(f"Extracted {len(math_blocks)} LaTeX equations from image")
            return math_blocks

        except Exception as e:
            logger.error(f"Error extracting LaTeX from image: {str(e)}")
            return []

    def extract_equations_from_pdf(self, pdf_path, output_dir=None):
        """Extract LaTeX equations from each page of a PDF"""
        logger.info(f"Processing PDF: {pdf_path}")

        if output_dir is None:
            output_dir = os.path.join(os.path.dirname(pdf_path), "equations")

        os.makedirs(output_dir, exist_ok=True)

        # Open the PDF
        doc = fitz.open(pdf_path)
        logger.info(f"PDF opened successfully. Document has {len(doc)} pages")

        all_equations = []

        # Process each page
        for page_num, page in enumerate(doc, 1):
            logger.info(f"Processing page {page_num}/{len(doc)}")

            # Render page to image
            pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # Higher resolution for better OCR

            # Save the page image to a temporary file
            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
                pix.save(tmp.name)
                tmp_path = tmp.name

            # Process the page image to extract equations
            page_equations = self.detect_equations_from_image(tmp_path)

            # Add page number information to each equation
            for i, eq in enumerate(page_equations):
                all_equations.append({
                    "page": page_num,
                    "index": i+1,
                    "latex": eq
                })

            # Clean up temporary file
            os.unlink(tmp_path)

        # Save all equations to a Markdown file
        md_path = os.path.join(output_dir, f"{os.path.splitext(os.path.basename(pdf_path))[0]}_equations.md")
        with open(md_path, "w", encoding="utf-8") as f:
            f.write(f"# Equations from {os.path.basename(pdf_path)}\n\n")

            for eq in all_equations:
                f.write(f"## Page {eq['page']} - Equation {eq['index']}\n\n")
                f.write(f"$$\n{eq['latex']}\n$$\n\n")

        logger.info(f"Extracted {len(all_equations)} equations. Saved to {md_path}")
        return all_equations

def main():
    import argparse

    parser = argparse.ArgumentParser(description="Extract LaTeX equations from PDF documents")
    parser.add_argument("pdf_path", help="Path to the PDF file")
    parser.add_argument("--output_dir", help="Directory to save extracted equations", default=None)
    parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging (DEBUG level)")

    args = parser.parse_args()

    # Set log level
    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)
        logger.info("Verbose logging enabled")

    # Extract equations
    extractor = LatexExtractor()
    extractor.extract_equations_from_pdf(args.pdf_path, args.output_dir)

if __name__ == "__main__":
    main()