rag/pdf_to_latex.py
2025-04-21 15:27:32 +02:00

118 lines
4.3 KiB
Python

import os
import fitz # PyMuPDF
import logging
from PIL import Image
import io
import tempfile
from pix2text import Pix2Text
import re
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)
class LatexExtractor:
def __init__(self):
# Initialize Pix2Text with LaTeX OCR engine
self.p2t = Pix2Text(math_engine='mfd', math_dpi=150)
logger.info("Initialized Pix2Text with LaTeX OCR engine")
def detect_equations_from_image(self, image_path):
"""Detect and extract LaTeX equations from an image"""
logger.info(f"Processing image: {image_path}")
try:
# Process image with Pix2Text
result = self.p2t.recognize(image_path)
# Extract math blocks (LaTeX equations)
math_blocks = []
for item in result:
if item.get('type') == 'math' and item.get('text'):
math_blocks.append(item.get('text'))
logger.info(f"Extracted {len(math_blocks)} LaTeX equations from image")
return math_blocks
except Exception as e:
logger.error(f"Error extracting LaTeX from image: {str(e)}")
return []
def extract_equations_from_pdf(self, pdf_path, output_dir=None):
"""Extract LaTeX equations from each page of a PDF"""
logger.info(f"Processing PDF: {pdf_path}")
if output_dir is None:
output_dir = os.path.join(os.path.dirname(pdf_path), "equations")
os.makedirs(output_dir, exist_ok=True)
# Open the PDF
doc = fitz.open(pdf_path)
logger.info(f"PDF opened successfully. Document has {len(doc)} pages")
all_equations = []
# Process each page
for page_num, page in enumerate(doc, 1):
logger.info(f"Processing page {page_num}/{len(doc)}")
# Render page to image
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # Higher resolution for better OCR
# Save the page image to a temporary file
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
pix.save(tmp.name)
tmp_path = tmp.name
# Process the page image to extract equations
page_equations = self.detect_equations_from_image(tmp_path)
# Add page number information to each equation
for i, eq in enumerate(page_equations):
all_equations.append({
"page": page_num,
"index": i+1,
"latex": eq
})
# Clean up temporary file
os.unlink(tmp_path)
# Save all equations to a Markdown file
md_path = os.path.join(output_dir, f"{os.path.splitext(os.path.basename(pdf_path))[0]}_equations.md")
with open(md_path, "w", encoding="utf-8") as f:
f.write(f"# Equations from {os.path.basename(pdf_path)}\n\n")
for eq in all_equations:
f.write(f"## Page {eq['page']} - Equation {eq['index']}\n\n")
f.write(f"$$\n{eq['latex']}\n$$\n\n")
logger.info(f"Extracted {len(all_equations)} equations. Saved to {md_path}")
return all_equations
def main():
import argparse
parser = argparse.ArgumentParser(description="Extract LaTeX equations from PDF documents")
parser.add_argument("pdf_path", help="Path to the PDF file")
parser.add_argument("--output_dir", help="Directory to save extracted equations", default=None)
parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging (DEBUG level)")
args = parser.parse_args()
# Set log level
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
logger.info("Verbose logging enabled")
# Extract equations
extractor = LatexExtractor()
extractor.extract_equations_from_pdf(args.pdf_path, args.output_dir)
if __name__ == "__main__":
main()