118 lines
4.3 KiB
Python
118 lines
4.3 KiB
Python
import os
|
|
import fitz # PyMuPDF
|
|
import logging
|
|
from PIL import Image
|
|
import io
|
|
import tempfile
|
|
from pix2text import Pix2Text
|
|
import re
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
datefmt='%Y-%m-%d %H:%M:%S'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class LatexExtractor:
|
|
def __init__(self):
|
|
# Initialize Pix2Text with LaTeX OCR engine
|
|
self.p2t = Pix2Text(math_engine='mfd', math_dpi=150)
|
|
logger.info("Initialized Pix2Text with LaTeX OCR engine")
|
|
|
|
def detect_equations_from_image(self, image_path):
|
|
"""Detect and extract LaTeX equations from an image"""
|
|
logger.info(f"Processing image: {image_path}")
|
|
|
|
try:
|
|
# Process image with Pix2Text
|
|
result = self.p2t.recognize(image_path)
|
|
|
|
# Extract math blocks (LaTeX equations)
|
|
math_blocks = []
|
|
for item in result:
|
|
if item.get('type') == 'math' and item.get('text'):
|
|
math_blocks.append(item.get('text'))
|
|
|
|
logger.info(f"Extracted {len(math_blocks)} LaTeX equations from image")
|
|
return math_blocks
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error extracting LaTeX from image: {str(e)}")
|
|
return []
|
|
|
|
def extract_equations_from_pdf(self, pdf_path, output_dir=None):
|
|
"""Extract LaTeX equations from each page of a PDF"""
|
|
logger.info(f"Processing PDF: {pdf_path}")
|
|
|
|
if output_dir is None:
|
|
output_dir = os.path.join(os.path.dirname(pdf_path), "equations")
|
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
# Open the PDF
|
|
doc = fitz.open(pdf_path)
|
|
logger.info(f"PDF opened successfully. Document has {len(doc)} pages")
|
|
|
|
all_equations = []
|
|
|
|
# Process each page
|
|
for page_num, page in enumerate(doc, 1):
|
|
logger.info(f"Processing page {page_num}/{len(doc)}")
|
|
|
|
# Render page to image
|
|
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # Higher resolution for better OCR
|
|
|
|
# Save the page image to a temporary file
|
|
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
|
|
pix.save(tmp.name)
|
|
tmp_path = tmp.name
|
|
|
|
# Process the page image to extract equations
|
|
page_equations = self.detect_equations_from_image(tmp_path)
|
|
|
|
# Add page number information to each equation
|
|
for i, eq in enumerate(page_equations):
|
|
all_equations.append({
|
|
"page": page_num,
|
|
"index": i+1,
|
|
"latex": eq
|
|
})
|
|
|
|
# Clean up temporary file
|
|
os.unlink(tmp_path)
|
|
|
|
# Save all equations to a Markdown file
|
|
md_path = os.path.join(output_dir, f"{os.path.splitext(os.path.basename(pdf_path))[0]}_equations.md")
|
|
with open(md_path, "w", encoding="utf-8") as f:
|
|
f.write(f"# Equations from {os.path.basename(pdf_path)}\n\n")
|
|
|
|
for eq in all_equations:
|
|
f.write(f"## Page {eq['page']} - Equation {eq['index']}\n\n")
|
|
f.write(f"$$\n{eq['latex']}\n$$\n\n")
|
|
|
|
logger.info(f"Extracted {len(all_equations)} equations. Saved to {md_path}")
|
|
return all_equations
|
|
|
|
def main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="Extract LaTeX equations from PDF documents")
|
|
parser.add_argument("pdf_path", help="Path to the PDF file")
|
|
parser.add_argument("--output_dir", help="Directory to save extracted equations", default=None)
|
|
parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging (DEBUG level)")
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Set log level
|
|
if args.verbose:
|
|
logging.getLogger().setLevel(logging.DEBUG)
|
|
logger.info("Verbose logging enabled")
|
|
|
|
# Extract equations
|
|
extractor = LatexExtractor()
|
|
extractor.extract_equations_from_pdf(args.pdf_path, args.output_dir)
|
|
|
|
if __name__ == "__main__":
|
|
main() |