From 74c720c7ba95101e507cfe3a8c3919c6912d2cf2 Mon Sep 17 00:00:00 2001 From: sepehr Date: Mon, 21 Apr 2025 15:27:32 +0200 Subject: [PATCH] add pdf parsing --- pdf_to_latex.py | 118 +++++++++++++ pdf_to_markdown.py | 426 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 544 insertions(+) create mode 100644 pdf_to_latex.py create mode 100644 pdf_to_markdown.py diff --git a/pdf_to_latex.py b/pdf_to_latex.py new file mode 100644 index 0000000..8ff92df --- /dev/null +++ b/pdf_to_latex.py @@ -0,0 +1,118 @@ +import os +import fitz # PyMuPDF +import logging +from PIL import Image +import io +import tempfile +from pix2text import Pix2Text +import re + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' +) +logger = logging.getLogger(__name__) + +class LatexExtractor: + def __init__(self): + # Initialize Pix2Text with LaTeX OCR engine + self.p2t = Pix2Text(math_engine='mfd', math_dpi=150) + logger.info("Initialized Pix2Text with LaTeX OCR engine") + + def detect_equations_from_image(self, image_path): + """Detect and extract LaTeX equations from an image""" + logger.info(f"Processing image: {image_path}") + + try: + # Process image with Pix2Text + result = self.p2t.recognize(image_path) + + # Extract math blocks (LaTeX equations) + math_blocks = [] + for item in result: + if item.get('type') == 'math' and item.get('text'): + math_blocks.append(item.get('text')) + + logger.info(f"Extracted {len(math_blocks)} LaTeX equations from image") + return math_blocks + + except Exception as e: + logger.error(f"Error extracting LaTeX from image: {str(e)}") + return [] + + def extract_equations_from_pdf(self, pdf_path, output_dir=None): + """Extract LaTeX equations from each page of a PDF""" + logger.info(f"Processing PDF: {pdf_path}") + + if output_dir is None: + output_dir = os.path.join(os.path.dirname(pdf_path), "equations") + + os.makedirs(output_dir, exist_ok=True) + + # Open the PDF + doc = fitz.open(pdf_path) + logger.info(f"PDF opened successfully. Document has {len(doc)} pages") + + all_equations = [] + + # Process each page + for page_num, page in enumerate(doc, 1): + logger.info(f"Processing page {page_num}/{len(doc)}") + + # Render page to image + pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # Higher resolution for better OCR + + # Save the page image to a temporary file + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp: + pix.save(tmp.name) + tmp_path = tmp.name + + # Process the page image to extract equations + page_equations = self.detect_equations_from_image(tmp_path) + + # Add page number information to each equation + for i, eq in enumerate(page_equations): + all_equations.append({ + "page": page_num, + "index": i+1, + "latex": eq + }) + + # Clean up temporary file + os.unlink(tmp_path) + + # Save all equations to a Markdown file + md_path = os.path.join(output_dir, f"{os.path.splitext(os.path.basename(pdf_path))[0]}_equations.md") + with open(md_path, "w", encoding="utf-8") as f: + f.write(f"# Equations from {os.path.basename(pdf_path)}\n\n") + + for eq in all_equations: + f.write(f"## Page {eq['page']} - Equation {eq['index']}\n\n") + f.write(f"$$\n{eq['latex']}\n$$\n\n") + + logger.info(f"Extracted {len(all_equations)} equations. Saved to {md_path}") + return all_equations + +def main(): + import argparse + + parser = argparse.ArgumentParser(description="Extract LaTeX equations from PDF documents") + parser.add_argument("pdf_path", help="Path to the PDF file") + parser.add_argument("--output_dir", help="Directory to save extracted equations", default=None) + parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging (DEBUG level)") + + args = parser.parse_args() + + # Set log level + if args.verbose: + logging.getLogger().setLevel(logging.DEBUG) + logger.info("Verbose logging enabled") + + # Extract equations + extractor = LatexExtractor() + extractor.extract_equations_from_pdf(args.pdf_path, args.output_dir) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/pdf_to_markdown.py b/pdf_to_markdown.py new file mode 100644 index 0000000..73948d8 --- /dev/null +++ b/pdf_to_markdown.py @@ -0,0 +1,426 @@ +import os +import pymupdf # PyMuPDF +import re +import logging +import fitz # PyMuPDF +import tempfile +from PIL import Image +import requests +import base64 +import io +from pathlib import Path +import importlib.util + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' +) +logger = logging.getLogger(__name__) + +# Configuration pour l'API Ollama ou autre modèle d'IA +OLLAMA_API_URL = "http://localhost:11434/api/generate" +OLLAMA_MODEL = "PetrosStav/gemma3-tools:4b" # ou autre modèle multimodal + +# Check if pix2text is available for LaTeX extraction +try: + from pix2text import Pix2Text + PIX2TEXT_AVAILABLE = True + logger.info("Pix2Text is available - will use it for LaTeX equations") +except ImportError: + PIX2TEXT_AVAILABLE = False + logger.warning("Pix2Text not found - LaTeX equations will be extracted using basic methods") + +def image_to_base64(image_path): + """Convertit une image en base64 pour l'API Ollama""" + with open(image_path, "rb") as img_file: + return base64.b64encode(img_file.read()).decode("utf-8") + +def get_image_description(image_path, language="english"): + """Utilise l'API Ollama pour décrire une image dans la langue demandée""" + try: + base64_image = image_to_base64(image_path) + + # Adjust prompt based on language + if language.lower() == "french": + prompt = "Décris cette image en détail. S'il s'agit d'un graphique, d'un diagramme ou d'une figure, explique ce qu'elle représente avec précision." + else: + prompt = "Describe this image in detail. If this is a chart, diagram, or figure, explain what it represents precisely." + + response = requests.post( + OLLAMA_API_URL, + json={ + "model": OLLAMA_MODEL, + "prompt": prompt, + "images": [base64_image], + "stream": False + } + ) + + if response.status_code == 200: + return response.json()["response"].strip() + else: + logger.error(f"Ollama API error: {response.status_code} - {response.text}") + return "Error generating description from image." + except Exception as e: + logger.error(f"Error in image description generation: {str(e)}") + return f"Description not available: {str(e)}" + +def extract_images_from_page(page, output_dir, pdf_name, page_num): + """Extrait les images d'une page de PDF en utilisant la méthode de rendu de page + au lieu d'extraire directement les images intégrées (qui peuvent être noires)""" + images_paths = [] + + # Méthode 1: Extraction directe (peut donner des images noires) + try: + embedded_images = page.get_images(full=True) + logger.info(f"Found {len(embedded_images)} embedded images on page {page_num}") + + for img_index, img in enumerate(embedded_images, 1): + try: + xref = img[0] + base_image = page.parent.extract_image(xref) + if base_image: + image_bytes = base_image["image"] + ext = base_image["ext"] + + # Chemin de l'image + image_filename = f"{pdf_name}-page{page_num}-embed{img_index}.{ext}" + image_path = os.path.join(output_dir, image_filename) + + # Sauvegarder l'image + with open(image_path, "wb") as img_file: + img_file.write(image_bytes) + logger.info(f"Embedded image saved: {image_path}") + + # Vérifier si l'image n'est pas noire + pil_img = Image.open(image_path) + if is_image_mostly_black(pil_img): + logger.warning(f"Image {image_path} appears to be mostly black, will be ignored") + else: + images_paths.append(image_path) + except Exception as e: + logger.error(f"Error extracting embedded image {img_index} on page {page_num}: {str(e)}") + except Exception as e: + logger.error(f"Error extracting embedded images from page {page_num}: {str(e)}") + + # Méthode 2: Rendu de page entière (meilleure qualité, fonctionne même si les images sont noires) + try: + # Rendre la page entière en haute résolution + zoom = 2 # Facteur de zoom pour une meilleure résolution + mat = fitz.Matrix(zoom, zoom) + pix = page.get_pixmap(matrix=mat) + + # Sauvegarder l'image de la page entière + page_image_filename = f"{pdf_name}-page{page_num}-full.png" + page_image_path = os.path.join(output_dir, page_image_filename) + pix.save(page_image_path) + logger.info(f"Full page image saved: {page_image_path}") + + # Ajouter le chemin de l'image de la page entière + images_paths.append(page_image_path) + + # Méthode 3: Extraction des zones d'image sur la page + # Cette méthode utilise une heuristique pour détecter les zones rectangulaires + # qui pourraient contenir des images, graphiques ou diagrammes + rect_areas = detect_image_areas(page) + + for i, rect in enumerate(rect_areas, 1): + try: + # Découper une région de la page + clip_pix = page.get_pixmap(matrix=mat, clip=rect) + + # Sauvegarder l'image découpée + clip_filename = f"{pdf_name}-page{page_num}-clip{i}.png" + clip_path = os.path.join(output_dir, clip_filename) + clip_pix.save(clip_path) + + # Vérifier si l'image n'est pas noire et si elle est assez grande + pil_img = Image.open(clip_path) + if pil_img.width > 100 and pil_img.height > 100 and not is_image_mostly_black(pil_img): + logger.info(f"Detected image area saved: {clip_path}") + images_paths.append(clip_path) + else: + # Supprimer les petites zones ou les zones noires + os.remove(clip_path) + logger.info(f"Image area too small or black, ignored: {clip_path}") + except Exception as e: + logger.error(f"Error extracting image area {i} on page {page_num}: {str(e)}") + except Exception as e: + logger.error(f"Error rendering page {page_num}: {str(e)}") + + return images_paths + +def is_image_mostly_black(image, threshold=0.95): + """Vérifie si une image est principalement noire""" + # Convertir en niveaux de gris + if image.mode != 'L': + image = image.convert('L') + + # Compter les pixels noirs + pixels = image.getdata() + black_pixels = sum(1 for pixel in pixels if pixel < 20) + total_pixels = len(pixels) + + # Vérifier le ratio de pixels noirs + return black_pixels / total_pixels > threshold + +def detect_image_areas(page): + """Détecte les zones potentielles d'images sur une page""" + # Cette fonction est une heuristique simple pour détecter les zones + # qui pourraient contenir des images, des graphiques ou des diagrammes + + # Obtenir les blocs de la page + blocks = page.get_text("dict")["blocks"] + + # Filtrer les blocs qui ne sont pas du texte + image_areas = [] + + for block in blocks: + # Les blocs d'images ont généralement un type différent de 0 (texte) + if block["type"] != 0: + rect = fitz.Rect(block["bbox"]) + # Ignorer les zones trop petites + if rect.width > 50 and rect.height > 50: + image_areas.append(rect) + + # Si aucune zone n'est détectée, essayer une approche différente + if not image_areas: + # Diviser la page en sections et considérer les sections + # qui ne contiennent pas de texte comme des candidats potentiels + page_rect = page.rect + text_areas = [] + + # Obtenir les zones de texte + for block in blocks: + if block["type"] == 0: # Bloc de texte + text_areas.append(fitz.Rect(block["bbox"])) + + # Si nous avons des zones de texte, considérer le reste comme potentielles zones d'image + if text_areas: + # Une heuristique simple: diviser la page en 4 quadrants + mid_x = page_rect.width / 2 + mid_y = page_rect.height / 2 + + quadrants = [ + fitz.Rect(0, 0, mid_x, mid_y), + fitz.Rect(mid_x, 0, page_rect.width, mid_y), + fitz.Rect(0, mid_y, mid_x, page_rect.height), + fitz.Rect(mid_x, mid_y, page_rect.width, page_rect.height) + ] + + # Vérifier chaque quadrant + for quad in quadrants: + # Vérifier si le quadrant contient du texte + contains_text = any(quad.intersects(text_area) for text_area in text_areas) + + if not contains_text and quad.width > 100 and quad.height > 100: + image_areas.append(quad) + + return image_areas + +def extract_latex_from_text(text): + """Extract and enhance mathematical equations from text using basic pattern matching""" + # Find potential equations in the text + equation_patterns = [ + # Expressions containing these characters are likely equations + r'[=<>+\-*/±≈≤≥]', + # Common mathematical notations + r'[a-zA-Z][_^]', + # Fractions, integrals, etc. + r'\\frac|\\int|\\sum|\\prod|\\sqrt', + # Greek letters + r'\\alpha|\\beta|\\gamma|\\delta|\\epsilon|\\theta|\\lambda|\\mu|\\pi', + # Already formatted LaTeX + r'\$\$.*?\$\$|\$.*?\$' + ] + + # Search line by line + lines = text.splitlines() + latex_chunks = [] + + for line in lines: + line = line.strip() + # Skip lines that are too long (probably not equations) + if len(line) > 150: + continue + + # Check if the line contains a potential equation + is_equation = False + for pattern in equation_patterns: + if re.search(pattern, line): + is_equation = True + break + + if is_equation: + # Clean the equation + eq = line.replace('$$', '').replace('$', '') + + # Improve LaTeX formatting + eq = format_equation_for_latex(eq) + latex_chunks.append(eq) + + return latex_chunks + +def extract_latex_with_pix2text(page_image_path): + """Extract LaTeX equations from an image using Pix2Text""" + if not PIX2TEXT_AVAILABLE: + logger.warning("Pix2Text is not available. Install it with: pip install pix2text") + return [] + + try: + # Initialize Pix2Text with LaTeX OCR capabilities + p2t = Pix2Text(math_engine='mfd') + + # Process the image + result = p2t.recognize(page_image_path) + + # Extract math blocks + equations = [] + for item in result: + if item.get('type') == 'math' and item.get('text'): + equations.append(item.get('text')) + + logger.info(f"Extracted {len(equations)} equations using Pix2Text") + return equations + except Exception as e: + logger.error(f"Error extracting equations with Pix2Text: {str(e)}") + return [] + +def format_equation_for_latex(eq_text): + """Improves LaTeX formatting of equations""" + # 1. Fix subscripts + eq_text = re.sub(r'([a-zA-Z])_([a-zA-Z0-9]+)', r'\1_{(\2)}', eq_text) + + # 2. Fix superscripts + eq_text = re.sub(r'([a-zA-Z0-9])(\^)([a-zA-Z0-9]+)', r'\1\2{(\3)}', eq_text) + + # 3. Remove equation numbers + eq_text = re.sub(r'\((\d+)\)$', r'', eq_text).strip() + + # 4. Convert simple fractions to \frac + fraction_match = re.search(r'([a-zA-Z0-9]+)\s*/\s*([a-zA-Z0-9]+)', eq_text) + if fraction_match: + numerator, denominator = fraction_match.groups() + eq_text = eq_text.replace(f"{numerator}/{denominator}", f"\\frac{{{numerator}}}{{{denominator}}}") + + # 5. Add spaces around operators + operators = ['+', '-', '=', '<', '>', '\\approx', '\\sim', '\\equiv'] + for op in operators: + if op != '-': # Avoid modifying negative signs + eq_text = eq_text.replace(op, f" {op} ") + + # Remove double spaces + while ' ' in eq_text: + eq_text = eq_text.replace(' ', ' ') + + return eq_text.strip() + +def process_pdf_to_markdown(pdf_path, output_md_path, output_dir="output", lang="english"): + """Process a PDF and generate a Markdown file with text, images, and LaTeX equations""" + logger.info(f"Processing PDF: {pdf_path}") + + # Create output directory if it doesn't exist + os.makedirs(output_dir, exist_ok=True) + + # Open the PDF with PyMuPDF + doc = fitz.open(pdf_path) + logger.info(f"PDF opened successfully. Document has {len(doc)} pages") + + # Initialize Markdown content + md_content = [] + md_content.append(f"# {os.path.splitext(os.path.basename(pdf_path))[0]}\n") + + pdf_name = os.path.splitext(os.path.basename(pdf_path))[0] + + # Process each page for text, images, and equations + for page_num, page in enumerate(doc, 1): + logger.info(f"Processing page {page_num}/{len(doc)}") + + # Extract text + text = page.get_text("text") + logger.info(f"Extracted {len(text)} characters of text from page {page_num}") + + # Add text to Markdown + md_content.append(f"## Page {page_num}\n") + md_content.append(f"{text.strip()}\n") + + # Extract images using multiple methods to ensure they're not black + image_paths = extract_images_from_page(page, output_dir, pdf_name, page_num) + logger.info(f"Extracted {len(image_paths)} images from page {page_num}") + + # Process each extracted image + for img_index, image_path in enumerate(image_paths, 1): + try: + # Generate image description + logger.info(f"Generating description for image {img_index} on page {page_num}") + description = get_image_description(image_path, language=lang) + logger.info(f"Description generated: {description[:50]}..." if len(description) > 50 else f"Description generated: {description}") + + # Add image and description to Markdown + md_content.append(f"\n![Image {page_num}-{img_index}]({image_path})\n") + md_content.append(f"**Description:** {description}\n") + except Exception as e: + logger.error(f"Error processing image {img_index} on page {page_num}: {str(e)}") + + # Extract and enhance equations - use Pix2Text if available + logger.info(f"Extracting equations from page {page_num}") + latex_equations = [] + + if PIX2TEXT_AVAILABLE: + # Render the page to an image for Pix2Text processing + temp_pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp: + temp_pix.save(tmp.name) + temp_path = tmp.name + + # Extract equations using Pix2Text + latex_equations = extract_latex_with_pix2text(temp_path) + + # Clean up temp file + os.unlink(temp_path) + else: + # Fallback to basic extraction + latex_equations = extract_latex_from_text(text) + + logger.info(f"Found {len(latex_equations)} potential equations on page {page_num}") + + for i, eq in enumerate(latex_equations, 1): + try: + logger.info(f"Equation {i}: {eq[:30]}..." if len(eq) > 30 else f"Equation {i}: {eq}") + + # Add equation to Markdown + md_content.append(f"\n$$\n{eq}\n$$\n") + except Exception as e: + logger.error(f"Error formatting equation {i} on page {page_num}: {str(e)}") + + # Write content to Markdown file + with open(output_md_path, "w", encoding="utf-8") as md_file: + md_file.write("\n".join(md_content)) + + logger.info(f"Markdown file generated: {output_md_path}") + print(f"Conversion complete. Markdown file generated: {output_md_path}") + + return output_md_path + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Convert PDF to Markdown with text, images, and LaTeX equations") + parser.add_argument("pdf_path", help="Path to the PDF file") + parser.add_argument("--output_md", default="output.md", help="Path to output Markdown file") + parser.add_argument("--output_dir", default="output", help="Directory for extracted images") + parser.add_argument("--language", default="english", choices=["english", "french"], + help="Language for image descriptions (english or french)") + parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging (DEBUG level)") + + args = parser.parse_args() + + # Set log level + if args.verbose: + logging.getLogger().setLevel(logging.DEBUG) + logger.info("Verbose logging enabled") + + # Process PDF + process_pdf_to_markdown(args.pdf_path, args.output_md, args.output_dir, args.language) \ No newline at end of file