import os import pymupdf # PyMuPDF import re import logging import fitz # PyMuPDF import tempfile from PIL import Image import requests import base64 import io from pathlib import Path import importlib.util # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) logger = logging.getLogger(__name__) # Configuration pour l'API Ollama ou autre modèle d'IA OLLAMA_API_URL = "http://localhost:11434/api/generate" OLLAMA_MODEL = "PetrosStav/gemma3-tools:4b" # ou autre modèle multimodal # Check if pix2text is available for LaTeX extraction try: from pix2text import Pix2Text PIX2TEXT_AVAILABLE = True logger.info("Pix2Text is available - will use it for LaTeX equations") except ImportError: PIX2TEXT_AVAILABLE = False logger.warning("Pix2Text not found - LaTeX equations will be extracted using basic methods") def image_to_base64(image_path): """Convertit une image en base64 pour l'API Ollama""" with open(image_path, "rb") as img_file: return base64.b64encode(img_file.read()).decode("utf-8") def get_image_description(image_path, language="english"): """Utilise l'API Ollama pour décrire une image dans la langue demandée""" try: base64_image = image_to_base64(image_path) # Adjust prompt based on language if language.lower() == "french": prompt = "Décris cette image en détail. S'il s'agit d'un graphique, d'un diagramme ou d'une figure, explique ce qu'elle représente avec précision." else: prompt = "Describe this image in detail. If this is a chart, diagram, or figure, explain what it represents precisely." response = requests.post( OLLAMA_API_URL, json={ "model": OLLAMA_MODEL, "prompt": prompt, "images": [base64_image], "stream": False } ) if response.status_code == 200: return response.json()["response"].strip() else: logger.error(f"Ollama API error: {response.status_code} - {response.text}") return "Error generating description from image." except Exception as e: logger.error(f"Error in image description generation: {str(e)}") return f"Description not available: {str(e)}" def extract_images_from_page(page, output_dir, pdf_name, page_num): """Extrait les images d'une page de PDF en utilisant la méthode de rendu de page au lieu d'extraire directement les images intégrées (qui peuvent être noires)""" images_paths = [] # Méthode 1: Extraction directe (peut donner des images noires) try: embedded_images = page.get_images(full=True) logger.info(f"Found {len(embedded_images)} embedded images on page {page_num}") for img_index, img in enumerate(embedded_images, 1): try: xref = img[0] base_image = page.parent.extract_image(xref) if base_image: image_bytes = base_image["image"] ext = base_image["ext"] # Chemin de l'image image_filename = f"{pdf_name}-page{page_num}-embed{img_index}.{ext}" image_path = os.path.join(output_dir, image_filename) # Sauvegarder l'image with open(image_path, "wb") as img_file: img_file.write(image_bytes) logger.info(f"Embedded image saved: {image_path}") # Vérifier si l'image n'est pas noire pil_img = Image.open(image_path) if is_image_mostly_black(pil_img): logger.warning(f"Image {image_path} appears to be mostly black, will be ignored") else: images_paths.append(image_path) except Exception as e: logger.error(f"Error extracting embedded image {img_index} on page {page_num}: {str(e)}") except Exception as e: logger.error(f"Error extracting embedded images from page {page_num}: {str(e)}") # Méthode 2: Rendu de page entière (meilleure qualité, fonctionne même si les images sont noires) try: # Rendre la page entière en haute résolution zoom = 2 # Facteur de zoom pour une meilleure résolution mat = fitz.Matrix(zoom, zoom) pix = page.get_pixmap(matrix=mat) # Sauvegarder l'image de la page entière page_image_filename = f"{pdf_name}-page{page_num}-full.png" page_image_path = os.path.join(output_dir, page_image_filename) pix.save(page_image_path) logger.info(f"Full page image saved: {page_image_path}") # Ajouter le chemin de l'image de la page entière images_paths.append(page_image_path) # Méthode 3: Extraction des zones d'image sur la page # Cette méthode utilise une heuristique pour détecter les zones rectangulaires # qui pourraient contenir des images, graphiques ou diagrammes rect_areas = detect_image_areas(page) for i, rect in enumerate(rect_areas, 1): try: # Découper une région de la page clip_pix = page.get_pixmap(matrix=mat, clip=rect) # Sauvegarder l'image découpée clip_filename = f"{pdf_name}-page{page_num}-clip{i}.png" clip_path = os.path.join(output_dir, clip_filename) clip_pix.save(clip_path) # Vérifier si l'image n'est pas noire et si elle est assez grande pil_img = Image.open(clip_path) if pil_img.width > 100 and pil_img.height > 100 and not is_image_mostly_black(pil_img): logger.info(f"Detected image area saved: {clip_path}") images_paths.append(clip_path) else: # Supprimer les petites zones ou les zones noires os.remove(clip_path) logger.info(f"Image area too small or black, ignored: {clip_path}") except Exception as e: logger.error(f"Error extracting image area {i} on page {page_num}: {str(e)}") except Exception as e: logger.error(f"Error rendering page {page_num}: {str(e)}") return images_paths def is_image_mostly_black(image, threshold=0.95): """Vérifie si une image est principalement noire""" # Convertir en niveaux de gris if image.mode != 'L': image = image.convert('L') # Compter les pixels noirs pixels = image.getdata() black_pixels = sum(1 for pixel in pixels if pixel < 20) total_pixels = len(pixels) # Vérifier le ratio de pixels noirs return black_pixels / total_pixels > threshold def detect_image_areas(page): """Détecte les zones potentielles d'images sur une page""" # Cette fonction est une heuristique simple pour détecter les zones # qui pourraient contenir des images, des graphiques ou des diagrammes # Obtenir les blocs de la page blocks = page.get_text("dict")["blocks"] # Filtrer les blocs qui ne sont pas du texte image_areas = [] for block in blocks: # Les blocs d'images ont généralement un type différent de 0 (texte) if block["type"] != 0: rect = fitz.Rect(block["bbox"]) # Ignorer les zones trop petites if rect.width > 50 and rect.height > 50: image_areas.append(rect) # Si aucune zone n'est détectée, essayer une approche différente if not image_areas: # Diviser la page en sections et considérer les sections # qui ne contiennent pas de texte comme des candidats potentiels page_rect = page.rect text_areas = [] # Obtenir les zones de texte for block in blocks: if block["type"] == 0: # Bloc de texte text_areas.append(fitz.Rect(block["bbox"])) # Si nous avons des zones de texte, considérer le reste comme potentielles zones d'image if text_areas: # Une heuristique simple: diviser la page en 4 quadrants mid_x = page_rect.width / 2 mid_y = page_rect.height / 2 quadrants = [ fitz.Rect(0, 0, mid_x, mid_y), fitz.Rect(mid_x, 0, page_rect.width, mid_y), fitz.Rect(0, mid_y, mid_x, page_rect.height), fitz.Rect(mid_x, mid_y, page_rect.width, page_rect.height) ] # Vérifier chaque quadrant for quad in quadrants: # Vérifier si le quadrant contient du texte contains_text = any(quad.intersects(text_area) for text_area in text_areas) if not contains_text and quad.width > 100 and quad.height > 100: image_areas.append(quad) return image_areas def extract_latex_from_text(text): """Extract and enhance mathematical equations from text using basic pattern matching""" # Find potential equations in the text equation_patterns = [ # Expressions containing these characters are likely equations r'[=<>+\-*/±≈≤≥]', # Common mathematical notations r'[a-zA-Z][_^]', # Fractions, integrals, etc. r'\\frac|\\int|\\sum|\\prod|\\sqrt', # Greek letters r'\\alpha|\\beta|\\gamma|\\delta|\\epsilon|\\theta|\\lambda|\\mu|\\pi', # Already formatted LaTeX r'\$\$.*?\$\$|\$.*?\$' ] # Search line by line lines = text.splitlines() latex_chunks = [] for line in lines: line = line.strip() # Skip lines that are too long (probably not equations) if len(line) > 150: continue # Check if the line contains a potential equation is_equation = False for pattern in equation_patterns: if re.search(pattern, line): is_equation = True break if is_equation: # Clean the equation eq = line.replace('$$', '').replace('$', '') # Improve LaTeX formatting eq = format_equation_for_latex(eq) latex_chunks.append(eq) return latex_chunks def extract_latex_with_pix2text(page_image_path): """Extract LaTeX equations from an image using Pix2Text""" if not PIX2TEXT_AVAILABLE: logger.warning("Pix2Text is not available. Install it with: pip install pix2text") return [] try: # Initialize Pix2Text with LaTeX OCR capabilities p2t = Pix2Text(math_engine='mfd') # Process the image result = p2t.recognize(page_image_path) # Extract math blocks equations = [] for item in result: if item.get('type') == 'math' and item.get('text'): equations.append(item.get('text')) logger.info(f"Extracted {len(equations)} equations using Pix2Text") return equations except Exception as e: logger.error(f"Error extracting equations with Pix2Text: {str(e)}") return [] def format_equation_for_latex(eq_text): """Improves LaTeX formatting of equations""" # 1. Fix subscripts eq_text = re.sub(r'([a-zA-Z])_([a-zA-Z0-9]+)', r'\1_{(\2)}', eq_text) # 2. Fix superscripts eq_text = re.sub(r'([a-zA-Z0-9])(\^)([a-zA-Z0-9]+)', r'\1\2{(\3)}', eq_text) # 3. Remove equation numbers eq_text = re.sub(r'\((\d+)\)$', r'', eq_text).strip() # 4. Convert simple fractions to \frac fraction_match = re.search(r'([a-zA-Z0-9]+)\s*/\s*([a-zA-Z0-9]+)', eq_text) if fraction_match: numerator, denominator = fraction_match.groups() eq_text = eq_text.replace(f"{numerator}/{denominator}", f"\\frac{{{numerator}}}{{{denominator}}}") # 5. Add spaces around operators operators = ['+', '-', '=', '<', '>', '\\approx', '\\sim', '\\equiv'] for op in operators: if op != '-': # Avoid modifying negative signs eq_text = eq_text.replace(op, f" {op} ") # Remove double spaces while ' ' in eq_text: eq_text = eq_text.replace(' ', ' ') return eq_text.strip() def process_pdf_to_markdown(pdf_path, output_md_path, output_dir="output", lang="english"): """Process a PDF and generate a Markdown file with text, images, and LaTeX equations""" logger.info(f"Processing PDF: {pdf_path}") # Create output directory if it doesn't exist os.makedirs(output_dir, exist_ok=True) # Open the PDF with PyMuPDF doc = fitz.open(pdf_path) logger.info(f"PDF opened successfully. Document has {len(doc)} pages") # Initialize Markdown content md_content = [] md_content.append(f"# {os.path.splitext(os.path.basename(pdf_path))[0]}\n") pdf_name = os.path.splitext(os.path.basename(pdf_path))[0] # Process each page for text, images, and equations for page_num, page in enumerate(doc, 1): logger.info(f"Processing page {page_num}/{len(doc)}") # Extract text text = page.get_text("text") logger.info(f"Extracted {len(text)} characters of text from page {page_num}") # Add text to Markdown md_content.append(f"## Page {page_num}\n") md_content.append(f"{text.strip()}\n") # Extract images using multiple methods to ensure they're not black image_paths = extract_images_from_page(page, output_dir, pdf_name, page_num) logger.info(f"Extracted {len(image_paths)} images from page {page_num}") # Process each extracted image for img_index, image_path in enumerate(image_paths, 1): try: # Generate image description logger.info(f"Generating description for image {img_index} on page {page_num}") description = get_image_description(image_path, language=lang) logger.info(f"Description generated: {description[:50]}..." if len(description) > 50 else f"Description generated: {description}") # Add image and description to Markdown md_content.append(f"\n![Image {page_num}-{img_index}]({image_path})\n") md_content.append(f"**Description:** {description}\n") except Exception as e: logger.error(f"Error processing image {img_index} on page {page_num}: {str(e)}") # Extract and enhance equations - use Pix2Text if available logger.info(f"Extracting equations from page {page_num}") latex_equations = [] if PIX2TEXT_AVAILABLE: # Render the page to an image for Pix2Text processing temp_pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp: temp_pix.save(tmp.name) temp_path = tmp.name # Extract equations using Pix2Text latex_equations = extract_latex_with_pix2text(temp_path) # Clean up temp file os.unlink(temp_path) else: # Fallback to basic extraction latex_equations = extract_latex_from_text(text) logger.info(f"Found {len(latex_equations)} potential equations on page {page_num}") for i, eq in enumerate(latex_equations, 1): try: logger.info(f"Equation {i}: {eq[:30]}..." if len(eq) > 30 else f"Equation {i}: {eq}") # Add equation to Markdown md_content.append(f"\n$$\n{eq}\n$$\n") except Exception as e: logger.error(f"Error formatting equation {i} on page {page_num}: {str(e)}") # Write content to Markdown file with open(output_md_path, "w", encoding="utf-8") as md_file: md_file.write("\n".join(md_content)) logger.info(f"Markdown file generated: {output_md_path}") print(f"Conversion complete. Markdown file generated: {output_md_path}") return output_md_path if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Convert PDF to Markdown with text, images, and LaTeX equations") parser.add_argument("pdf_path", help="Path to the PDF file") parser.add_argument("--output_md", default="output.md", help="Path to output Markdown file") parser.add_argument("--output_dir", default="output", help="Directory for extracted images") parser.add_argument("--language", default="english", choices=["english", "french"], help="Language for image descriptions (english or french)") parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging (DEBUG level)") args = parser.parse_args() # Set log level if args.verbose: logging.getLogger().setLevel(logging.DEBUG) logger.info("Verbose logging enabled") # Process PDF process_pdf_to_markdown(args.pdf_path, args.output_md, args.output_dir, args.language)