rag/pdf_to_markdown.py

import os
import pymupdf  # PyMuPDF
import re
import logging
import fitz  # PyMuPDF
import tempfile
from PIL import Image
import requests
import base64
import io
from pathlib import Path
import importlib.util

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)

# Configuration pour l'API Ollama ou autre modèle d'IA
OLLAMA_API_URL = "http://localhost:11434/api/generate"
OLLAMA_MODEL = "PetrosStav/gemma3-tools:4b"  # ou autre modèle multimodal

# Check if pix2text is available for LaTeX extraction
try:
    from pix2text import Pix2Text
    PIX2TEXT_AVAILABLE = True
    logger.info("Pix2Text is available - will use it for LaTeX equations")
except ImportError:
    PIX2TEXT_AVAILABLE = False
    logger.warning("Pix2Text not found - LaTeX equations will be extracted using basic methods")

def image_to_base64(image_path):
    """Convertit une image en base64 pour l'API Ollama"""
    with open(image_path, "rb") as img_file:
        return base64.b64encode(img_file.read()).decode("utf-8")

def get_image_description(image_path, language="english"):
    """Utilise l'API Ollama pour décrire une image dans la langue demandée"""
    try:
        base64_image = image_to_base64(image_path)

        # Adjust prompt based on language
        if language.lower() == "french":
            prompt = "Décris cette image en détail. S'il s'agit d'un graphique, d'un diagramme ou d'une figure, explique ce qu'elle représente avec précision."
        else:
            prompt = "Describe this image in detail. If this is a chart, diagram, or figure, explain what it represents precisely."

        response = requests.post(
            OLLAMA_API_URL,
            json={
                "model": OLLAMA_MODEL,
                "prompt": prompt,
                "images": [base64_image],
                "stream": False
            }
        )

        if response.status_code == 200:
            return response.json()["response"].strip()
        else:
            logger.error(f"Ollama API error: {response.status_code} - {response.text}")
            return "Error generating description from image."
    except Exception as e:
        logger.error(f"Error in image description generation: {str(e)}")
        return f"Description not available: {str(e)}"

def extract_images_from_page(page, output_dir, pdf_name, page_num):
    """Extrait les images d'une page de PDF en utilisant la méthode de rendu de page
    au lieu d'extraire directement les images intégrées (qui peuvent être noires)"""
    images_paths = []

    # Méthode 1: Extraction directe (peut donner des images noires)
    try:
        embedded_images = page.get_images(full=True)
        logger.info(f"Found {len(embedded_images)} embedded images on page {page_num}")

        for img_index, img in enumerate(embedded_images, 1):
            try:
                xref = img[0]
                base_image = page.parent.extract_image(xref)
                if base_image:
                    image_bytes = base_image["image"]
                    ext = base_image["ext"]

                    # Chemin de l'image
                    image_filename = f"{pdf_name}-page{page_num}-embed{img_index}.{ext}"
                    image_path = os.path.join(output_dir, image_filename)

                    # Sauvegarder l'image
                    with open(image_path, "wb") as img_file:
                        img_file.write(image_bytes)
                    logger.info(f"Embedded image saved: {image_path}")

                    # Vérifier si l'image n'est pas noire
                    pil_img = Image.open(image_path)
                    if is_image_mostly_black(pil_img):
                        logger.warning(f"Image {image_path} appears to be mostly black, will be ignored")
                    else:
                        images_paths.append(image_path)
            except Exception as e:
                logger.error(f"Error extracting embedded image {img_index} on page {page_num}: {str(e)}")
    except Exception as e:
        logger.error(f"Error extracting embedded images from page {page_num}: {str(e)}")

    # Méthode 2: Rendu de page entière (meilleure qualité, fonctionne même si les images sont noires)
    try:
        # Rendre la page entière en haute résolution
        zoom = 2  # Facteur de zoom pour une meilleure résolution
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat)

        # Sauvegarder l'image de la page entière
        page_image_filename = f"{pdf_name}-page{page_num}-full.png"
        page_image_path = os.path.join(output_dir, page_image_filename)
        pix.save(page_image_path)
        logger.info(f"Full page image saved: {page_image_path}")

        # Ajouter le chemin de l'image de la page entière
        images_paths.append(page_image_path)

        # Méthode 3: Extraction des zones d'image sur la page
        # Cette méthode utilise une heuristique pour détecter les zones rectangulaires
        # qui pourraient contenir des images, graphiques ou diagrammes
        rect_areas = detect_image_areas(page)

        for i, rect in enumerate(rect_areas, 1):
            try:
                # Découper une région de la page
                clip_pix = page.get_pixmap(matrix=mat, clip=rect)

                # Sauvegarder l'image découpée
                clip_filename = f"{pdf_name}-page{page_num}-clip{i}.png"
                clip_path = os.path.join(output_dir, clip_filename)
                clip_pix.save(clip_path)

                # Vérifier si l'image n'est pas noire et si elle est assez grande
                pil_img = Image.open(clip_path)
                if pil_img.width > 100 and pil_img.height > 100 and not is_image_mostly_black(pil_img):
                    logger.info(f"Detected image area saved: {clip_path}")
                    images_paths.append(clip_path)
                else:
                    # Supprimer les petites zones ou les zones noires
                    os.remove(clip_path)
                    logger.info(f"Image area too small or black, ignored: {clip_path}")
            except Exception as e:
                logger.error(f"Error extracting image area {i} on page {page_num}: {str(e)}")
    except Exception as e:
        logger.error(f"Error rendering page {page_num}: {str(e)}")

    return images_paths

def is_image_mostly_black(image, threshold=0.95):
    """Vérifie si une image est principalement noire"""
    # Convertir en niveaux de gris
    if image.mode != 'L':
        image = image.convert('L')

    # Compter les pixels noirs
    pixels = image.getdata()
    black_pixels = sum(1 for pixel in pixels if pixel < 20)
    total_pixels = len(pixels)

    # Vérifier le ratio de pixels noirs
    return black_pixels / total_pixels > threshold

def detect_image_areas(page):
    """Détecte les zones potentielles d'images sur une page"""
    # Cette fonction est une heuristique simple pour détecter les zones
    # qui pourraient contenir des images, des graphiques ou des diagrammes

    # Obtenir les blocs de la page
    blocks = page.get_text("dict")["blocks"]

    # Filtrer les blocs qui ne sont pas du texte
    image_areas = []

    for block in blocks:
        # Les blocs d'images ont généralement un type différent de 0 (texte)
        if block["type"] != 0:
            rect = fitz.Rect(block["bbox"])
            # Ignorer les zones trop petites
            if rect.width > 50 and rect.height > 50:
                image_areas.append(rect)

    # Si aucune zone n'est détectée, essayer une approche différente
    if not image_areas:
        # Diviser la page en sections et considérer les sections
        # qui ne contiennent pas de texte comme des candidats potentiels
        page_rect = page.rect
        text_areas = []

        # Obtenir les zones de texte
        for block in blocks:
            if block["type"] == 0:  # Bloc de texte
                text_areas.append(fitz.Rect(block["bbox"]))

        # Si nous avons des zones de texte, considérer le reste comme potentielles zones d'image
        if text_areas:
            # Une heuristique simple: diviser la page en 4 quadrants
            mid_x = page_rect.width / 2
            mid_y = page_rect.height / 2

            quadrants = [
                fitz.Rect(0, 0, mid_x, mid_y),
                fitz.Rect(mid_x, 0, page_rect.width, mid_y),
                fitz.Rect(0, mid_y, mid_x, page_rect.height),
                fitz.Rect(mid_x, mid_y, page_rect.width, page_rect.height)
            ]

            # Vérifier chaque quadrant
            for quad in quadrants:
                # Vérifier si le quadrant contient du texte
                contains_text = any(quad.intersects(text_area) for text_area in text_areas)

                if not contains_text and quad.width > 100 and quad.height > 100:
                    image_areas.append(quad)

    return image_areas

def extract_latex_from_text(text):
    """Extract and enhance mathematical equations from text using basic pattern matching"""
    # Find potential equations in the text
    equation_patterns = [
        # Expressions containing these characters are likely equations
        r'[=<>+\-*/±≈≤≥]',
        # Common mathematical notations
        r'[a-zA-Z][_^]',
        # Fractions, integrals, etc.
        r'\\frac|\\int|\\sum|\\prod|\\sqrt',
        # Greek letters
        r'\\alpha|\\beta|\\gamma|\\delta|\\epsilon|\\theta|\\lambda|\\mu|\\pi',
        # Already formatted LaTeX
        r'\$\$.*?\$\$|\$.*?\$'
    ]

    # Search line by line
    lines = text.splitlines()
    latex_chunks = []

    for line in lines:
        line = line.strip()
        # Skip lines that are too long (probably not equations)
        if len(line) > 150:
            continue

        # Check if the line contains a potential equation
        is_equation = False
        for pattern in equation_patterns:
            if re.search(pattern, line):
                is_equation = True
                break

        if is_equation:
            # Clean the equation
            eq = line.replace('$$', '').replace('$', '')

            # Improve LaTeX formatting
            eq = format_equation_for_latex(eq)
            latex_chunks.append(eq)

    return latex_chunks

def extract_latex_with_pix2text(page_image_path):
    """Extract LaTeX equations from an image using Pix2Text"""
    if not PIX2TEXT_AVAILABLE:
        logger.warning("Pix2Text is not available. Install it with: pip install pix2text")
        return []

    try:
        # Initialize Pix2Text with LaTeX OCR capabilities
        p2t = Pix2Text(math_engine='mfd')

        # Process the image
        result = p2t.recognize(page_image_path)

        # Extract math blocks
        equations = []
        for item in result:
            if item.get('type') == 'math' and item.get('text'):
                equations.append(item.get('text'))

        logger.info(f"Extracted {len(equations)} equations using Pix2Text")
        return equations
    except Exception as e:
        logger.error(f"Error extracting equations with Pix2Text: {str(e)}")
        return []

def format_equation_for_latex(eq_text):
    """Improves LaTeX formatting of equations"""
    # 1. Fix subscripts
    eq_text = re.sub(r'([a-zA-Z])_([a-zA-Z0-9]+)', r'\1_{(\2)}', eq_text)

    # 2. Fix superscripts
    eq_text = re.sub(r'([a-zA-Z0-9])(\^)([a-zA-Z0-9]+)', r'\1\2{(\3)}', eq_text)

    # 3. Remove equation numbers
    eq_text = re.sub(r'\((\d+)\)$', r'', eq_text).strip()

    # 4. Convert simple fractions to \frac
    fraction_match = re.search(r'([a-zA-Z0-9]+)\s*/\s*([a-zA-Z0-9]+)', eq_text)
    if fraction_match:
        numerator, denominator = fraction_match.groups()
        eq_text = eq_text.replace(f"{numerator}/{denominator}", f"\\frac{{{numerator}}}{{{denominator}}}")

    # 5. Add spaces around operators
    operators = ['+', '-', '=', '<', '>', '\\approx', '\\sim', '\\equiv']
    for op in operators:
        if op != '-':  # Avoid modifying negative signs
            eq_text = eq_text.replace(op, f" {op} ")

    # Remove double spaces
    while '  ' in eq_text:
        eq_text = eq_text.replace('  ', ' ')

    return eq_text.strip()

def process_pdf_to_markdown(pdf_path, output_md_path, output_dir="output", lang="english"):
    """Process a PDF and generate a Markdown file with text, images, and LaTeX equations"""
    logger.info(f"Processing PDF: {pdf_path}")

    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Open the PDF with PyMuPDF
    doc = fitz.open(pdf_path)
    logger.info(f"PDF opened successfully. Document has {len(doc)} pages")

    # Initialize Markdown content
    md_content = []
    md_content.append(f"# {os.path.splitext(os.path.basename(pdf_path))[0]}\n")

    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]

    # Process each page for text, images, and equations
    for page_num, page in enumerate(doc, 1):
        logger.info(f"Processing page {page_num}/{len(doc)}")

        # Extract text
        text = page.get_text("text")
        logger.info(f"Extracted {len(text)} characters of text from page {page_num}")

        # Add text to Markdown
        md_content.append(f"## Page {page_num}\n")
        md_content.append(f"{text.strip()}\n")

        # Extract images using multiple methods to ensure they're not black
        image_paths = extract_images_from_page(page, output_dir, pdf_name, page_num)
        logger.info(f"Extracted {len(image_paths)} images from page {page_num}")

        # Process each extracted image
        for img_index, image_path in enumerate(image_paths, 1):
            try:
                # Generate image description
                logger.info(f"Generating description for image {img_index} on page {page_num}")
                description = get_image_description(image_path, language=lang)
                logger.info(f"Description generated: {description[:50]}..." if len(description) > 50 else f"Description generated: {description}")

                # Add image and description to Markdown
                md_content.append(f"\n![Image {page_num}-{img_index}]({image_path})\n")
                md_content.append(f"**Description:** {description}\n")
            except Exception as e:
                logger.error(f"Error processing image {img_index} on page {page_num}: {str(e)}")

        # Extract and enhance equations - use Pix2Text if available
        logger.info(f"Extracting equations from page {page_num}")
        latex_equations = []

        if PIX2TEXT_AVAILABLE:
            # Render the page to an image for Pix2Text processing
            temp_pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
                temp_pix.save(tmp.name)
                temp_path = tmp.name

            # Extract equations using Pix2Text
            latex_equations = extract_latex_with_pix2text(temp_path)

            # Clean up temp file
            os.unlink(temp_path)
        else:
            # Fallback to basic extraction
            latex_equations = extract_latex_from_text(text)

        logger.info(f"Found {len(latex_equations)} potential equations on page {page_num}")

        for i, eq in enumerate(latex_equations, 1):
            try:
                logger.info(f"Equation {i}: {eq[:30]}..." if len(eq) > 30 else f"Equation {i}: {eq}")

                # Add equation to Markdown
                md_content.append(f"\n$$\n{eq}\n$$\n")
            except Exception as e:
                logger.error(f"Error formatting equation {i} on page {page_num}: {str(e)}")

    # Write content to Markdown file
    with open(output_md_path, "w", encoding="utf-8") as md_file:
        md_file.write("\n".join(md_content))

    logger.info(f"Markdown file generated: {output_md_path}")
    print(f"Conversion complete. Markdown file generated: {output_md_path}")

    return output_md_path

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Convert PDF to Markdown with text, images, and LaTeX equations")
    parser.add_argument("pdf_path", help="Path to the PDF file")
    parser.add_argument("--output_md", default="output.md", help="Path to output Markdown file")
    parser.add_argument("--output_dir", default="output", help="Directory for extracted images")
    parser.add_argument("--language", default="english", choices=["english", "french"],
                        help="Language for image descriptions (english or french)")
    parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging (DEBUG level)")

    args = parser.parse_args()

    # Set log level
    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)
        logger.info("Verbose logging enabled")

    # Process PDF
    process_pdf_to_markdown(args.pdf_path, args.output_md, args.output_dir, args.language)