Compare commits
4 Commits
2275e8b5be
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 74c720c7ba | |||
| e44c929ce7 | |||
| b7e2ded889 | |||
| 8292dc15b3 |
118
pdf_to_latex.py
Normal file
118
pdf_to_latex.py
Normal file
@@ -0,0 +1,118 @@
|
|||||||
|
import os
|
||||||
|
import fitz # PyMuPDF
|
||||||
|
import logging
|
||||||
|
from PIL import Image
|
||||||
|
import io
|
||||||
|
import tempfile
|
||||||
|
from pix2text import Pix2Text
|
||||||
|
import re
|
||||||
|
|
||||||
|
# Configure logging
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||||
|
datefmt='%Y-%m-%d %H:%M:%S'
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class LatexExtractor:
|
||||||
|
def __init__(self):
|
||||||
|
# Initialize Pix2Text with LaTeX OCR engine
|
||||||
|
self.p2t = Pix2Text(math_engine='mfd', math_dpi=150)
|
||||||
|
logger.info("Initialized Pix2Text with LaTeX OCR engine")
|
||||||
|
|
||||||
|
def detect_equations_from_image(self, image_path):
|
||||||
|
"""Detect and extract LaTeX equations from an image"""
|
||||||
|
logger.info(f"Processing image: {image_path}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Process image with Pix2Text
|
||||||
|
result = self.p2t.recognize(image_path)
|
||||||
|
|
||||||
|
# Extract math blocks (LaTeX equations)
|
||||||
|
math_blocks = []
|
||||||
|
for item in result:
|
||||||
|
if item.get('type') == 'math' and item.get('text'):
|
||||||
|
math_blocks.append(item.get('text'))
|
||||||
|
|
||||||
|
logger.info(f"Extracted {len(math_blocks)} LaTeX equations from image")
|
||||||
|
return math_blocks
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error extracting LaTeX from image: {str(e)}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def extract_equations_from_pdf(self, pdf_path, output_dir=None):
|
||||||
|
"""Extract LaTeX equations from each page of a PDF"""
|
||||||
|
logger.info(f"Processing PDF: {pdf_path}")
|
||||||
|
|
||||||
|
if output_dir is None:
|
||||||
|
output_dir = os.path.join(os.path.dirname(pdf_path), "equations")
|
||||||
|
|
||||||
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Open the PDF
|
||||||
|
doc = fitz.open(pdf_path)
|
||||||
|
logger.info(f"PDF opened successfully. Document has {len(doc)} pages")
|
||||||
|
|
||||||
|
all_equations = []
|
||||||
|
|
||||||
|
# Process each page
|
||||||
|
for page_num, page in enumerate(doc, 1):
|
||||||
|
logger.info(f"Processing page {page_num}/{len(doc)}")
|
||||||
|
|
||||||
|
# Render page to image
|
||||||
|
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # Higher resolution for better OCR
|
||||||
|
|
||||||
|
# Save the page image to a temporary file
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
|
||||||
|
pix.save(tmp.name)
|
||||||
|
tmp_path = tmp.name
|
||||||
|
|
||||||
|
# Process the page image to extract equations
|
||||||
|
page_equations = self.detect_equations_from_image(tmp_path)
|
||||||
|
|
||||||
|
# Add page number information to each equation
|
||||||
|
for i, eq in enumerate(page_equations):
|
||||||
|
all_equations.append({
|
||||||
|
"page": page_num,
|
||||||
|
"index": i+1,
|
||||||
|
"latex": eq
|
||||||
|
})
|
||||||
|
|
||||||
|
# Clean up temporary file
|
||||||
|
os.unlink(tmp_path)
|
||||||
|
|
||||||
|
# Save all equations to a Markdown file
|
||||||
|
md_path = os.path.join(output_dir, f"{os.path.splitext(os.path.basename(pdf_path))[0]}_equations.md")
|
||||||
|
with open(md_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(f"# Equations from {os.path.basename(pdf_path)}\n\n")
|
||||||
|
|
||||||
|
for eq in all_equations:
|
||||||
|
f.write(f"## Page {eq['page']} - Equation {eq['index']}\n\n")
|
||||||
|
f.write(f"$$\n{eq['latex']}\n$$\n\n")
|
||||||
|
|
||||||
|
logger.info(f"Extracted {len(all_equations)} equations. Saved to {md_path}")
|
||||||
|
return all_equations
|
||||||
|
|
||||||
|
def main():
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="Extract LaTeX equations from PDF documents")
|
||||||
|
parser.add_argument("pdf_path", help="Path to the PDF file")
|
||||||
|
parser.add_argument("--output_dir", help="Directory to save extracted equations", default=None)
|
||||||
|
parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging (DEBUG level)")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Set log level
|
||||||
|
if args.verbose:
|
||||||
|
logging.getLogger().setLevel(logging.DEBUG)
|
||||||
|
logger.info("Verbose logging enabled")
|
||||||
|
|
||||||
|
# Extract equations
|
||||||
|
extractor = LatexExtractor()
|
||||||
|
extractor.extract_equations_from_pdf(args.pdf_path, args.output_dir)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
426
pdf_to_markdown.py
Normal file
426
pdf_to_markdown.py
Normal file
@@ -0,0 +1,426 @@
|
|||||||
|
import os
|
||||||
|
import pymupdf # PyMuPDF
|
||||||
|
import re
|
||||||
|
import logging
|
||||||
|
import fitz # PyMuPDF
|
||||||
|
import tempfile
|
||||||
|
from PIL import Image
|
||||||
|
import requests
|
||||||
|
import base64
|
||||||
|
import io
|
||||||
|
from pathlib import Path
|
||||||
|
import importlib.util
|
||||||
|
|
||||||
|
# Configure logging
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||||
|
datefmt='%Y-%m-%d %H:%M:%S'
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Configuration pour l'API Ollama ou autre modèle d'IA
|
||||||
|
OLLAMA_API_URL = "http://localhost:11434/api/generate"
|
||||||
|
OLLAMA_MODEL = "PetrosStav/gemma3-tools:4b" # ou autre modèle multimodal
|
||||||
|
|
||||||
|
# Check if pix2text is available for LaTeX extraction
|
||||||
|
try:
|
||||||
|
from pix2text import Pix2Text
|
||||||
|
PIX2TEXT_AVAILABLE = True
|
||||||
|
logger.info("Pix2Text is available - will use it for LaTeX equations")
|
||||||
|
except ImportError:
|
||||||
|
PIX2TEXT_AVAILABLE = False
|
||||||
|
logger.warning("Pix2Text not found - LaTeX equations will be extracted using basic methods")
|
||||||
|
|
||||||
|
def image_to_base64(image_path):
|
||||||
|
"""Convertit une image en base64 pour l'API Ollama"""
|
||||||
|
with open(image_path, "rb") as img_file:
|
||||||
|
return base64.b64encode(img_file.read()).decode("utf-8")
|
||||||
|
|
||||||
|
def get_image_description(image_path, language="english"):
|
||||||
|
"""Utilise l'API Ollama pour décrire une image dans la langue demandée"""
|
||||||
|
try:
|
||||||
|
base64_image = image_to_base64(image_path)
|
||||||
|
|
||||||
|
# Adjust prompt based on language
|
||||||
|
if language.lower() == "french":
|
||||||
|
prompt = "Décris cette image en détail. S'il s'agit d'un graphique, d'un diagramme ou d'une figure, explique ce qu'elle représente avec précision."
|
||||||
|
else:
|
||||||
|
prompt = "Describe this image in detail. If this is a chart, diagram, or figure, explain what it represents precisely."
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
OLLAMA_API_URL,
|
||||||
|
json={
|
||||||
|
"model": OLLAMA_MODEL,
|
||||||
|
"prompt": prompt,
|
||||||
|
"images": [base64_image],
|
||||||
|
"stream": False
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
return response.json()["response"].strip()
|
||||||
|
else:
|
||||||
|
logger.error(f"Ollama API error: {response.status_code} - {response.text}")
|
||||||
|
return "Error generating description from image."
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in image description generation: {str(e)}")
|
||||||
|
return f"Description not available: {str(e)}"
|
||||||
|
|
||||||
|
def extract_images_from_page(page, output_dir, pdf_name, page_num):
|
||||||
|
"""Extrait les images d'une page de PDF en utilisant la méthode de rendu de page
|
||||||
|
au lieu d'extraire directement les images intégrées (qui peuvent être noires)"""
|
||||||
|
images_paths = []
|
||||||
|
|
||||||
|
# Méthode 1: Extraction directe (peut donner des images noires)
|
||||||
|
try:
|
||||||
|
embedded_images = page.get_images(full=True)
|
||||||
|
logger.info(f"Found {len(embedded_images)} embedded images on page {page_num}")
|
||||||
|
|
||||||
|
for img_index, img in enumerate(embedded_images, 1):
|
||||||
|
try:
|
||||||
|
xref = img[0]
|
||||||
|
base_image = page.parent.extract_image(xref)
|
||||||
|
if base_image:
|
||||||
|
image_bytes = base_image["image"]
|
||||||
|
ext = base_image["ext"]
|
||||||
|
|
||||||
|
# Chemin de l'image
|
||||||
|
image_filename = f"{pdf_name}-page{page_num}-embed{img_index}.{ext}"
|
||||||
|
image_path = os.path.join(output_dir, image_filename)
|
||||||
|
|
||||||
|
# Sauvegarder l'image
|
||||||
|
with open(image_path, "wb") as img_file:
|
||||||
|
img_file.write(image_bytes)
|
||||||
|
logger.info(f"Embedded image saved: {image_path}")
|
||||||
|
|
||||||
|
# Vérifier si l'image n'est pas noire
|
||||||
|
pil_img = Image.open(image_path)
|
||||||
|
if is_image_mostly_black(pil_img):
|
||||||
|
logger.warning(f"Image {image_path} appears to be mostly black, will be ignored")
|
||||||
|
else:
|
||||||
|
images_paths.append(image_path)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error extracting embedded image {img_index} on page {page_num}: {str(e)}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error extracting embedded images from page {page_num}: {str(e)}")
|
||||||
|
|
||||||
|
# Méthode 2: Rendu de page entière (meilleure qualité, fonctionne même si les images sont noires)
|
||||||
|
try:
|
||||||
|
# Rendre la page entière en haute résolution
|
||||||
|
zoom = 2 # Facteur de zoom pour une meilleure résolution
|
||||||
|
mat = fitz.Matrix(zoom, zoom)
|
||||||
|
pix = page.get_pixmap(matrix=mat)
|
||||||
|
|
||||||
|
# Sauvegarder l'image de la page entière
|
||||||
|
page_image_filename = f"{pdf_name}-page{page_num}-full.png"
|
||||||
|
page_image_path = os.path.join(output_dir, page_image_filename)
|
||||||
|
pix.save(page_image_path)
|
||||||
|
logger.info(f"Full page image saved: {page_image_path}")
|
||||||
|
|
||||||
|
# Ajouter le chemin de l'image de la page entière
|
||||||
|
images_paths.append(page_image_path)
|
||||||
|
|
||||||
|
# Méthode 3: Extraction des zones d'image sur la page
|
||||||
|
# Cette méthode utilise une heuristique pour détecter les zones rectangulaires
|
||||||
|
# qui pourraient contenir des images, graphiques ou diagrammes
|
||||||
|
rect_areas = detect_image_areas(page)
|
||||||
|
|
||||||
|
for i, rect in enumerate(rect_areas, 1):
|
||||||
|
try:
|
||||||
|
# Découper une région de la page
|
||||||
|
clip_pix = page.get_pixmap(matrix=mat, clip=rect)
|
||||||
|
|
||||||
|
# Sauvegarder l'image découpée
|
||||||
|
clip_filename = f"{pdf_name}-page{page_num}-clip{i}.png"
|
||||||
|
clip_path = os.path.join(output_dir, clip_filename)
|
||||||
|
clip_pix.save(clip_path)
|
||||||
|
|
||||||
|
# Vérifier si l'image n'est pas noire et si elle est assez grande
|
||||||
|
pil_img = Image.open(clip_path)
|
||||||
|
if pil_img.width > 100 and pil_img.height > 100 and not is_image_mostly_black(pil_img):
|
||||||
|
logger.info(f"Detected image area saved: {clip_path}")
|
||||||
|
images_paths.append(clip_path)
|
||||||
|
else:
|
||||||
|
# Supprimer les petites zones ou les zones noires
|
||||||
|
os.remove(clip_path)
|
||||||
|
logger.info(f"Image area too small or black, ignored: {clip_path}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error extracting image area {i} on page {page_num}: {str(e)}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error rendering page {page_num}: {str(e)}")
|
||||||
|
|
||||||
|
return images_paths
|
||||||
|
|
||||||
|
def is_image_mostly_black(image, threshold=0.95):
|
||||||
|
"""Vérifie si une image est principalement noire"""
|
||||||
|
# Convertir en niveaux de gris
|
||||||
|
if image.mode != 'L':
|
||||||
|
image = image.convert('L')
|
||||||
|
|
||||||
|
# Compter les pixels noirs
|
||||||
|
pixels = image.getdata()
|
||||||
|
black_pixels = sum(1 for pixel in pixels if pixel < 20)
|
||||||
|
total_pixels = len(pixels)
|
||||||
|
|
||||||
|
# Vérifier le ratio de pixels noirs
|
||||||
|
return black_pixels / total_pixels > threshold
|
||||||
|
|
||||||
|
def detect_image_areas(page):
|
||||||
|
"""Détecte les zones potentielles d'images sur une page"""
|
||||||
|
# Cette fonction est une heuristique simple pour détecter les zones
|
||||||
|
# qui pourraient contenir des images, des graphiques ou des diagrammes
|
||||||
|
|
||||||
|
# Obtenir les blocs de la page
|
||||||
|
blocks = page.get_text("dict")["blocks"]
|
||||||
|
|
||||||
|
# Filtrer les blocs qui ne sont pas du texte
|
||||||
|
image_areas = []
|
||||||
|
|
||||||
|
for block in blocks:
|
||||||
|
# Les blocs d'images ont généralement un type différent de 0 (texte)
|
||||||
|
if block["type"] != 0:
|
||||||
|
rect = fitz.Rect(block["bbox"])
|
||||||
|
# Ignorer les zones trop petites
|
||||||
|
if rect.width > 50 and rect.height > 50:
|
||||||
|
image_areas.append(rect)
|
||||||
|
|
||||||
|
# Si aucune zone n'est détectée, essayer une approche différente
|
||||||
|
if not image_areas:
|
||||||
|
# Diviser la page en sections et considérer les sections
|
||||||
|
# qui ne contiennent pas de texte comme des candidats potentiels
|
||||||
|
page_rect = page.rect
|
||||||
|
text_areas = []
|
||||||
|
|
||||||
|
# Obtenir les zones de texte
|
||||||
|
for block in blocks:
|
||||||
|
if block["type"] == 0: # Bloc de texte
|
||||||
|
text_areas.append(fitz.Rect(block["bbox"]))
|
||||||
|
|
||||||
|
# Si nous avons des zones de texte, considérer le reste comme potentielles zones d'image
|
||||||
|
if text_areas:
|
||||||
|
# Une heuristique simple: diviser la page en 4 quadrants
|
||||||
|
mid_x = page_rect.width / 2
|
||||||
|
mid_y = page_rect.height / 2
|
||||||
|
|
||||||
|
quadrants = [
|
||||||
|
fitz.Rect(0, 0, mid_x, mid_y),
|
||||||
|
fitz.Rect(mid_x, 0, page_rect.width, mid_y),
|
||||||
|
fitz.Rect(0, mid_y, mid_x, page_rect.height),
|
||||||
|
fitz.Rect(mid_x, mid_y, page_rect.width, page_rect.height)
|
||||||
|
]
|
||||||
|
|
||||||
|
# Vérifier chaque quadrant
|
||||||
|
for quad in quadrants:
|
||||||
|
# Vérifier si le quadrant contient du texte
|
||||||
|
contains_text = any(quad.intersects(text_area) for text_area in text_areas)
|
||||||
|
|
||||||
|
if not contains_text and quad.width > 100 and quad.height > 100:
|
||||||
|
image_areas.append(quad)
|
||||||
|
|
||||||
|
return image_areas
|
||||||
|
|
||||||
|
def extract_latex_from_text(text):
|
||||||
|
"""Extract and enhance mathematical equations from text using basic pattern matching"""
|
||||||
|
# Find potential equations in the text
|
||||||
|
equation_patterns = [
|
||||||
|
# Expressions containing these characters are likely equations
|
||||||
|
r'[=<>+\-*/±≈≤≥]',
|
||||||
|
# Common mathematical notations
|
||||||
|
r'[a-zA-Z][_^]',
|
||||||
|
# Fractions, integrals, etc.
|
||||||
|
r'\\frac|\\int|\\sum|\\prod|\\sqrt',
|
||||||
|
# Greek letters
|
||||||
|
r'\\alpha|\\beta|\\gamma|\\delta|\\epsilon|\\theta|\\lambda|\\mu|\\pi',
|
||||||
|
# Already formatted LaTeX
|
||||||
|
r'\$\$.*?\$\$|\$.*?\$'
|
||||||
|
]
|
||||||
|
|
||||||
|
# Search line by line
|
||||||
|
lines = text.splitlines()
|
||||||
|
latex_chunks = []
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
line = line.strip()
|
||||||
|
# Skip lines that are too long (probably not equations)
|
||||||
|
if len(line) > 150:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if the line contains a potential equation
|
||||||
|
is_equation = False
|
||||||
|
for pattern in equation_patterns:
|
||||||
|
if re.search(pattern, line):
|
||||||
|
is_equation = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if is_equation:
|
||||||
|
# Clean the equation
|
||||||
|
eq = line.replace('$$', '').replace('$', '')
|
||||||
|
|
||||||
|
# Improve LaTeX formatting
|
||||||
|
eq = format_equation_for_latex(eq)
|
||||||
|
latex_chunks.append(eq)
|
||||||
|
|
||||||
|
return latex_chunks
|
||||||
|
|
||||||
|
def extract_latex_with_pix2text(page_image_path):
|
||||||
|
"""Extract LaTeX equations from an image using Pix2Text"""
|
||||||
|
if not PIX2TEXT_AVAILABLE:
|
||||||
|
logger.warning("Pix2Text is not available. Install it with: pip install pix2text")
|
||||||
|
return []
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Initialize Pix2Text with LaTeX OCR capabilities
|
||||||
|
p2t = Pix2Text(math_engine='mfd')
|
||||||
|
|
||||||
|
# Process the image
|
||||||
|
result = p2t.recognize(page_image_path)
|
||||||
|
|
||||||
|
# Extract math blocks
|
||||||
|
equations = []
|
||||||
|
for item in result:
|
||||||
|
if item.get('type') == 'math' and item.get('text'):
|
||||||
|
equations.append(item.get('text'))
|
||||||
|
|
||||||
|
logger.info(f"Extracted {len(equations)} equations using Pix2Text")
|
||||||
|
return equations
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error extracting equations with Pix2Text: {str(e)}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def format_equation_for_latex(eq_text):
|
||||||
|
"""Improves LaTeX formatting of equations"""
|
||||||
|
# 1. Fix subscripts
|
||||||
|
eq_text = re.sub(r'([a-zA-Z])_([a-zA-Z0-9]+)', r'\1_{(\2)}', eq_text)
|
||||||
|
|
||||||
|
# 2. Fix superscripts
|
||||||
|
eq_text = re.sub(r'([a-zA-Z0-9])(\^)([a-zA-Z0-9]+)', r'\1\2{(\3)}', eq_text)
|
||||||
|
|
||||||
|
# 3. Remove equation numbers
|
||||||
|
eq_text = re.sub(r'\((\d+)\)$', r'', eq_text).strip()
|
||||||
|
|
||||||
|
# 4. Convert simple fractions to \frac
|
||||||
|
fraction_match = re.search(r'([a-zA-Z0-9]+)\s*/\s*([a-zA-Z0-9]+)', eq_text)
|
||||||
|
if fraction_match:
|
||||||
|
numerator, denominator = fraction_match.groups()
|
||||||
|
eq_text = eq_text.replace(f"{numerator}/{denominator}", f"\\frac{{{numerator}}}{{{denominator}}}")
|
||||||
|
|
||||||
|
# 5. Add spaces around operators
|
||||||
|
operators = ['+', '-', '=', '<', '>', '\\approx', '\\sim', '\\equiv']
|
||||||
|
for op in operators:
|
||||||
|
if op != '-': # Avoid modifying negative signs
|
||||||
|
eq_text = eq_text.replace(op, f" {op} ")
|
||||||
|
|
||||||
|
# Remove double spaces
|
||||||
|
while ' ' in eq_text:
|
||||||
|
eq_text = eq_text.replace(' ', ' ')
|
||||||
|
|
||||||
|
return eq_text.strip()
|
||||||
|
|
||||||
|
def process_pdf_to_markdown(pdf_path, output_md_path, output_dir="output", lang="english"):
|
||||||
|
"""Process a PDF and generate a Markdown file with text, images, and LaTeX equations"""
|
||||||
|
logger.info(f"Processing PDF: {pdf_path}")
|
||||||
|
|
||||||
|
# Create output directory if it doesn't exist
|
||||||
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Open the PDF with PyMuPDF
|
||||||
|
doc = fitz.open(pdf_path)
|
||||||
|
logger.info(f"PDF opened successfully. Document has {len(doc)} pages")
|
||||||
|
|
||||||
|
# Initialize Markdown content
|
||||||
|
md_content = []
|
||||||
|
md_content.append(f"# {os.path.splitext(os.path.basename(pdf_path))[0]}\n")
|
||||||
|
|
||||||
|
pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
||||||
|
|
||||||
|
# Process each page for text, images, and equations
|
||||||
|
for page_num, page in enumerate(doc, 1):
|
||||||
|
logger.info(f"Processing page {page_num}/{len(doc)}")
|
||||||
|
|
||||||
|
# Extract text
|
||||||
|
text = page.get_text("text")
|
||||||
|
logger.info(f"Extracted {len(text)} characters of text from page {page_num}")
|
||||||
|
|
||||||
|
# Add text to Markdown
|
||||||
|
md_content.append(f"## Page {page_num}\n")
|
||||||
|
md_content.append(f"{text.strip()}\n")
|
||||||
|
|
||||||
|
# Extract images using multiple methods to ensure they're not black
|
||||||
|
image_paths = extract_images_from_page(page, output_dir, pdf_name, page_num)
|
||||||
|
logger.info(f"Extracted {len(image_paths)} images from page {page_num}")
|
||||||
|
|
||||||
|
# Process each extracted image
|
||||||
|
for img_index, image_path in enumerate(image_paths, 1):
|
||||||
|
try:
|
||||||
|
# Generate image description
|
||||||
|
logger.info(f"Generating description for image {img_index} on page {page_num}")
|
||||||
|
description = get_image_description(image_path, language=lang)
|
||||||
|
logger.info(f"Description generated: {description[:50]}..." if len(description) > 50 else f"Description generated: {description}")
|
||||||
|
|
||||||
|
# Add image and description to Markdown
|
||||||
|
md_content.append(f"\n\n")
|
||||||
|
md_content.append(f"**Description:** {description}\n")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing image {img_index} on page {page_num}: {str(e)}")
|
||||||
|
|
||||||
|
# Extract and enhance equations - use Pix2Text if available
|
||||||
|
logger.info(f"Extracting equations from page {page_num}")
|
||||||
|
latex_equations = []
|
||||||
|
|
||||||
|
if PIX2TEXT_AVAILABLE:
|
||||||
|
# Render the page to an image for Pix2Text processing
|
||||||
|
temp_pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
|
||||||
|
temp_pix.save(tmp.name)
|
||||||
|
temp_path = tmp.name
|
||||||
|
|
||||||
|
# Extract equations using Pix2Text
|
||||||
|
latex_equations = extract_latex_with_pix2text(temp_path)
|
||||||
|
|
||||||
|
# Clean up temp file
|
||||||
|
os.unlink(temp_path)
|
||||||
|
else:
|
||||||
|
# Fallback to basic extraction
|
||||||
|
latex_equations = extract_latex_from_text(text)
|
||||||
|
|
||||||
|
logger.info(f"Found {len(latex_equations)} potential equations on page {page_num}")
|
||||||
|
|
||||||
|
for i, eq in enumerate(latex_equations, 1):
|
||||||
|
try:
|
||||||
|
logger.info(f"Equation {i}: {eq[:30]}..." if len(eq) > 30 else f"Equation {i}: {eq}")
|
||||||
|
|
||||||
|
# Add equation to Markdown
|
||||||
|
md_content.append(f"\n$$\n{eq}\n$$\n")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error formatting equation {i} on page {page_num}: {str(e)}")
|
||||||
|
|
||||||
|
# Write content to Markdown file
|
||||||
|
with open(output_md_path, "w", encoding="utf-8") as md_file:
|
||||||
|
md_file.write("\n".join(md_content))
|
||||||
|
|
||||||
|
logger.info(f"Markdown file generated: {output_md_path}")
|
||||||
|
print(f"Conversion complete. Markdown file generated: {output_md_path}")
|
||||||
|
|
||||||
|
return output_md_path
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="Convert PDF to Markdown with text, images, and LaTeX equations")
|
||||||
|
parser.add_argument("pdf_path", help="Path to the PDF file")
|
||||||
|
parser.add_argument("--output_md", default="output.md", help="Path to output Markdown file")
|
||||||
|
parser.add_argument("--output_dir", default="output", help="Directory for extracted images")
|
||||||
|
parser.add_argument("--language", default="english", choices=["english", "french"],
|
||||||
|
help="Language for image descriptions (english or french)")
|
||||||
|
parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging (DEBUG level)")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Set log level
|
||||||
|
if args.verbose:
|
||||||
|
logging.getLogger().setLevel(logging.DEBUG)
|
||||||
|
logger.info("Verbose logging enabled")
|
||||||
|
|
||||||
|
# Process PDF
|
||||||
|
process_pdf_to_markdown(args.pdf_path, args.output_md, args.output_dir, args.language)
|
||||||
274
requirements.txt
274
requirements.txt
@@ -1,42 +1,232 @@
|
|||||||
# Core RAG and LLM libraries
|
acres==0.3.0
|
||||||
langchain>=0.0.267
|
aiofiles==24.1.0
|
||||||
langchain-community>=0.0.10
|
aiohappyeyeballs==2.4.6
|
||||||
transformers>=4.30.0
|
aiohttp==3.11.13
|
||||||
langchain_community
|
aiosignal==1.3.2
|
||||||
|
annotated-types==0.7.0
|
||||||
# Document processing
|
antlr4-python3-runtime==4.9.3
|
||||||
unstructured>=0.10.0
|
anyio==4.8.0
|
||||||
pdf2image>=1.16.3
|
asttokens==3.0.0
|
||||||
pypdf2>=3.0.0
|
attrs==25.1.0
|
||||||
pdfminer.six>=20221105
|
backoff==2.2.1
|
||||||
|
beautifulsoup4==4.13.3
|
||||||
# OCR and image processing
|
cachetools==5.5.2
|
||||||
pytesseract>=0.3.10
|
camelot-py==1.0.0
|
||||||
Pillow>=9.5.0
|
certifi==2025.1.31
|
||||||
opencv-python>=4.8.0
|
cffi==1.17.1
|
||||||
|
chardet==5.2.0
|
||||||
# Table extraction
|
charset-normalizer==3.4.1
|
||||||
camelot-py>=0.11.0
|
ci-info==0.3.0
|
||||||
tabula-py>=2.7.0
|
click==8.1.8
|
||||||
|
coloredlogs==15.0.1
|
||||||
# Data manipulation
|
comm==0.2.2
|
||||||
pandas>=2.0.0
|
configobj==5.0.9
|
||||||
numpy
|
configparser==7.1.0
|
||||||
|
contourpy==1.3.1
|
||||||
# Visualization
|
cryptography==44.0.1
|
||||||
matplotlib>=3.7.0
|
cycler==0.12.1
|
||||||
|
dataclasses-json==0.6.7
|
||||||
# Optional but commonly used with RAG
|
debugpy==1.8.12
|
||||||
scikit-learn>=1.2.0
|
decorator==5.2.1
|
||||||
sentence-transformers>=2.2.2
|
Deprecated==1.2.18
|
||||||
|
distro==1.9.0
|
||||||
# Vector database connections (common choices, uncomment as needed)
|
effdet==0.4.1
|
||||||
# chromadb>=0.4.6
|
emoji==2.14.1
|
||||||
# pinecone-client>=2.2.2
|
et_xmlfile==2.0.0
|
||||||
# qdrant-client>=1.3.0
|
etelemetry==0.3.1
|
||||||
# faiss-cpu>=1.7.4
|
eval_type_backport==0.2.2
|
||||||
|
executing==2.2.0
|
||||||
# Utilities
|
filelock==3.17.0
|
||||||
tqdm>=4.65.0
|
filetype==1.2.0
|
||||||
python-dotenv>=1.0.0
|
flatbuffers==25.2.10
|
||||||
pi_heif
|
fonttools==4.56.0
|
||||||
|
frontend==0.0.3
|
||||||
|
frozenlist==1.5.0
|
||||||
|
fsspec==2025.2.0
|
||||||
|
google-api-core==2.24.1
|
||||||
|
google-auth==2.38.0
|
||||||
|
google-cloud-vision==3.10.0
|
||||||
|
googleapis-common-protos==1.68.0
|
||||||
|
greenlet==3.1.1
|
||||||
|
grpcio==1.71.0rc2
|
||||||
|
grpcio-status==1.71.0rc2
|
||||||
|
grpcio-tools==1.70.0
|
||||||
|
h11==0.14.0
|
||||||
|
h2==4.2.0
|
||||||
|
hpack==4.1.0
|
||||||
|
html5lib==1.1
|
||||||
|
httpcore==1.0.7
|
||||||
|
httplib2==0.22.0
|
||||||
|
httpx==0.28.1
|
||||||
|
httpx-sse==0.4.0
|
||||||
|
huggingface-hub==0.29.1
|
||||||
|
humanfriendly==10.0
|
||||||
|
hyperframe==6.1.0
|
||||||
|
idna==3.10
|
||||||
|
ipykernel==6.29.5
|
||||||
|
ipython==9.0.0
|
||||||
|
ipython_pygments_lexers==1.1.1
|
||||||
|
ipywidgets==8.1.5
|
||||||
|
isodate==0.6.1
|
||||||
|
itsdangerous==2.2.0
|
||||||
|
jedi==0.19.2
|
||||||
|
Jinja2==3.1.5
|
||||||
|
jiter==0.8.2
|
||||||
|
joblib==1.4.2
|
||||||
|
jsonpatch==1.33
|
||||||
|
jsonpointer==3.0.0
|
||||||
|
jupyter_client==8.6.3
|
||||||
|
jupyter_core==5.7.2
|
||||||
|
jupyterlab_widgets==3.0.13
|
||||||
|
kiwisolver==1.4.8
|
||||||
|
langchain==0.3.19
|
||||||
|
langchain-community==0.3.18
|
||||||
|
langchain-core==0.3.40
|
||||||
|
langchain-deepseek==0.1.2
|
||||||
|
langchain-ollama==0.2.3
|
||||||
|
langchain-openai==0.3.7
|
||||||
|
langchain-qdrant==0.2.0
|
||||||
|
langchain-text-splitters==0.3.6
|
||||||
|
langdetect==1.0.9
|
||||||
|
langsmith==0.3.11
|
||||||
|
looseversion==1.3.0
|
||||||
|
lxml==5.3.1
|
||||||
|
Markdown==3.7
|
||||||
|
MarkupSafe==3.0.2
|
||||||
|
marshmallow==3.26.1
|
||||||
|
matplotlib==3.10.1
|
||||||
|
matplotlib-inline==0.1.7
|
||||||
|
mpmath==1.3.0
|
||||||
|
multidict==6.1.0
|
||||||
|
mypy-extensions==1.0.0
|
||||||
|
nest-asyncio==1.6.0
|
||||||
|
networkx==3.4.2
|
||||||
|
nibabel==5.3.2
|
||||||
|
nipype==1.9.2
|
||||||
|
nltk==3.9.1
|
||||||
|
numpy==1.26.4
|
||||||
|
nvidia-cublas-cu12==12.4.5.8
|
||||||
|
nvidia-cuda-cupti-cu12==12.4.127
|
||||||
|
nvidia-cuda-nvrtc-cu12==12.4.127
|
||||||
|
nvidia-cuda-runtime-cu12==12.4.127
|
||||||
|
nvidia-cudnn-cu12==9.1.0.70
|
||||||
|
nvidia-cufft-cu12==11.2.1.3
|
||||||
|
nvidia-curand-cu12==10.3.5.147
|
||||||
|
nvidia-cusolver-cu12==11.6.1.9
|
||||||
|
nvidia-cusparse-cu12==12.3.1.170
|
||||||
|
nvidia-cusparselt-cu12==0.6.2
|
||||||
|
nvidia-nccl-cu12==2.21.5
|
||||||
|
nvidia-nvjitlink-cu12==12.4.127
|
||||||
|
nvidia-nvtx-cu12==12.4.127
|
||||||
|
olefile==0.47
|
||||||
|
ollama==0.4.7
|
||||||
|
omegaconf==2.3.0
|
||||||
|
onnx==1.17.0
|
||||||
|
onnxruntime==1.20.1
|
||||||
|
openai==1.65.2
|
||||||
|
opencv-python==4.11.0.86
|
||||||
|
opencv-python-headless==4.11.0.86
|
||||||
|
openpyxl==3.1.5
|
||||||
|
orjson==3.10.15
|
||||||
|
packaging==24.2
|
||||||
|
pandas==2.2.3
|
||||||
|
parso==0.8.4
|
||||||
|
pathlib==1.0.1
|
||||||
|
pdf2image==1.17.0
|
||||||
|
pdfminer.six==20240706
|
||||||
|
pexpect==4.9.0
|
||||||
|
pi_heif==0.21.0
|
||||||
|
pikepdf==9.5.2
|
||||||
|
pillow==11.1.0
|
||||||
|
platformdirs==4.3.6
|
||||||
|
portalocker==2.10.1
|
||||||
|
prompt_toolkit==3.0.50
|
||||||
|
propcache==0.3.0
|
||||||
|
proto-plus==1.26.0
|
||||||
|
protobuf==5.29.3
|
||||||
|
prov==2.0.1
|
||||||
|
psutil==7.0.0
|
||||||
|
ptyprocess==0.7.0
|
||||||
|
pure_eval==0.2.3
|
||||||
|
puremagic==1.28
|
||||||
|
pyasn1==0.6.1
|
||||||
|
pyasn1_modules==0.4.1
|
||||||
|
pycocotools==2.0.8
|
||||||
|
pycparser==2.22
|
||||||
|
pydantic==2.10.6
|
||||||
|
pydantic-settings==2.8.1
|
||||||
|
pydantic_core==2.27.2
|
||||||
|
pydot==3.0.4
|
||||||
|
Pygments==2.19.1
|
||||||
|
PyMuPDF==1.25.3
|
||||||
|
pymupdf4llm==0.0.17
|
||||||
|
pypandoc==1.15
|
||||||
|
pyparsing==3.2.1
|
||||||
|
pypdf==5.3.0
|
||||||
|
PyPDF2==3.0.1
|
||||||
|
pypdfium2==4.30.1
|
||||||
|
pytesseract==0.3.13
|
||||||
|
python-dateutil==2.9.0.post0
|
||||||
|
python-docx==1.1.2
|
||||||
|
python-dotenv==1.0.1
|
||||||
|
python-iso639==2025.2.18
|
||||||
|
python-magic==0.4.27
|
||||||
|
python-multipart==0.0.20
|
||||||
|
python-oxmsg==0.0.2
|
||||||
|
python-pptx==1.0.2
|
||||||
|
pytz==2025.1
|
||||||
|
pyxnat==1.6.3
|
||||||
|
PyYAML==6.0.2
|
||||||
|
pyzmq==26.2.1
|
||||||
|
qdrant-client==1.13.2
|
||||||
|
RapidFuzz==3.12.1
|
||||||
|
rdflib==6.3.2
|
||||||
|
regex==2024.11.6
|
||||||
|
requests==2.32.3
|
||||||
|
requests-toolbelt==1.0.0
|
||||||
|
rsa==4.9
|
||||||
|
safetensors==0.5.3
|
||||||
|
scikit-learn==1.6.1
|
||||||
|
scipy==1.15.2
|
||||||
|
sentence-transformers==3.4.1
|
||||||
|
setuptools==75.8.2
|
||||||
|
simplejson==3.20.1
|
||||||
|
six==1.17.0
|
||||||
|
sniffio==1.3.1
|
||||||
|
soupsieve==2.6
|
||||||
|
SQLAlchemy==2.0.38
|
||||||
|
stack-data==0.6.3
|
||||||
|
starlette==0.46.0
|
||||||
|
sympy==1.13.1
|
||||||
|
tabula-py==2.10.0
|
||||||
|
tabulate==0.9.0
|
||||||
|
tenacity==9.0.0
|
||||||
|
threadpoolctl==3.5.0
|
||||||
|
tiktoken==0.9.0
|
||||||
|
timm==1.0.15
|
||||||
|
tokenizers==0.21.0
|
||||||
|
torch==2.6.0
|
||||||
|
torchvision==0.21.0
|
||||||
|
tornado==6.4.2
|
||||||
|
tqdm==4.67.1
|
||||||
|
traitlets==5.14.3
|
||||||
|
traits==7.0.2
|
||||||
|
transformers==4.49.0
|
||||||
|
triton==3.2.0
|
||||||
|
typing-inspect==0.9.0
|
||||||
|
typing_extensions==4.12.2
|
||||||
|
tzdata==2025.1
|
||||||
|
unstructured==0.16.23
|
||||||
|
unstructured-client==0.30.6
|
||||||
|
unstructured-inference==0.8.7
|
||||||
|
unstructured.pytesseract==0.3.13
|
||||||
|
urllib3==2.3.0
|
||||||
|
uvicorn==0.34.0
|
||||||
|
wcwidth==0.2.13
|
||||||
|
webencodings==0.5.1
|
||||||
|
widgetsnbextension==4.0.13
|
||||||
|
wrapt==1.17.2
|
||||||
|
xlrd==2.0.1
|
||||||
|
XlsxWriter==3.2.2
|
||||||
|
yarl==1.18.3
|
||||||
|
zstandard==0.23.0
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user