Initial commit
This commit is contained in:
commit
0d396d9bd9
5
.vscode/settings.json
vendored
Normal file
5
.vscode/settings.json
vendored
Normal file
@ -0,0 +1,5 @@
|
||||
{
|
||||
"python.analysis.extraPaths": [
|
||||
"./src/document_processing"
|
||||
]
|
||||
}
|
||||
39
README.md
Normal file
39
README.md
Normal file
@ -0,0 +1,39 @@
|
||||
# RAG Modeling Project
|
||||
|
||||
## Overview
|
||||
|
||||
RAG Modeling is an advanced Retrieval-Augmented Generation system with comprehensive document processing capabilities. The system focuses on extracting high-quality data from PDF documents including text, images, and tables to build robust RAG applications.
|
||||
|
||||
## Features
|
||||
|
||||
- **Advanced PDF Processing**:
|
||||
- Multiple extraction methods for maximum text coverage
|
||||
- Image extraction with OCR capabilities
|
||||
- Table detection and extraction
|
||||
- Structured document parsing
|
||||
|
||||
- **Text Processing Pipeline**:
|
||||
- Intelligent text chunking for optimal context management
|
||||
- Support for multiple languages
|
||||
- Metadata preservation
|
||||
|
||||
- **Modular Architecture**:
|
||||
- Component-based design for easy extension
|
||||
- Configurable processing parameters
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
# Clone the repository
|
||||
git clone https://gitea.parsanet.org/sepehr/rag.git
|
||||
cd rag
|
||||
|
||||
# Create a virtual environment (optional but recommended)
|
||||
python -m venv venv
|
||||
source venv/bin/activate # On Windows: venv\Scripts\activate
|
||||
|
||||
# Install dependencies
|
||||
pip install -r requirements.txt
|
||||
|
||||
# Install additional dependencies
|
||||
pip install unstructured pytesseract camelot-py opencv-python pandas
|
||||
BIN
document/04Extrait_Methodologie_Experimentale.pdf
Normal file
BIN
document/04Extrait_Methodologie_Experimentale.pdf
Normal file
Binary file not shown.
BIN
document/11_chapitre3.pdf
Normal file
BIN
document/11_chapitre3.pdf
Normal file
Binary file not shown.
BIN
document/Echangeurs.pdf
Normal file
BIN
document/Echangeurs.pdf
Normal file
Binary file not shown.
BIN
document/TEST.png
Normal file
BIN
document/TEST.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 91 KiB |
BIN
document/synthese.doc
Normal file
BIN
document/synthese.doc
Normal file
Binary file not shown.
0
src/document_processing/__init__.py
Normal file
0
src/document_processing/__init__.py
Normal file
Binary file not shown.
Binary file not shown.
0
src/document_processing/docx_processor.py
Normal file
0
src/document_processing/docx_processor.py
Normal file
315
src/document_processing/pdf_processor.py
Normal file
315
src/document_processing/pdf_processor.py
Normal file
@ -0,0 +1,315 @@
|
||||
import os
|
||||
import logging
|
||||
from typing import Dict, List, Optional, Tuple, Union, Any
|
||||
import tempfile
|
||||
|
||||
# LangChain imports
|
||||
from langchain_community.document_loaders import PyPDFLoader, UnstructuredPDFLoader
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from langchain_community.document_loaders.pdf import PDFMinerLoader
|
||||
from langchain_community.document_loaders import PyPDFDirectoryLoader
|
||||
|
||||
# Image processing
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
|
||||
# Table extraction
|
||||
import camelot
|
||||
import pandas as pd
|
||||
|
||||
# For unstructured data
|
||||
from unstructured.partition.pdf import partition_pdf
|
||||
# from unstructured.partition.auto import partition
|
||||
os.environ['OCR_AGENT']=r'C:\Program Files\Tesseract-OCR\tesseract.exe'
|
||||
# Setup logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class AdvancedPDFProcessor:
|
||||
"""
|
||||
Classe pour traiter des documents PDF avec extraction avancée de texte, images et tableaux
|
||||
en utilisant LangChain et d'autres bibliothèques modernes.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
ocr_enabled: bool = True,
|
||||
extract_tables: bool = True,
|
||||
extract_images: bool = True,
|
||||
chunk_size: int = 1000,
|
||||
chunk_overlap: int = 200):
|
||||
"""
|
||||
Initialise le processeur PDF avec les options configurées.
|
||||
|
||||
Args:
|
||||
ocr_enabled: Si True, applique l'OCR sur les images détectées
|
||||
extract_tables: Si True, tente d'extraire les tableaux
|
||||
extract_images: Si True, extrait les images du PDF
|
||||
chunk_size: Taille des chunks pour la division du texte
|
||||
chunk_overlap: Chevauchement entre les chunks
|
||||
"""
|
||||
self.ocr_enabled = ocr_enabled
|
||||
self.extract_tables = extract_tables
|
||||
self.extract_images = extract_images
|
||||
self.chunk_size = chunk_size
|
||||
self.chunk_overlap = chunk_overlap
|
||||
|
||||
# Configurer pytesseract si OCR est activé
|
||||
if ocr_enabled:
|
||||
# Chemin vers l'exécutable Tesseract si nécessaire
|
||||
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
|
||||
pass
|
||||
|
||||
def process_pdf(self, pdf_path: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Traite un fichier PDF et extrait son contenu de manière structurée.
|
||||
|
||||
Args:
|
||||
pdf_path: Chemin vers le fichier PDF
|
||||
|
||||
Returns:
|
||||
Dictionnaire contenant le texte extrait, les images, les tableaux et métadonnées
|
||||
"""
|
||||
logger.info(f"Début du traitement du fichier PDF: {pdf_path}")
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
raise FileNotFoundError(f"Le fichier {pdf_path} n'existe pas")
|
||||
|
||||
result = {
|
||||
"text": [],
|
||||
"chunks": [],
|
||||
"tables": [],
|
||||
"images": [],
|
||||
"metadata": {
|
||||
"filename": os.path.basename(pdf_path),
|
||||
"path": pdf_path,
|
||||
"size_bytes": os.path.getsize(pdf_path),
|
||||
}
|
||||
}
|
||||
|
||||
# 1. Extraction de texte avec différentes méthodes pour maximiser la couverture
|
||||
result["text"] = self._extract_text(pdf_path)
|
||||
|
||||
# 2. Chunking du texte pour une meilleure gestion par les LLMs
|
||||
result["chunks"] = self._chunk_text(result["text"])
|
||||
|
||||
# 3. Extraction des tableaux si activée
|
||||
if self.extract_tables:
|
||||
result["tables"] = self._extract_tables(pdf_path)
|
||||
|
||||
# 4. Extraction et analyse des images si activée
|
||||
if self.extract_images:
|
||||
result["images"] = self._extract_images(pdf_path)
|
||||
|
||||
logger.info("Traitement du PDF terminé: %d chunks, %d tableaux, %d images",
|
||||
len(result['chunks']), len(result['tables']), len(result['images']))
|
||||
|
||||
return result
|
||||
|
||||
def _extract_text(self, pdf_path: str) -> str:
|
||||
"""Extrait le texte du PDF en utilisant plusieurs méthodes pour une couverture maximale."""
|
||||
text_content = ""
|
||||
|
||||
# Méthode 1: PyPDFLoader de LangChain
|
||||
try:
|
||||
logger.info("Extraction de texte avec PyPDFLoader")
|
||||
loader = PyPDFLoader(pdf_path)
|
||||
documents = loader.load()
|
||||
text_content += "\n".join([doc.page_content for doc in documents])
|
||||
except Exception as e:
|
||||
logger.warning(f"Erreur avec PyPDFLoader: {e}")
|
||||
|
||||
# Méthode 2: Utiliser PDFMinerLoader pour une extraction plus détaillée
|
||||
try:
|
||||
logger.info("Extraction de texte avec PDFMinerLoader")
|
||||
miner_loader = PDFMinerLoader(pdf_path)
|
||||
miner_docs = miner_loader.load()
|
||||
if not text_content: # Si la première méthode a échoué
|
||||
text_content = "\n".join([doc.page_content for doc in miner_docs])
|
||||
except Exception as e:
|
||||
logger.warning(f"Erreur avec PDFMinerLoader: {e}")
|
||||
|
||||
# Méthode 3: Utiliser Unstructured pour une extraction plus avancée
|
||||
try:
|
||||
logger.info("Extraction de texte avec Unstructured")
|
||||
elements = partition_pdf(pdf_path, extract_images_in_pdf=False, infer_table_structure=False)
|
||||
unstructured_text = "\n".join([str(element) for element in elements])
|
||||
|
||||
# Si les méthodes précédentes n'ont rien donné ou si Unstructured a trouvé plus de contenu
|
||||
if not text_content or len(unstructured_text) > len(text_content):
|
||||
text_content = unstructured_text
|
||||
except Exception as e:
|
||||
logger.warning(f"Erreur avec Unstructured: {e}")
|
||||
|
||||
return text_content
|
||||
|
||||
def _chunk_text(self, text: str) -> List[str]:
|
||||
"""Divise le texte en chunks pour un meilleur traitement."""
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=self.chunk_size,
|
||||
chunk_overlap=self.chunk_overlap,
|
||||
length_function=len,
|
||||
)
|
||||
chunks = text_splitter.split_text(text)
|
||||
return chunks
|
||||
|
||||
def _extract_tables(self, pdf_path: str) -> List[Dict[str, Union[str, pd.DataFrame]]]:
|
||||
"""Extrait les tableaux du PDF en utilisant Camelot."""
|
||||
tables_data = []
|
||||
|
||||
try:
|
||||
logger.info("Extraction des tableaux avec Camelot")
|
||||
# Utiliser stream pour les tableaux avec des lignes claires et lattice pour les tableaux avec des bordures
|
||||
tables_stream = camelot.read_pdf(pdf_path, pages='all', flavor='stream')
|
||||
tables_lattice = camelot.read_pdf(pdf_path, pages='all', flavor='lattice')
|
||||
|
||||
# Traiter les tableaux de type 'stream'
|
||||
for i, table in enumerate(tables_stream):
|
||||
if table.df.size > 0: # Vérifier que le tableau contient des données
|
||||
tables_data.append({
|
||||
"page": table.page,
|
||||
"type": "stream",
|
||||
"data": table.df,
|
||||
"accuracy": table.accuracy,
|
||||
"description": f"Table {i+1} (Stream) de la page {table.page}"
|
||||
})
|
||||
|
||||
# Traiter les tableaux de type 'lattice'
|
||||
for i, table in enumerate(tables_lattice):
|
||||
if table.df.size > 0:
|
||||
tables_data.append({
|
||||
"page": table.page,
|
||||
"type": "lattice",
|
||||
"data": table.df,
|
||||
"accuracy": table.accuracy,
|
||||
"description": f"Table {i+1} (Lattice) de la page {table.page}"
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Erreur lors de l'extraction des tableaux: {e}")
|
||||
|
||||
return tables_data
|
||||
|
||||
def _extract_images(self, pdf_path: str) -> List[Dict[str, Any]]:
|
||||
"""Extrait et analyse les images du PDF."""
|
||||
images_data = []
|
||||
|
||||
try:
|
||||
logger.info("Extraction des images avec Unstructured")
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
elements = partition_pdf(
|
||||
pdf_path,
|
||||
extract_images_in_pdf=True,
|
||||
images_output_dir=temp_dir
|
||||
)
|
||||
|
||||
# Collecter les chemins des images extraites
|
||||
image_elements = [el for el in elements if hasattr(el, 'image_path') and el.image_path]
|
||||
|
||||
for i, img_element in enumerate(image_elements):
|
||||
img_path = img_element.image_path
|
||||
img_data = {
|
||||
"page": getattr(img_element, 'page_number', None),
|
||||
"path": img_path,
|
||||
"position": getattr(img_element, 'coordinates', None),
|
||||
"text": None # Sera rempli par OCR si activé
|
||||
}
|
||||
|
||||
# Appliquer OCR si activé
|
||||
if self.ocr_enabled and img_path and os.path.exists(img_path):
|
||||
try:
|
||||
logger.info("Extraction des textes avec ocr avec Unstructured")
|
||||
img = Image.open(img_path)
|
||||
ocr_text = pytesseract.image_to_string(img)
|
||||
img_data["text"] = ocr_text.strip()
|
||||
except Exception as e:
|
||||
logger.warning(f"Erreur OCR sur l'image {i}: {e}")
|
||||
|
||||
images_data.append(img_data)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Erreur lors de l'extraction des images: {e}")
|
||||
|
||||
return images_data
|
||||
|
||||
|
||||
def process_pdf_document(pdf_path: str, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Fonction utilitaire pour traiter un document PDF avec des options configurables.
|
||||
|
||||
Args:
|
||||
pdf_path: Chemin vers le fichier PDF à traiter
|
||||
**kwargs: Options de configuration pour le processeur PDF
|
||||
|
||||
Returns:
|
||||
Dictionnaire contenant les données extraites du PDF
|
||||
"""
|
||||
processor = AdvancedPDFProcessor(**kwargs)
|
||||
return processor.process_pdf(pdf_path)
|
||||
|
||||
def process_pdf_with_unstructured_loader(pdf_path: str, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Fonction qui utilise spécifiquement UnstructuredPDFLoader de LangChain
|
||||
pour extraire le contenu d'un PDF.
|
||||
|
||||
Args:
|
||||
pdf_path: Chemin vers le fichier PDF à traiter
|
||||
**kwargs: Options supplémentaires à passer au loader
|
||||
|
||||
Returns:
|
||||
Dictionnaire contenant le texte extrait et autres données
|
||||
"""
|
||||
logger.info(f"Traitement du PDF avec UnstructuredPDFLoader: {pdf_path}")
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
raise FileNotFoundError(f"Le fichier {pdf_path} n'existe pas")
|
||||
|
||||
result = {
|
||||
"text": "",
|
||||
"chunks": [],
|
||||
"metadata": {
|
||||
"filename": os.path.basename(pdf_path),
|
||||
"path": pdf_path,
|
||||
"size_bytes": os.path.getsize(pdf_path),
|
||||
},
|
||||
"elements": []
|
||||
}
|
||||
|
||||
try:
|
||||
# Configuration du loader avec les options avancées
|
||||
loader = UnstructuredPDFLoader(
|
||||
pdf_path,
|
||||
mode="elements", # Pour obtenir une extraction structurée par éléments
|
||||
strategy="fast",
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# Chargement et extraction du contenu
|
||||
documents = loader.load()
|
||||
|
||||
# Extraire le texte brut
|
||||
result["text"] = "\n".join([doc.page_content for doc in documents])
|
||||
|
||||
# Stocker les documents individuels avec leurs métadonnées
|
||||
result["elements"] = [
|
||||
{
|
||||
"content": doc.page_content,
|
||||
"metadata": doc.metadata
|
||||
} for doc in documents
|
||||
]
|
||||
|
||||
# Chunking du texte si nécessaire
|
||||
chunk_size = kwargs.get("chunk_size", 1000)
|
||||
chunk_overlap = kwargs.get("chunk_overlap", 200)
|
||||
text_splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
length_function=len,
|
||||
)
|
||||
result["chunks"] = text_splitter.split_text(result["text"])
|
||||
|
||||
logger.info(f"UnstructuredPDFLoader: extrait {len(documents)} éléments et {len(result['chunks'])} chunks")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Erreur lors du traitement avec UnstructuredPDFLoader: {e}")
|
||||
|
||||
return result
|
||||
19
test/pdf_processing_test.py
Normal file
19
test/pdf_processing_test.py
Normal file
@ -0,0 +1,19 @@
|
||||
# Exemple d'utilisation
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../src/document_processing')))
|
||||
from pdf_processor import process_pdf_document
|
||||
pdf_path = r"F:\Dev\Rag\Rag_Modeling\document\04Extrait_Methodologie_Experimentale.pdf"
|
||||
result = process_pdf_document(
|
||||
pdf_path,
|
||||
ocr_enabled=True,
|
||||
extract_tables=True,
|
||||
extract_images=True,
|
||||
chunk_size=1000,
|
||||
chunk_overlap=200
|
||||
)
|
||||
|
||||
# Accès aux différentes parties du résultat
|
||||
text_chunks = result["chunks"]
|
||||
tables = result["tables"]
|
||||
images = result["images"]
|
||||
311
test_processing.ipynb
Normal file
311
test_processing.ipynb
Normal file
@ -0,0 +1,311 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"c:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\pypdf\\_crypt_providers\\_cryptography.py:32: CryptographyDeprecationWarning: ARC4 has been moved to cryptography.hazmat.decrepit.ciphers.algorithms.ARC4 and will be removed from cryptography.hazmat.primitives.ciphers.algorithms in 48.0.0.\n",
|
||||
" from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4\n",
|
||||
"c:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import sys\n",
|
||||
"import os\n",
|
||||
"sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), './src/document_processing')))\n",
|
||||
"from pdf_processor import process_pdf_document\n",
|
||||
"from pdf_processor import process_pdf_with_unstructured_loader\n",
|
||||
"pdf_path = r\"F:\\Dev\\Rag\\Rag_Modeling\\document\\Echangeurs.pdf\"\n",
|
||||
"from PIL import Image\n",
|
||||
"import pytesseract \n",
|
||||
"pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2025-02-28 22:14:30,067 - pdf_processor - INFO - Début du traitement du fichier PDF: F:\\Dev\\Rag\\Rag_Modeling\\document\\Echangeurs.pdf\n",
|
||||
"2025-02-28 22:14:30,068 - pdf_processor - INFO - Extraction de texte avec PyPDFLoader\n",
|
||||
"2025-02-28 22:14:30,355 - pdf_processor - INFO - Extraction de texte avec PDFMinerLoader\n",
|
||||
"2025-02-28 22:14:31,675 - pdf_processor - WARNING - Erreur avec PDFMinerLoader: The PDF parser must valorize the standard metadata.\n",
|
||||
"2025-02-28 22:14:31,678 - pdf_processor - INFO - Extraction de texte avec Unstructured\n",
|
||||
"2025-02-28 22:14:31,680 - unstructured - INFO - PDF text extraction failed, skip text extraction...\n",
|
||||
"2025-02-28 22:14:31,682 - unstructured - WARNING - pytesseract is not installed. Cannot use the ocr_only partitioning strategy. Falling back to partitioning with another strategy.\n",
|
||||
"2025-02-28 22:14:31,682 - unstructured - WARNING - Falling back to partitioning with hi_res.\n",
|
||||
"2025-02-28 22:14:31,683 - unstructured_inference - INFO - Reading PDF for file: F:\\Dev\\Rag\\Rag_Modeling\\document\\Echangeurs.pdf ...\n",
|
||||
"2025-02-28 22:14:45,041 - pdf_processor - WARNING - Erreur avec Unstructured: Environment variable OCR_AGENT module name C:\\Program Files\\Tesseract-OCR\\tesseract must be set to a whitelisted module part of ['unstructured.partition.utils.ocr_models.tesseract_ocr', 'unstructured.partition.utils.ocr_models.paddle_ocr', 'unstructured.partition.utils.ocr_models.google_vision_ocr'].\n",
|
||||
"2025-02-28 22:14:45,044 - pdf_processor - INFO - Extraction des tableaux avec Camelot\n",
|
||||
"2025-02-28 22:14:56,714 - pdf_processor - INFO - Extraction des images avec Unstructured\n",
|
||||
"2025-02-28 22:14:56,717 - unstructured - INFO - PDF text extraction failed, skip text extraction...\n",
|
||||
"2025-02-28 22:14:56,719 - unstructured_inference - INFO - Reading PDF for file: F:\\Dev\\Rag\\Rag_Modeling\\document\\Echangeurs.pdf ...\n",
|
||||
"2025-02-28 22:15:09,709 - pdf_processor - WARNING - Erreur lors de l'extraction des images: Environment variable OCR_AGENT module name C:\\Program Files\\Tesseract-OCR\\tesseract must be set to a whitelisted module part of ['unstructured.partition.utils.ocr_models.tesseract_ocr', 'unstructured.partition.utils.ocr_models.paddle_ocr', 'unstructured.partition.utils.ocr_models.google_vision_ocr'].\n",
|
||||
"2025-02-28 22:15:09,711 - pdf_processor - INFO - Traitement du PDF terminé: 30 chunks, 18 tableaux, 0 images\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"result = process_pdf_document(\n",
|
||||
" pdf_path,\n",
|
||||
" ocr_enabled=True,\n",
|
||||
" extract_tables=True,\n",
|
||||
" extract_images=True,\n",
|
||||
" chunk_size=1000,\n",
|
||||
" chunk_overlap=200\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Accès aux différentes parties du résultat\n",
|
||||
"text_chunks = result[\"chunks\"]\n",
|
||||
"tables = result[\"tables\"]\n",
|
||||
"images = result[\"images\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2025-02-28 22:23:44,919 - pdf_processor - INFO - Traitement du PDF avec UnstructuredPDFLoader: F:\\Dev\\Rag\\Rag_Modeling\\document\\Echangeurs.pdf\n",
|
||||
"2025-02-28 22:23:44,921 - unstructured - INFO - PDF text extraction failed, skip text extraction...\n",
|
||||
"2025-02-28 22:23:44,923 - pdf_processor - INFO - UnstructuredPDFLoader: extrait 0 éléments et 0 chunks\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"result = process_pdf_with_unstructured_loader(\n",
|
||||
" pdf_path,\n",
|
||||
" chunk_size=1000,\n",
|
||||
" chunk_overlap=200,\n",
|
||||
" # Vous pouvez passer des options spécifiques à UnstructuredPDFLoader:\n",
|
||||
" \n",
|
||||
" include_page_breaks=True # Pour inclure les sauts de page\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Accéder aux résultats\n",
|
||||
"text = result[\"text\"]\n",
|
||||
"chunks = result[\"chunks\"]\n",
|
||||
"elements = result[\"elements\"] \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"# Correction de la configuration OCR\n",
|
||||
"import pytesseract \n",
|
||||
"import os\n",
|
||||
"pytesseract.pytesseract.tesseract_cmd = r\"C:\\Program Files\\Tesseract-OCR\\tesseract.exe\"\n",
|
||||
"os.environ['TESSDATA_PREFIX'] = os.environ['TESSDATA_PREFIX'] = r\"C:\\Program Files\\Tesseract-OCR\\tessdata\"\n",
|
||||
"# Au lieu du chemin vers l'exécutable, utilisez le nom de module approprié\n",
|
||||
"os.environ['OCR_AGENT'] = r\"C:\\Program Files\\Tesseract-OCR\\tessdata\"\n",
|
||||
"from langchain_community.document_loaders import UnstructuredPDFLoader\n",
|
||||
"\n",
|
||||
"pdf_path = r\"F:\\Dev\\Rag\\Rag_Modeling\\document\\Echangeurs.pdf\"\n",
|
||||
"loader = UnstructuredPDFLoader(pdf_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"c:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\pypdf\\_crypt_providers\\_cryptography.py:32: CryptographyDeprecationWarning: ARC4 has been moved to cryptography.hazmat.decrepit.ciphers.algorithms.ARC4 and will be removed from cryptography.hazmat.primitives.ciphers.algorithms in 48.0.0.\n",
|
||||
" from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4\n",
|
||||
"c:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n",
|
||||
"pytesseract is not installed. Cannot use the ocr_only partitioning strategy. Falling back to partitioning with another strategy.\n",
|
||||
"Falling back to partitioning with hi_res.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"je suis ici dans OCR_AGENT\n",
|
||||
"C:\\Program Files\\Tesseract-OCR\\tessdata\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "ValueError",
|
||||
"evalue": "not enough values to unpack (expected 2, got 1)",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[1;32mIn[2], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m docs \u001b[38;5;241m=\u001b[39m \u001b[43mloader\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2\u001b[0m docs[\u001b[38;5;241m0\u001b[39m]\n",
|
||||
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\langchain_core\\document_loaders\\base.py:31\u001b[0m, in \u001b[0;36mBaseLoader.load\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 29\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mload\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mlist\u001b[39m[Document]:\n\u001b[0;32m 30\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Load data into Document objects.\"\"\"\u001b[39;00m\n\u001b[1;32m---> 31\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlazy_load\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\langchain_community\\document_loaders\\unstructured.py:107\u001b[0m, in \u001b[0;36mUnstructuredBaseLoader.lazy_load\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 105\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mlazy_load\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Iterator[Document]:\n\u001b[0;32m 106\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Load file.\"\"\"\u001b[39;00m\n\u001b[1;32m--> 107\u001b[0m elements \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_elements\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 108\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_post_process_elements(elements)\n\u001b[0;32m 109\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmode \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124melements\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n",
|
||||
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\langchain_community\\document_loaders\\pdf.py:94\u001b[0m, in \u001b[0;36mUnstructuredPDFLoader._get_elements\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 91\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m_get_elements\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mlist\u001b[39m:\n\u001b[0;32m 92\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01munstructured\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpartition\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpdf\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m partition_pdf\n\u001b[1;32m---> 94\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m partition_pdf(filename\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfile_path, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39munstructured_kwargs)\n",
|
||||
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\documents\\elements.py:581\u001b[0m, in \u001b[0;36mprocess_metadata.<locals>.decorator.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 579\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[0;32m 580\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mwrapper\u001b[39m(\u001b[38;5;241m*\u001b[39margs: _P\u001b[38;5;241m.\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: _P\u001b[38;5;241m.\u001b[39mkwargs) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mlist\u001b[39m[Element]:\n\u001b[1;32m--> 581\u001b[0m elements \u001b[38;5;241m=\u001b[39m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 582\u001b[0m call_args \u001b[38;5;241m=\u001b[39m get_call_args_applying_defaults(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 584\u001b[0m unique_element_ids: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m call_args\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124munique_element_ids\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mFalse\u001b[39;00m)\n",
|
||||
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\file_utils\\filetype.py:815\u001b[0m, in \u001b[0;36madd_filetype.<locals>.decorator.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 813\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[0;32m 814\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mwrapper\u001b[39m(\u001b[38;5;241m*\u001b[39margs: _P\u001b[38;5;241m.\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: _P\u001b[38;5;241m.\u001b[39mkwargs) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mlist\u001b[39m[Element]:\n\u001b[1;32m--> 815\u001b[0m elements \u001b[38;5;241m=\u001b[39m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 817\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m element \u001b[38;5;129;01min\u001b[39;00m elements:\n\u001b[0;32m 818\u001b[0m \u001b[38;5;66;03m# NOTE(robinson) - Attached files have already run through this logic\u001b[39;00m\n\u001b[0;32m 819\u001b[0m \u001b[38;5;66;03m# in their own partitioning function\u001b[39;00m\n\u001b[0;32m 820\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m element\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39mattached_to_filename \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
|
||||
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\file_utils\\filetype.py:773\u001b[0m, in \u001b[0;36madd_metadata.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 771\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[0;32m 772\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mwrapper\u001b[39m(\u001b[38;5;241m*\u001b[39margs: _P\u001b[38;5;241m.\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: _P\u001b[38;5;241m.\u001b[39mkwargs) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mlist\u001b[39m[Element]:\n\u001b[1;32m--> 773\u001b[0m elements \u001b[38;5;241m=\u001b[39m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 774\u001b[0m call_args \u001b[38;5;241m=\u001b[39m get_call_args_applying_defaults(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 776\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m call_args\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmetadata_filename\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n",
|
||||
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\chunking\\dispatch.py:74\u001b[0m, in \u001b[0;36madd_chunking_strategy.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 71\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"The decorated function is replaced with this one.\"\"\"\u001b[39;00m\n\u001b[0;32m 73\u001b[0m \u001b[38;5;66;03m# -- call the partitioning function to get the elements --\u001b[39;00m\n\u001b[1;32m---> 74\u001b[0m elements \u001b[38;5;241m=\u001b[39m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 76\u001b[0m \u001b[38;5;66;03m# -- look for a chunking-strategy argument --\u001b[39;00m\n\u001b[0;32m 77\u001b[0m call_args \u001b[38;5;241m=\u001b[39m get_call_args_applying_defaults(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
|
||||
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\partition\\pdf.py:229\u001b[0m, in \u001b[0;36mpartition_pdf\u001b[1;34m(filename, file, include_page_breaks, strategy, infer_table_structure, ocr_languages, languages, metadata_filename, metadata_last_modified, chunking_strategy, hi_res_model_name, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, starting_page_number, extract_forms, form_extraction_skip_tables, password, pdfminer_line_margin, pdfminer_char_margin, pdfminer_line_overlap, pdfminer_word_margin, **kwargs)\u001b[0m\n\u001b[0;32m 226\u001b[0m exactly_one(filename\u001b[38;5;241m=\u001b[39mfilename, file\u001b[38;5;241m=\u001b[39mfile)\n\u001b[0;32m 228\u001b[0m languages \u001b[38;5;241m=\u001b[39m check_language_args(languages \u001b[38;5;129;01mor\u001b[39;00m [], ocr_languages)\n\u001b[1;32m--> 229\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m partition_pdf_or_image(\n\u001b[0;32m 230\u001b[0m filename\u001b[38;5;241m=\u001b[39mfilename,\n\u001b[0;32m 231\u001b[0m file\u001b[38;5;241m=\u001b[39mfile,\n\u001b[0;32m 232\u001b[0m include_page_breaks\u001b[38;5;241m=\u001b[39minclude_page_breaks,\n\u001b[0;32m 233\u001b[0m strategy\u001b[38;5;241m=\u001b[39mstrategy,\n\u001b[0;32m 234\u001b[0m infer_table_structure\u001b[38;5;241m=\u001b[39minfer_table_structure,\n\u001b[0;32m 235\u001b[0m languages\u001b[38;5;241m=\u001b[39mlanguages,\n\u001b[0;32m 236\u001b[0m metadata_last_modified\u001b[38;5;241m=\u001b[39mmetadata_last_modified,\n\u001b[0;32m 237\u001b[0m hi_res_model_name\u001b[38;5;241m=\u001b[39mhi_res_model_name,\n\u001b[0;32m 238\u001b[0m extract_images_in_pdf\u001b[38;5;241m=\u001b[39mextract_images_in_pdf,\n\u001b[0;32m 239\u001b[0m extract_image_block_types\u001b[38;5;241m=\u001b[39mextract_image_block_types,\n\u001b[0;32m 240\u001b[0m extract_image_block_output_dir\u001b[38;5;241m=\u001b[39mextract_image_block_output_dir,\n\u001b[0;32m 241\u001b[0m extract_image_block_to_payload\u001b[38;5;241m=\u001b[39mextract_image_block_to_payload,\n\u001b[0;32m 242\u001b[0m starting_page_number\u001b[38;5;241m=\u001b[39mstarting_page_number,\n\u001b[0;32m 243\u001b[0m extract_forms\u001b[38;5;241m=\u001b[39mextract_forms,\n\u001b[0;32m 244\u001b[0m form_extraction_skip_tables\u001b[38;5;241m=\u001b[39mform_extraction_skip_tables,\n\u001b[0;32m 245\u001b[0m password\u001b[38;5;241m=\u001b[39mpassword,\n\u001b[0;32m 246\u001b[0m pdfminer_line_margin\u001b[38;5;241m=\u001b[39mpdfminer_line_margin,\n\u001b[0;32m 247\u001b[0m pdfminer_char_margin\u001b[38;5;241m=\u001b[39mpdfminer_char_margin,\n\u001b[0;32m 248\u001b[0m pdfminer_line_overlap\u001b[38;5;241m=\u001b[39mpdfminer_line_overlap,\n\u001b[0;32m 249\u001b[0m pdfminer_word_margin\u001b[38;5;241m=\u001b[39mpdfminer_word_margin,\n\u001b[0;32m 250\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m 251\u001b[0m )\n",
|
||||
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\partition\\pdf.py:342\u001b[0m, in \u001b[0;36mpartition_pdf_or_image\u001b[1;34m(filename, file, is_image, include_page_breaks, strategy, infer_table_structure, languages, metadata_last_modified, hi_res_model_name, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, starting_page_number, extract_forms, form_extraction_skip_tables, password, pdfminer_line_margin, pdfminer_char_margin, pdfminer_line_overlap, pdfminer_word_margin, **kwargs)\u001b[0m\n\u001b[0;32m 340\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m warnings\u001b[38;5;241m.\u001b[39mcatch_warnings():\n\u001b[0;32m 341\u001b[0m warnings\u001b[38;5;241m.\u001b[39msimplefilter(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m--> 342\u001b[0m elements \u001b[38;5;241m=\u001b[39m _partition_pdf_or_image_local(\n\u001b[0;32m 343\u001b[0m filename\u001b[38;5;241m=\u001b[39mfilename,\n\u001b[0;32m 344\u001b[0m file\u001b[38;5;241m=\u001b[39mspooled_to_bytes_io_if_needed(file),\n\u001b[0;32m 345\u001b[0m is_image\u001b[38;5;241m=\u001b[39mis_image,\n\u001b[0;32m 346\u001b[0m infer_table_structure\u001b[38;5;241m=\u001b[39minfer_table_structure,\n\u001b[0;32m 347\u001b[0m include_page_breaks\u001b[38;5;241m=\u001b[39minclude_page_breaks,\n\u001b[0;32m 348\u001b[0m languages\u001b[38;5;241m=\u001b[39mlanguages,\n\u001b[0;32m 349\u001b[0m ocr_languages\u001b[38;5;241m=\u001b[39mocr_languages,\n\u001b[0;32m 350\u001b[0m metadata_last_modified\u001b[38;5;241m=\u001b[39mmetadata_last_modified \u001b[38;5;129;01mor\u001b[39;00m last_modified,\n\u001b[0;32m 351\u001b[0m hi_res_model_name\u001b[38;5;241m=\u001b[39mhi_res_model_name,\n\u001b[0;32m 352\u001b[0m pdf_text_extractable\u001b[38;5;241m=\u001b[39mpdf_text_extractable,\n\u001b[0;32m 353\u001b[0m extract_images_in_pdf\u001b[38;5;241m=\u001b[39mextract_images_in_pdf,\n\u001b[0;32m 354\u001b[0m extract_image_block_types\u001b[38;5;241m=\u001b[39mextract_image_block_types,\n\u001b[0;32m 355\u001b[0m extract_image_block_output_dir\u001b[38;5;241m=\u001b[39mextract_image_block_output_dir,\n\u001b[0;32m 356\u001b[0m extract_image_block_to_payload\u001b[38;5;241m=\u001b[39mextract_image_block_to_payload,\n\u001b[0;32m 357\u001b[0m starting_page_number\u001b[38;5;241m=\u001b[39mstarting_page_number,\n\u001b[0;32m 358\u001b[0m extract_forms\u001b[38;5;241m=\u001b[39mextract_forms,\n\u001b[0;32m 359\u001b[0m form_extraction_skip_tables\u001b[38;5;241m=\u001b[39mform_extraction_skip_tables,\n\u001b[0;32m 360\u001b[0m password\u001b[38;5;241m=\u001b[39mpassword,\n\u001b[0;32m 361\u001b[0m pdfminer_config\u001b[38;5;241m=\u001b[39mpdfminer_config,\n\u001b[0;32m 362\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m 363\u001b[0m )\n\u001b[0;32m 364\u001b[0m out_elements \u001b[38;5;241m=\u001b[39m _process_uncategorized_text_elements(elements)\n\u001b[0;32m 366\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m strategy \u001b[38;5;241m==\u001b[39m PartitionStrategy\u001b[38;5;241m.\u001b[39mFAST:\n",
|
||||
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\utils.py:216\u001b[0m, in \u001b[0;36mrequires_dependencies.<locals>.decorator.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 213\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(func)\n\u001b[0;32m 214\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mwrapper\u001b[39m(\u001b[38;5;241m*\u001b[39margs: _P\u001b[38;5;241m.\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: _P\u001b[38;5;241m.\u001b[39mkwargs):\n\u001b[0;32m 215\u001b[0m run_check()\n\u001b[1;32m--> 216\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
|
||||
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\partition\\pdf.py:687\u001b[0m, in \u001b[0;36m_partition_pdf_or_image_local\u001b[1;34m(filename, file, is_image, infer_table_structure, include_page_breaks, languages, ocr_languages, ocr_mode, model_name, hi_res_model_name, pdf_image_dpi, metadata_last_modified, pdf_text_extractable, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, analysis, analyzed_image_output_dir_path, starting_page_number, extract_forms, form_extraction_skip_tables, pdf_hi_res_max_pages, password, pdfminer_config, **kwargs)\u001b[0m\n\u001b[0;32m 680\u001b[0m \u001b[38;5;66;03m# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout\u001b[39;00m\n\u001b[0;32m 681\u001b[0m merged_document_layout \u001b[38;5;241m=\u001b[39m merge_inferred_with_extracted_layout(\n\u001b[0;32m 682\u001b[0m inferred_document_layout\u001b[38;5;241m=\u001b[39minferred_document_layout,\n\u001b[0;32m 683\u001b[0m extracted_layout\u001b[38;5;241m=\u001b[39mextracted_layout,\n\u001b[0;32m 684\u001b[0m hi_res_model_name\u001b[38;5;241m=\u001b[39mhi_res_model_name,\n\u001b[0;32m 685\u001b[0m )\n\u001b[1;32m--> 687\u001b[0m final_document_layout \u001b[38;5;241m=\u001b[39m \u001b[43mprocess_file_with_ocr\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 688\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 689\u001b[0m \u001b[43m \u001b[49m\u001b[43mmerged_document_layout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 690\u001b[0m \u001b[43m \u001b[49m\u001b[43mextracted_layout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextracted_layout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 691\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_image\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_image\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 692\u001b[0m \u001b[43m \u001b[49m\u001b[43minfer_table_structure\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minfer_table_structure\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 693\u001b[0m \u001b[43m \u001b[49m\u001b[43mocr_languages\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mocr_languages\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 694\u001b[0m \u001b[43m \u001b[49m\u001b[43mocr_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mocr_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 695\u001b[0m \u001b[43m \u001b[49m\u001b[43mpdf_image_dpi\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpdf_image_dpi\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 696\u001b[0m \u001b[43m \u001b[49m\u001b[43mocr_layout_dumper\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mocr_layout_dumper\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 697\u001b[0m \u001b[43m \u001b[49m\u001b[43mpassword\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpassword\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 698\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 699\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 700\u001b[0m inferred_document_layout \u001b[38;5;241m=\u001b[39m process_data_with_model(\n\u001b[0;32m 701\u001b[0m file,\n\u001b[0;32m 702\u001b[0m is_image\u001b[38;5;241m=\u001b[39mis_image,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 705\u001b[0m password\u001b[38;5;241m=\u001b[39mpassword,\n\u001b[0;32m 706\u001b[0m )\n",
|
||||
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\utils.py:216\u001b[0m, in \u001b[0;36mrequires_dependencies.<locals>.decorator.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 213\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(func)\n\u001b[0;32m 214\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mwrapper\u001b[39m(\u001b[38;5;241m*\u001b[39margs: _P\u001b[38;5;241m.\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: _P\u001b[38;5;241m.\u001b[39mkwargs):\n\u001b[0;32m 215\u001b[0m run_check()\n\u001b[1;32m--> 216\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
|
||||
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\partition\\pdf_image\\ocr.py:190\u001b[0m, in \u001b[0;36mprocess_file_with_ocr\u001b[1;34m(filename, out_layout, extracted_layout, is_image, infer_table_structure, ocr_languages, ocr_mode, pdf_image_dpi, ocr_layout_dumper, password)\u001b[0m\n\u001b[0;32m 188\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 189\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39misdir(filename) \u001b[38;5;129;01mor\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39misfile(filename):\n\u001b[1;32m--> 190\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[0;32m 191\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 192\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mFile \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfilename\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m not found!\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01me\u001b[39;00m\n",
|
||||
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\partition\\pdf_image\\ocr.py:177\u001b[0m, in \u001b[0;36mprocess_file_with_ocr\u001b[1;34m(filename, out_layout, extracted_layout, is_image, infer_table_structure, ocr_languages, ocr_mode, pdf_image_dpi, ocr_layout_dumper, password)\u001b[0m\n\u001b[0;32m 175\u001b[0m extracted_regions \u001b[38;5;241m=\u001b[39m extracted_layout[i] \u001b[38;5;28;01mif\u001b[39;00m i \u001b[38;5;241m<\u001b[39m \u001b[38;5;28mlen\u001b[39m(extracted_layout) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m 176\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m PILImage\u001b[38;5;241m.\u001b[39mopen(image_path) \u001b[38;5;28;01mas\u001b[39;00m image:\n\u001b[1;32m--> 177\u001b[0m merged_page_layout \u001b[38;5;241m=\u001b[39m \u001b[43msupplement_page_layout_with_ocr\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 178\u001b[0m \u001b[43m \u001b[49m\u001b[43mpage_layout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mout_layout\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpages\u001b[49m\u001b[43m[\u001b[49m\u001b[43mi\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 179\u001b[0m \u001b[43m \u001b[49m\u001b[43mimage\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mimage\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 180\u001b[0m \u001b[43m \u001b[49m\u001b[43minfer_table_structure\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minfer_table_structure\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 181\u001b[0m \u001b[43m \u001b[49m\u001b[43mocr_languages\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mocr_languages\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 182\u001b[0m \u001b[43m \u001b[49m\u001b[43mocr_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mocr_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 183\u001b[0m \u001b[43m \u001b[49m\u001b[43mextracted_regions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextracted_regions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 184\u001b[0m \u001b[43m \u001b[49m\u001b[43mocr_layout_dumper\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mocr_layout_dumper\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 185\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 186\u001b[0m merged_page_layouts\u001b[38;5;241m.\u001b[39mappend(merged_page_layout)\n\u001b[0;32m 187\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m DocumentLayout\u001b[38;5;241m.\u001b[39mfrom_pages(merged_page_layouts)\n",
|
||||
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\utils.py:216\u001b[0m, in \u001b[0;36mrequires_dependencies.<locals>.decorator.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 213\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(func)\n\u001b[0;32m 214\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mwrapper\u001b[39m(\u001b[38;5;241m*\u001b[39margs: _P\u001b[38;5;241m.\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: _P\u001b[38;5;241m.\u001b[39mkwargs):\n\u001b[0;32m 215\u001b[0m run_check()\n\u001b[1;32m--> 216\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
|
||||
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\partition\\pdf_image\\ocr.py:213\u001b[0m, in \u001b[0;36msupplement_page_layout_with_ocr\u001b[1;34m(page_layout, image, infer_table_structure, ocr_languages, ocr_mode, extracted_regions, ocr_layout_dumper)\u001b[0m\n\u001b[0;32m 195\u001b[0m \u001b[38;5;129m@requires_dependencies\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124munstructured_inference\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 196\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21msupplement_page_layout_with_ocr\u001b[39m(\n\u001b[0;32m 197\u001b[0m page_layout: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPageLayout\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 203\u001b[0m ocr_layout_dumper: Optional[OCRLayoutDumper] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[0;32m 204\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPageLayout\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m 205\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 206\u001b[0m \u001b[38;5;124;03m Supplement an PageLayout with OCR results depending on OCR mode.\u001b[39;00m\n\u001b[0;32m 207\u001b[0m \u001b[38;5;124;03m If mode is \"entire_page\", we get the OCR layout for the entire image and\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 210\u001b[0m \u001b[38;5;124;03m with no text and add text from OCR to each element.\u001b[39;00m\n\u001b[0;32m 211\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 213\u001b[0m ocr_agent \u001b[38;5;241m=\u001b[39m \u001b[43mOCRAgent\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_agent\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlanguage\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mocr_languages\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 214\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ocr_mode \u001b[38;5;241m==\u001b[39m OCRMode\u001b[38;5;241m.\u001b[39mFULL_PAGE\u001b[38;5;241m.\u001b[39mvalue:\n\u001b[0;32m 215\u001b[0m ocr_layout \u001b[38;5;241m=\u001b[39m ocr_agent\u001b[38;5;241m.\u001b[39mget_layout_from_image(image)\n",
|
||||
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\partition\\utils\\ocr_models\\ocr_interface.py:34\u001b[0m, in \u001b[0;36mOCRAgent.get_agent\u001b[1;34m(cls, language)\u001b[0m\n\u001b[0;32m 29\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Get the configured OCRAgent instance.\u001b[39;00m\n\u001b[0;32m 30\u001b[0m \n\u001b[0;32m 31\u001b[0m \u001b[38;5;124;03mThe OCR package used by the agent is determined by the `OCR_AGENT` environment variable.\u001b[39;00m\n\u001b[0;32m 32\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 33\u001b[0m ocr_agent_cls_qname \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m_get_ocr_agent_cls_qname()\n\u001b[1;32m---> 34\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_instance\u001b[49m\u001b[43m(\u001b[49m\u001b[43mocr_agent_cls_qname\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlanguage\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\partition\\utils\\ocr_models\\ocr_interface.py:41\u001b[0m, in \u001b[0;36mOCRAgent.get_instance\u001b[1;34m(ocr_agent_module, language)\u001b[0m\n\u001b[0;32m 39\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mje suis ici dans OCR_AGENT\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 40\u001b[0m \u001b[38;5;28mprint\u001b[39m(ocr_agent_module)\n\u001b[1;32m---> 41\u001b[0m module_name, class_name \u001b[38;5;241m=\u001b[39m ocr_agent_module\u001b[38;5;241m.\u001b[39mrsplit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;241m1\u001b[39m)\n\u001b[0;32m 43\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m module_name \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m OCR_AGENT_MODULES_WHITELIST:\n\u001b[0;32m 44\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 45\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEnvironment variable OCR_AGENT module name \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmodule_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must be set to a \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 46\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwhitelisted module part of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mOCR_AGENT_MODULES_WHITELIST\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 47\u001b[0m )\n",
|
||||
"\u001b[1;31mValueError\u001b[0m: not enough values to unpack (expected 2, got 1)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs = loader.load()\n",
|
||||
"docs[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"from PIL import Image oa\n",
|
||||
"import pytesseract\n",
|
||||
"\n",
|
||||
"# If you don't have tesseract executable in your PATH, include the following:\n",
|
||||
"pytesseract.pytesseract.tesseract_cmd = r’<full_path_to_your_tesseract_executable>*\n",
|
||||
"‘# Example tesseract_cmd = r’C:\\Program Files (x86)\\Tesseract-OCR\\tesseract’\n",
|
||||
"\n",
|
||||
"‘# Simple image to string\n",
|
||||
"print(pytesseract. image to_string(Image.open( ‘test .png’)))\n",
|
||||
"\n",
|
||||
"# In order to bypass the image conversions of pytesseract, just use relative or absolute image path\n",
|
||||
"# NOTE: In this case you should provide tesseract supported images or tesseract will return error\n",
|
||||
"print (pytesseract.image_to_string(‘test.png\"))\n",
|
||||
"\n",
|
||||
"# List of available languages\n",
|
||||
"\n",
|
||||
"print (pytesseract.get_languages(config=\"*))\n",
|
||||
"\n",
|
||||
"# French text image to string\n",
|
||||
"print (pytesseract. image_to_string(Image.open(‘test-european. jpg’), lang=\"fra’))\n",
|
||||
"\n",
|
||||
"# Batch processing with a single file containing the list of multiple image file paths\n",
|
||||
"print (pytesseract. image_to_string(’images.txt\"))\n",
|
||||
"\n",
|
||||
"# Timeout/terminate the tesseract job after a period of time\n",
|
||||
"try:\n",
|
||||
"\n",
|
||||
"print (pytesseract.image_to_string(‘test. jpg’, timeout-2)) # Timeout after 2 seconds\n",
|
||||
"\n",
|
||||
"print (pytesseract. image to_string(‘test.jpg\", timeout=2.5)) # Timeout after half a second\n",
|
||||
"except Runtime€rror as timeout_error:\n",
|
||||
"\n",
|
||||
"# Tesseract processing is terminated\n",
|
||||
"\n",
|
||||
"pass\n",
|
||||
"\n",
|
||||
"# Get bounding box estimates\n",
|
||||
"print (pytesseract. image_to_boxes(Image.open(‘test.png’)))\n",
|
||||
"\n",
|
||||
"# Get verbose data including boxes, confidences, line and page numbers\n",
|
||||
"print (pytesseract.image_to_data(Inage.open(‘test.png’)))\n",
|
||||
"\n",
|
||||
"# Get information about orientation and script detection\n",
|
||||
"print (pytesseract..image_to_osd(Image.open( ‘test.png\")))\n",
|
||||
"\n",
|
||||
"# Get a searchable PDF\n",
|
||||
"pdf = pytesseract.image_to_pdf_or_hocr(‘test.png’, extension='\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"IMG_path = r\"F:\\Dev\\Rag\\Rag_Modeling\\document\\test.png\"\n",
|
||||
"print(pytesseract.image_to_string(Image.open(IMG_path)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "rag",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.16"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user