commit 0d396d9bd95f040cda63dd19fe6816e2f8e4dc08 Author: sepehr Date: Sat Mar 1 08:15:30 2025 +0100 Initial commit diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..1d92a01 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,5 @@ +{ + "python.analysis.extraPaths": [ + "./src/document_processing" + ] +} \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..1ad5e16 --- /dev/null +++ b/README.md @@ -0,0 +1,39 @@ +# RAG Modeling Project + +## Overview + +RAG Modeling is an advanced Retrieval-Augmented Generation system with comprehensive document processing capabilities. The system focuses on extracting high-quality data from PDF documents including text, images, and tables to build robust RAG applications. + +## Features + +- **Advanced PDF Processing**: + - Multiple extraction methods for maximum text coverage + - Image extraction with OCR capabilities + - Table detection and extraction + - Structured document parsing + +- **Text Processing Pipeline**: + - Intelligent text chunking for optimal context management + - Support for multiple languages + - Metadata preservation + +- **Modular Architecture**: + - Component-based design for easy extension + - Configurable processing parameters + +## Installation + +```bash +# Clone the repository +git clone https://gitea.parsanet.org/sepehr/rag.git +cd rag + +# Create a virtual environment (optional but recommended) +python -m venv venv +source venv/bin/activate # On Windows: venv\Scripts\activate + +# Install dependencies +pip install -r requirements.txt + +# Install additional dependencies +pip install unstructured pytesseract camelot-py opencv-python pandas \ No newline at end of file diff --git a/document/04Extrait_Methodologie_Experimentale.pdf b/document/04Extrait_Methodologie_Experimentale.pdf new file mode 100644 index 0000000..2fea11a Binary files /dev/null and b/document/04Extrait_Methodologie_Experimentale.pdf differ diff --git a/document/11_chapitre3.pdf b/document/11_chapitre3.pdf new file mode 100644 index 0000000..d71a745 Binary files /dev/null and b/document/11_chapitre3.pdf differ diff --git a/document/Echangeurs.pdf b/document/Echangeurs.pdf new file mode 100644 index 0000000..2fefc5f Binary files /dev/null and b/document/Echangeurs.pdf differ diff --git a/document/TEST.png b/document/TEST.png new file mode 100644 index 0000000..d6eb9cf Binary files /dev/null and b/document/TEST.png differ diff --git a/document/synthese.doc b/document/synthese.doc new file mode 100644 index 0000000..290d072 Binary files /dev/null and b/document/synthese.doc differ diff --git a/src/document_processing/__init__.py b/src/document_processing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/document_processing/__pycache__/pdf_processor.cpython-310.pyc b/src/document_processing/__pycache__/pdf_processor.cpython-310.pyc new file mode 100644 index 0000000..b144376 Binary files /dev/null and b/src/document_processing/__pycache__/pdf_processor.cpython-310.pyc differ diff --git a/src/document_processing/__pycache__/pdf_processor.cpython-313.pyc b/src/document_processing/__pycache__/pdf_processor.cpython-313.pyc new file mode 100644 index 0000000..5ad2a25 Binary files /dev/null and b/src/document_processing/__pycache__/pdf_processor.cpython-313.pyc differ diff --git a/src/document_processing/docx_processor.py b/src/document_processing/docx_processor.py new file mode 100644 index 0000000..e69de29 diff --git a/src/document_processing/pdf_processor.py b/src/document_processing/pdf_processor.py new file mode 100644 index 0000000..5e61594 --- /dev/null +++ b/src/document_processing/pdf_processor.py @@ -0,0 +1,315 @@ +import os +import logging +from typing import Dict, List, Optional, Tuple, Union, Any +import tempfile + +# LangChain imports +from langchain_community.document_loaders import PyPDFLoader, UnstructuredPDFLoader +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.document_loaders.pdf import PDFMinerLoader +from langchain_community.document_loaders import PyPDFDirectoryLoader + +# Image processing +import pytesseract +from PIL import Image +pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" +# Table extraction +import camelot +import pandas as pd + +# For unstructured data +from unstructured.partition.pdf import partition_pdf +# from unstructured.partition.auto import partition +os.environ['OCR_AGENT']=r'C:\Program Files\Tesseract-OCR\tesseract.exe' +# Setup logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +class AdvancedPDFProcessor: + """ + Classe pour traiter des documents PDF avec extraction avancée de texte, images et tableaux + en utilisant LangChain et d'autres bibliothèques modernes. + """ + + def __init__(self, + ocr_enabled: bool = True, + extract_tables: bool = True, + extract_images: bool = True, + chunk_size: int = 1000, + chunk_overlap: int = 200): + """ + Initialise le processeur PDF avec les options configurées. + + Args: + ocr_enabled: Si True, applique l'OCR sur les images détectées + extract_tables: Si True, tente d'extraire les tableaux + extract_images: Si True, extrait les images du PDF + chunk_size: Taille des chunks pour la division du texte + chunk_overlap: Chevauchement entre les chunks + """ + self.ocr_enabled = ocr_enabled + self.extract_tables = extract_tables + self.extract_images = extract_images + self.chunk_size = chunk_size + self.chunk_overlap = chunk_overlap + + # Configurer pytesseract si OCR est activé + if ocr_enabled: + # Chemin vers l'exécutable Tesseract si nécessaire + pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' + pass + + def process_pdf(self, pdf_path: str) -> Dict[str, Any]: + """ + Traite un fichier PDF et extrait son contenu de manière structurée. + + Args: + pdf_path: Chemin vers le fichier PDF + + Returns: + Dictionnaire contenant le texte extrait, les images, les tableaux et métadonnées + """ + logger.info(f"Début du traitement du fichier PDF: {pdf_path}") + + if not os.path.exists(pdf_path): + raise FileNotFoundError(f"Le fichier {pdf_path} n'existe pas") + + result = { + "text": [], + "chunks": [], + "tables": [], + "images": [], + "metadata": { + "filename": os.path.basename(pdf_path), + "path": pdf_path, + "size_bytes": os.path.getsize(pdf_path), + } + } + + # 1. Extraction de texte avec différentes méthodes pour maximiser la couverture + result["text"] = self._extract_text(pdf_path) + + # 2. Chunking du texte pour une meilleure gestion par les LLMs + result["chunks"] = self._chunk_text(result["text"]) + + # 3. Extraction des tableaux si activée + if self.extract_tables: + result["tables"] = self._extract_tables(pdf_path) + + # 4. Extraction et analyse des images si activée + if self.extract_images: + result["images"] = self._extract_images(pdf_path) + + logger.info("Traitement du PDF terminé: %d chunks, %d tableaux, %d images", + len(result['chunks']), len(result['tables']), len(result['images'])) + + return result + + def _extract_text(self, pdf_path: str) -> str: + """Extrait le texte du PDF en utilisant plusieurs méthodes pour une couverture maximale.""" + text_content = "" + + # Méthode 1: PyPDFLoader de LangChain + try: + logger.info("Extraction de texte avec PyPDFLoader") + loader = PyPDFLoader(pdf_path) + documents = loader.load() + text_content += "\n".join([doc.page_content for doc in documents]) + except Exception as e: + logger.warning(f"Erreur avec PyPDFLoader: {e}") + + # Méthode 2: Utiliser PDFMinerLoader pour une extraction plus détaillée + try: + logger.info("Extraction de texte avec PDFMinerLoader") + miner_loader = PDFMinerLoader(pdf_path) + miner_docs = miner_loader.load() + if not text_content: # Si la première méthode a échoué + text_content = "\n".join([doc.page_content for doc in miner_docs]) + except Exception as e: + logger.warning(f"Erreur avec PDFMinerLoader: {e}") + + # Méthode 3: Utiliser Unstructured pour une extraction plus avancée + try: + logger.info("Extraction de texte avec Unstructured") + elements = partition_pdf(pdf_path, extract_images_in_pdf=False, infer_table_structure=False) + unstructured_text = "\n".join([str(element) for element in elements]) + + # Si les méthodes précédentes n'ont rien donné ou si Unstructured a trouvé plus de contenu + if not text_content or len(unstructured_text) > len(text_content): + text_content = unstructured_text + except Exception as e: + logger.warning(f"Erreur avec Unstructured: {e}") + + return text_content + + def _chunk_text(self, text: str) -> List[str]: + """Divise le texte en chunks pour un meilleur traitement.""" + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=self.chunk_size, + chunk_overlap=self.chunk_overlap, + length_function=len, + ) + chunks = text_splitter.split_text(text) + return chunks + + def _extract_tables(self, pdf_path: str) -> List[Dict[str, Union[str, pd.DataFrame]]]: + """Extrait les tableaux du PDF en utilisant Camelot.""" + tables_data = [] + + try: + logger.info("Extraction des tableaux avec Camelot") + # Utiliser stream pour les tableaux avec des lignes claires et lattice pour les tableaux avec des bordures + tables_stream = camelot.read_pdf(pdf_path, pages='all', flavor='stream') + tables_lattice = camelot.read_pdf(pdf_path, pages='all', flavor='lattice') + + # Traiter les tableaux de type 'stream' + for i, table in enumerate(tables_stream): + if table.df.size > 0: # Vérifier que le tableau contient des données + tables_data.append({ + "page": table.page, + "type": "stream", + "data": table.df, + "accuracy": table.accuracy, + "description": f"Table {i+1} (Stream) de la page {table.page}" + }) + + # Traiter les tableaux de type 'lattice' + for i, table in enumerate(tables_lattice): + if table.df.size > 0: + tables_data.append({ + "page": table.page, + "type": "lattice", + "data": table.df, + "accuracy": table.accuracy, + "description": f"Table {i+1} (Lattice) de la page {table.page}" + }) + + except Exception as e: + logger.warning(f"Erreur lors de l'extraction des tableaux: {e}") + + return tables_data + + def _extract_images(self, pdf_path: str) -> List[Dict[str, Any]]: + """Extrait et analyse les images du PDF.""" + images_data = [] + + try: + logger.info("Extraction des images avec Unstructured") + with tempfile.TemporaryDirectory() as temp_dir: + elements = partition_pdf( + pdf_path, + extract_images_in_pdf=True, + images_output_dir=temp_dir + ) + + # Collecter les chemins des images extraites + image_elements = [el for el in elements if hasattr(el, 'image_path') and el.image_path] + + for i, img_element in enumerate(image_elements): + img_path = img_element.image_path + img_data = { + "page": getattr(img_element, 'page_number', None), + "path": img_path, + "position": getattr(img_element, 'coordinates', None), + "text": None # Sera rempli par OCR si activé + } + + # Appliquer OCR si activé + if self.ocr_enabled and img_path and os.path.exists(img_path): + try: + logger.info("Extraction des textes avec ocr avec Unstructured") + img = Image.open(img_path) + ocr_text = pytesseract.image_to_string(img) + img_data["text"] = ocr_text.strip() + except Exception as e: + logger.warning(f"Erreur OCR sur l'image {i}: {e}") + + images_data.append(img_data) + + except Exception as e: + logger.warning(f"Erreur lors de l'extraction des images: {e}") + + return images_data + + +def process_pdf_document(pdf_path: str, **kwargs) -> Dict[str, Any]: + """ + Fonction utilitaire pour traiter un document PDF avec des options configurables. + + Args: + pdf_path: Chemin vers le fichier PDF à traiter + **kwargs: Options de configuration pour le processeur PDF + + Returns: + Dictionnaire contenant les données extraites du PDF + """ + processor = AdvancedPDFProcessor(**kwargs) + return processor.process_pdf(pdf_path) + +def process_pdf_with_unstructured_loader(pdf_path: str, **kwargs) -> Dict[str, Any]: + """ + Fonction qui utilise spécifiquement UnstructuredPDFLoader de LangChain + pour extraire le contenu d'un PDF. + + Args: + pdf_path: Chemin vers le fichier PDF à traiter + **kwargs: Options supplémentaires à passer au loader + + Returns: + Dictionnaire contenant le texte extrait et autres données + """ + logger.info(f"Traitement du PDF avec UnstructuredPDFLoader: {pdf_path}") + + if not os.path.exists(pdf_path): + raise FileNotFoundError(f"Le fichier {pdf_path} n'existe pas") + + result = { + "text": "", + "chunks": [], + "metadata": { + "filename": os.path.basename(pdf_path), + "path": pdf_path, + "size_bytes": os.path.getsize(pdf_path), + }, + "elements": [] + } + + try: + # Configuration du loader avec les options avancées + loader = UnstructuredPDFLoader( + pdf_path, + mode="elements", # Pour obtenir une extraction structurée par éléments + strategy="fast", + **kwargs + ) + + # Chargement et extraction du contenu + documents = loader.load() + + # Extraire le texte brut + result["text"] = "\n".join([doc.page_content for doc in documents]) + + # Stocker les documents individuels avec leurs métadonnées + result["elements"] = [ + { + "content": doc.page_content, + "metadata": doc.metadata + } for doc in documents + ] + + # Chunking du texte si nécessaire + chunk_size = kwargs.get("chunk_size", 1000) + chunk_overlap = kwargs.get("chunk_overlap", 200) + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + length_function=len, + ) + result["chunks"] = text_splitter.split_text(result["text"]) + + logger.info(f"UnstructuredPDFLoader: extrait {len(documents)} éléments et {len(result['chunks'])} chunks") + + except Exception as e: + logger.warning(f"Erreur lors du traitement avec UnstructuredPDFLoader: {e}") + + return result \ No newline at end of file diff --git a/test/pdf_processing_test.py b/test/pdf_processing_test.py new file mode 100644 index 0000000..17c4bbb --- /dev/null +++ b/test/pdf_processing_test.py @@ -0,0 +1,19 @@ +# Exemple d'utilisation +import sys +import os +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../src/document_processing'))) +from pdf_processor import process_pdf_document +pdf_path = r"F:\Dev\Rag\Rag_Modeling\document\04Extrait_Methodologie_Experimentale.pdf" +result = process_pdf_document( + pdf_path, + ocr_enabled=True, + extract_tables=True, + extract_images=True, + chunk_size=1000, + chunk_overlap=200 +) + +# Accès aux différentes parties du résultat +text_chunks = result["chunks"] +tables = result["tables"] +images = result["images"] \ No newline at end of file diff --git a/test_processing.ipynb b/test_processing.ipynb new file mode 100644 index 0000000..5f4b0ea --- /dev/null +++ b/test_processing.ipynb @@ -0,0 +1,311 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\pypdf\\_crypt_providers\\_cryptography.py:32: CryptographyDeprecationWarning: ARC4 has been moved to cryptography.hazmat.decrepit.ciphers.algorithms.ARC4 and will be removed from cryptography.hazmat.primitives.ciphers.algorithms in 48.0.0.\n", + " from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4\n", + "c:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "import sys\n", + "import os\n", + "sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), './src/document_processing')))\n", + "from pdf_processor import process_pdf_document\n", + "from pdf_processor import process_pdf_with_unstructured_loader\n", + "pdf_path = r\"F:\\Dev\\Rag\\Rag_Modeling\\document\\Echangeurs.pdf\"\n", + "from PIL import Image\n", + "import pytesseract \n", + "pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-02-28 22:14:30,067 - pdf_processor - INFO - Début du traitement du fichier PDF: F:\\Dev\\Rag\\Rag_Modeling\\document\\Echangeurs.pdf\n", + "2025-02-28 22:14:30,068 - pdf_processor - INFO - Extraction de texte avec PyPDFLoader\n", + "2025-02-28 22:14:30,355 - pdf_processor - INFO - Extraction de texte avec PDFMinerLoader\n", + "2025-02-28 22:14:31,675 - pdf_processor - WARNING - Erreur avec PDFMinerLoader: The PDF parser must valorize the standard metadata.\n", + "2025-02-28 22:14:31,678 - pdf_processor - INFO - Extraction de texte avec Unstructured\n", + "2025-02-28 22:14:31,680 - unstructured - INFO - PDF text extraction failed, skip text extraction...\n", + "2025-02-28 22:14:31,682 - unstructured - WARNING - pytesseract is not installed. Cannot use the ocr_only partitioning strategy. Falling back to partitioning with another strategy.\n", + "2025-02-28 22:14:31,682 - unstructured - WARNING - Falling back to partitioning with hi_res.\n", + "2025-02-28 22:14:31,683 - unstructured_inference - INFO - Reading PDF for file: F:\\Dev\\Rag\\Rag_Modeling\\document\\Echangeurs.pdf ...\n", + "2025-02-28 22:14:45,041 - pdf_processor - WARNING - Erreur avec Unstructured: Environment variable OCR_AGENT module name C:\\Program Files\\Tesseract-OCR\\tesseract must be set to a whitelisted module part of ['unstructured.partition.utils.ocr_models.tesseract_ocr', 'unstructured.partition.utils.ocr_models.paddle_ocr', 'unstructured.partition.utils.ocr_models.google_vision_ocr'].\n", + "2025-02-28 22:14:45,044 - pdf_processor - INFO - Extraction des tableaux avec Camelot\n", + "2025-02-28 22:14:56,714 - pdf_processor - INFO - Extraction des images avec Unstructured\n", + "2025-02-28 22:14:56,717 - unstructured - INFO - PDF text extraction failed, skip text extraction...\n", + "2025-02-28 22:14:56,719 - unstructured_inference - INFO - Reading PDF for file: F:\\Dev\\Rag\\Rag_Modeling\\document\\Echangeurs.pdf ...\n", + "2025-02-28 22:15:09,709 - pdf_processor - WARNING - Erreur lors de l'extraction des images: Environment variable OCR_AGENT module name C:\\Program Files\\Tesseract-OCR\\tesseract must be set to a whitelisted module part of ['unstructured.partition.utils.ocr_models.tesseract_ocr', 'unstructured.partition.utils.ocr_models.paddle_ocr', 'unstructured.partition.utils.ocr_models.google_vision_ocr'].\n", + "2025-02-28 22:15:09,711 - pdf_processor - INFO - Traitement du PDF terminé: 30 chunks, 18 tableaux, 0 images\n" + ] + } + ], + "source": [ + "result = process_pdf_document(\n", + " pdf_path,\n", + " ocr_enabled=True,\n", + " extract_tables=True,\n", + " extract_images=True,\n", + " chunk_size=1000,\n", + " chunk_overlap=200\n", + ")\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Accès aux différentes parties du résultat\n", + "text_chunks = result[\"chunks\"]\n", + "tables = result[\"tables\"]\n", + "images = result[\"images\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-02-28 22:23:44,919 - pdf_processor - INFO - Traitement du PDF avec UnstructuredPDFLoader: F:\\Dev\\Rag\\Rag_Modeling\\document\\Echangeurs.pdf\n", + "2025-02-28 22:23:44,921 - unstructured - INFO - PDF text extraction failed, skip text extraction...\n", + "2025-02-28 22:23:44,923 - pdf_processor - INFO - UnstructuredPDFLoader: extrait 0 éléments et 0 chunks\n" + ] + } + ], + "source": [ + "result = process_pdf_with_unstructured_loader(\n", + " pdf_path,\n", + " chunk_size=1000,\n", + " chunk_overlap=200,\n", + " # Vous pouvez passer des options spécifiques à UnstructuredPDFLoader:\n", + " \n", + " include_page_breaks=True # Pour inclure les sauts de page\n", + ")\n", + "\n", + "# Accéder aux résultats\n", + "text = result[\"text\"]\n", + "chunks = result[\"chunks\"]\n", + "elements = result[\"elements\"] \n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Correction de la configuration OCR\n", + "import pytesseract \n", + "import os\n", + "pytesseract.pytesseract.tesseract_cmd = r\"C:\\Program Files\\Tesseract-OCR\\tesseract.exe\"\n", + "os.environ['TESSDATA_PREFIX'] = os.environ['TESSDATA_PREFIX'] = r\"C:\\Program Files\\Tesseract-OCR\\tessdata\"\n", + "# Au lieu du chemin vers l'exécutable, utilisez le nom de module approprié\n", + "os.environ['OCR_AGENT'] = r\"C:\\Program Files\\Tesseract-OCR\\tessdata\"\n", + "from langchain_community.document_loaders import UnstructuredPDFLoader\n", + "\n", + "pdf_path = r\"F:\\Dev\\Rag\\Rag_Modeling\\document\\Echangeurs.pdf\"\n", + "loader = UnstructuredPDFLoader(pdf_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\pypdf\\_crypt_providers\\_cryptography.py:32: CryptographyDeprecationWarning: ARC4 has been moved to cryptography.hazmat.decrepit.ciphers.algorithms.ARC4 and will be removed from cryptography.hazmat.primitives.ciphers.algorithms in 48.0.0.\n", + " from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4\n", + "c:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "pytesseract is not installed. Cannot use the ocr_only partitioning strategy. Falling back to partitioning with another strategy.\n", + "Falling back to partitioning with hi_res.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "je suis ici dans OCR_AGENT\n", + "C:\\Program Files\\Tesseract-OCR\\tessdata\n" + ] + }, + { + "ename": "ValueError", + "evalue": "not enough values to unpack (expected 2, got 1)", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[2], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m docs \u001b[38;5;241m=\u001b[39m \u001b[43mloader\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2\u001b[0m docs[\u001b[38;5;241m0\u001b[39m]\n", + "File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\langchain_core\\document_loaders\\base.py:31\u001b[0m, in \u001b[0;36mBaseLoader.load\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 29\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mload\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mlist\u001b[39m[Document]:\n\u001b[0;32m 30\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Load data into Document objects.\"\"\"\u001b[39;00m\n\u001b[1;32m---> 31\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlazy_load\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\langchain_community\\document_loaders\\unstructured.py:107\u001b[0m, in \u001b[0;36mUnstructuredBaseLoader.lazy_load\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 105\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mlazy_load\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Iterator[Document]:\n\u001b[0;32m 106\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Load file.\"\"\"\u001b[39;00m\n\u001b[1;32m--> 107\u001b[0m elements \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_elements\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 108\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_post_process_elements(elements)\n\u001b[0;32m 109\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmode \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124melements\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n", + "File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\langchain_community\\document_loaders\\pdf.py:94\u001b[0m, in \u001b[0;36mUnstructuredPDFLoader._get_elements\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 91\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m_get_elements\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mlist\u001b[39m:\n\u001b[0;32m 92\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01munstructured\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpartition\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpdf\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m partition_pdf\n\u001b[1;32m---> 94\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m partition_pdf(filename\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfile_path, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39munstructured_kwargs)\n", + "File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\documents\\elements.py:581\u001b[0m, in \u001b[0;36mprocess_metadata..decorator..wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 579\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[0;32m 580\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mwrapper\u001b[39m(\u001b[38;5;241m*\u001b[39margs: _P\u001b[38;5;241m.\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: _P\u001b[38;5;241m.\u001b[39mkwargs) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mlist\u001b[39m[Element]:\n\u001b[1;32m--> 581\u001b[0m elements \u001b[38;5;241m=\u001b[39m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 582\u001b[0m call_args \u001b[38;5;241m=\u001b[39m get_call_args_applying_defaults(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 584\u001b[0m unique_element_ids: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m call_args\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124munique_element_ids\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mFalse\u001b[39;00m)\n", + "File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\file_utils\\filetype.py:815\u001b[0m, in \u001b[0;36madd_filetype..decorator..wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 813\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[0;32m 814\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mwrapper\u001b[39m(\u001b[38;5;241m*\u001b[39margs: _P\u001b[38;5;241m.\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: _P\u001b[38;5;241m.\u001b[39mkwargs) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mlist\u001b[39m[Element]:\n\u001b[1;32m--> 815\u001b[0m elements \u001b[38;5;241m=\u001b[39m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 817\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m element \u001b[38;5;129;01min\u001b[39;00m elements:\n\u001b[0;32m 818\u001b[0m \u001b[38;5;66;03m# NOTE(robinson) - Attached files have already run through this logic\u001b[39;00m\n\u001b[0;32m 819\u001b[0m \u001b[38;5;66;03m# in their own partitioning function\u001b[39;00m\n\u001b[0;32m 820\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m element\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39mattached_to_filename \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", + "File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\file_utils\\filetype.py:773\u001b[0m, in \u001b[0;36madd_metadata..wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 771\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[0;32m 772\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mwrapper\u001b[39m(\u001b[38;5;241m*\u001b[39margs: _P\u001b[38;5;241m.\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: _P\u001b[38;5;241m.\u001b[39mkwargs) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mlist\u001b[39m[Element]:\n\u001b[1;32m--> 773\u001b[0m elements \u001b[38;5;241m=\u001b[39m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 774\u001b[0m call_args \u001b[38;5;241m=\u001b[39m get_call_args_applying_defaults(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 776\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m call_args\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmetadata_filename\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n", + "File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\chunking\\dispatch.py:74\u001b[0m, in \u001b[0;36madd_chunking_strategy..wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 71\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"The decorated function is replaced with this one.\"\"\"\u001b[39;00m\n\u001b[0;32m 73\u001b[0m \u001b[38;5;66;03m# -- call the partitioning function to get the elements --\u001b[39;00m\n\u001b[1;32m---> 74\u001b[0m elements \u001b[38;5;241m=\u001b[39m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 76\u001b[0m \u001b[38;5;66;03m# -- look for a chunking-strategy argument --\u001b[39;00m\n\u001b[0;32m 77\u001b[0m call_args \u001b[38;5;241m=\u001b[39m get_call_args_applying_defaults(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\partition\\pdf.py:229\u001b[0m, in \u001b[0;36mpartition_pdf\u001b[1;34m(filename, file, include_page_breaks, strategy, infer_table_structure, ocr_languages, languages, metadata_filename, metadata_last_modified, chunking_strategy, hi_res_model_name, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, starting_page_number, extract_forms, form_extraction_skip_tables, password, pdfminer_line_margin, pdfminer_char_margin, pdfminer_line_overlap, pdfminer_word_margin, **kwargs)\u001b[0m\n\u001b[0;32m 226\u001b[0m exactly_one(filename\u001b[38;5;241m=\u001b[39mfilename, file\u001b[38;5;241m=\u001b[39mfile)\n\u001b[0;32m 228\u001b[0m languages \u001b[38;5;241m=\u001b[39m check_language_args(languages \u001b[38;5;129;01mor\u001b[39;00m [], ocr_languages)\n\u001b[1;32m--> 229\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m partition_pdf_or_image(\n\u001b[0;32m 230\u001b[0m filename\u001b[38;5;241m=\u001b[39mfilename,\n\u001b[0;32m 231\u001b[0m file\u001b[38;5;241m=\u001b[39mfile,\n\u001b[0;32m 232\u001b[0m include_page_breaks\u001b[38;5;241m=\u001b[39minclude_page_breaks,\n\u001b[0;32m 233\u001b[0m strategy\u001b[38;5;241m=\u001b[39mstrategy,\n\u001b[0;32m 234\u001b[0m infer_table_structure\u001b[38;5;241m=\u001b[39minfer_table_structure,\n\u001b[0;32m 235\u001b[0m languages\u001b[38;5;241m=\u001b[39mlanguages,\n\u001b[0;32m 236\u001b[0m metadata_last_modified\u001b[38;5;241m=\u001b[39mmetadata_last_modified,\n\u001b[0;32m 237\u001b[0m hi_res_model_name\u001b[38;5;241m=\u001b[39mhi_res_model_name,\n\u001b[0;32m 238\u001b[0m extract_images_in_pdf\u001b[38;5;241m=\u001b[39mextract_images_in_pdf,\n\u001b[0;32m 239\u001b[0m extract_image_block_types\u001b[38;5;241m=\u001b[39mextract_image_block_types,\n\u001b[0;32m 240\u001b[0m extract_image_block_output_dir\u001b[38;5;241m=\u001b[39mextract_image_block_output_dir,\n\u001b[0;32m 241\u001b[0m extract_image_block_to_payload\u001b[38;5;241m=\u001b[39mextract_image_block_to_payload,\n\u001b[0;32m 242\u001b[0m starting_page_number\u001b[38;5;241m=\u001b[39mstarting_page_number,\n\u001b[0;32m 243\u001b[0m extract_forms\u001b[38;5;241m=\u001b[39mextract_forms,\n\u001b[0;32m 244\u001b[0m form_extraction_skip_tables\u001b[38;5;241m=\u001b[39mform_extraction_skip_tables,\n\u001b[0;32m 245\u001b[0m password\u001b[38;5;241m=\u001b[39mpassword,\n\u001b[0;32m 246\u001b[0m pdfminer_line_margin\u001b[38;5;241m=\u001b[39mpdfminer_line_margin,\n\u001b[0;32m 247\u001b[0m pdfminer_char_margin\u001b[38;5;241m=\u001b[39mpdfminer_char_margin,\n\u001b[0;32m 248\u001b[0m pdfminer_line_overlap\u001b[38;5;241m=\u001b[39mpdfminer_line_overlap,\n\u001b[0;32m 249\u001b[0m pdfminer_word_margin\u001b[38;5;241m=\u001b[39mpdfminer_word_margin,\n\u001b[0;32m 250\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m 251\u001b[0m )\n", + "File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\partition\\pdf.py:342\u001b[0m, in \u001b[0;36mpartition_pdf_or_image\u001b[1;34m(filename, file, is_image, include_page_breaks, strategy, infer_table_structure, languages, metadata_last_modified, hi_res_model_name, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, starting_page_number, extract_forms, form_extraction_skip_tables, password, pdfminer_line_margin, pdfminer_char_margin, pdfminer_line_overlap, pdfminer_word_margin, **kwargs)\u001b[0m\n\u001b[0;32m 340\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m warnings\u001b[38;5;241m.\u001b[39mcatch_warnings():\n\u001b[0;32m 341\u001b[0m warnings\u001b[38;5;241m.\u001b[39msimplefilter(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m--> 342\u001b[0m elements \u001b[38;5;241m=\u001b[39m _partition_pdf_or_image_local(\n\u001b[0;32m 343\u001b[0m filename\u001b[38;5;241m=\u001b[39mfilename,\n\u001b[0;32m 344\u001b[0m file\u001b[38;5;241m=\u001b[39mspooled_to_bytes_io_if_needed(file),\n\u001b[0;32m 345\u001b[0m is_image\u001b[38;5;241m=\u001b[39mis_image,\n\u001b[0;32m 346\u001b[0m infer_table_structure\u001b[38;5;241m=\u001b[39minfer_table_structure,\n\u001b[0;32m 347\u001b[0m include_page_breaks\u001b[38;5;241m=\u001b[39minclude_page_breaks,\n\u001b[0;32m 348\u001b[0m languages\u001b[38;5;241m=\u001b[39mlanguages,\n\u001b[0;32m 349\u001b[0m ocr_languages\u001b[38;5;241m=\u001b[39mocr_languages,\n\u001b[0;32m 350\u001b[0m metadata_last_modified\u001b[38;5;241m=\u001b[39mmetadata_last_modified \u001b[38;5;129;01mor\u001b[39;00m last_modified,\n\u001b[0;32m 351\u001b[0m hi_res_model_name\u001b[38;5;241m=\u001b[39mhi_res_model_name,\n\u001b[0;32m 352\u001b[0m pdf_text_extractable\u001b[38;5;241m=\u001b[39mpdf_text_extractable,\n\u001b[0;32m 353\u001b[0m extract_images_in_pdf\u001b[38;5;241m=\u001b[39mextract_images_in_pdf,\n\u001b[0;32m 354\u001b[0m extract_image_block_types\u001b[38;5;241m=\u001b[39mextract_image_block_types,\n\u001b[0;32m 355\u001b[0m extract_image_block_output_dir\u001b[38;5;241m=\u001b[39mextract_image_block_output_dir,\n\u001b[0;32m 356\u001b[0m extract_image_block_to_payload\u001b[38;5;241m=\u001b[39mextract_image_block_to_payload,\n\u001b[0;32m 357\u001b[0m starting_page_number\u001b[38;5;241m=\u001b[39mstarting_page_number,\n\u001b[0;32m 358\u001b[0m extract_forms\u001b[38;5;241m=\u001b[39mextract_forms,\n\u001b[0;32m 359\u001b[0m form_extraction_skip_tables\u001b[38;5;241m=\u001b[39mform_extraction_skip_tables,\n\u001b[0;32m 360\u001b[0m password\u001b[38;5;241m=\u001b[39mpassword,\n\u001b[0;32m 361\u001b[0m pdfminer_config\u001b[38;5;241m=\u001b[39mpdfminer_config,\n\u001b[0;32m 362\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m 363\u001b[0m )\n\u001b[0;32m 364\u001b[0m out_elements \u001b[38;5;241m=\u001b[39m _process_uncategorized_text_elements(elements)\n\u001b[0;32m 366\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m strategy \u001b[38;5;241m==\u001b[39m PartitionStrategy\u001b[38;5;241m.\u001b[39mFAST:\n", + "File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\utils.py:216\u001b[0m, in \u001b[0;36mrequires_dependencies..decorator..wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 213\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(func)\n\u001b[0;32m 214\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mwrapper\u001b[39m(\u001b[38;5;241m*\u001b[39margs: _P\u001b[38;5;241m.\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: _P\u001b[38;5;241m.\u001b[39mkwargs):\n\u001b[0;32m 215\u001b[0m run_check()\n\u001b[1;32m--> 216\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\partition\\pdf.py:687\u001b[0m, in \u001b[0;36m_partition_pdf_or_image_local\u001b[1;34m(filename, file, is_image, infer_table_structure, include_page_breaks, languages, ocr_languages, ocr_mode, model_name, hi_res_model_name, pdf_image_dpi, metadata_last_modified, pdf_text_extractable, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, analysis, analyzed_image_output_dir_path, starting_page_number, extract_forms, form_extraction_skip_tables, pdf_hi_res_max_pages, password, pdfminer_config, **kwargs)\u001b[0m\n\u001b[0;32m 680\u001b[0m \u001b[38;5;66;03m# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout\u001b[39;00m\n\u001b[0;32m 681\u001b[0m merged_document_layout \u001b[38;5;241m=\u001b[39m merge_inferred_with_extracted_layout(\n\u001b[0;32m 682\u001b[0m inferred_document_layout\u001b[38;5;241m=\u001b[39minferred_document_layout,\n\u001b[0;32m 683\u001b[0m extracted_layout\u001b[38;5;241m=\u001b[39mextracted_layout,\n\u001b[0;32m 684\u001b[0m hi_res_model_name\u001b[38;5;241m=\u001b[39mhi_res_model_name,\n\u001b[0;32m 685\u001b[0m )\n\u001b[1;32m--> 687\u001b[0m final_document_layout \u001b[38;5;241m=\u001b[39m \u001b[43mprocess_file_with_ocr\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 688\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 689\u001b[0m \u001b[43m \u001b[49m\u001b[43mmerged_document_layout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 690\u001b[0m \u001b[43m \u001b[49m\u001b[43mextracted_layout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextracted_layout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 691\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_image\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_image\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 692\u001b[0m \u001b[43m \u001b[49m\u001b[43minfer_table_structure\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minfer_table_structure\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 693\u001b[0m \u001b[43m \u001b[49m\u001b[43mocr_languages\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mocr_languages\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 694\u001b[0m \u001b[43m \u001b[49m\u001b[43mocr_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mocr_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 695\u001b[0m \u001b[43m \u001b[49m\u001b[43mpdf_image_dpi\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpdf_image_dpi\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 696\u001b[0m \u001b[43m \u001b[49m\u001b[43mocr_layout_dumper\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mocr_layout_dumper\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 697\u001b[0m \u001b[43m \u001b[49m\u001b[43mpassword\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpassword\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 698\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 699\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 700\u001b[0m inferred_document_layout \u001b[38;5;241m=\u001b[39m process_data_with_model(\n\u001b[0;32m 701\u001b[0m file,\n\u001b[0;32m 702\u001b[0m is_image\u001b[38;5;241m=\u001b[39mis_image,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 705\u001b[0m password\u001b[38;5;241m=\u001b[39mpassword,\n\u001b[0;32m 706\u001b[0m )\n", + "File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\utils.py:216\u001b[0m, in \u001b[0;36mrequires_dependencies..decorator..wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 213\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(func)\n\u001b[0;32m 214\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mwrapper\u001b[39m(\u001b[38;5;241m*\u001b[39margs: _P\u001b[38;5;241m.\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: _P\u001b[38;5;241m.\u001b[39mkwargs):\n\u001b[0;32m 215\u001b[0m run_check()\n\u001b[1;32m--> 216\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\partition\\pdf_image\\ocr.py:190\u001b[0m, in \u001b[0;36mprocess_file_with_ocr\u001b[1;34m(filename, out_layout, extracted_layout, is_image, infer_table_structure, ocr_languages, ocr_mode, pdf_image_dpi, ocr_layout_dumper, password)\u001b[0m\n\u001b[0;32m 188\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 189\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39misdir(filename) \u001b[38;5;129;01mor\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39misfile(filename):\n\u001b[1;32m--> 190\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[0;32m 191\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 192\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mFile \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfilename\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m not found!\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01me\u001b[39;00m\n", + "File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\partition\\pdf_image\\ocr.py:177\u001b[0m, in \u001b[0;36mprocess_file_with_ocr\u001b[1;34m(filename, out_layout, extracted_layout, is_image, infer_table_structure, ocr_languages, ocr_mode, pdf_image_dpi, ocr_layout_dumper, password)\u001b[0m\n\u001b[0;32m 175\u001b[0m extracted_regions \u001b[38;5;241m=\u001b[39m extracted_layout[i] \u001b[38;5;28;01mif\u001b[39;00m i \u001b[38;5;241m<\u001b[39m \u001b[38;5;28mlen\u001b[39m(extracted_layout) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m 176\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m PILImage\u001b[38;5;241m.\u001b[39mopen(image_path) \u001b[38;5;28;01mas\u001b[39;00m image:\n\u001b[1;32m--> 177\u001b[0m merged_page_layout \u001b[38;5;241m=\u001b[39m \u001b[43msupplement_page_layout_with_ocr\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 178\u001b[0m \u001b[43m \u001b[49m\u001b[43mpage_layout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mout_layout\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpages\u001b[49m\u001b[43m[\u001b[49m\u001b[43mi\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 179\u001b[0m \u001b[43m \u001b[49m\u001b[43mimage\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mimage\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 180\u001b[0m \u001b[43m \u001b[49m\u001b[43minfer_table_structure\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minfer_table_structure\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 181\u001b[0m \u001b[43m \u001b[49m\u001b[43mocr_languages\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mocr_languages\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 182\u001b[0m \u001b[43m \u001b[49m\u001b[43mocr_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mocr_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 183\u001b[0m \u001b[43m \u001b[49m\u001b[43mextracted_regions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextracted_regions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 184\u001b[0m \u001b[43m \u001b[49m\u001b[43mocr_layout_dumper\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mocr_layout_dumper\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 185\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 186\u001b[0m merged_page_layouts\u001b[38;5;241m.\u001b[39mappend(merged_page_layout)\n\u001b[0;32m 187\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m DocumentLayout\u001b[38;5;241m.\u001b[39mfrom_pages(merged_page_layouts)\n", + "File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\utils.py:216\u001b[0m, in \u001b[0;36mrequires_dependencies..decorator..wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 213\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(func)\n\u001b[0;32m 214\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mwrapper\u001b[39m(\u001b[38;5;241m*\u001b[39margs: _P\u001b[38;5;241m.\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: _P\u001b[38;5;241m.\u001b[39mkwargs):\n\u001b[0;32m 215\u001b[0m run_check()\n\u001b[1;32m--> 216\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\partition\\pdf_image\\ocr.py:213\u001b[0m, in \u001b[0;36msupplement_page_layout_with_ocr\u001b[1;34m(page_layout, image, infer_table_structure, ocr_languages, ocr_mode, extracted_regions, ocr_layout_dumper)\u001b[0m\n\u001b[0;32m 195\u001b[0m \u001b[38;5;129m@requires_dependencies\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124munstructured_inference\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 196\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21msupplement_page_layout_with_ocr\u001b[39m(\n\u001b[0;32m 197\u001b[0m page_layout: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPageLayout\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 203\u001b[0m ocr_layout_dumper: Optional[OCRLayoutDumper] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[0;32m 204\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPageLayout\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m 205\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 206\u001b[0m \u001b[38;5;124;03m Supplement an PageLayout with OCR results depending on OCR mode.\u001b[39;00m\n\u001b[0;32m 207\u001b[0m \u001b[38;5;124;03m If mode is \"entire_page\", we get the OCR layout for the entire image and\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 210\u001b[0m \u001b[38;5;124;03m with no text and add text from OCR to each element.\u001b[39;00m\n\u001b[0;32m 211\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 213\u001b[0m ocr_agent \u001b[38;5;241m=\u001b[39m \u001b[43mOCRAgent\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_agent\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlanguage\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mocr_languages\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 214\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ocr_mode \u001b[38;5;241m==\u001b[39m OCRMode\u001b[38;5;241m.\u001b[39mFULL_PAGE\u001b[38;5;241m.\u001b[39mvalue:\n\u001b[0;32m 215\u001b[0m ocr_layout \u001b[38;5;241m=\u001b[39m ocr_agent\u001b[38;5;241m.\u001b[39mget_layout_from_image(image)\n", + "File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\partition\\utils\\ocr_models\\ocr_interface.py:34\u001b[0m, in \u001b[0;36mOCRAgent.get_agent\u001b[1;34m(cls, language)\u001b[0m\n\u001b[0;32m 29\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Get the configured OCRAgent instance.\u001b[39;00m\n\u001b[0;32m 30\u001b[0m \n\u001b[0;32m 31\u001b[0m \u001b[38;5;124;03mThe OCR package used by the agent is determined by the `OCR_AGENT` environment variable.\u001b[39;00m\n\u001b[0;32m 32\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 33\u001b[0m ocr_agent_cls_qname \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m_get_ocr_agent_cls_qname()\n\u001b[1;32m---> 34\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_instance\u001b[49m\u001b[43m(\u001b[49m\u001b[43mocr_agent_cls_qname\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlanguage\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\partition\\utils\\ocr_models\\ocr_interface.py:41\u001b[0m, in \u001b[0;36mOCRAgent.get_instance\u001b[1;34m(ocr_agent_module, language)\u001b[0m\n\u001b[0;32m 39\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mje suis ici dans OCR_AGENT\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 40\u001b[0m \u001b[38;5;28mprint\u001b[39m(ocr_agent_module)\n\u001b[1;32m---> 41\u001b[0m module_name, class_name \u001b[38;5;241m=\u001b[39m ocr_agent_module\u001b[38;5;241m.\u001b[39mrsplit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;241m1\u001b[39m)\n\u001b[0;32m 43\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m module_name \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m OCR_AGENT_MODULES_WHITELIST:\n\u001b[0;32m 44\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 45\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEnvironment variable OCR_AGENT module name \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmodule_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must be set to a \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 46\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwhitelisted module part of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mOCR_AGENT_MODULES_WHITELIST\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 47\u001b[0m )\n", + "\u001b[1;31mValueError\u001b[0m: not enough values to unpack (expected 2, got 1)" + ] + } + ], + "source": [ + "docs = loader.load()\n", + "docs[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "from PIL import Image oa\n", + "import pytesseract\n", + "\n", + "# If you don't have tesseract executable in your PATH, include the following:\n", + "pytesseract.pytesseract.tesseract_cmd = r’*\n", + "‘# Example tesseract_cmd = r’C:\\Program Files (x86)\\Tesseract-OCR\\tesseract’\n", + "\n", + "‘# Simple image to string\n", + "print(pytesseract. image to_string(Image.open( ‘test .png’)))\n", + "\n", + "# In order to bypass the image conversions of pytesseract, just use relative or absolute image path\n", + "# NOTE: In this case you should provide tesseract supported images or tesseract will return error\n", + "print (pytesseract.image_to_string(‘test.png\"))\n", + "\n", + "# List of available languages\n", + "\n", + "print (pytesseract.get_languages(config=\"*))\n", + "\n", + "# French text image to string\n", + "print (pytesseract. image_to_string(Image.open(‘test-european. jpg’), lang=\"fra’))\n", + "\n", + "# Batch processing with a single file containing the list of multiple image file paths\n", + "print (pytesseract. image_to_string(’images.txt\"))\n", + "\n", + "# Timeout/terminate the tesseract job after a period of time\n", + "try:\n", + "\n", + "print (pytesseract.image_to_string(‘test. jpg’, timeout-2)) # Timeout after 2 seconds\n", + "\n", + "print (pytesseract. image to_string(‘test.jpg\", timeout=2.5)) # Timeout after half a second\n", + "except Runtime€rror as timeout_error:\n", + "\n", + "# Tesseract processing is terminated\n", + "\n", + "pass\n", + "\n", + "# Get bounding box estimates\n", + "print (pytesseract. image_to_boxes(Image.open(‘test.png’)))\n", + "\n", + "# Get verbose data including boxes, confidences, line and page numbers\n", + "print (pytesseract.image_to_data(Inage.open(‘test.png’)))\n", + "\n", + "# Get information about orientation and script detection\n", + "print (pytesseract..image_to_osd(Image.open( ‘test.png\")))\n", + "\n", + "# Get a searchable PDF\n", + "pdf = pytesseract.image_to_pdf_or_hocr(‘test.png’, extension='\n", + "\n", + "\n" + ] + } + ], + "source": [ + "IMG_path = r\"F:\\Dev\\Rag\\Rag_Modeling\\document\\test.png\"\n", + "print(pytesseract.image_to_string(Image.open(IMG_path)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "rag", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}