rag/test_processing.ipynb
2025-03-01 08:15:30 +01:00

312 lines
42 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\pypdf\\_crypt_providers\\_cryptography.py:32: CryptographyDeprecationWarning: ARC4 has been moved to cryptography.hazmat.decrepit.ciphers.algorithms.ARC4 and will be removed from cryptography.hazmat.primitives.ciphers.algorithms in 48.0.0.\n",
" from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4\n",
"c:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"import sys\n",
"import os\n",
"sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), './src/document_processing')))\n",
"from pdf_processor import process_pdf_document\n",
"from pdf_processor import process_pdf_with_unstructured_loader\n",
"pdf_path = r\"F:\\Dev\\Rag\\Rag_Modeling\\document\\Echangeurs.pdf\"\n",
"from PIL import Image\n",
"import pytesseract \n",
"pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2025-02-28 22:14:30,067 - pdf_processor - INFO - Début du traitement du fichier PDF: F:\\Dev\\Rag\\Rag_Modeling\\document\\Echangeurs.pdf\n",
"2025-02-28 22:14:30,068 - pdf_processor - INFO - Extraction de texte avec PyPDFLoader\n",
"2025-02-28 22:14:30,355 - pdf_processor - INFO - Extraction de texte avec PDFMinerLoader\n",
"2025-02-28 22:14:31,675 - pdf_processor - WARNING - Erreur avec PDFMinerLoader: The PDF parser must valorize the standard metadata.\n",
"2025-02-28 22:14:31,678 - pdf_processor - INFO - Extraction de texte avec Unstructured\n",
"2025-02-28 22:14:31,680 - unstructured - INFO - PDF text extraction failed, skip text extraction...\n",
"2025-02-28 22:14:31,682 - unstructured - WARNING - pytesseract is not installed. Cannot use the ocr_only partitioning strategy. Falling back to partitioning with another strategy.\n",
"2025-02-28 22:14:31,682 - unstructured - WARNING - Falling back to partitioning with hi_res.\n",
"2025-02-28 22:14:31,683 - unstructured_inference - INFO - Reading PDF for file: F:\\Dev\\Rag\\Rag_Modeling\\document\\Echangeurs.pdf ...\n",
"2025-02-28 22:14:45,041 - pdf_processor - WARNING - Erreur avec Unstructured: Environment variable OCR_AGENT module name C:\\Program Files\\Tesseract-OCR\\tesseract must be set to a whitelisted module part of ['unstructured.partition.utils.ocr_models.tesseract_ocr', 'unstructured.partition.utils.ocr_models.paddle_ocr', 'unstructured.partition.utils.ocr_models.google_vision_ocr'].\n",
"2025-02-28 22:14:45,044 - pdf_processor - INFO - Extraction des tableaux avec Camelot\n",
"2025-02-28 22:14:56,714 - pdf_processor - INFO - Extraction des images avec Unstructured\n",
"2025-02-28 22:14:56,717 - unstructured - INFO - PDF text extraction failed, skip text extraction...\n",
"2025-02-28 22:14:56,719 - unstructured_inference - INFO - Reading PDF for file: F:\\Dev\\Rag\\Rag_Modeling\\document\\Echangeurs.pdf ...\n",
"2025-02-28 22:15:09,709 - pdf_processor - WARNING - Erreur lors de l'extraction des images: Environment variable OCR_AGENT module name C:\\Program Files\\Tesseract-OCR\\tesseract must be set to a whitelisted module part of ['unstructured.partition.utils.ocr_models.tesseract_ocr', 'unstructured.partition.utils.ocr_models.paddle_ocr', 'unstructured.partition.utils.ocr_models.google_vision_ocr'].\n",
"2025-02-28 22:15:09,711 - pdf_processor - INFO - Traitement du PDF terminé: 30 chunks, 18 tableaux, 0 images\n"
]
}
],
"source": [
"result = process_pdf_document(\n",
" pdf_path,\n",
" ocr_enabled=True,\n",
" extract_tables=True,\n",
" extract_images=True,\n",
" chunk_size=1000,\n",
" chunk_overlap=200\n",
")\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Accès aux différentes parties du résultat\n",
"text_chunks = result[\"chunks\"]\n",
"tables = result[\"tables\"]\n",
"images = result[\"images\"]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2025-02-28 22:23:44,919 - pdf_processor - INFO - Traitement du PDF avec UnstructuredPDFLoader: F:\\Dev\\Rag\\Rag_Modeling\\document\\Echangeurs.pdf\n",
"2025-02-28 22:23:44,921 - unstructured - INFO - PDF text extraction failed, skip text extraction...\n",
"2025-02-28 22:23:44,923 - pdf_processor - INFO - UnstructuredPDFLoader: extrait 0 éléments et 0 chunks\n"
]
}
],
"source": [
"result = process_pdf_with_unstructured_loader(\n",
" pdf_path,\n",
" chunk_size=1000,\n",
" chunk_overlap=200,\n",
" # Vous pouvez passer des options spécifiques à UnstructuredPDFLoader:\n",
" \n",
" include_page_breaks=True # Pour inclure les sauts de page\n",
")\n",
"\n",
"# Accéder aux résultats\n",
"text = result[\"text\"]\n",
"chunks = result[\"chunks\"]\n",
"elements = result[\"elements\"] \n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# Correction de la configuration OCR\n",
"import pytesseract \n",
"import os\n",
"pytesseract.pytesseract.tesseract_cmd = r\"C:\\Program Files\\Tesseract-OCR\\tesseract.exe\"\n",
"os.environ['TESSDATA_PREFIX'] = os.environ['TESSDATA_PREFIX'] = r\"C:\\Program Files\\Tesseract-OCR\\tessdata\"\n",
"# Au lieu du chemin vers l'exécutable, utilisez le nom de module approprié\n",
"os.environ['OCR_AGENT'] = r\"C:\\Program Files\\Tesseract-OCR\\tessdata\"\n",
"from langchain_community.document_loaders import UnstructuredPDFLoader\n",
"\n",
"pdf_path = r\"F:\\Dev\\Rag\\Rag_Modeling\\document\\Echangeurs.pdf\"\n",
"loader = UnstructuredPDFLoader(pdf_path)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\pypdf\\_crypt_providers\\_cryptography.py:32: CryptographyDeprecationWarning: ARC4 has been moved to cryptography.hazmat.decrepit.ciphers.algorithms.ARC4 and will be removed from cryptography.hazmat.primitives.ciphers.algorithms in 48.0.0.\n",
" from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4\n",
"c:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"pytesseract is not installed. Cannot use the ocr_only partitioning strategy. Falling back to partitioning with another strategy.\n",
"Falling back to partitioning with hi_res.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"je suis ici dans OCR_AGENT\n",
"C:\\Program Files\\Tesseract-OCR\\tessdata\n"
]
},
{
"ename": "ValueError",
"evalue": "not enough values to unpack (expected 2, got 1)",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[2], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m docs \u001b[38;5;241m=\u001b[39m \u001b[43mloader\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2\u001b[0m docs[\u001b[38;5;241m0\u001b[39m]\n",
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\langchain_core\\document_loaders\\base.py:31\u001b[0m, in \u001b[0;36mBaseLoader.load\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 29\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mload\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mlist\u001b[39m[Document]:\n\u001b[0;32m 30\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Load data into Document objects.\"\"\"\u001b[39;00m\n\u001b[1;32m---> 31\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlazy_load\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\langchain_community\\document_loaders\\unstructured.py:107\u001b[0m, in \u001b[0;36mUnstructuredBaseLoader.lazy_load\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 105\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mlazy_load\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Iterator[Document]:\n\u001b[0;32m 106\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Load file.\"\"\"\u001b[39;00m\n\u001b[1;32m--> 107\u001b[0m elements \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_elements\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 108\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_post_process_elements(elements)\n\u001b[0;32m 109\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmode \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124melements\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n",
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\langchain_community\\document_loaders\\pdf.py:94\u001b[0m, in \u001b[0;36mUnstructuredPDFLoader._get_elements\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 91\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m_get_elements\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mlist\u001b[39m:\n\u001b[0;32m 92\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01munstructured\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpartition\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpdf\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m partition_pdf\n\u001b[1;32m---> 94\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m partition_pdf(filename\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfile_path, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39munstructured_kwargs)\n",
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\documents\\elements.py:581\u001b[0m, in \u001b[0;36mprocess_metadata.<locals>.decorator.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 579\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[0;32m 580\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mwrapper\u001b[39m(\u001b[38;5;241m*\u001b[39margs: _P\u001b[38;5;241m.\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: _P\u001b[38;5;241m.\u001b[39mkwargs) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mlist\u001b[39m[Element]:\n\u001b[1;32m--> 581\u001b[0m elements \u001b[38;5;241m=\u001b[39m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 582\u001b[0m call_args \u001b[38;5;241m=\u001b[39m get_call_args_applying_defaults(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 584\u001b[0m unique_element_ids: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m call_args\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124munique_element_ids\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mFalse\u001b[39;00m)\n",
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\file_utils\\filetype.py:815\u001b[0m, in \u001b[0;36madd_filetype.<locals>.decorator.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 813\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[0;32m 814\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mwrapper\u001b[39m(\u001b[38;5;241m*\u001b[39margs: _P\u001b[38;5;241m.\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: _P\u001b[38;5;241m.\u001b[39mkwargs) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mlist\u001b[39m[Element]:\n\u001b[1;32m--> 815\u001b[0m elements \u001b[38;5;241m=\u001b[39m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 817\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m element \u001b[38;5;129;01min\u001b[39;00m elements:\n\u001b[0;32m 818\u001b[0m \u001b[38;5;66;03m# NOTE(robinson) - Attached files have already run through this logic\u001b[39;00m\n\u001b[0;32m 819\u001b[0m \u001b[38;5;66;03m# in their own partitioning function\u001b[39;00m\n\u001b[0;32m 820\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m element\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39mattached_to_filename \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\file_utils\\filetype.py:773\u001b[0m, in \u001b[0;36madd_metadata.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 771\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[0;32m 772\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mwrapper\u001b[39m(\u001b[38;5;241m*\u001b[39margs: _P\u001b[38;5;241m.\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: _P\u001b[38;5;241m.\u001b[39mkwargs) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mlist\u001b[39m[Element]:\n\u001b[1;32m--> 773\u001b[0m elements \u001b[38;5;241m=\u001b[39m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 774\u001b[0m call_args \u001b[38;5;241m=\u001b[39m get_call_args_applying_defaults(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 776\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m call_args\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmetadata_filename\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n",
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\chunking\\dispatch.py:74\u001b[0m, in \u001b[0;36madd_chunking_strategy.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 71\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"The decorated function is replaced with this one.\"\"\"\u001b[39;00m\n\u001b[0;32m 73\u001b[0m \u001b[38;5;66;03m# -- call the partitioning function to get the elements --\u001b[39;00m\n\u001b[1;32m---> 74\u001b[0m elements \u001b[38;5;241m=\u001b[39m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 76\u001b[0m \u001b[38;5;66;03m# -- look for a chunking-strategy argument --\u001b[39;00m\n\u001b[0;32m 77\u001b[0m call_args \u001b[38;5;241m=\u001b[39m get_call_args_applying_defaults(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\partition\\pdf.py:229\u001b[0m, in \u001b[0;36mpartition_pdf\u001b[1;34m(filename, file, include_page_breaks, strategy, infer_table_structure, ocr_languages, languages, metadata_filename, metadata_last_modified, chunking_strategy, hi_res_model_name, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, starting_page_number, extract_forms, form_extraction_skip_tables, password, pdfminer_line_margin, pdfminer_char_margin, pdfminer_line_overlap, pdfminer_word_margin, **kwargs)\u001b[0m\n\u001b[0;32m 226\u001b[0m exactly_one(filename\u001b[38;5;241m=\u001b[39mfilename, file\u001b[38;5;241m=\u001b[39mfile)\n\u001b[0;32m 228\u001b[0m languages \u001b[38;5;241m=\u001b[39m check_language_args(languages \u001b[38;5;129;01mor\u001b[39;00m [], ocr_languages)\n\u001b[1;32m--> 229\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m partition_pdf_or_image(\n\u001b[0;32m 230\u001b[0m filename\u001b[38;5;241m=\u001b[39mfilename,\n\u001b[0;32m 231\u001b[0m file\u001b[38;5;241m=\u001b[39mfile,\n\u001b[0;32m 232\u001b[0m include_page_breaks\u001b[38;5;241m=\u001b[39minclude_page_breaks,\n\u001b[0;32m 233\u001b[0m strategy\u001b[38;5;241m=\u001b[39mstrategy,\n\u001b[0;32m 234\u001b[0m infer_table_structure\u001b[38;5;241m=\u001b[39minfer_table_structure,\n\u001b[0;32m 235\u001b[0m languages\u001b[38;5;241m=\u001b[39mlanguages,\n\u001b[0;32m 236\u001b[0m metadata_last_modified\u001b[38;5;241m=\u001b[39mmetadata_last_modified,\n\u001b[0;32m 237\u001b[0m hi_res_model_name\u001b[38;5;241m=\u001b[39mhi_res_model_name,\n\u001b[0;32m 238\u001b[0m extract_images_in_pdf\u001b[38;5;241m=\u001b[39mextract_images_in_pdf,\n\u001b[0;32m 239\u001b[0m extract_image_block_types\u001b[38;5;241m=\u001b[39mextract_image_block_types,\n\u001b[0;32m 240\u001b[0m extract_image_block_output_dir\u001b[38;5;241m=\u001b[39mextract_image_block_output_dir,\n\u001b[0;32m 241\u001b[0m extract_image_block_to_payload\u001b[38;5;241m=\u001b[39mextract_image_block_to_payload,\n\u001b[0;32m 242\u001b[0m starting_page_number\u001b[38;5;241m=\u001b[39mstarting_page_number,\n\u001b[0;32m 243\u001b[0m extract_forms\u001b[38;5;241m=\u001b[39mextract_forms,\n\u001b[0;32m 244\u001b[0m form_extraction_skip_tables\u001b[38;5;241m=\u001b[39mform_extraction_skip_tables,\n\u001b[0;32m 245\u001b[0m password\u001b[38;5;241m=\u001b[39mpassword,\n\u001b[0;32m 246\u001b[0m pdfminer_line_margin\u001b[38;5;241m=\u001b[39mpdfminer_line_margin,\n\u001b[0;32m 247\u001b[0m pdfminer_char_margin\u001b[38;5;241m=\u001b[39mpdfminer_char_margin,\n\u001b[0;32m 248\u001b[0m pdfminer_line_overlap\u001b[38;5;241m=\u001b[39mpdfminer_line_overlap,\n\u001b[0;32m 249\u001b[0m pdfminer_word_margin\u001b[38;5;241m=\u001b[39mpdfminer_word_margin,\n\u001b[0;32m 250\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m 251\u001b[0m )\n",
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\partition\\pdf.py:342\u001b[0m, in \u001b[0;36mpartition_pdf_or_image\u001b[1;34m(filename, file, is_image, include_page_breaks, strategy, infer_table_structure, languages, metadata_last_modified, hi_res_model_name, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, starting_page_number, extract_forms, form_extraction_skip_tables, password, pdfminer_line_margin, pdfminer_char_margin, pdfminer_line_overlap, pdfminer_word_margin, **kwargs)\u001b[0m\n\u001b[0;32m 340\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m warnings\u001b[38;5;241m.\u001b[39mcatch_warnings():\n\u001b[0;32m 341\u001b[0m warnings\u001b[38;5;241m.\u001b[39msimplefilter(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m--> 342\u001b[0m elements \u001b[38;5;241m=\u001b[39m _partition_pdf_or_image_local(\n\u001b[0;32m 343\u001b[0m filename\u001b[38;5;241m=\u001b[39mfilename,\n\u001b[0;32m 344\u001b[0m file\u001b[38;5;241m=\u001b[39mspooled_to_bytes_io_if_needed(file),\n\u001b[0;32m 345\u001b[0m is_image\u001b[38;5;241m=\u001b[39mis_image,\n\u001b[0;32m 346\u001b[0m infer_table_structure\u001b[38;5;241m=\u001b[39minfer_table_structure,\n\u001b[0;32m 347\u001b[0m include_page_breaks\u001b[38;5;241m=\u001b[39minclude_page_breaks,\n\u001b[0;32m 348\u001b[0m languages\u001b[38;5;241m=\u001b[39mlanguages,\n\u001b[0;32m 349\u001b[0m ocr_languages\u001b[38;5;241m=\u001b[39mocr_languages,\n\u001b[0;32m 350\u001b[0m metadata_last_modified\u001b[38;5;241m=\u001b[39mmetadata_last_modified \u001b[38;5;129;01mor\u001b[39;00m last_modified,\n\u001b[0;32m 351\u001b[0m hi_res_model_name\u001b[38;5;241m=\u001b[39mhi_res_model_name,\n\u001b[0;32m 352\u001b[0m pdf_text_extractable\u001b[38;5;241m=\u001b[39mpdf_text_extractable,\n\u001b[0;32m 353\u001b[0m extract_images_in_pdf\u001b[38;5;241m=\u001b[39mextract_images_in_pdf,\n\u001b[0;32m 354\u001b[0m extract_image_block_types\u001b[38;5;241m=\u001b[39mextract_image_block_types,\n\u001b[0;32m 355\u001b[0m extract_image_block_output_dir\u001b[38;5;241m=\u001b[39mextract_image_block_output_dir,\n\u001b[0;32m 356\u001b[0m extract_image_block_to_payload\u001b[38;5;241m=\u001b[39mextract_image_block_to_payload,\n\u001b[0;32m 357\u001b[0m starting_page_number\u001b[38;5;241m=\u001b[39mstarting_page_number,\n\u001b[0;32m 358\u001b[0m extract_forms\u001b[38;5;241m=\u001b[39mextract_forms,\n\u001b[0;32m 359\u001b[0m form_extraction_skip_tables\u001b[38;5;241m=\u001b[39mform_extraction_skip_tables,\n\u001b[0;32m 360\u001b[0m password\u001b[38;5;241m=\u001b[39mpassword,\n\u001b[0;32m 361\u001b[0m pdfminer_config\u001b[38;5;241m=\u001b[39mpdfminer_config,\n\u001b[0;32m 362\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m 363\u001b[0m )\n\u001b[0;32m 364\u001b[0m out_elements \u001b[38;5;241m=\u001b[39m _process_uncategorized_text_elements(elements)\n\u001b[0;32m 366\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m strategy \u001b[38;5;241m==\u001b[39m PartitionStrategy\u001b[38;5;241m.\u001b[39mFAST:\n",
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\utils.py:216\u001b[0m, in \u001b[0;36mrequires_dependencies.<locals>.decorator.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 213\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(func)\n\u001b[0;32m 214\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mwrapper\u001b[39m(\u001b[38;5;241m*\u001b[39margs: _P\u001b[38;5;241m.\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: _P\u001b[38;5;241m.\u001b[39mkwargs):\n\u001b[0;32m 215\u001b[0m run_check()\n\u001b[1;32m--> 216\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\partition\\pdf.py:687\u001b[0m, in \u001b[0;36m_partition_pdf_or_image_local\u001b[1;34m(filename, file, is_image, infer_table_structure, include_page_breaks, languages, ocr_languages, ocr_mode, model_name, hi_res_model_name, pdf_image_dpi, metadata_last_modified, pdf_text_extractable, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, analysis, analyzed_image_output_dir_path, starting_page_number, extract_forms, form_extraction_skip_tables, pdf_hi_res_max_pages, password, pdfminer_config, **kwargs)\u001b[0m\n\u001b[0;32m 680\u001b[0m \u001b[38;5;66;03m# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout\u001b[39;00m\n\u001b[0;32m 681\u001b[0m merged_document_layout \u001b[38;5;241m=\u001b[39m merge_inferred_with_extracted_layout(\n\u001b[0;32m 682\u001b[0m inferred_document_layout\u001b[38;5;241m=\u001b[39minferred_document_layout,\n\u001b[0;32m 683\u001b[0m extracted_layout\u001b[38;5;241m=\u001b[39mextracted_layout,\n\u001b[0;32m 684\u001b[0m hi_res_model_name\u001b[38;5;241m=\u001b[39mhi_res_model_name,\n\u001b[0;32m 685\u001b[0m )\n\u001b[1;32m--> 687\u001b[0m final_document_layout \u001b[38;5;241m=\u001b[39m \u001b[43mprocess_file_with_ocr\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 688\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 689\u001b[0m \u001b[43m \u001b[49m\u001b[43mmerged_document_layout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 690\u001b[0m \u001b[43m \u001b[49m\u001b[43mextracted_layout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextracted_layout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 691\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_image\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_image\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 692\u001b[0m \u001b[43m \u001b[49m\u001b[43minfer_table_structure\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minfer_table_structure\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 693\u001b[0m \u001b[43m \u001b[49m\u001b[43mocr_languages\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mocr_languages\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 694\u001b[0m \u001b[43m \u001b[49m\u001b[43mocr_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mocr_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 695\u001b[0m \u001b[43m \u001b[49m\u001b[43mpdf_image_dpi\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpdf_image_dpi\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 696\u001b[0m \u001b[43m \u001b[49m\u001b[43mocr_layout_dumper\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mocr_layout_dumper\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 697\u001b[0m \u001b[43m \u001b[49m\u001b[43mpassword\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpassword\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 698\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 699\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 700\u001b[0m inferred_document_layout \u001b[38;5;241m=\u001b[39m process_data_with_model(\n\u001b[0;32m 701\u001b[0m file,\n\u001b[0;32m 702\u001b[0m is_image\u001b[38;5;241m=\u001b[39mis_image,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 705\u001b[0m password\u001b[38;5;241m=\u001b[39mpassword,\n\u001b[0;32m 706\u001b[0m )\n",
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\utils.py:216\u001b[0m, in \u001b[0;36mrequires_dependencies.<locals>.decorator.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 213\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(func)\n\u001b[0;32m 214\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mwrapper\u001b[39m(\u001b[38;5;241m*\u001b[39margs: _P\u001b[38;5;241m.\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: _P\u001b[38;5;241m.\u001b[39mkwargs):\n\u001b[0;32m 215\u001b[0m run_check()\n\u001b[1;32m--> 216\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\partition\\pdf_image\\ocr.py:190\u001b[0m, in \u001b[0;36mprocess_file_with_ocr\u001b[1;34m(filename, out_layout, extracted_layout, is_image, infer_table_structure, ocr_languages, ocr_mode, pdf_image_dpi, ocr_layout_dumper, password)\u001b[0m\n\u001b[0;32m 188\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 189\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39misdir(filename) \u001b[38;5;129;01mor\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39misfile(filename):\n\u001b[1;32m--> 190\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[0;32m 191\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 192\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mFile \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfilename\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m not found!\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01me\u001b[39;00m\n",
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\partition\\pdf_image\\ocr.py:177\u001b[0m, in \u001b[0;36mprocess_file_with_ocr\u001b[1;34m(filename, out_layout, extracted_layout, is_image, infer_table_structure, ocr_languages, ocr_mode, pdf_image_dpi, ocr_layout_dumper, password)\u001b[0m\n\u001b[0;32m 175\u001b[0m extracted_regions \u001b[38;5;241m=\u001b[39m extracted_layout[i] \u001b[38;5;28;01mif\u001b[39;00m i \u001b[38;5;241m<\u001b[39m \u001b[38;5;28mlen\u001b[39m(extracted_layout) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m 176\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m PILImage\u001b[38;5;241m.\u001b[39mopen(image_path) \u001b[38;5;28;01mas\u001b[39;00m image:\n\u001b[1;32m--> 177\u001b[0m merged_page_layout \u001b[38;5;241m=\u001b[39m \u001b[43msupplement_page_layout_with_ocr\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 178\u001b[0m \u001b[43m \u001b[49m\u001b[43mpage_layout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mout_layout\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpages\u001b[49m\u001b[43m[\u001b[49m\u001b[43mi\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 179\u001b[0m \u001b[43m \u001b[49m\u001b[43mimage\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mimage\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 180\u001b[0m \u001b[43m \u001b[49m\u001b[43minfer_table_structure\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minfer_table_structure\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 181\u001b[0m \u001b[43m \u001b[49m\u001b[43mocr_languages\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mocr_languages\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 182\u001b[0m \u001b[43m \u001b[49m\u001b[43mocr_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mocr_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 183\u001b[0m \u001b[43m \u001b[49m\u001b[43mextracted_regions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextracted_regions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 184\u001b[0m \u001b[43m \u001b[49m\u001b[43mocr_layout_dumper\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mocr_layout_dumper\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 185\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 186\u001b[0m merged_page_layouts\u001b[38;5;241m.\u001b[39mappend(merged_page_layout)\n\u001b[0;32m 187\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m DocumentLayout\u001b[38;5;241m.\u001b[39mfrom_pages(merged_page_layouts)\n",
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\utils.py:216\u001b[0m, in \u001b[0;36mrequires_dependencies.<locals>.decorator.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 213\u001b[0m \u001b[38;5;129m@wraps\u001b[39m(func)\n\u001b[0;32m 214\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mwrapper\u001b[39m(\u001b[38;5;241m*\u001b[39margs: _P\u001b[38;5;241m.\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: _P\u001b[38;5;241m.\u001b[39mkwargs):\n\u001b[0;32m 215\u001b[0m run_check()\n\u001b[1;32m--> 216\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\partition\\pdf_image\\ocr.py:213\u001b[0m, in \u001b[0;36msupplement_page_layout_with_ocr\u001b[1;34m(page_layout, image, infer_table_structure, ocr_languages, ocr_mode, extracted_regions, ocr_layout_dumper)\u001b[0m\n\u001b[0;32m 195\u001b[0m \u001b[38;5;129m@requires_dependencies\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124munstructured_inference\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 196\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21msupplement_page_layout_with_ocr\u001b[39m(\n\u001b[0;32m 197\u001b[0m page_layout: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPageLayout\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 203\u001b[0m ocr_layout_dumper: Optional[OCRLayoutDumper] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[0;32m 204\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPageLayout\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m 205\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 206\u001b[0m \u001b[38;5;124;03m Supplement an PageLayout with OCR results depending on OCR mode.\u001b[39;00m\n\u001b[0;32m 207\u001b[0m \u001b[38;5;124;03m If mode is \"entire_page\", we get the OCR layout for the entire image and\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 210\u001b[0m \u001b[38;5;124;03m with no text and add text from OCR to each element.\u001b[39;00m\n\u001b[0;32m 211\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m--> 213\u001b[0m ocr_agent \u001b[38;5;241m=\u001b[39m \u001b[43mOCRAgent\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_agent\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlanguage\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mocr_languages\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 214\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ocr_mode \u001b[38;5;241m==\u001b[39m OCRMode\u001b[38;5;241m.\u001b[39mFULL_PAGE\u001b[38;5;241m.\u001b[39mvalue:\n\u001b[0;32m 215\u001b[0m ocr_layout \u001b[38;5;241m=\u001b[39m ocr_agent\u001b[38;5;241m.\u001b[39mget_layout_from_image(image)\n",
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\partition\\utils\\ocr_models\\ocr_interface.py:34\u001b[0m, in \u001b[0;36mOCRAgent.get_agent\u001b[1;34m(cls, language)\u001b[0m\n\u001b[0;32m 29\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Get the configured OCRAgent instance.\u001b[39;00m\n\u001b[0;32m 30\u001b[0m \n\u001b[0;32m 31\u001b[0m \u001b[38;5;124;03mThe OCR package used by the agent is determined by the `OCR_AGENT` environment variable.\u001b[39;00m\n\u001b[0;32m 32\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 33\u001b[0m ocr_agent_cls_qname \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m_get_ocr_agent_cls_qname()\n\u001b[1;32m---> 34\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_instance\u001b[49m\u001b[43m(\u001b[49m\u001b[43mocr_agent_cls_qname\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlanguage\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[1;32mc:\\Users\\ramez\\miniconda3\\envs\\rag\\lib\\site-packages\\unstructured\\partition\\utils\\ocr_models\\ocr_interface.py:41\u001b[0m, in \u001b[0;36mOCRAgent.get_instance\u001b[1;34m(ocr_agent_module, language)\u001b[0m\n\u001b[0;32m 39\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mje suis ici dans OCR_AGENT\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 40\u001b[0m \u001b[38;5;28mprint\u001b[39m(ocr_agent_module)\n\u001b[1;32m---> 41\u001b[0m module_name, class_name \u001b[38;5;241m=\u001b[39m ocr_agent_module\u001b[38;5;241m.\u001b[39mrsplit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;241m1\u001b[39m)\n\u001b[0;32m 43\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m module_name \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m OCR_AGENT_MODULES_WHITELIST:\n\u001b[0;32m 44\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[0;32m 45\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEnvironment variable OCR_AGENT module name \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmodule_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must be set to a \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 46\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwhitelisted module part of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mOCR_AGENT_MODULES_WHITELIST\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 47\u001b[0m )\n",
"\u001b[1;31mValueError\u001b[0m: not enough values to unpack (expected 2, got 1)"
]
}
],
"source": [
"docs = loader.load()\n",
"docs[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"from PIL import Image oa\n",
"import pytesseract\n",
"\n",
"# If you don't have tesseract executable in your PATH, include the following:\n",
"pytesseract.pytesseract.tesseract_cmd = r<full_path_to_your_tesseract_executable>*\n",
"# Example tesseract_cmd = rC:\\Program Files (x86)\\Tesseract-OCR\\tesseract\n",
"\n",
"# Simple image to string\n",
"print(pytesseract. image to_string(Image.open( test .png)))\n",
"\n",
"# In order to bypass the image conversions of pytesseract, just use relative or absolute image path\n",
"# NOTE: In this case you should provide tesseract supported images or tesseract will return error\n",
"print (pytesseract.image_to_string(test.png\"))\n",
"\n",
"# List of available languages\n",
"\n",
"print (pytesseract.get_languages(config=\"*))\n",
"\n",
"# French text image to string\n",
"print (pytesseract. image_to_string(Image.open(test-european. jpg), lang=\"fra))\n",
"\n",
"# Batch processing with a single file containing the list of multiple image file paths\n",
"print (pytesseract. image_to_string(images.txt\"))\n",
"\n",
"# Timeout/terminate the tesseract job after a period of time\n",
"try:\n",
"\n",
"print (pytesseract.image_to_string(test. jpg, timeout-2)) # Timeout after 2 seconds\n",
"\n",
"print (pytesseract. image to_string(test.jpg\", timeout=2.5)) # Timeout after half a second\n",
"except Runtime€rror as timeout_error:\n",
"\n",
"# Tesseract processing is terminated\n",
"\n",
"pass\n",
"\n",
"# Get bounding box estimates\n",
"print (pytesseract. image_to_boxes(Image.open(test.png)))\n",
"\n",
"# Get verbose data including boxes, confidences, line and page numbers\n",
"print (pytesseract.image_to_data(Inage.open(test.png)))\n",
"\n",
"# Get information about orientation and script detection\n",
"print (pytesseract..image_to_osd(Image.open( test.png\")))\n",
"\n",
"# Get a searchable PDF\n",
"pdf = pytesseract.image_to_pdf_or_hocr(test.png, extension='\n",
"\n",
"\n"
]
}
],
"source": [
"IMG_path = r\"F:\\Dev\\Rag\\Rag_Modeling\\document\\test.png\"\n",
"print(pytesseract.image_to_string(Image.open(IMG_path)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "rag",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.16"
}
},
"nbformat": 4,
"nbformat_minor": 2
}