803 lines
492 KiB
Plaintext
803 lines
492 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/home/sepehr/dev/rag/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import sys\n",
|
||
"import os\n",
|
||
"sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), './src/document_processing')))\n",
|
||
"from pdf_processor import process_pdf_document\n",
|
||
"from pdf_processor import process_pdf_with_unstructured_loader\n",
|
||
"pdf_path = \"/home/sepehr/dev/rag/document/Echangeurs.pdf\"\n",
|
||
"from PIL import Image\n",
|
||
"import pytesseract \n",
|
||
"pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"2025-02-28 22:14:30,067 - pdf_processor - INFO - Début du traitement du fichier PDF: F:\\Dev\\Rag\\Rag_Modeling\\document\\Echangeurs.pdf\n",
|
||
"2025-02-28 22:14:30,068 - pdf_processor - INFO - Extraction de texte avec PyPDFLoader\n",
|
||
"2025-02-28 22:14:30,355 - pdf_processor - INFO - Extraction de texte avec PDFMinerLoader\n",
|
||
"2025-02-28 22:14:31,675 - pdf_processor - WARNING - Erreur avec PDFMinerLoader: The PDF parser must valorize the standard metadata.\n",
|
||
"2025-02-28 22:14:31,678 - pdf_processor - INFO - Extraction de texte avec Unstructured\n",
|
||
"2025-02-28 22:14:31,680 - unstructured - INFO - PDF text extraction failed, skip text extraction...\n",
|
||
"2025-02-28 22:14:31,682 - unstructured - WARNING - pytesseract is not installed. Cannot use the ocr_only partitioning strategy. Falling back to partitioning with another strategy.\n",
|
||
"2025-02-28 22:14:31,682 - unstructured - WARNING - Falling back to partitioning with hi_res.\n",
|
||
"2025-02-28 22:14:31,683 - unstructured_inference - INFO - Reading PDF for file: F:\\Dev\\Rag\\Rag_Modeling\\document\\Echangeurs.pdf ...\n",
|
||
"2025-02-28 22:14:45,041 - pdf_processor - WARNING - Erreur avec Unstructured: Environment variable OCR_AGENT module name C:\\Program Files\\Tesseract-OCR\\tesseract must be set to a whitelisted module part of ['unstructured.partition.utils.ocr_models.tesseract_ocr', 'unstructured.partition.utils.ocr_models.paddle_ocr', 'unstructured.partition.utils.ocr_models.google_vision_ocr'].\n",
|
||
"2025-02-28 22:14:45,044 - pdf_processor - INFO - Extraction des tableaux avec Camelot\n",
|
||
"2025-02-28 22:14:56,714 - pdf_processor - INFO - Extraction des images avec Unstructured\n",
|
||
"2025-02-28 22:14:56,717 - unstructured - INFO - PDF text extraction failed, skip text extraction...\n",
|
||
"2025-02-28 22:14:56,719 - unstructured_inference - INFO - Reading PDF for file: F:\\Dev\\Rag\\Rag_Modeling\\document\\Echangeurs.pdf ...\n",
|
||
"2025-02-28 22:15:09,709 - pdf_processor - WARNING - Erreur lors de l'extraction des images: Environment variable OCR_AGENT module name C:\\Program Files\\Tesseract-OCR\\tesseract must be set to a whitelisted module part of ['unstructured.partition.utils.ocr_models.tesseract_ocr', 'unstructured.partition.utils.ocr_models.paddle_ocr', 'unstructured.partition.utils.ocr_models.google_vision_ocr'].\n",
|
||
"2025-02-28 22:15:09,711 - pdf_processor - INFO - Traitement du PDF terminé: 30 chunks, 18 tableaux, 0 images\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"result = process_pdf_document(\n",
|
||
" pdf_path,\n",
|
||
" ocr_enabled=True,\n",
|
||
" extract_tables=True,\n",
|
||
" extract_images=True,\n",
|
||
" chunk_size=1000,\n",
|
||
" chunk_overlap=200\n",
|
||
")\n",
|
||
"\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Accès aux différentes parties du résultat\n",
|
||
"text_chunks = result[\"chunks\"]\n",
|
||
"tables = result[\"tables\"]\n",
|
||
"images = result[\"images\"]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"2025-02-28 22:23:44,919 - pdf_processor - INFO - Traitement du PDF avec UnstructuredPDFLoader: F:\\Dev\\Rag\\Rag_Modeling\\document\\Echangeurs.pdf\n",
|
||
"2025-02-28 22:23:44,921 - unstructured - INFO - PDF text extraction failed, skip text extraction...\n",
|
||
"2025-02-28 22:23:44,923 - pdf_processor - INFO - UnstructuredPDFLoader: extrait 0 éléments et 0 chunks\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"result = process_pdf_with_unstructured_loader(\n",
|
||
" pdf_path,\n",
|
||
" chunk_size=1000,\n",
|
||
" chunk_overlap=200,\n",
|
||
" # Vous pouvez passer des options spécifiques à UnstructuredPDFLoader:\n",
|
||
" \n",
|
||
" include_page_breaks=True # Pour inclure les sauts de page\n",
|
||
")\n",
|
||
"\n",
|
||
"# Accéder aux résultats\n",
|
||
"text = result[\"text\"]\n",
|
||
"chunks = result[\"chunks\"]\n",
|
||
"elements = result[\"elements\"] \n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"\n",
|
||
"# Correction de la configuration OCR\n",
|
||
"import pytesseract \n",
|
||
"import os\n",
|
||
"# pytesseract.pytesseract.tesseract_cmd = r\"C:\\Program Files\\Tesseract-OCR\\tesseract.exe\"\n",
|
||
"# os.environ['TESSDATA_PREFIX'] = os.environ['TESSDATA_PREFIX'] = r\"C:\\Program Files\\Tesseract-OCR\\tessdata\"\n",
|
||
"# Au lieu du chemin vers l'exécutable, utilisez le nom de module approprié\n",
|
||
"# os.environ['OCR_AGENT'] = r\"C:\\Program Files\\Tesseract-OCR\\tessdata\"\n",
|
||
"from langchain_community.document_loaders import UnstructuredPDFLoader\n",
|
||
"\n",
|
||
"pdf_path = \"/home/sepehr/dev/rag/document/11_chapitre3.pdf\"\n",
|
||
"loader = UnstructuredPDFLoader(pdf_path)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"2025-03-01 10:30:42,989 - unstructured - INFO - PDF text extraction failed, skip text extraction...\n",
|
||
"2025-03-01 10:30:42,991 - unstructured - WARNING - pytesseract is not installed. Cannot use the ocr_only partitioning strategy. Falling back to partitioning with another strategy.\n",
|
||
"2025-03-01 10:30:42,991 - unstructured - WARNING - Falling back to partitioning with hi_res.\n",
|
||
"Error while downloading from https://cdn-lfs.hf.co/repos/d9/51/d951593388d0af1cb4a029c311ba19f9b05090d9acc4606c2b82588297ea4397/134301ca94fb0df8027be9a6dad1908fe6218af8ffa4d34f0819c7c2226195f3?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27yolox_l0.05.onnx%3B+filename%3D%22yolox_l0.05.onnx%22%3B&Expires=1740824527&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0MDgyNDUyN319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9yZXBvcy9kOS81MS9kOTUxNTkzMzg4ZDBhZjFjYjRhMDI5YzMxMWJhMTlmOWIwNTA5MGQ5YWNjNDYwNmMyYjgyNTg4Mjk3ZWE0Mzk3LzEzNDMwMWNhOTRmYjBkZjgwMjdiZTlhNmRhZDE5MDhmZTYyMThhZjhmZmE0ZDM0ZjA4MTljN2MyMjI2MTk1ZjM%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=HooWiUcXLJXHPWzVNtKIzsVjxYea8p0iN25xm59JZbS1u1mHwIzHtF1XOEr%7EvHLFS1kUlORUIf-j0127HWbsBbvIw9SFGNYDGPmjZai6%7ExN34mNbLaa6FhFfGZ-N-M1%7EnnKmIyLy1VASx2ut0-NfCBkfIRo%7Ew8oo7XFkArOAwz1OTkopFpIhyuhTWa9igWoJdKLvJWw4NMaDCP00P5ZMP3KJTZoftqMDgL0NAJ2N5AcjMnwR3yoimTCGkdd34SBU9BUnQ1vpCE66JEYkTrgSzUi2TQfEAOFhU8AT97PvqLlwYkwOM%7EZFpMAgjgnV8a76pXRV9%7E99LIRCX1AWCCUpXw__&Key-Pair-Id=K3RPWS32NSSJCE: HTTPSConnectionPool(host='cdn-lfs.hf.co', port=443): Read timed out.\n",
|
||
"Trying to resume download...\n",
|
||
"2025-03-01 10:31:10,693 - huggingface_hub.file_download - WARNING - Error while downloading from https://cdn-lfs.hf.co/repos/d9/51/d951593388d0af1cb4a029c311ba19f9b05090d9acc4606c2b82588297ea4397/134301ca94fb0df8027be9a6dad1908fe6218af8ffa4d34f0819c7c2226195f3?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27yolox_l0.05.onnx%3B+filename%3D%22yolox_l0.05.onnx%22%3B&Expires=1740824527&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0MDgyNDUyN319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9yZXBvcy9kOS81MS9kOTUxNTkzMzg4ZDBhZjFjYjRhMDI5YzMxMWJhMTlmOWIwNTA5MGQ5YWNjNDYwNmMyYjgyNTg4Mjk3ZWE0Mzk3LzEzNDMwMWNhOTRmYjBkZjgwMjdiZTlhNmRhZDE5MDhmZTYyMThhZjhmZmE0ZDM0ZjA4MTljN2MyMjI2MTk1ZjM%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=HooWiUcXLJXHPWzVNtKIzsVjxYea8p0iN25xm59JZbS1u1mHwIzHtF1XOEr%7EvHLFS1kUlORUIf-j0127HWbsBbvIw9SFGNYDGPmjZai6%7ExN34mNbLaa6FhFfGZ-N-M1%7EnnKmIyLy1VASx2ut0-NfCBkfIRo%7Ew8oo7XFkArOAwz1OTkopFpIhyuhTWa9igWoJdKLvJWw4NMaDCP00P5ZMP3KJTZoftqMDgL0NAJ2N5AcjMnwR3yoimTCGkdd34SBU9BUnQ1vpCE66JEYkTrgSzUi2TQfEAOFhU8AT97PvqLlwYkwOM%7EZFpMAgjgnV8a76pXRV9%7E99LIRCX1AWCCUpXw__&Key-Pair-Id=K3RPWS32NSSJCE: HTTPSConnectionPool(host='cdn-lfs.hf.co', port=443): Read timed out.\n",
|
||
"Trying to resume download...\n"
|
||
]
|
||
},
|
||
{
|
||
"ename": "ChunkedEncodingError",
|
||
"evalue": "('Connection broken: IncompleteRead(46334378 bytes read, 33976465 more expected)', IncompleteRead(46334378 bytes read, 33976465 more expected))",
|
||
"output_type": "error",
|
||
"traceback": [
|
||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||
"\u001b[31mTimeoutError\u001b[39m Traceback (most recent call last)",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/urllib3/response.py:754\u001b[39m, in \u001b[36mHTTPResponse._error_catcher\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 753\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m754\u001b[39m \u001b[38;5;28;01myield\u001b[39;00m\n\u001b[32m 756\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m SocketTimeout \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m 757\u001b[39m \u001b[38;5;66;03m# FIXME: Ideally we'd like to include the url in the ReadTimeoutError but\u001b[39;00m\n\u001b[32m 758\u001b[39m \u001b[38;5;66;03m# there is yet no clean way to get at it from this context.\u001b[39;00m\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/urllib3/response.py:879\u001b[39m, in \u001b[36mHTTPResponse._raw_read\u001b[39m\u001b[34m(self, amt, read1)\u001b[39m\n\u001b[32m 878\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m._error_catcher():\n\u001b[32m--> \u001b[39m\u001b[32m879\u001b[39m data = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_fp_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mamt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mread1\u001b[49m\u001b[43m=\u001b[49m\u001b[43mread1\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m fp_closed \u001b[38;5;28;01melse\u001b[39;00m \u001b[33mb\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 880\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m amt \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m amt != \u001b[32m0\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m data:\n\u001b[32m 881\u001b[39m \u001b[38;5;66;03m# Platform-specific: Buggy versions of Python.\u001b[39;00m\n\u001b[32m 882\u001b[39m \u001b[38;5;66;03m# Close the connection when no data is returned\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 887\u001b[39m \u001b[38;5;66;03m# not properly close the connection in all cases. There is\u001b[39;00m\n\u001b[32m 888\u001b[39m \u001b[38;5;66;03m# no harm in redundantly calling close.\u001b[39;00m\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/urllib3/response.py:862\u001b[39m, in \u001b[36mHTTPResponse._fp_read\u001b[39m\u001b[34m(self, amt, read1)\u001b[39m\n\u001b[32m 860\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 861\u001b[39m \u001b[38;5;66;03m# StringIO doesn't like amt=None\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m862\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_fp\u001b[49m\u001b[43m.\u001b[49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mamt\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m amt \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m._fp.read()\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m/usr/lib/python3.12/http/client.py:479\u001b[39m, in \u001b[36mHTTPResponse.read\u001b[39m\u001b[34m(self, amt)\u001b[39m\n\u001b[32m 478\u001b[39m amt = \u001b[38;5;28mself\u001b[39m.length\n\u001b[32m--> \u001b[39m\u001b[32m479\u001b[39m s = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mfp\u001b[49m\u001b[43m.\u001b[49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mamt\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 480\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m s \u001b[38;5;129;01mand\u001b[39;00m amt:\n\u001b[32m 481\u001b[39m \u001b[38;5;66;03m# Ideally, we would raise IncompleteRead if the content-length\u001b[39;00m\n\u001b[32m 482\u001b[39m \u001b[38;5;66;03m# wasn't satisfied, but it might break compatibility.\u001b[39;00m\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m/usr/lib/python3.12/socket.py:707\u001b[39m, in \u001b[36mSocketIO.readinto\u001b[39m\u001b[34m(self, b)\u001b[39m\n\u001b[32m 706\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m707\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_sock\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrecv_into\u001b[49m\u001b[43m(\u001b[49m\u001b[43mb\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 708\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m timeout:\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m/usr/lib/python3.12/ssl.py:1252\u001b[39m, in \u001b[36mSSLSocket.recv_into\u001b[39m\u001b[34m(self, buffer, nbytes, flags)\u001b[39m\n\u001b[32m 1249\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[32m 1250\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mnon-zero flags not allowed in calls to recv_into() on \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[33m\"\u001b[39m %\n\u001b[32m 1251\u001b[39m \u001b[38;5;28mself\u001b[39m.\u001b[34m__class__\u001b[39m)\n\u001b[32m-> \u001b[39m\u001b[32m1252\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnbytes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuffer\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1253\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m/usr/lib/python3.12/ssl.py:1104\u001b[39m, in \u001b[36mSSLSocket.read\u001b[39m\u001b[34m(self, len, buffer)\u001b[39m\n\u001b[32m 1103\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m buffer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1104\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_sslobj\u001b[49m\u001b[43m.\u001b[49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuffer\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1105\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n",
|
||
"\u001b[31mTimeoutError\u001b[39m: The read operation timed out",
|
||
"\nThe above exception was the direct cause of the following exception:\n",
|
||
"\u001b[31mReadTimeoutError\u001b[39m Traceback (most recent call last)",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/requests/models.py:820\u001b[39m, in \u001b[36mResponse.iter_content.<locals>.generate\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m 819\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m820\u001b[39m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m.raw.stream(chunk_size, decode_content=\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[32m 821\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m ProtocolError \u001b[38;5;28;01mas\u001b[39;00m e:\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/urllib3/response.py:1066\u001b[39m, in \u001b[36mHTTPResponse.stream\u001b[39m\u001b[34m(self, amt, decode_content)\u001b[39m\n\u001b[32m 1065\u001b[39m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_fp_closed(\u001b[38;5;28mself\u001b[39m._fp) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m._decoded_buffer) > \u001b[32m0\u001b[39m:\n\u001b[32m-> \u001b[39m\u001b[32m1066\u001b[39m data = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mamt\u001b[49m\u001b[43m=\u001b[49m\u001b[43mamt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1068\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m data:\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/urllib3/response.py:955\u001b[39m, in \u001b[36mHTTPResponse.read\u001b[39m\u001b[34m(self, amt, decode_content, cache_content)\u001b[39m\n\u001b[32m 953\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._decoded_buffer.get(amt)\n\u001b[32m--> \u001b[39m\u001b[32m955\u001b[39m data = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_raw_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mamt\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 957\u001b[39m flush_decoder = amt \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m (amt != \u001b[32m0\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m data)\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/urllib3/response.py:878\u001b[39m, in \u001b[36mHTTPResponse._raw_read\u001b[39m\u001b[34m(self, amt, read1)\u001b[39m\n\u001b[32m 876\u001b[39m fp_closed = \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m._fp, \u001b[33m\"\u001b[39m\u001b[33mclosed\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[32m--> \u001b[39m\u001b[32m878\u001b[39m \u001b[43m\u001b[49m\u001b[38;5;28;43;01mwith\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_error_catcher\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 879\u001b[39m \u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m \u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_fp_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mamt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mread1\u001b[49m\u001b[43m=\u001b[49m\u001b[43mread1\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mnot\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mfp_closed\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[33;43mb\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m/usr/lib/python3.12/contextlib.py:158\u001b[39m, in \u001b[36m_GeneratorContextManager.__exit__\u001b[39m\u001b[34m(self, typ, value, traceback)\u001b[39m\n\u001b[32m 157\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m158\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mgen\u001b[49m\u001b[43m.\u001b[49m\u001b[43mthrow\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 159\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[32m 160\u001b[39m \u001b[38;5;66;03m# Suppress StopIteration *unless* it's the same exception that\u001b[39;00m\n\u001b[32m 161\u001b[39m \u001b[38;5;66;03m# was passed to throw(). This prevents a StopIteration\u001b[39;00m\n\u001b[32m 162\u001b[39m \u001b[38;5;66;03m# raised inside the \"with\" statement from being suppressed.\u001b[39;00m\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/urllib3/response.py:759\u001b[39m, in \u001b[36mHTTPResponse._error_catcher\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 756\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m SocketTimeout \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m 757\u001b[39m \u001b[38;5;66;03m# FIXME: Ideally we'd like to include the url in the ReadTimeoutError but\u001b[39;00m\n\u001b[32m 758\u001b[39m \u001b[38;5;66;03m# there is yet no clean way to get at it from this context.\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m759\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m ReadTimeoutError(\u001b[38;5;28mself\u001b[39m._pool, \u001b[38;5;28;01mNone\u001b[39;00m, \u001b[33m\"\u001b[39m\u001b[33mRead timed out.\u001b[39m\u001b[33m\"\u001b[39m) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01me\u001b[39;00m \u001b[38;5;66;03m# type: ignore[arg-type]\u001b[39;00m\n\u001b[32m 761\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m BaseSSLError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m 762\u001b[39m \u001b[38;5;66;03m# FIXME: Is there a better way to differentiate between SSLErrors?\u001b[39;00m\n",
|
||
"\u001b[31mReadTimeoutError\u001b[39m: HTTPSConnectionPool(host='cdn-lfs.hf.co', port=443): Read timed out.",
|
||
"\nDuring handling of the above exception, another exception occurred:\n",
|
||
"\u001b[31mConnectionError\u001b[39m Traceback (most recent call last)",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/huggingface_hub/file_download.py:454\u001b[39m, in \u001b[36mhttp_get\u001b[39m\u001b[34m(url, temp_file, proxies, resume_size, headers, expected_size, displayed_filename, _nb_retries, _tqdm_bar)\u001b[39m\n\u001b[32m 453\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m454\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mr\u001b[49m\u001b[43m.\u001b[49m\u001b[43miter_content\u001b[49m\u001b[43m(\u001b[49m\u001b[43mchunk_size\u001b[49m\u001b[43m=\u001b[49m\u001b[43mconstants\u001b[49m\u001b[43m.\u001b[49m\u001b[43mDOWNLOAD_CHUNK_SIZE\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 455\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# filter out keep-alive new chunks\u001b[39;49;00m\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/requests/models.py:826\u001b[39m, in \u001b[36mResponse.iter_content.<locals>.generate\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m 825\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m ReadTimeoutError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m--> \u001b[39m\u001b[32m826\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mConnectionError\u001b[39;00m(e)\n\u001b[32m 827\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m SSLError \u001b[38;5;28;01mas\u001b[39;00m e:\n",
|
||
"\u001b[31mConnectionError\u001b[39m: HTTPSConnectionPool(host='cdn-lfs.hf.co', port=443): Read timed out.",
|
||
"\nDuring handling of the above exception, another exception occurred:\n",
|
||
"\u001b[31mIncompleteRead\u001b[39m Traceback (most recent call last)",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/urllib3/response.py:754\u001b[39m, in \u001b[36mHTTPResponse._error_catcher\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 753\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m754\u001b[39m \u001b[38;5;28;01myield\u001b[39;00m\n\u001b[32m 756\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m SocketTimeout \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m 757\u001b[39m \u001b[38;5;66;03m# FIXME: Ideally we'd like to include the url in the ReadTimeoutError but\u001b[39;00m\n\u001b[32m 758\u001b[39m \u001b[38;5;66;03m# there is yet no clean way to get at it from this context.\u001b[39;00m\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/urllib3/response.py:900\u001b[39m, in \u001b[36mHTTPResponse._raw_read\u001b[39m\u001b[34m(self, amt, read1)\u001b[39m\n\u001b[32m 890\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[32m 891\u001b[39m \u001b[38;5;28mself\u001b[39m.enforce_content_length\n\u001b[32m 892\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m.length_remaining \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 898\u001b[39m \u001b[38;5;66;03m# raised during streaming, so all calls with incorrect\u001b[39;00m\n\u001b[32m 899\u001b[39m \u001b[38;5;66;03m# Content-Length are caught.\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m900\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m IncompleteRead(\u001b[38;5;28mself\u001b[39m._fp_bytes_read, \u001b[38;5;28mself\u001b[39m.length_remaining)\n\u001b[32m 901\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m read1 \u001b[38;5;129;01mand\u001b[39;00m (\n\u001b[32m 902\u001b[39m (amt != \u001b[32m0\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m data) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m.length_remaining == \u001b[38;5;28mlen\u001b[39m(data)\n\u001b[32m 903\u001b[39m ):\n\u001b[32m (...)\u001b[39m\u001b[32m 906\u001b[39m \u001b[38;5;66;03m# `http.client.HTTPResponse`, so we close it here.\u001b[39;00m\n\u001b[32m 907\u001b[39m \u001b[38;5;66;03m# See https://github.com/python/cpython/issues/113199\u001b[39;00m\n",
|
||
"\u001b[31mIncompleteRead\u001b[39m: IncompleteRead(46334378 bytes read, 33976465 more expected)",
|
||
"\nThe above exception was the direct cause of the following exception:\n",
|
||
"\u001b[31mProtocolError\u001b[39m Traceback (most recent call last)",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/requests/models.py:820\u001b[39m, in \u001b[36mResponse.iter_content.<locals>.generate\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m 819\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m820\u001b[39m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m.raw.stream(chunk_size, decode_content=\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[32m 821\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m ProtocolError \u001b[38;5;28;01mas\u001b[39;00m e:\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/urllib3/response.py:1066\u001b[39m, in \u001b[36mHTTPResponse.stream\u001b[39m\u001b[34m(self, amt, decode_content)\u001b[39m\n\u001b[32m 1065\u001b[39m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_fp_closed(\u001b[38;5;28mself\u001b[39m._fp) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m._decoded_buffer) > \u001b[32m0\u001b[39m:\n\u001b[32m-> \u001b[39m\u001b[32m1066\u001b[39m data = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mamt\u001b[49m\u001b[43m=\u001b[49m\u001b[43mamt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1068\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m data:\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/urllib3/response.py:983\u001b[39m, in \u001b[36mHTTPResponse.read\u001b[39m\u001b[34m(self, amt, decode_content, cache_content)\u001b[39m\n\u001b[32m 979\u001b[39m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m._decoded_buffer) < amt \u001b[38;5;129;01mand\u001b[39;00m data:\n\u001b[32m 980\u001b[39m \u001b[38;5;66;03m# TODO make sure to initially read enough data to get past the headers\u001b[39;00m\n\u001b[32m 981\u001b[39m \u001b[38;5;66;03m# For example, the GZ file header takes 10 bytes, we don't want to read\u001b[39;00m\n\u001b[32m 982\u001b[39m \u001b[38;5;66;03m# it one byte at a time\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m983\u001b[39m data = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_raw_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mamt\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 984\u001b[39m decoded_data = \u001b[38;5;28mself\u001b[39m._decode(data, decode_content, flush_decoder)\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/urllib3/response.py:878\u001b[39m, in \u001b[36mHTTPResponse._raw_read\u001b[39m\u001b[34m(self, amt, read1)\u001b[39m\n\u001b[32m 876\u001b[39m fp_closed = \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m._fp, \u001b[33m\"\u001b[39m\u001b[33mclosed\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[32m--> \u001b[39m\u001b[32m878\u001b[39m \u001b[43m\u001b[49m\u001b[38;5;28;43;01mwith\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_error_catcher\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 879\u001b[39m \u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m \u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_fp_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mamt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mread1\u001b[49m\u001b[43m=\u001b[49m\u001b[43mread1\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mnot\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mfp_closed\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[33;43mb\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m/usr/lib/python3.12/contextlib.py:158\u001b[39m, in \u001b[36m_GeneratorContextManager.__exit__\u001b[39m\u001b[34m(self, typ, value, traceback)\u001b[39m\n\u001b[32m 157\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m158\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mgen\u001b[49m\u001b[43m.\u001b[49m\u001b[43mthrow\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 159\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[32m 160\u001b[39m \u001b[38;5;66;03m# Suppress StopIteration *unless* it's the same exception that\u001b[39;00m\n\u001b[32m 161\u001b[39m \u001b[38;5;66;03m# was passed to throw(). This prevents a StopIteration\u001b[39;00m\n\u001b[32m 162\u001b[39m \u001b[38;5;66;03m# raised inside the \"with\" statement from being suppressed.\u001b[39;00m\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/urllib3/response.py:778\u001b[39m, in \u001b[36mHTTPResponse._error_catcher\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 777\u001b[39m arg = \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mConnection broken: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m!r}\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m--> \u001b[39m\u001b[32m778\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m ProtocolError(arg, e) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01me\u001b[39;00m\n\u001b[32m 780\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m (HTTPException, \u001b[38;5;167;01mOSError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n",
|
||
"\u001b[31mProtocolError\u001b[39m: ('Connection broken: IncompleteRead(46334378 bytes read, 33976465 more expected)', IncompleteRead(46334378 bytes read, 33976465 more expected))",
|
||
"\nDuring handling of the above exception, another exception occurred:\n",
|
||
"\u001b[31mChunkedEncodingError\u001b[39m Traceback (most recent call last)",
|
||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m docs = \u001b[43mloader\u001b[49m\u001b[43m.\u001b[49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 2\u001b[39m docs[\u001b[32m0\u001b[39m]\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/langchain_core/document_loaders/base.py:31\u001b[39m, in \u001b[36mBaseLoader.load\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 29\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mload\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> \u001b[38;5;28mlist\u001b[39m[Document]:\n\u001b[32m 30\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"Load data into Document objects.\"\"\"\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m31\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mlazy_load\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/langchain_community/document_loaders/unstructured.py:107\u001b[39m, in \u001b[36mUnstructuredBaseLoader.lazy_load\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 105\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mlazy_load\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> Iterator[Document]:\n\u001b[32m 106\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"Load file.\"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m107\u001b[39m elements = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_get_elements\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 108\u001b[39m \u001b[38;5;28mself\u001b[39m._post_process_elements(elements)\n\u001b[32m 109\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.mode == \u001b[33m\"\u001b[39m\u001b[33melements\u001b[39m\u001b[33m\"\u001b[39m:\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/langchain_community/document_loaders/pdf.py:94\u001b[39m, in \u001b[36mUnstructuredPDFLoader._get_elements\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 91\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m_get_elements\u001b[39m(\u001b[38;5;28mself\u001b[39m) -> \u001b[38;5;28mlist\u001b[39m:\n\u001b[32m 92\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01munstructured\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mpartition\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mpdf\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m partition_pdf\n\u001b[32m---> \u001b[39m\u001b[32m94\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mpartition_pdf\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mfile_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43munstructured_kwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/unstructured/documents/elements.py:581\u001b[39m, in \u001b[36mprocess_metadata.<locals>.decorator.<locals>.wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 579\u001b[39m \u001b[38;5;129m@functools\u001b[39m.wraps(func)\n\u001b[32m 580\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mwrapper\u001b[39m(*args: _P.args, **kwargs: _P.kwargs) -> \u001b[38;5;28mlist\u001b[39m[Element]:\n\u001b[32m--> \u001b[39m\u001b[32m581\u001b[39m elements = \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 582\u001b[39m call_args = get_call_args_applying_defaults(func, *args, **kwargs)\n\u001b[32m 584\u001b[39m unique_element_ids: \u001b[38;5;28mbool\u001b[39m = call_args.get(\u001b[33m\"\u001b[39m\u001b[33munique_element_ids\u001b[39m\u001b[33m\"\u001b[39m, \u001b[38;5;28;01mFalse\u001b[39;00m)\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/unstructured/file_utils/filetype.py:815\u001b[39m, in \u001b[36madd_filetype.<locals>.decorator.<locals>.wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 813\u001b[39m \u001b[38;5;129m@functools\u001b[39m.wraps(func)\n\u001b[32m 814\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mwrapper\u001b[39m(*args: _P.args, **kwargs: _P.kwargs) -> \u001b[38;5;28mlist\u001b[39m[Element]:\n\u001b[32m--> \u001b[39m\u001b[32m815\u001b[39m elements = \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 817\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m element \u001b[38;5;129;01min\u001b[39;00m elements:\n\u001b[32m 818\u001b[39m \u001b[38;5;66;03m# NOTE(robinson) - Attached files have already run through this logic\u001b[39;00m\n\u001b[32m 819\u001b[39m \u001b[38;5;66;03m# in their own partitioning function\u001b[39;00m\n\u001b[32m 820\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m element.metadata.attached_to_filename \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/unstructured/file_utils/filetype.py:773\u001b[39m, in \u001b[36madd_metadata.<locals>.wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 771\u001b[39m \u001b[38;5;129m@functools\u001b[39m.wraps(func)\n\u001b[32m 772\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mwrapper\u001b[39m(*args: _P.args, **kwargs: _P.kwargs) -> \u001b[38;5;28mlist\u001b[39m[Element]:\n\u001b[32m--> \u001b[39m\u001b[32m773\u001b[39m elements = \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 774\u001b[39m call_args = get_call_args_applying_defaults(func, *args, **kwargs)\n\u001b[32m 776\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m call_args.get(\u001b[33m\"\u001b[39m\u001b[33mmetadata_filename\u001b[39m\u001b[33m\"\u001b[39m):\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/unstructured/chunking/dispatch.py:74\u001b[39m, in \u001b[36madd_chunking_strategy.<locals>.wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 71\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33;03m\"\"\"The decorated function is replaced with this one.\"\"\"\u001b[39;00m\n\u001b[32m 73\u001b[39m \u001b[38;5;66;03m# -- call the partitioning function to get the elements --\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m74\u001b[39m elements = \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 76\u001b[39m \u001b[38;5;66;03m# -- look for a chunking-strategy argument --\u001b[39;00m\n\u001b[32m 77\u001b[39m call_args = get_call_args_applying_defaults(func, *args, **kwargs)\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/unstructured/partition/pdf.py:229\u001b[39m, in \u001b[36mpartition_pdf\u001b[39m\u001b[34m(filename, file, include_page_breaks, strategy, infer_table_structure, ocr_languages, languages, metadata_filename, metadata_last_modified, chunking_strategy, hi_res_model_name, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, starting_page_number, extract_forms, form_extraction_skip_tables, password, pdfminer_line_margin, pdfminer_char_margin, pdfminer_line_overlap, pdfminer_word_margin, **kwargs)\u001b[39m\n\u001b[32m 226\u001b[39m exactly_one(filename=filename, file=file)\n\u001b[32m 228\u001b[39m languages = check_language_args(languages \u001b[38;5;129;01mor\u001b[39;00m [], ocr_languages)\n\u001b[32m--> \u001b[39m\u001b[32m229\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mpartition_pdf_or_image\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 230\u001b[39m \u001b[43m \u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 231\u001b[39m \u001b[43m \u001b[49m\u001b[43mfile\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 232\u001b[39m \u001b[43m \u001b[49m\u001b[43minclude_page_breaks\u001b[49m\u001b[43m=\u001b[49m\u001b[43minclude_page_breaks\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 233\u001b[39m \u001b[43m \u001b[49m\u001b[43mstrategy\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstrategy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 234\u001b[39m \u001b[43m \u001b[49m\u001b[43minfer_table_structure\u001b[49m\u001b[43m=\u001b[49m\u001b[43minfer_table_structure\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 235\u001b[39m \u001b[43m \u001b[49m\u001b[43mlanguages\u001b[49m\u001b[43m=\u001b[49m\u001b[43mlanguages\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 236\u001b[39m \u001b[43m \u001b[49m\u001b[43mmetadata_last_modified\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmetadata_last_modified\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 237\u001b[39m \u001b[43m \u001b[49m\u001b[43mhi_res_model_name\u001b[49m\u001b[43m=\u001b[49m\u001b[43mhi_res_model_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 238\u001b[39m \u001b[43m \u001b[49m\u001b[43mextract_images_in_pdf\u001b[49m\u001b[43m=\u001b[49m\u001b[43mextract_images_in_pdf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 239\u001b[39m \u001b[43m \u001b[49m\u001b[43mextract_image_block_types\u001b[49m\u001b[43m=\u001b[49m\u001b[43mextract_image_block_types\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 240\u001b[39m \u001b[43m \u001b[49m\u001b[43mextract_image_block_output_dir\u001b[49m\u001b[43m=\u001b[49m\u001b[43mextract_image_block_output_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 241\u001b[39m \u001b[43m \u001b[49m\u001b[43mextract_image_block_to_payload\u001b[49m\u001b[43m=\u001b[49m\u001b[43mextract_image_block_to_payload\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 242\u001b[39m \u001b[43m \u001b[49m\u001b[43mstarting_page_number\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstarting_page_number\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 243\u001b[39m \u001b[43m \u001b[49m\u001b[43mextract_forms\u001b[49m\u001b[43m=\u001b[49m\u001b[43mextract_forms\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 244\u001b[39m \u001b[43m \u001b[49m\u001b[43mform_extraction_skip_tables\u001b[49m\u001b[43m=\u001b[49m\u001b[43mform_extraction_skip_tables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 245\u001b[39m \u001b[43m \u001b[49m\u001b[43mpassword\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpassword\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 246\u001b[39m \u001b[43m \u001b[49m\u001b[43mpdfminer_line_margin\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpdfminer_line_margin\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 247\u001b[39m \u001b[43m \u001b[49m\u001b[43mpdfminer_char_margin\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpdfminer_char_margin\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 248\u001b[39m \u001b[43m \u001b[49m\u001b[43mpdfminer_line_overlap\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpdfminer_line_overlap\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 249\u001b[39m \u001b[43m \u001b[49m\u001b[43mpdfminer_word_margin\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpdfminer_word_margin\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 250\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 251\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/unstructured/partition/pdf.py:342\u001b[39m, in \u001b[36mpartition_pdf_or_image\u001b[39m\u001b[34m(filename, file, is_image, include_page_breaks, strategy, infer_table_structure, languages, metadata_last_modified, hi_res_model_name, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, starting_page_number, extract_forms, form_extraction_skip_tables, password, pdfminer_line_margin, pdfminer_char_margin, pdfminer_line_overlap, pdfminer_word_margin, **kwargs)\u001b[39m\n\u001b[32m 340\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m warnings.catch_warnings():\n\u001b[32m 341\u001b[39m warnings.simplefilter(\u001b[33m\"\u001b[39m\u001b[33mignore\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m--> \u001b[39m\u001b[32m342\u001b[39m elements = \u001b[43m_partition_pdf_or_image_local\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 343\u001b[39m \u001b[43m \u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 344\u001b[39m \u001b[43m \u001b[49m\u001b[43mfile\u001b[49m\u001b[43m=\u001b[49m\u001b[43mspooled_to_bytes_io_if_needed\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 345\u001b[39m \u001b[43m \u001b[49m\u001b[43mis_image\u001b[49m\u001b[43m=\u001b[49m\u001b[43mis_image\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 346\u001b[39m \u001b[43m \u001b[49m\u001b[43minfer_table_structure\u001b[49m\u001b[43m=\u001b[49m\u001b[43minfer_table_structure\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 347\u001b[39m \u001b[43m \u001b[49m\u001b[43minclude_page_breaks\u001b[49m\u001b[43m=\u001b[49m\u001b[43minclude_page_breaks\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 348\u001b[39m \u001b[43m \u001b[49m\u001b[43mlanguages\u001b[49m\u001b[43m=\u001b[49m\u001b[43mlanguages\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 349\u001b[39m \u001b[43m \u001b[49m\u001b[43mocr_languages\u001b[49m\u001b[43m=\u001b[49m\u001b[43mocr_languages\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 350\u001b[39m \u001b[43m \u001b[49m\u001b[43mmetadata_last_modified\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmetadata_last_modified\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mlast_modified\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 351\u001b[39m \u001b[43m \u001b[49m\u001b[43mhi_res_model_name\u001b[49m\u001b[43m=\u001b[49m\u001b[43mhi_res_model_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 352\u001b[39m \u001b[43m \u001b[49m\u001b[43mpdf_text_extractable\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpdf_text_extractable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 353\u001b[39m \u001b[43m \u001b[49m\u001b[43mextract_images_in_pdf\u001b[49m\u001b[43m=\u001b[49m\u001b[43mextract_images_in_pdf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 354\u001b[39m \u001b[43m \u001b[49m\u001b[43mextract_image_block_types\u001b[49m\u001b[43m=\u001b[49m\u001b[43mextract_image_block_types\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 355\u001b[39m \u001b[43m \u001b[49m\u001b[43mextract_image_block_output_dir\u001b[49m\u001b[43m=\u001b[49m\u001b[43mextract_image_block_output_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 356\u001b[39m \u001b[43m \u001b[49m\u001b[43mextract_image_block_to_payload\u001b[49m\u001b[43m=\u001b[49m\u001b[43mextract_image_block_to_payload\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 357\u001b[39m \u001b[43m \u001b[49m\u001b[43mstarting_page_number\u001b[49m\u001b[43m=\u001b[49m\u001b[43mstarting_page_number\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 358\u001b[39m \u001b[43m \u001b[49m\u001b[43mextract_forms\u001b[49m\u001b[43m=\u001b[49m\u001b[43mextract_forms\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 359\u001b[39m \u001b[43m \u001b[49m\u001b[43mform_extraction_skip_tables\u001b[49m\u001b[43m=\u001b[49m\u001b[43mform_extraction_skip_tables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 360\u001b[39m \u001b[43m \u001b[49m\u001b[43mpassword\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpassword\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 361\u001b[39m \u001b[43m \u001b[49m\u001b[43mpdfminer_config\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpdfminer_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 362\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 363\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 364\u001b[39m out_elements = _process_uncategorized_text_elements(elements)\n\u001b[32m 366\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m strategy == PartitionStrategy.FAST:\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/unstructured/utils.py:216\u001b[39m, in \u001b[36mrequires_dependencies.<locals>.decorator.<locals>.wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 213\u001b[39m \u001b[38;5;129m@wraps\u001b[39m(func)\n\u001b[32m 214\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mwrapper\u001b[39m(*args: _P.args, **kwargs: _P.kwargs):\n\u001b[32m 215\u001b[39m run_check()\n\u001b[32m--> \u001b[39m\u001b[32m216\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/unstructured/partition/pdf.py:643\u001b[39m, in \u001b[36m_partition_pdf_or_image_local\u001b[39m\u001b[34m(filename, file, is_image, infer_table_structure, include_page_breaks, languages, ocr_languages, ocr_mode, model_name, hi_res_model_name, pdf_image_dpi, metadata_last_modified, pdf_text_extractable, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, analysis, analyzed_image_output_dir_path, starting_page_number, extract_forms, form_extraction_skip_tables, pdf_hi_res_max_pages, password, pdfminer_config, **kwargs)\u001b[39m\n\u001b[32m 640\u001b[39m skip_analysis_dump = env_config.ANALYSIS_DUMP_OD_SKIP\n\u001b[32m 642\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m file \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m643\u001b[39m inferred_document_layout = \u001b[43mprocess_file_with_model\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 644\u001b[39m \u001b[43m \u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 645\u001b[39m \u001b[43m \u001b[49m\u001b[43mis_image\u001b[49m\u001b[43m=\u001b[49m\u001b[43mis_image\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 646\u001b[39m \u001b[43m \u001b[49m\u001b[43mmodel_name\u001b[49m\u001b[43m=\u001b[49m\u001b[43mhi_res_model_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 647\u001b[39m \u001b[43m \u001b[49m\u001b[43mpdf_image_dpi\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpdf_image_dpi\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 648\u001b[39m \u001b[43m \u001b[49m\u001b[43mpassword\u001b[49m\u001b[43m=\u001b[49m\u001b[43mpassword\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 649\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 651\u001b[39m extracted_layout, layouts_links = (\n\u001b[32m 652\u001b[39m process_file_with_pdfminer(\n\u001b[32m 653\u001b[39m filename=filename,\n\u001b[32m (...)\u001b[39m\u001b[32m 659\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m ([], [])\n\u001b[32m 660\u001b[39m )\n\u001b[32m 662\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m analysis:\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/unstructured_inference/inference/layout.py:366\u001b[39m, in \u001b[36mprocess_file_with_model\u001b[39m\u001b[34m(filename, model_name, is_image, fixed_layouts, pdf_image_dpi, password, **kwargs)\u001b[39m\n\u001b[32m 354\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mprocess_file_with_model\u001b[39m(\n\u001b[32m 355\u001b[39m filename: \u001b[38;5;28mstr\u001b[39m,\n\u001b[32m 356\u001b[39m model_name: Optional[\u001b[38;5;28mstr\u001b[39m],\n\u001b[32m (...)\u001b[39m\u001b[32m 361\u001b[39m **kwargs: Any,\n\u001b[32m 362\u001b[39m ) -> DocumentLayout:\n\u001b[32m 363\u001b[39m \u001b[38;5;250m \u001b[39m\u001b[33;03m\"\"\"Processes pdf file with name filename into a DocumentLayout by using a model identified by\u001b[39;00m\n\u001b[32m 364\u001b[39m \u001b[33;03m model_name.\"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m366\u001b[39m model = \u001b[43mget_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 367\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(model, UnstructuredObjectDetectionModel):\n\u001b[32m 368\u001b[39m detection_model = model\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/unstructured_inference/models/base.py:74\u001b[39m, in \u001b[36mget_model\u001b[39m\u001b[34m(model_name)\u001b[39m\n\u001b[32m 70\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m UnknownModelException(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mUnknown model type: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmodel_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 72\u001b[39m model: UnstructuredModel = model_class_map[model_name]()\n\u001b[32m---> \u001b[39m\u001b[32m74\u001b[39m model.initialize(**initialize_params)\n\u001b[32m 75\u001b[39m models[model_name] = model\n\u001b[32m 76\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m model\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/unstructured_inference/utils.py:40\u001b[39m, in \u001b[36mLazyDict.__getitem__\u001b[39m\u001b[34m(self, key)\u001b[39m\n\u001b[32m 38\u001b[39m evaluate = value.evaluate\n\u001b[32m 39\u001b[39m args, kwargs = value.info\n\u001b[32m---> \u001b[39m\u001b[32m40\u001b[39m value = \u001b[43mevaluate\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 41\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.cache:\n\u001b[32m 42\u001b[39m \u001b[38;5;28mself\u001b[39m._raw_dict[key] = value\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/unstructured_inference/utils.py:115\u001b[39m, in \u001b[36mdownload_if_needed_and_get_local_path\u001b[39m\u001b[34m(path_or_repo, filename, **kwargs)\u001b[39m\n\u001b[32m 113\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m full_path\n\u001b[32m 114\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m115\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mhf_hub_download\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath_or_repo\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/huggingface_hub/utils/_validators.py:114\u001b[39m, in \u001b[36mvalidate_hf_hub_args.<locals>._inner_fn\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 111\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m check_use_auth_token:\n\u001b[32m 112\u001b[39m kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.\u001b[34m__name__\u001b[39m, has_token=has_token, kwargs=kwargs)\n\u001b[32m--> \u001b[39m\u001b[32m114\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/huggingface_hub/file_download.py:862\u001b[39m, in \u001b[36mhf_hub_download\u001b[39m\u001b[34m(repo_id, filename, subfolder, repo_type, revision, library_name, library_version, cache_dir, local_dir, user_agent, force_download, proxies, etag_timeout, token, local_files_only, headers, endpoint, resume_download, force_filename, local_dir_use_symlinks)\u001b[39m\n\u001b[32m 842\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m _hf_hub_download_to_local_dir(\n\u001b[32m 843\u001b[39m \u001b[38;5;66;03m# Destination\u001b[39;00m\n\u001b[32m 844\u001b[39m local_dir=local_dir,\n\u001b[32m (...)\u001b[39m\u001b[32m 859\u001b[39m local_files_only=local_files_only,\n\u001b[32m 860\u001b[39m )\n\u001b[32m 861\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m862\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_hf_hub_download_to_cache_dir\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 863\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Destination\u001b[39;49;00m\n\u001b[32m 864\u001b[39m \u001b[43m \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 865\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# File info\u001b[39;49;00m\n\u001b[32m 866\u001b[39m \u001b[43m \u001b[49m\u001b[43mrepo_id\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrepo_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 867\u001b[39m \u001b[43m \u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 868\u001b[39m \u001b[43m \u001b[49m\u001b[43mrepo_type\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrepo_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 869\u001b[39m \u001b[43m \u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m=\u001b[49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 870\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# HTTP info\u001b[39;49;00m\n\u001b[32m 871\u001b[39m \u001b[43m \u001b[49m\u001b[43mendpoint\u001b[49m\u001b[43m=\u001b[49m\u001b[43mendpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 872\u001b[39m \u001b[43m \u001b[49m\u001b[43metag_timeout\u001b[49m\u001b[43m=\u001b[49m\u001b[43metag_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 873\u001b[39m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m=\u001b[49m\u001b[43mhf_headers\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 874\u001b[39m \u001b[43m \u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m=\u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 875\u001b[39m \u001b[43m \u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 876\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Additional options\u001b[39;49;00m\n\u001b[32m 877\u001b[39m \u001b[43m \u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m=\u001b[49m\u001b[43mlocal_files_only\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 878\u001b[39m \u001b[43m \u001b[49m\u001b[43mforce_download\u001b[49m\u001b[43m=\u001b[49m\u001b[43mforce_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 879\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/huggingface_hub/file_download.py:1011\u001b[39m, in \u001b[36m_hf_hub_download_to_cache_dir\u001b[39m\u001b[34m(cache_dir, repo_id, filename, repo_type, revision, endpoint, etag_timeout, headers, proxies, token, local_files_only, force_download)\u001b[39m\n\u001b[32m 1009\u001b[39m Path(lock_path).parent.mkdir(parents=\u001b[38;5;28;01mTrue\u001b[39;00m, exist_ok=\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[32m 1010\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m WeakFileLock(lock_path):\n\u001b[32m-> \u001b[39m\u001b[32m1011\u001b[39m \u001b[43m_download_to_tmp_and_move\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1012\u001b[39m \u001b[43m \u001b[49m\u001b[43mincomplete_path\u001b[49m\u001b[43m=\u001b[49m\u001b[43mPath\u001b[49m\u001b[43m(\u001b[49m\u001b[43mblob_path\u001b[49m\u001b[43m \u001b[49m\u001b[43m+\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m.incomplete\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1013\u001b[39m \u001b[43m \u001b[49m\u001b[43mdestination_path\u001b[49m\u001b[43m=\u001b[49m\u001b[43mPath\u001b[49m\u001b[43m(\u001b[49m\u001b[43mblob_path\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1014\u001b[39m \u001b[43m \u001b[49m\u001b[43murl_to_download\u001b[49m\u001b[43m=\u001b[49m\u001b[43murl_to_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1015\u001b[39m \u001b[43m \u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m=\u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1016\u001b[39m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m=\u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1017\u001b[39m \u001b[43m \u001b[49m\u001b[43mexpected_size\u001b[49m\u001b[43m=\u001b[49m\u001b[43mexpected_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1018\u001b[39m \u001b[43m \u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1019\u001b[39m \u001b[43m \u001b[49m\u001b[43mforce_download\u001b[49m\u001b[43m=\u001b[49m\u001b[43mforce_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1020\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1021\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m os.path.exists(pointer_path):\n\u001b[32m 1022\u001b[39m _create_symlink(blob_path, pointer_path, new_blob=\u001b[38;5;28;01mTrue\u001b[39;00m)\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/huggingface_hub/file_download.py:1547\u001b[39m, in \u001b[36m_download_to_tmp_and_move\u001b[39m\u001b[34m(incomplete_path, destination_path, url_to_download, proxies, headers, expected_size, filename, force_download)\u001b[39m\n\u001b[32m 1544\u001b[39m _check_disk_space(expected_size, incomplete_path.parent)\n\u001b[32m 1545\u001b[39m _check_disk_space(expected_size, destination_path.parent)\n\u001b[32m-> \u001b[39m\u001b[32m1547\u001b[39m \u001b[43mhttp_get\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1548\u001b[39m \u001b[43m \u001b[49m\u001b[43murl_to_download\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1549\u001b[39m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1550\u001b[39m \u001b[43m \u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m=\u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1551\u001b[39m \u001b[43m \u001b[49m\u001b[43mresume_size\u001b[49m\u001b[43m=\u001b[49m\u001b[43mresume_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1552\u001b[39m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m=\u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1553\u001b[39m \u001b[43m \u001b[49m\u001b[43mexpected_size\u001b[49m\u001b[43m=\u001b[49m\u001b[43mexpected_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1554\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1556\u001b[39m logger.info(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mDownload complete. Moving file to \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdestination_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 1557\u001b[39m _chmod_and_move(incomplete_path, destination_path)\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/huggingface_hub/file_download.py:471\u001b[39m, in \u001b[36mhttp_get\u001b[39m\u001b[34m(url, temp_file, proxies, resume_size, headers, expected_size, displayed_filename, _nb_retries, _tqdm_bar)\u001b[39m\n\u001b[32m 469\u001b[39m time.sleep(\u001b[32m1\u001b[39m)\n\u001b[32m 470\u001b[39m reset_sessions() \u001b[38;5;66;03m# In case of SSLError it's best to reset the shared requests.Session objects\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m471\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mhttp_get\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 472\u001b[39m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m=\u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 473\u001b[39m \u001b[43m \u001b[49m\u001b[43mtemp_file\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtemp_file\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 474\u001b[39m \u001b[43m \u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m=\u001b[49m\u001b[43mproxies\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 475\u001b[39m \u001b[43m \u001b[49m\u001b[43mresume_size\u001b[49m\u001b[43m=\u001b[49m\u001b[43mnew_resume_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 476\u001b[39m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m=\u001b[49m\u001b[43minitial_headers\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 477\u001b[39m \u001b[43m \u001b[49m\u001b[43mexpected_size\u001b[49m\u001b[43m=\u001b[49m\u001b[43mexpected_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 478\u001b[39m \u001b[43m \u001b[49m\u001b[43m_nb_retries\u001b[49m\u001b[43m=\u001b[49m\u001b[43m_nb_retries\u001b[49m\u001b[43m \u001b[49m\u001b[43m-\u001b[49m\u001b[43m \u001b[49m\u001b[32;43m1\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 479\u001b[39m \u001b[43m \u001b[49m\u001b[43m_tqdm_bar\u001b[49m\u001b[43m=\u001b[49m\u001b[43m_tqdm_bar\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 480\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 482\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m expected_size \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m expected_size != temp_file.tell():\n\u001b[32m 483\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mEnvironmentError\u001b[39;00m(\n\u001b[32m 484\u001b[39m consistency_error_message.format(\n\u001b[32m 485\u001b[39m actual_size=temp_file.tell(),\n\u001b[32m 486\u001b[39m )\n\u001b[32m 487\u001b[39m )\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/huggingface_hub/file_download.py:454\u001b[39m, in \u001b[36mhttp_get\u001b[39m\u001b[34m(url, temp_file, proxies, resume_size, headers, expected_size, displayed_filename, _nb_retries, _tqdm_bar)\u001b[39m\n\u001b[32m 452\u001b[39m new_resume_size = resume_size\n\u001b[32m 453\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m454\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mr\u001b[49m\u001b[43m.\u001b[49m\u001b[43miter_content\u001b[49m\u001b[43m(\u001b[49m\u001b[43mchunk_size\u001b[49m\u001b[43m=\u001b[49m\u001b[43mconstants\u001b[49m\u001b[43m.\u001b[49m\u001b[43mDOWNLOAD_CHUNK_SIZE\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 455\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# filter out keep-alive new chunks\u001b[39;49;00m\n\u001b[32m 456\u001b[39m \u001b[43m \u001b[49m\u001b[43mprogress\u001b[49m\u001b[43m.\u001b[49m\u001b[43mupdate\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n",
|
||
"\u001b[36mFile \u001b[39m\u001b[32m~/dev/rag/.venv/lib/python3.12/site-packages/requests/models.py:822\u001b[39m, in \u001b[36mResponse.iter_content.<locals>.generate\u001b[39m\u001b[34m()\u001b[39m\n\u001b[32m 820\u001b[39m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[38;5;28mself\u001b[39m.raw.stream(chunk_size, decode_content=\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[32m 821\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m ProtocolError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m--> \u001b[39m\u001b[32m822\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m ChunkedEncodingError(e)\n\u001b[32m 823\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m DecodeError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m 824\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m ContentDecodingError(e)\n",
|
||
"\u001b[31mChunkedEncodingError\u001b[39m: ('Connection broken: IncompleteRead(46334378 bytes read, 33976465 more expected)', IncompleteRead(46334378 bytes read, 33976465 more expected))"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"docs = loader.load()\n",
|
||
"docs[0]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"from PIL import Image oa\n",
|
||
"import pytesseract\n",
|
||
"\n",
|
||
"# If you don't have tesseract executable in your PATH, include the following:\n",
|
||
"pytesseract.pytesseract.tesseract_cmd = r’<full_path_to_your_tesseract_executable>*\n",
|
||
"‘# Example tesseract_cmd = r’C:\\Program Files (x86)\\Tesseract-OCR\\tesseract’\n",
|
||
"\n",
|
||
"‘# Simple image to string\n",
|
||
"print(pytesseract. image to_string(Image.open( ‘test .png’)))\n",
|
||
"\n",
|
||
"# In order to bypass the image conversions of pytesseract, just use relative or absolute image path\n",
|
||
"# NOTE: In this case you should provide tesseract supported images or tesseract will return error\n",
|
||
"print (pytesseract.image_to_string(‘test.png\"))\n",
|
||
"\n",
|
||
"# List of available languages\n",
|
||
"\n",
|
||
"print (pytesseract.get_languages(config=\"*))\n",
|
||
"\n",
|
||
"# French text image to string\n",
|
||
"print (pytesseract. image_to_string(Image.open(‘test-european. jpg’), lang=\"fra’))\n",
|
||
"\n",
|
||
"# Batch processing with a single file containing the list of multiple image file paths\n",
|
||
"print (pytesseract. image_to_string(’images.txt\"))\n",
|
||
"\n",
|
||
"# Timeout/terminate the tesseract job after a period of time\n",
|
||
"try:\n",
|
||
"\n",
|
||
"print (pytesseract.image_to_string(‘test. jpg’, timeout-2)) # Timeout after 2 seconds\n",
|
||
"\n",
|
||
"print (pytesseract. image to_string(‘test.jpg\", timeout=2.5)) # Timeout after half a second\n",
|
||
"except Runtime€rror as timeout_error:\n",
|
||
"\n",
|
||
"# Tesseract processing is terminated\n",
|
||
"\n",
|
||
"pass\n",
|
||
"\n",
|
||
"# Get bounding box estimates\n",
|
||
"print (pytesseract. image_to_boxes(Image.open(‘test.png’)))\n",
|
||
"\n",
|
||
"# Get verbose data including boxes, confidences, line and page numbers\n",
|
||
"print (pytesseract.image_to_data(Inage.open(‘test.png’)))\n",
|
||
"\n",
|
||
"# Get information about orientation and script detection\n",
|
||
"print (pytesseract..image_to_osd(Image.open( ‘test.png\")))\n",
|
||
"\n",
|
||
"# Get a searchable PDF\n",
|
||
"pdf = pytesseract.image_to_pdf_or_hocr(‘test.png’, extension='\n",
|
||
"\n",
|
||
"\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"IMG_path = r\"F:\\Dev\\Rag\\Rag_Modeling\\document\\test.png\"\n",
|
||
"print(pytesseract.image_to_string(Image.open(IMG_path)))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/home/sepehr/dev/rag/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||
" from .autonotebook import tqdm as notebook_tqdm\n",
|
||
"The PDF <_io.BufferedReader name='/home/sepehr/dev/rag/document/11_chapitre3.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"33.. CCHHAAPPIITTRREE 33 ::\n",
|
||
"\n",
|
||
"MMOODDÉÉLLIISSAATTIIOONN 11DD DDEESS IINNJJEECCTTEEUURRSS CCOONNDDEENNSSEEUURRSS\n",
|
||
"\n",
|
||
"92\n",
|
||
"\n",
|
||
"Chapitre 3 : Modélisation 1D des injecteurs condenseurs\n",
|
||
"\n",
|
||
"3-1 MODÉLISATION 0D DE L’IC\n",
|
||
"\n",
|
||
"La modélisation 0D consiste à donner une approche théorique de type global des IC en simplifiant au maximum la physique des phénomènes intervenant dans le processus de fonctionnement. Ce modèle est basé sur des bilans globaux entrée/sortie de masse, de quantité de mouvement et d'énergie. Il permet d'estimer les caractéristiques d'un injecteur, en termes de performance et de limites de fonctionnement tout en restant simple avec un volume de calcul réduit. Il nécessite la prise en compte d'une loi de fermeture expérimentale. Cette modélisation a déjà été entreprise auparavant par différents auteurs : [Rose1960] ; [Cattadori1993] ; [Narabayashi1994] ; [Soplenkov1995] ; et [Deberne2000]. Nous partirons de la modélisation décrite dans [Deberne2000], référence à laquelle nous nous reporterons pour plus de détails. Elle traite de l'IC fonctionnant avec une injection de liquide centrale. Beithou [Beithou2000] a proposé aussi un modèle 0D stationnaire et simplifié de la chambre de mélange pour un IC à injection de vapeur centrale, qui donne de bons résultats. Par contre, l'auteur suppose que l'écoulement dans la chambre de mélange de l'IC est isobare, et il ne traite pas l'onde de condensation en supposant que la condensation complète a lieu à la fin de la chambre de mélange (l'auteur impose l'évolution du taux de vide). Pour cela, Beithou considère 2 équations : l'équation de conservation de l'énergie et l'équation de la masse (les indices correspondent aux repères marqués sur la figure 3-1) :\n",
|
||
"\n",
|
||
"⎧ ⎪ ⎪⎪ ⎨ ⎪ ⎪ u ⎪ ⎩\n",
|
||
"\n",
|
||
"( hM 1 V 1 V M +\n",
|
||
"\n",
|
||
"2 50 u, 1 V M\n",
|
||
"\n",
|
||
"+\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from unstructured.partition.auto import partition\n",
|
||
"\n",
|
||
"\n",
|
||
"filename = pdf_path = \"/home/sepehr/dev/rag/document/11_chapitre3.pdf\"\n",
|
||
"elements = partition(filename=filename, content_type=\"application/pdf\")\n",
|
||
"print(\"\\n\\n\".join([str(el) for el in elements][:10]))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"datetime.datetime(2021, 3, 26, 11, 4, 9, tzinfo=datetime.timezone(datetime.timedelta(seconds=43200)))"
|
||
]
|
||
},
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from unstructured.cleaners.extract import extract_datetimetz\n",
|
||
"\n",
|
||
"text = \"\"\"from ABC.DEF.local ([ba23::58b5:2236:45g2:88h2]) by\n",
|
||
" \\n ABC.DEF.local2 ([ba23::58b5:2236:45g2:88h2%25]) with mapi id\\\n",
|
||
" n 32.88.5467.123; Fri, 26 Mar 2021 11:04:09 +1200\"\"\"\n",
|
||
"\n",
|
||
"# Returns datetime.datetime(2021, 3, 26, 11, 4, 9, tzinfo=datetime.timezone(datetime.timedelta(seconds=43200)))\n",
|
||
"extract_datetimetz(text)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"['me@email.com', 'you@email.com']"
|
||
]
|
||
},
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"from unstructured.cleaners.extract import extract_email_address\n",
|
||
"\n",
|
||
"text = \"\"\"Me me@email.com and You <You@email.com>\n",
|
||
" ([ba23::58b5:2236:45g2:88h2]) (10.0.2.01)\"\"\"\n",
|
||
"\n",
|
||
"# Returns \"['me@email.com', 'you@email.com']\"\n",
|
||
"extract_email_address(text)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 38,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import json\n",
|
||
"\n",
|
||
"from unstructured.partition.image import partition_image\n",
|
||
"\n",
|
||
"# Source: https://github.com/Unstructured-IO/unstructured-ingest/blob/main/example-docs/img/english-and-korean.png\n",
|
||
"# Path to the local file to process, relative to this .py file.\n",
|
||
"filename = \"/home/sepehr/dev/rag/document/test2.png\"\n",
|
||
"\n",
|
||
"elements = partition_image(\n",
|
||
" filename=filename,\n",
|
||
" strategy=\"ocr_only\",\n",
|
||
" languages=[\"eng\", \"fr\"] # Language codes differ by the OCR agent used.\n",
|
||
")\n",
|
||
"\n",
|
||
"# Convert the list of returned elements into a list of dictionaries for printing or saving.\n",
|
||
"element_dicts = [element.to_dict() for element in elements]\n",
|
||
"\n",
|
||
"# Print the list.\n",
|
||
"# print(json.dumps(element_dicts, indent=2))\n",
|
||
"\n",
|
||
"# Or, save the list locally:\n",
|
||
"#\n",
|
||
"# file = \"local-ingest-output/english-and-korean.json\"\n",
|
||
"#\n",
|
||
"# with open(file, \"w\") as file:\n",
|
||
"# json.dump(element_dicts, file, indent=2)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 40,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"from unstructured_inference.models.base import get_model\n",
|
||
"from unstructured_inference.inference.layout import DocumentLayout\n",
|
||
"\n",
|
||
"model = get_model(\"yolox\")\n",
|
||
"layout = DocumentLayout.from_file(\"/home/sepehr/dev/rag/document/04Extrait_Methodologie_Experimentale.pdf\", detection_model=model)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 48,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Nombre de pages: 54\n",
|
||
"Éléments sur la première page: 4\n",
|
||
"Picture: 49 elements\n",
|
||
"Caption: 11 elements\n",
|
||
"Text: 191 elements\n",
|
||
"Section-header: 50 elements\n",
|
||
"Page-header: 17 elements\n",
|
||
"Table: 57 elements\n",
|
||
"Title: 24 elements\n",
|
||
"Formula: 1 elements\n",
|
||
"Page-footer: 29 elements\n",
|
||
"List-item: 1 elements\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(f\"Nombre de pages: {len(layout.pages)}\")\n",
|
||
"\n",
|
||
"# Explorer la première page\n",
|
||
"first_page = layout.pages[0]\n",
|
||
"print(f\"Éléments sur la première page: {len(first_page.elements)}\")\n",
|
||
"\n",
|
||
"# Examiner les types d'éléments\n",
|
||
"from collections import defaultdict\n",
|
||
"\n",
|
||
"# Group elements by type across all pages\n",
|
||
"element_types = defaultdict(list)\n",
|
||
"\n",
|
||
"for page in layout.pages:\n",
|
||
" for element in page.elements:\n",
|
||
" element_types[element.type].append(element)\n",
|
||
"\n",
|
||
"# Print count of each element type\n",
|
||
"for elem_type, elems in element_types.items():\n",
|
||
" print(f\"{elem_type}: {len(elems)} elements\")\n",
|
||
"\n",
|
||
"# Examiner les types d'éléments\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import matplotlib.pyplot as plt\n",
|
||
"import matplotlib.patches as patches\n",
|
||
"from PIL import Image\n",
|
||
"import numpy as np\n",
|
||
"import fitz # PyMuPDF\n",
|
||
"\n",
|
||
"def visualize_layout(layout, page_num=0):\n",
|
||
" # Get the PDF path used to generate the layout\n",
|
||
" pdf_path = \"/home/sepehr/dev/rag/document/04Extrait_Methodologie_Experimentale.pdf\"\n",
|
||
" \n",
|
||
" # Open the PDF and render the page as an image using PyMuPDF\n",
|
||
" pdf_document = fitz.open(pdf_path)\n",
|
||
" page = pdf_document[page_num]\n",
|
||
" pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x zoom for better quality\n",
|
||
" \n",
|
||
" # Convert to numpy array for matplotlib\n",
|
||
" img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)\n",
|
||
" \n",
|
||
" # Create figure and axis\n",
|
||
" fig, ax = plt.subplots(1, figsize=(12, 16))\n",
|
||
" ax.imshow(img)\n",
|
||
" \n",
|
||
" # Define colors for different element types\n",
|
||
" colors = {'Title': 'red', 'Text': 'blue', 'Table': 'green', \n",
|
||
" 'Figure': 'orange', 'List': 'purple', 'Header': 'cyan',\n",
|
||
" 'Footer': 'magenta'}\n",
|
||
" \n",
|
||
" # Draw bounding boxes for elements\n",
|
||
" page = layout.pages[page_num]\n",
|
||
" \n",
|
||
" for element in page.elements:\n",
|
||
" print(element)\n",
|
||
" # Access bbox properties correctly based on the Rectangle object structure\n",
|
||
" try:\n",
|
||
" # Try to access rectangle coordinates\n",
|
||
" if hasattr(element, 'bbox'):\n",
|
||
" \n",
|
||
" if hasattr(element.bbox, 'x0'): # Rectangle object with explicit coordinates\n",
|
||
" x = element.bbox.x0\n",
|
||
" y = element.bbox.y0\n",
|
||
" width = element.bbox.x1 - element.bbox.x0\n",
|
||
" height = element.bbox.y1 - element.bbox.y0\n",
|
||
" else: # Some other format\n",
|
||
" x, y, x2, y2 = element.bbox # Try direct unpacking\n",
|
||
" width = x2 - x\n",
|
||
" height = y2 - y\n",
|
||
" \n",
|
||
" elem_type = getattr(element, 'type', 'Unknown')\n",
|
||
" color = colors.get(elem_type, 'gray')\n",
|
||
" \n",
|
||
" rect = patches.Rectangle((x, y), width, height, \n",
|
||
" linewidth=1, edgecolor=color, facecolor='none')\n",
|
||
" ax.add_patch(rect)\n",
|
||
" \n",
|
||
" # Add label for the element type\n",
|
||
" plt.text(x, y, f\"{elem_type}\", color='white', \n",
|
||
" backgroundcolor=color, fontsize=8)\n",
|
||
" \n",
|
||
" except (AttributeError, TypeError, ValueError) as e:\n",
|
||
" print(f\"Error processing element: {e}\")\n",
|
||
" \n",
|
||
" plt.title(f\"Page {page_num+1} Layout\")\n",
|
||
" plt.tight_layout()\n",
|
||
" plt.show()\n",
|
||
" \n",
|
||
" # Close the PDF document\n",
|
||
" pdf_document.close()\n",
|
||
"\n",
|
||
"# Install required package if needed\n",
|
||
"# !pip install pymupdf\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 69,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"ename": "UnboundLocalError",
|
||
"evalue": "cannot access local variable 'element' where it is not associated with a value",
|
||
"output_type": "error",
|
||
"traceback": [
|
||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||
"\u001b[31mUnboundLocalError\u001b[39m Traceback (most recent call last)",
|
||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[69]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mvisualize_layout\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlayout\u001b[49m\u001b[43m,\u001b[49m\u001b[32;43m1\u001b[39;49m\u001b[43m)\u001b[49m\n",
|
||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[68]\u001b[39m\u001b[32m, line 30\u001b[39m, in \u001b[36mvisualize_layout\u001b[39m\u001b[34m(layout, page_num)\u001b[39m\n\u001b[32m 28\u001b[39m \u001b[38;5;66;03m# Draw bounding boxes for elements\u001b[39;00m\n\u001b[32m 29\u001b[39m page = layout.pages[page_num]\n\u001b[32m---> \u001b[39m\u001b[32m30\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[43melement\u001b[49m)\n\u001b[32m 31\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m element \u001b[38;5;129;01min\u001b[39;00m page.elements:\n\u001b[32m 32\u001b[39m \u001b[38;5;66;03m# Access bbox properties correctly based on the Rectangle object structure\u001b[39;00m\n\u001b[32m 33\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m 34\u001b[39m \u001b[38;5;66;03m# Try to access rectangle coordinates\u001b[39;00m\n",
|
||
"\u001b[31mUnboundLocalError\u001b[39m: cannot access local variable 'element' where it is not associated with a value"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"image/png": "",
|
||
"text/plain": [
|
||
"<Figure size 1200x1600 with 1 Axes>"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"visualize_layout(layout,1)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 52,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"INTRODUCTION AUX PLANS D’EXPERIENCES\n",
|
||
"INTERET DES PEX : RAPPELS STATISTIQUES\n",
|
||
"\n",
|
||
"a Distribution normale : courbe représentative = courbe de Gauss variable\n",
|
||
"aléatoire x distribuée normalement avec moyenne = X, écart type = cet\n",
|
||
"Variance : V = 0”\n",
|
||
"\n",
|
||
"9)\n",
|
||
"\n",
|
||
"Re x Bo\n",
|
||
"a Théoréme des variances : variables aléatoires x;, x2, ... X, indépendantes\n",
|
||
"+ relation yr sagt ayX; + aX. +t ot anXy\n",
|
||
"\n",
|
||
"= Variance de y~ — V(y~) = 0 + a V(x1) + ax V(X0)+.... + ay V(Xn)\n",
|
||
"\n",
|
||
"Exemple : X = moyenne de n valeurs de x; ® =In [xp txt... +x,]\n",
|
||
"> V(¥) = In? [V(x)) + V(x2) +... + V(X,)]\n",
|
||
"Si variances égales entre elles : V(x1) = V0) =... = VO&q) =\n",
|
||
"\n",
|
||
"=> V(X)= Ino? dou (o(X) = oy/ vn\n",
|
||
"\n",
|
||
"a ERREUR SUR LA MOYENNE = ERREUR SUR UNE MESURE DIVISEE PAR\n",
|
||
"RACINE CARRREE DE n.\n",
|
||
"\n",
|
||
"\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pytesseract \n",
|
||
"from PIL import Image\n",
|
||
"print(pytesseract.image_to_string(Image.open(filename)))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"elements = partition(filename=filename,\n",
|
||
" strategy=\"hi_res\",\n",
|
||
" hi_res_model_name=\"yolox\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"a Distribution normale : courbe représentative = courbe de Gauss variable aléatoire x distribuée normalement avec moyenne = X, écart type = cet Variance : V = 0”\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(elements[1])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 34,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"ename": "TypeError",
|
||
"evalue": "a bytes-like object is required, not 'Image'",
|
||
"output_type": "error",
|
||
"traceback": [
|
||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||
"\u001b[31mTypeError\u001b[39m Traceback (most recent call last)",
|
||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[34]\u001b[39m\u001b[32m, line 6\u001b[39m\n\u001b[32m 4\u001b[39m \u001b[38;5;66;03m# Assuming element is an instance of unstructured.documents.elements.Image\u001b[39;00m\n\u001b[32m 5\u001b[39m image_data = element \u001b[38;5;66;03m# Convert the element to image data\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m6\u001b[39m image = Image.open(\u001b[43mio\u001b[49m\u001b[43m.\u001b[49m\u001b[43mBytesIO\u001b[49m\u001b[43m(\u001b[49m\u001b[43mimage_data\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[32m 7\u001b[39m image.show()\n",
|
||
"\u001b[31mTypeError\u001b[39m: a bytes-like object is required, not 'Image'"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from PIL import Image\n",
|
||
"import io\n",
|
||
"\n",
|
||
"# Assuming element is an instance of unstructured.documents.elements.Image\n",
|
||
"image_data = element # Convert the element to image data\n",
|
||
"image = Image.open(io.BytesIO(image_data))\n",
|
||
"image.show()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 37,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"{'type': 'Image',\n",
|
||
" 'element_id': '68faf178bda657a69fc87460f25782c1',\n",
|
||
" 'text': '9) Re x Bo',\n",
|
||
" 'metadata': {'detection_class_prob': 0.820507287979126,\n",
|
||
" 'coordinates': {'points': ((99.514404296875, 233.28099060058594),\n",
|
||
" (99.514404296875, 533.5597534179688),\n",
|
||
" (651.2028198242188, 533.5597534179688),\n",
|
||
" (651.2028198242188, 233.28099060058594)),\n",
|
||
" 'system': 'PixelSpace',\n",
|
||
" 'layout_width': 761,\n",
|
||
" 'layout_height': 1096},\n",
|
||
" 'last_modified': '2025-03-01T11:08:55',\n",
|
||
" 'filetype': 'image/png',\n",
|
||
" 'languages': ['eng'],\n",
|
||
" 'page_number': 1,\n",
|
||
" 'file_directory': '/home/sepehr/dev/rag/document',\n",
|
||
" 'filename': 'test2.png'}}"
|
||
]
|
||
},
|
||
"execution_count": 37,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"element.to_dict()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"elements = partition_image(\"example-docs/img/layout-parser-paper-fast.jpg\")"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": ".venv",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.3"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|