update code from youtube tuto

https://www.youtube.com/watch?v=uLrReyH5cu0&t=3558s
This commit is contained in:
sepehr 2025-03-01 23:15:38 +01:00
parent f3704a3aa4
commit 918c9796a2
10 changed files with 1434 additions and 53 deletions

Binary file not shown.

Binary file not shown.

BIN
document/test2.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 115 KiB

BIN
page_0.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 163 KiB

BIN
page_1.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 258 KiB

View File

@ -2,6 +2,7 @@
langchain>=0.0.267
langchain-community>=0.0.10
transformers>=4.30.0
langchain_community
# Document processing
unstructured>=0.10.0
@ -20,7 +21,7 @@ tabula-py>=2.7.0
# Data manipulation
pandas>=2.0.0
numpy>=1.24.0
numpy
# Visualization
matplotlib>=3.7.0
@ -37,4 +38,5 @@ sentence-transformers>=2.2.2
# Utilities
tqdm>=4.65.0
python-dotenv>=1.0.0
python-dotenv>=1.0.0
pi_heif

File diff suppressed because one or more lines are too long

25
testunstructuredPDF.py Normal file
View File

@ -0,0 +1,25 @@
from unstructured.partition.pdf import partition_pdf
output_path = "/home/sepehr/dev/rag/document/"
file_path = "/home/sepehr/dev/rag/document/04Extrait_Methodologie_Experimentale.pdf"
# Reference: https://docs.unstructured.io/open-source/core-functionality/chunking
chunks = partition_pdf(
filename=file_path,
infer_table_structure=True, # extract tables
strategy="hi_res", # mandatory to infer tables
extract_image_block_types=["Image"], # Add 'Table' to list to extract image of tables
# image_output_dir_path=output_path, # if None, images and tables will saved in base64
extract_image_block_to_payload=True, # if true, will extract base64 for API usage
chunking_strategy="by_title", # or 'basic'
max_characters=10000, # defaults to 500
combine_text_under_n_chars=2000, # defaults to 0
new_after_n_chars=6000,
# extract_images_in_pdf=True, # deprecated
)
print(chunks[3].metadata.orig_elements)

863
testvideoYoutube.ipynb Normal file

File diff suppressed because one or more lines are too long