update code from youtube tuto
https://www.youtube.com/watch?v=uLrReyH5cu0&t=3558s
This commit is contained in:
parent
f3704a3aa4
commit
918c9796a2
BIN
__pycache__/testunstructuredPDF.cpython-312.pyc
Normal file
BIN
__pycache__/testunstructuredPDF.cpython-312.pyc
Normal file
Binary file not shown.
BIN
document/NIPS-2017-attention-is-all-you-need-Paper.pdf
Normal file
BIN
document/NIPS-2017-attention-is-all-you-need-Paper.pdf
Normal file
Binary file not shown.
BIN
document/test2.png
Normal file
BIN
document/test2.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 115 KiB |
BIN
page_0.png
Normal file
BIN
page_0.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 163 KiB |
BIN
page_1.png
Normal file
BIN
page_1.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 258 KiB |
@ -2,6 +2,7 @@
|
||||
langchain>=0.0.267
|
||||
langchain-community>=0.0.10
|
||||
transformers>=4.30.0
|
||||
langchain_community
|
||||
|
||||
# Document processing
|
||||
unstructured>=0.10.0
|
||||
@ -20,7 +21,7 @@ tabula-py>=2.7.0
|
||||
|
||||
# Data manipulation
|
||||
pandas>=2.0.0
|
||||
numpy>=1.24.0
|
||||
numpy
|
||||
|
||||
# Visualization
|
||||
matplotlib>=3.7.0
|
||||
@ -37,4 +38,5 @@ sentence-transformers>=2.2.2
|
||||
|
||||
# Utilities
|
||||
tqdm>=4.65.0
|
||||
python-dotenv>=1.0.0
|
||||
python-dotenv>=1.0.0
|
||||
pi_heif
|
||||
Binary file not shown.
File diff suppressed because one or more lines are too long
25
testunstructuredPDF.py
Normal file
25
testunstructuredPDF.py
Normal file
@ -0,0 +1,25 @@
|
||||
from unstructured.partition.pdf import partition_pdf
|
||||
|
||||
output_path = "/home/sepehr/dev/rag/document/"
|
||||
file_path = "/home/sepehr/dev/rag/document/04Extrait_Methodologie_Experimentale.pdf"
|
||||
|
||||
# Reference: https://docs.unstructured.io/open-source/core-functionality/chunking
|
||||
chunks = partition_pdf(
|
||||
filename=file_path,
|
||||
infer_table_structure=True, # extract tables
|
||||
strategy="hi_res", # mandatory to infer tables
|
||||
|
||||
extract_image_block_types=["Image"], # Add 'Table' to list to extract image of tables
|
||||
# image_output_dir_path=output_path, # if None, images and tables will saved in base64
|
||||
|
||||
extract_image_block_to_payload=True, # if true, will extract base64 for API usage
|
||||
|
||||
chunking_strategy="by_title", # or 'basic'
|
||||
max_characters=10000, # defaults to 500
|
||||
combine_text_under_n_chars=2000, # defaults to 0
|
||||
new_after_n_chars=6000,
|
||||
|
||||
# extract_images_in_pdf=True, # deprecated
|
||||
)
|
||||
|
||||
print(chunks[3].metadata.orig_elements)
|
||||
863
testvideoYoutube.ipynb
Normal file
863
testvideoYoutube.ipynb
Normal file
File diff suppressed because one or more lines are too long
Loading…
x
Reference in New Issue
Block a user