25 lines
1.0 KiB
Python
25 lines
1.0 KiB
Python
from unstructured.partition.pdf import partition_pdf
|
|
|
|
output_path = "/home/sepehr/dev/rag/document/"
|
|
file_path = "/home/sepehr/dev/rag/document/04Extrait_Methodologie_Experimentale.pdf"
|
|
|
|
# Reference: https://docs.unstructured.io/open-source/core-functionality/chunking
|
|
chunks = partition_pdf(
|
|
filename=file_path,
|
|
infer_table_structure=True, # extract tables
|
|
strategy="hi_res", # mandatory to infer tables
|
|
|
|
extract_image_block_types=["Image"], # Add 'Table' to list to extract image of tables
|
|
# image_output_dir_path=output_path, # if None, images and tables will saved in base64
|
|
|
|
extract_image_block_to_payload=True, # if true, will extract base64 for API usage
|
|
|
|
chunking_strategy="by_title", # or 'basic'
|
|
max_characters=10000, # defaults to 500
|
|
combine_text_under_n_chars=2000, # defaults to 0
|
|
new_after_n_chars=6000,
|
|
|
|
# extract_images_in_pdf=True, # deprecated
|
|
)
|
|
|
|
print(chunks[3].metadata.orig_elements) |