from unstructured.partition.pdf import partition_pdf output_path = "/home/sepehr/dev/rag/document/" file_path = "/home/sepehr/dev/rag/document/04Extrait_Methodologie_Experimentale.pdf" # Reference: https://docs.unstructured.io/open-source/core-functionality/chunking chunks = partition_pdf( filename=file_path, infer_table_structure=True, # extract tables strategy="hi_res", # mandatory to infer tables extract_image_block_types=["Image"], # Add 'Table' to list to extract image of tables # image_output_dir_path=output_path, # if None, images and tables will saved in base64 extract_image_block_to_payload=True, # if true, will extract base64 for API usage chunking_strategy="by_title", # or 'basic' max_characters=10000, # defaults to 500 combine_text_under_n_chars=2000, # defaults to 0 new_after_n_chars=6000, # extract_images_in_pdf=True, # deprecated ) print(chunks[3].metadata.orig_elements)