Upload files to "/"
add progress bar plus folder inspect for pdf files
This commit is contained in:
parent
0cddd0842f
commit
819d3a0956
230
pdfProcessing.py
230
pdfProcessing.py
@ -7,7 +7,9 @@ from langchain.schema import Document
|
|||||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||||
from langchain_core.prompts import ChatPromptTemplate
|
from langchain_core.prompts import ChatPromptTemplate
|
||||||
from langchain_core.output_parsers import StrOutputParser
|
from langchain_core.output_parsers import StrOutputParser
|
||||||
|
import httpx
|
||||||
|
from tqdm import tqdm
|
||||||
|
http_client = httpx.Client(verify=False)
|
||||||
|
|
||||||
class PdfProcessor:
|
class PdfProcessor:
|
||||||
"""
|
"""
|
||||||
@ -80,6 +82,40 @@ class PdfProcessor:
|
|||||||
if not self.config["openai_api_key"]:
|
if not self.config["openai_api_key"]:
|
||||||
raise ValueError("OpenAI API key is required when using OpenAI models")
|
raise ValueError("OpenAI API key is required when using OpenAI models")
|
||||||
os.environ["OPENAI_API_KEY"] = self.config["openai_api_key"]
|
os.environ["OPENAI_API_KEY"] = self.config["openai_api_key"]
|
||||||
|
|
||||||
|
# Initialize Qdrant client
|
||||||
|
from qdrant_client import QdrantClient
|
||||||
|
from qdrant_client.http import models as rest
|
||||||
|
|
||||||
|
self.qdrant_client = QdrantClient(url=self.config["qdrant_url"])
|
||||||
|
|
||||||
|
# Check if collection exists and create it if not
|
||||||
|
collections = self.qdrant_client.get_collections().collections
|
||||||
|
collection_exists = any(collection.name == self.config["collection_name"] for collection in collections)
|
||||||
|
|
||||||
|
if not collection_exists:
|
||||||
|
# Get vector size based on embedding model
|
||||||
|
if self.config["embedding_provider"] == "ollama":
|
||||||
|
# For OllamaEmbeddings, typically 4096 dimensions for newer models
|
||||||
|
vector_size = 4096
|
||||||
|
else: # OpenAI
|
||||||
|
# OpenAI embedding dimensions vary by model
|
||||||
|
model_dimensions = {
|
||||||
|
"text-embedding-ada-002": 1536,
|
||||||
|
"text-embedding-3-small": 1536,
|
||||||
|
"text-embedding-3-large": 3072
|
||||||
|
}
|
||||||
|
vector_size = model_dimensions.get(self.config["openai_embedding_model"], 1536)
|
||||||
|
|
||||||
|
# Create the collection
|
||||||
|
self.qdrant_client.create_collection(
|
||||||
|
collection_name=self.config["collection_name"],
|
||||||
|
vectors_config=rest.VectorParams(
|
||||||
|
size=vector_size,
|
||||||
|
distance=rest.Distance.COSINE
|
||||||
|
)
|
||||||
|
)
|
||||||
|
print(f"Created new Qdrant collection: {self.config['collection_name']}")
|
||||||
|
|
||||||
def _setup_models(self):
|
def _setup_models(self):
|
||||||
"""Initialize models based on configuration."""
|
"""Initialize models based on configuration."""
|
||||||
@ -106,6 +142,7 @@ class PdfProcessor:
|
|||||||
else: # openai
|
else: # openai
|
||||||
from langchain_openai import ChatOpenAI
|
from langchain_openai import ChatOpenAI
|
||||||
self.summary_model = ChatOpenAI(
|
self.summary_model = ChatOpenAI(
|
||||||
|
http_client=http_client,
|
||||||
model=self.config["openai_summary_model"]
|
model=self.config["openai_summary_model"]
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -134,38 +171,45 @@ class PdfProcessor:
|
|||||||
Returns:
|
Returns:
|
||||||
Dictionary with processing statistics
|
Dictionary with processing statistics
|
||||||
"""
|
"""
|
||||||
# Load and extract content from PDF
|
# Create a master progress bar
|
||||||
print("Loading PDF and extracting elements...")
|
with tqdm(total=5, desc="PDF Processing", position=0) as master_bar:
|
||||||
documents = self._load_pdf(pdf_path)
|
# Load and extract content from PDF
|
||||||
|
master_bar.set_description("Loading PDF")
|
||||||
# Process text chunks
|
documents = self._load_pdf(pdf_path)
|
||||||
print("Processing text chunks...")
|
master_bar.update(1)
|
||||||
title_chunks = self._process_text(documents)
|
|
||||||
text_summaries = self._summarize_text(title_chunks)
|
# Process text chunks
|
||||||
processed_text = self._convert_text_to_documents(title_chunks, text_summaries)
|
master_bar.set_description("Processing text chunks")
|
||||||
|
title_chunks = self._process_text(documents)
|
||||||
# Process images if configured
|
text_summaries = self._summarize_text(title_chunks)
|
||||||
print("Processing images...")
|
processed_text = self._convert_text_to_documents(title_chunks, text_summaries)
|
||||||
processed_images = []
|
master_bar.update(1)
|
||||||
if self.config["extract_images"]:
|
|
||||||
images = self._extract_images(documents)
|
# Process images if configured
|
||||||
image_summaries = self._process_images(images)
|
master_bar.set_description("Processing images")
|
||||||
processed_images = self._convert_images_to_documents(images, image_summaries)
|
processed_images = []
|
||||||
|
if self.config["extract_images"]:
|
||||||
# Process tables if configured
|
images = self._extract_images(documents)
|
||||||
print("Processing tables...")
|
image_summaries = self._process_images(images)
|
||||||
processed_tables = []
|
processed_images = self._convert_images_to_documents(images, image_summaries)
|
||||||
if self.config["extract_tables"]:
|
master_bar.update(1)
|
||||||
tables = self._extract_tables(documents)
|
|
||||||
table_summaries = self._process_tables(tables)
|
# Process tables if configured
|
||||||
processed_tables = self._convert_tables_to_documents(tables, table_summaries)
|
master_bar.set_description("Processing tables")
|
||||||
|
processed_tables = []
|
||||||
print("Storing processed elements in Qdrant...")
|
if self.config["extract_tables"]:
|
||||||
# Combine all processed elements
|
tables = self._extract_tables(documents)
|
||||||
final_documents = processed_text + processed_images + processed_tables
|
table_summaries = self._process_tables(tables)
|
||||||
|
processed_tables = self._convert_tables_to_documents(tables, table_summaries)
|
||||||
# Store in Qdrant
|
master_bar.update(1)
|
||||||
self._store_documents(final_documents)
|
|
||||||
|
master_bar.set_description("Storing in Qdrant")
|
||||||
|
# Combine all processed elements
|
||||||
|
final_documents = processed_text + processed_images + processed_tables
|
||||||
|
|
||||||
|
# Store in Qdrant
|
||||||
|
self._store_documents(final_documents)
|
||||||
|
master_bar.update(1)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"text_chunks": len(processed_text),
|
"text_chunks": len(processed_text),
|
||||||
@ -199,7 +243,15 @@ class PdfProcessor:
|
|||||||
|
|
||||||
def _summarize_text(self, chunks: List[Document]) -> List[str]:
|
def _summarize_text(self, chunks: List[Document]) -> List[str]:
|
||||||
"""Generate summaries for text chunks."""
|
"""Generate summaries for text chunks."""
|
||||||
return self.summarize_chain.batch([chunk.page_content for chunk in chunks], {"max_concurrency": 3})
|
if not chunks:
|
||||||
|
return []
|
||||||
|
|
||||||
|
print(f"Summarizing {len(chunks)} text chunks...")
|
||||||
|
results = []
|
||||||
|
for chunk in tqdm(chunks, desc="Text summarization", leave=False):
|
||||||
|
result = self.summarize_chain.invoke(chunk.page_content)
|
||||||
|
results.append(result)
|
||||||
|
return results
|
||||||
|
|
||||||
def _extract_images(self, documents: List[Document]) -> List[Dict[str, Any]]:
|
def _extract_images(self, documents: List[Document]) -> List[Dict[str, Any]]:
|
||||||
"""Extract images with captions from documents."""
|
"""Extract images with captions from documents."""
|
||||||
@ -225,12 +277,17 @@ class PdfProcessor:
|
|||||||
|
|
||||||
def _process_images(self, images: List[Dict[str, Any]]) -> List[str]:
|
def _process_images(self, images: List[Dict[str, Any]]) -> List[str]:
|
||||||
"""Generate descriptions for images using configured model."""
|
"""Generate descriptions for images using configured model."""
|
||||||
|
if not images:
|
||||||
|
return []
|
||||||
|
|
||||||
|
print(f"Processing {len(images)} images...")
|
||||||
|
|
||||||
if self.config["image_provider"] == "ollama":
|
if self.config["image_provider"] == "ollama":
|
||||||
from ollama import Client
|
from ollama import Client
|
||||||
client = Client(host=self.config["ollama_image_url"])
|
client = Client(host=self.config["ollama_image_url"])
|
||||||
|
|
||||||
image_summaries = []
|
image_summaries = []
|
||||||
for img in images:
|
for img in tqdm(images, desc="Image processing", leave=False):
|
||||||
prompt = f"Caption of image: {img.get('caption', '')}. Describe this image in detail in {self.config['summary_language']}."
|
prompt = f"Caption of image: {img.get('caption', '')}. Describe this image in detail in {self.config['summary_language']}."
|
||||||
response = client.chat(
|
response = client.chat(
|
||||||
model=self.config["ollama_image_model"],
|
model=self.config["ollama_image_model"],
|
||||||
@ -261,9 +318,17 @@ class PdfProcessor:
|
|||||||
]
|
]
|
||||||
|
|
||||||
prompt = ChatPromptTemplate.from_messages(messages)
|
prompt = ChatPromptTemplate.from_messages(messages)
|
||||||
chain = prompt | ChatOpenAI(model=self.config["openai_image_model"]) | StrOutputParser()
|
chain = prompt | ChatOpenAI(model=self.config["openai_image_model"], http_client=http_client) | StrOutputParser()
|
||||||
|
|
||||||
return chain.batch([{"image_base64": img["image_base64"], "caption": img.get("caption", "")} for img in images])
|
# Process images with progress bar
|
||||||
|
results = []
|
||||||
|
image_data = [{"image_base64": img["image_base64"], "caption": img.get("caption", "")} for img in images]
|
||||||
|
|
||||||
|
for img_data in tqdm(image_data, desc="Image processing", leave=False):
|
||||||
|
result = chain.invoke(img_data)
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
def _extract_tables(self, documents: List[Document]) -> List[Dict[str, Any]]:
|
def _extract_tables(self, documents: List[Document]) -> List[Dict[str, Any]]:
|
||||||
"""Extract tables with captions from documents."""
|
"""Extract tables with captions from documents."""
|
||||||
@ -290,9 +355,13 @@ class PdfProcessor:
|
|||||||
|
|
||||||
def _process_tables(self, tables: List[Dict[str, Any]]) -> List[str]:
|
def _process_tables(self, tables: List[Dict[str, Any]]) -> List[str]:
|
||||||
"""Generate summaries for tables."""
|
"""Generate summaries for tables."""
|
||||||
|
if not tables:
|
||||||
|
return []
|
||||||
|
|
||||||
|
print(f"Processing {len(tables)} tables...")
|
||||||
table_summaries = []
|
table_summaries = []
|
||||||
|
|
||||||
for table in tables:
|
for table in tqdm(tables, desc="Table processing", leave=False):
|
||||||
prompt = f"""Caption of table: {table.get('caption', '')}.
|
prompt = f"""Caption of table: {table.get('caption', '')}.
|
||||||
Describe this table in detail in {self.config['summary_language']}.
|
Describe this table in detail in {self.config['summary_language']}.
|
||||||
Table content: {table.get('table_data', '')}"""
|
Table content: {table.get('table_data', '')}"""
|
||||||
@ -481,11 +550,86 @@ class PdfProcessor:
|
|||||||
final_chunks.extend(sub_chunks)
|
final_chunks.extend(sub_chunks)
|
||||||
|
|
||||||
return final_chunks
|
return final_chunks
|
||||||
|
|
||||||
|
def process_directory(self, directory_path: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Process all PDF files in the specified directory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
directory_path: Path to the directory containing PDF files
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with processing statistics for all files
|
||||||
|
"""
|
||||||
|
# Check if directory exists
|
||||||
|
if not os.path.isdir(directory_path):
|
||||||
|
raise ValueError(f"Directory not found: {directory_path}")
|
||||||
|
|
||||||
|
# Find all PDF files in the directory
|
||||||
|
pdf_files = glob.glob(os.path.join(directory_path, "*.pdf"))
|
||||||
|
|
||||||
|
if not pdf_files:
|
||||||
|
print(f"No PDF files found in {directory_path}")
|
||||||
|
return {"files_processed": 0}
|
||||||
|
|
||||||
|
# Track overall statistics
|
||||||
|
overall_stats = {
|
||||||
|
"files_processed": 0,
|
||||||
|
"total_text_chunks": 0,
|
||||||
|
"total_image_chunks": 0,
|
||||||
|
"total_table_chunks": 0,
|
||||||
|
"total_chunks": 0,
|
||||||
|
"collection_name": self.config["collection_name"],
|
||||||
|
"file_details": []
|
||||||
|
}
|
||||||
|
|
||||||
|
# Process each PDF file with a progress bar
|
||||||
|
print(f"Found {len(pdf_files)} PDF files in {directory_path}")
|
||||||
|
for pdf_file in tqdm(pdf_files, desc="Processing PDF files", unit="file"):
|
||||||
|
try:
|
||||||
|
print(f"\nProcessing: {os.path.basename(pdf_file)}")
|
||||||
|
result = self.process_pdf(pdf_file)
|
||||||
|
|
||||||
|
# Update statistics
|
||||||
|
overall_stats["files_processed"] += 1
|
||||||
|
overall_stats["total_text_chunks"] += result.get("text_chunks", 0)
|
||||||
|
overall_stats["total_image_chunks"] += result.get("image_chunks", 0)
|
||||||
|
overall_stats["total_table_chunks"] += result.get("table_chunks", 0)
|
||||||
|
overall_stats["total_chunks"] += result.get("total_chunks", 0)
|
||||||
|
|
||||||
|
# Store individual file results
|
||||||
|
file_detail = {
|
||||||
|
"filename": os.path.basename(pdf_file),
|
||||||
|
"text_chunks": result.get("text_chunks", 0),
|
||||||
|
"image_chunks": result.get("image_chunks", 0),
|
||||||
|
"table_chunks": result.get("table_chunks", 0),
|
||||||
|
"total_chunks": result.get("total_chunks", 0)
|
||||||
|
}
|
||||||
|
overall_stats["file_details"].append(file_detail)
|
||||||
|
|
||||||
|
print(f"Completed: {file_detail['filename']} - {file_detail['total_chunks']} chunks processed")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing {pdf_file}: {str(e)}")
|
||||||
|
# Continue with next file
|
||||||
|
|
||||||
|
print("\nDirectory processing complete!")
|
||||||
|
print(f"Processed {overall_stats['files_processed']} files")
|
||||||
|
print(f"Total chunks: {overall_stats['total_chunks']}")
|
||||||
|
print(f" - Text chunks: {overall_stats['total_text_chunks']}")
|
||||||
|
print(f" - Image chunks: {overall_stats['total_image_chunks']}")
|
||||||
|
print(f" - Table chunks: {overall_stats['total_table_chunks']}")
|
||||||
|
print(f"All content stored in collection: {overall_stats['collection_name']}")
|
||||||
|
|
||||||
|
return overall_stats
|
||||||
|
|
||||||
|
import glob
|
||||||
|
import os
|
||||||
processor = PdfProcessor({
|
processor = PdfProcessor({
|
||||||
"image_provider": "openai",
|
# "image_provider": "openai",
|
||||||
"openai_api_key": "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",
|
# "openai_api_key": "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",
|
||||||
"collection_name": "my_custom_collection",
|
"collection_name": "my_control_and calibration",
|
||||||
"summary_language": "English"
|
"summary_language": "English"
|
||||||
})
|
})
|
||||||
result = processor.process_pdf(r"F:\Dev\Rag\chat_bot_rag\T4 Machines thermiques.pdf")
|
|
||||||
|
results = processor.process_directory(r"C:\Users\serameza\host-data")
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user