diff --git a/pdfProcessing.py b/pdfProcessing.py index 42dbb77..a72e065 100644 --- a/pdfProcessing.py +++ b/pdfProcessing.py @@ -7,7 +7,9 @@ from langchain.schema import Document from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_core.prompts import ChatPromptTemplate from langchain_core.output_parsers import StrOutputParser - +import httpx +from tqdm import tqdm +http_client = httpx.Client(verify=False) class PdfProcessor: """ @@ -80,6 +82,40 @@ class PdfProcessor: if not self.config["openai_api_key"]: raise ValueError("OpenAI API key is required when using OpenAI models") os.environ["OPENAI_API_KEY"] = self.config["openai_api_key"] + + # Initialize Qdrant client + from qdrant_client import QdrantClient + from qdrant_client.http import models as rest + + self.qdrant_client = QdrantClient(url=self.config["qdrant_url"]) + + # Check if collection exists and create it if not + collections = self.qdrant_client.get_collections().collections + collection_exists = any(collection.name == self.config["collection_name"] for collection in collections) + + if not collection_exists: + # Get vector size based on embedding model + if self.config["embedding_provider"] == "ollama": + # For OllamaEmbeddings, typically 4096 dimensions for newer models + vector_size = 4096 + else: # OpenAI + # OpenAI embedding dimensions vary by model + model_dimensions = { + "text-embedding-ada-002": 1536, + "text-embedding-3-small": 1536, + "text-embedding-3-large": 3072 + } + vector_size = model_dimensions.get(self.config["openai_embedding_model"], 1536) + + # Create the collection + self.qdrant_client.create_collection( + collection_name=self.config["collection_name"], + vectors_config=rest.VectorParams( + size=vector_size, + distance=rest.Distance.COSINE + ) + ) + print(f"Created new Qdrant collection: {self.config['collection_name']}") def _setup_models(self): """Initialize models based on configuration.""" @@ -106,6 +142,7 @@ class PdfProcessor: else: # openai from langchain_openai import ChatOpenAI self.summary_model = ChatOpenAI( + http_client=http_client, model=self.config["openai_summary_model"] ) @@ -134,38 +171,45 @@ class PdfProcessor: Returns: Dictionary with processing statistics """ - # Load and extract content from PDF - print("Loading PDF and extracting elements...") - documents = self._load_pdf(pdf_path) - - # Process text chunks - print("Processing text chunks...") - title_chunks = self._process_text(documents) - text_summaries = self._summarize_text(title_chunks) - processed_text = self._convert_text_to_documents(title_chunks, text_summaries) - - # Process images if configured - print("Processing images...") - processed_images = [] - if self.config["extract_images"]: - images = self._extract_images(documents) - image_summaries = self._process_images(images) - processed_images = self._convert_images_to_documents(images, image_summaries) - - # Process tables if configured - print("Processing tables...") - processed_tables = [] - if self.config["extract_tables"]: - tables = self._extract_tables(documents) - table_summaries = self._process_tables(tables) - processed_tables = self._convert_tables_to_documents(tables, table_summaries) - - print("Storing processed elements in Qdrant...") - # Combine all processed elements - final_documents = processed_text + processed_images + processed_tables - - # Store in Qdrant - self._store_documents(final_documents) + # Create a master progress bar + with tqdm(total=5, desc="PDF Processing", position=0) as master_bar: + # Load and extract content from PDF + master_bar.set_description("Loading PDF") + documents = self._load_pdf(pdf_path) + master_bar.update(1) + + # Process text chunks + master_bar.set_description("Processing text chunks") + title_chunks = self._process_text(documents) + text_summaries = self._summarize_text(title_chunks) + processed_text = self._convert_text_to_documents(title_chunks, text_summaries) + master_bar.update(1) + + # Process images if configured + master_bar.set_description("Processing images") + processed_images = [] + if self.config["extract_images"]: + images = self._extract_images(documents) + image_summaries = self._process_images(images) + processed_images = self._convert_images_to_documents(images, image_summaries) + master_bar.update(1) + + # Process tables if configured + master_bar.set_description("Processing tables") + processed_tables = [] + if self.config["extract_tables"]: + tables = self._extract_tables(documents) + table_summaries = self._process_tables(tables) + processed_tables = self._convert_tables_to_documents(tables, table_summaries) + master_bar.update(1) + + master_bar.set_description("Storing in Qdrant") + # Combine all processed elements + final_documents = processed_text + processed_images + processed_tables + + # Store in Qdrant + self._store_documents(final_documents) + master_bar.update(1) return { "text_chunks": len(processed_text), @@ -199,7 +243,15 @@ class PdfProcessor: def _summarize_text(self, chunks: List[Document]) -> List[str]: """Generate summaries for text chunks.""" - return self.summarize_chain.batch([chunk.page_content for chunk in chunks], {"max_concurrency": 3}) + if not chunks: + return [] + + print(f"Summarizing {len(chunks)} text chunks...") + results = [] + for chunk in tqdm(chunks, desc="Text summarization", leave=False): + result = self.summarize_chain.invoke(chunk.page_content) + results.append(result) + return results def _extract_images(self, documents: List[Document]) -> List[Dict[str, Any]]: """Extract images with captions from documents.""" @@ -225,12 +277,17 @@ class PdfProcessor: def _process_images(self, images: List[Dict[str, Any]]) -> List[str]: """Generate descriptions for images using configured model.""" + if not images: + return [] + + print(f"Processing {len(images)} images...") + if self.config["image_provider"] == "ollama": from ollama import Client client = Client(host=self.config["ollama_image_url"]) image_summaries = [] - for img in images: + for img in tqdm(images, desc="Image processing", leave=False): prompt = f"Caption of image: {img.get('caption', '')}. Describe this image in detail in {self.config['summary_language']}." response = client.chat( model=self.config["ollama_image_model"], @@ -261,9 +318,17 @@ class PdfProcessor: ] prompt = ChatPromptTemplate.from_messages(messages) - chain = prompt | ChatOpenAI(model=self.config["openai_image_model"]) | StrOutputParser() + chain = prompt | ChatOpenAI(model=self.config["openai_image_model"], http_client=http_client) | StrOutputParser() - return chain.batch([{"image_base64": img["image_base64"], "caption": img.get("caption", "")} for img in images]) + # Process images with progress bar + results = [] + image_data = [{"image_base64": img["image_base64"], "caption": img.get("caption", "")} for img in images] + + for img_data in tqdm(image_data, desc="Image processing", leave=False): + result = chain.invoke(img_data) + results.append(result) + + return results def _extract_tables(self, documents: List[Document]) -> List[Dict[str, Any]]: """Extract tables with captions from documents.""" @@ -290,9 +355,13 @@ class PdfProcessor: def _process_tables(self, tables: List[Dict[str, Any]]) -> List[str]: """Generate summaries for tables.""" + if not tables: + return [] + + print(f"Processing {len(tables)} tables...") table_summaries = [] - for table in tables: + for table in tqdm(tables, desc="Table processing", leave=False): prompt = f"""Caption of table: {table.get('caption', '')}. Describe this table in detail in {self.config['summary_language']}. Table content: {table.get('table_data', '')}""" @@ -481,11 +550,86 @@ class PdfProcessor: final_chunks.extend(sub_chunks) return final_chunks - + + def process_directory(self, directory_path: str) -> Dict[str, Any]: + """ + Process all PDF files in the specified directory. + + Args: + directory_path: Path to the directory containing PDF files + + Returns: + Dictionary with processing statistics for all files + """ + # Check if directory exists + if not os.path.isdir(directory_path): + raise ValueError(f"Directory not found: {directory_path}") + + # Find all PDF files in the directory + pdf_files = glob.glob(os.path.join(directory_path, "*.pdf")) + + if not pdf_files: + print(f"No PDF files found in {directory_path}") + return {"files_processed": 0} + + # Track overall statistics + overall_stats = { + "files_processed": 0, + "total_text_chunks": 0, + "total_image_chunks": 0, + "total_table_chunks": 0, + "total_chunks": 0, + "collection_name": self.config["collection_name"], + "file_details": [] + } + + # Process each PDF file with a progress bar + print(f"Found {len(pdf_files)} PDF files in {directory_path}") + for pdf_file in tqdm(pdf_files, desc="Processing PDF files", unit="file"): + try: + print(f"\nProcessing: {os.path.basename(pdf_file)}") + result = self.process_pdf(pdf_file) + + # Update statistics + overall_stats["files_processed"] += 1 + overall_stats["total_text_chunks"] += result.get("text_chunks", 0) + overall_stats["total_image_chunks"] += result.get("image_chunks", 0) + overall_stats["total_table_chunks"] += result.get("table_chunks", 0) + overall_stats["total_chunks"] += result.get("total_chunks", 0) + + # Store individual file results + file_detail = { + "filename": os.path.basename(pdf_file), + "text_chunks": result.get("text_chunks", 0), + "image_chunks": result.get("image_chunks", 0), + "table_chunks": result.get("table_chunks", 0), + "total_chunks": result.get("total_chunks", 0) + } + overall_stats["file_details"].append(file_detail) + + print(f"Completed: {file_detail['filename']} - {file_detail['total_chunks']} chunks processed") + + except Exception as e: + print(f"Error processing {pdf_file}: {str(e)}") + # Continue with next file + + print("\nDirectory processing complete!") + print(f"Processed {overall_stats['files_processed']} files") + print(f"Total chunks: {overall_stats['total_chunks']}") + print(f" - Text chunks: {overall_stats['total_text_chunks']}") + print(f" - Image chunks: {overall_stats['total_image_chunks']}") + print(f" - Table chunks: {overall_stats['total_table_chunks']}") + print(f"All content stored in collection: {overall_stats['collection_name']}") + + return overall_stats + +import glob +import os processor = PdfProcessor({ - "image_provider": "openai", - "openai_api_key": "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", - "collection_name": "my_custom_collection", + # "image_provider": "openai", + # "openai_api_key": "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", + "collection_name": "my_control_and calibration", "summary_language": "English" }) -result = processor.process_pdf(r"F:\Dev\Rag\chat_bot_rag\T4 Machines thermiques.pdf") \ No newline at end of file + +results = processor.process_directory(r"C:\Users\serameza\host-data")