Upload files to "/"

add progress bar plus folder inspect for pdf files
2025-03-09 12:31:02 +01:00 · 2025-03-09 12:31:02 +01:00 · 819d3a0956
commit 819d3a0956
parent 0cddd0842f
1 changed files with 187 additions and 43 deletions
--- a/pdfProcessing.py
+++ b/pdfProcessing.py
@ -7,7 +7,9 @@ from langchain.schema import Document
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.output_parsers import StrOutputParser
-
+import httpx
+from tqdm import tqdm
+http_client = httpx.Client(verify=False)

 class PdfProcessor:
    """
@ -80,6 +82,40 @@ class PdfProcessor:
            if not self.config["openai_api_key"]:
                raise ValueError("OpenAI API key is required when using OpenAI models")
            os.environ["OPENAI_API_KEY"] = self.config["openai_api_key"]
+            
+        # Initialize Qdrant client
+        from qdrant_client import QdrantClient
+        from qdrant_client.http import models as rest
+        
+        self.qdrant_client = QdrantClient(url=self.config["qdrant_url"])
+        
+        # Check if collection exists and create it if not
+        collections = self.qdrant_client.get_collections().collections
+        collection_exists = any(collection.name == self.config["collection_name"] for collection in collections)
+        
+        if not collection_exists:
+            # Get vector size based on embedding model
+            if self.config["embedding_provider"] == "ollama":
+                # For OllamaEmbeddings, typically 4096 dimensions for newer models
+                vector_size = 4096
+            else:  # OpenAI
+                # OpenAI embedding dimensions vary by model
+                model_dimensions = {
+                    "text-embedding-ada-002": 1536,
+                    "text-embedding-3-small": 1536,
+                    "text-embedding-3-large": 3072
+                }
+                vector_size = model_dimensions.get(self.config["openai_embedding_model"], 1536)
+            
+            # Create the collection
+            self.qdrant_client.create_collection(
+                collection_name=self.config["collection_name"],
+                vectors_config=rest.VectorParams(
+                    size=vector_size,
+                    distance=rest.Distance.COSINE
+                )
+            )
+            print(f"Created new Qdrant collection: {self.config['collection_name']}")
    
    def _setup_models(self):
        """Initialize models based on configuration."""
@ -106,6 +142,7 @@ class PdfProcessor:
        else:  # openai
            from langchain_openai import ChatOpenAI
            self.summary_model = ChatOpenAI(
+                http_client=http_client,
                model=self.config["openai_summary_model"]
            )
        
@ -134,38 +171,45 @@ class PdfProcessor:
        Returns:
            Dictionary with processing statistics
        """
-        # Load and extract content from PDF
-        print("Loading PDF and extracting elements...")
-        documents = self._load_pdf(pdf_path)
-        
-        # Process text chunks
-        print("Processing text chunks...")
-        title_chunks = self._process_text(documents)
-        text_summaries = self._summarize_text(title_chunks)
-        processed_text = self._convert_text_to_documents(title_chunks, text_summaries)
-        
-        # Process images if configured
-        print("Processing images...")
-        processed_images = []
-        if self.config["extract_images"]:
-            images = self._extract_images(documents)
-            image_summaries = self._process_images(images)
-            processed_images = self._convert_images_to_documents(images, image_summaries)
-        
-        # Process tables if configured
-        print("Processing tables...")
-        processed_tables = []
-        if self.config["extract_tables"]:
-            tables = self._extract_tables(documents)
-            table_summaries = self._process_tables(tables)
-            processed_tables = self._convert_tables_to_documents(tables, table_summaries)
-        
-        print("Storing processed elements in Qdrant...")
-        # Combine all processed elements
-        final_documents = processed_text + processed_images + processed_tables
-        
-        # Store in Qdrant
-        self._store_documents(final_documents)
+        # Create a master progress bar
+        with tqdm(total=5, desc="PDF Processing", position=0) as master_bar:
+            # Load and extract content from PDF
+            master_bar.set_description("Loading PDF")
+            documents = self._load_pdf(pdf_path)
+            master_bar.update(1)
+            
+            # Process text chunks
+            master_bar.set_description("Processing text chunks")
+            title_chunks = self._process_text(documents)
+            text_summaries = self._summarize_text(title_chunks)
+            processed_text = self._convert_text_to_documents(title_chunks, text_summaries)
+            master_bar.update(1)
+            
+            # Process images if configured
+            master_bar.set_description("Processing images")
+            processed_images = []
+            if self.config["extract_images"]:
+                images = self._extract_images(documents)
+                image_summaries = self._process_images(images)
+                processed_images = self._convert_images_to_documents(images, image_summaries)
+            master_bar.update(1)
+            
+            # Process tables if configured
+            master_bar.set_description("Processing tables")
+            processed_tables = []
+            if self.config["extract_tables"]:
+                tables = self._extract_tables(documents)
+                table_summaries = self._process_tables(tables)
+                processed_tables = self._convert_tables_to_documents(tables, table_summaries)
+            master_bar.update(1)
+            
+            master_bar.set_description("Storing in Qdrant")
+            # Combine all processed elements
+            final_documents = processed_text + processed_images + processed_tables
+            
+            # Store in Qdrant
+            self._store_documents(final_documents)
+            master_bar.update(1)
        
        return {
            "text_chunks": len(processed_text),
@ -199,7 +243,15 @@ class PdfProcessor:
    
    def _summarize_text(self, chunks: List[Document]) -> List[str]:
        """Generate summaries for text chunks."""
-        return self.summarize_chain.batch([chunk.page_content for chunk in chunks], {"max_concurrency": 3})
+        if not chunks:
+            return []
+        
+        print(f"Summarizing {len(chunks)} text chunks...")
+        results = []
+        for chunk in tqdm(chunks, desc="Text summarization", leave=False):
+            result = self.summarize_chain.invoke(chunk.page_content)
+            results.append(result)
+        return results
    
    def _extract_images(self, documents: List[Document]) -> List[Dict[str, Any]]:
        """Extract images with captions from documents."""
@ -225,12 +277,17 @@ class PdfProcessor:
    
    def _process_images(self, images: List[Dict[str, Any]]) -> List[str]:
        """Generate descriptions for images using configured model."""
+        if not images:
+            return []
+        
+        print(f"Processing {len(images)} images...")
+        
        if self.config["image_provider"] == "ollama":
            from ollama import Client
            client = Client(host=self.config["ollama_image_url"])
            
            image_summaries = []
-            for img in images:
+            for img in tqdm(images, desc="Image processing", leave=False):
                prompt = f"Caption of image: {img.get('caption', '')}. Describe this image in detail in {self.config['summary_language']}."
                response = client.chat(
                    model=self.config["ollama_image_model"],
@ -261,9 +318,17 @@ class PdfProcessor:
            ]
            
            prompt = ChatPromptTemplate.from_messages(messages)
-            chain = prompt | ChatOpenAI(model=self.config["openai_image_model"]) | StrOutputParser()
+            chain = prompt | ChatOpenAI(model=self.config["openai_image_model"], http_client=http_client) | StrOutputParser()
            
-            return chain.batch([{"image_base64": img["image_base64"], "caption": img.get("caption", "")} for img in images])
+            # Process images with progress bar
+            results = []
+            image_data = [{"image_base64": img["image_base64"], "caption": img.get("caption", "")} for img in images]
+            
+            for img_data in tqdm(image_data, desc="Image processing", leave=False):
+                result = chain.invoke(img_data)
+                results.append(result)
+                
+            return results
    
    def _extract_tables(self, documents: List[Document]) -> List[Dict[str, Any]]:
        """Extract tables with captions from documents."""
@ -290,9 +355,13 @@ class PdfProcessor:
    
    def _process_tables(self, tables: List[Dict[str, Any]]) -> List[str]:
        """Generate summaries for tables."""
+        if not tables:
+            return []
+            
+        print(f"Processing {len(tables)} tables...")
        table_summaries = []
        
-        for table in tables:
+        for table in tqdm(tables, desc="Table processing", leave=False):
            prompt = f"""Caption of table: {table.get('caption', '')}. 
                      Describe this table in detail in {self.config['summary_language']}.
                      Table content: {table.get('table_data', '')}"""
@ -481,11 +550,86 @@ class PdfProcessor:
                final_chunks.extend(sub_chunks)
        
        return final_chunks
-    
+
+    def process_directory(self, directory_path: str) -> Dict[str, Any]:
+        """
+        Process all PDF files in the specified directory.
+        
+        Args:
+            directory_path: Path to the directory containing PDF files
+            
+        Returns:
+            Dictionary with processing statistics for all files
+        """
+        # Check if directory exists
+        if not os.path.isdir(directory_path):
+            raise ValueError(f"Directory not found: {directory_path}")
+        
+        # Find all PDF files in the directory
+        pdf_files = glob.glob(os.path.join(directory_path, "*.pdf"))
+        
+        if not pdf_files:
+            print(f"No PDF files found in {directory_path}")
+            return {"files_processed": 0}
+        
+        # Track overall statistics
+        overall_stats = {
+            "files_processed": 0,
+            "total_text_chunks": 0,
+            "total_image_chunks": 0,
+            "total_table_chunks": 0,
+            "total_chunks": 0,
+            "collection_name": self.config["collection_name"],
+            "file_details": []
+        }
+        
+        # Process each PDF file with a progress bar
+        print(f"Found {len(pdf_files)} PDF files in {directory_path}")
+        for pdf_file in tqdm(pdf_files, desc="Processing PDF files", unit="file"):
+            try:
+                print(f"\nProcessing: {os.path.basename(pdf_file)}")
+                result = self.process_pdf(pdf_file)
+                
+                # Update statistics
+                overall_stats["files_processed"] += 1
+                overall_stats["total_text_chunks"] += result.get("text_chunks", 0)
+                overall_stats["total_image_chunks"] += result.get("image_chunks", 0)
+                overall_stats["total_table_chunks"] += result.get("table_chunks", 0)
+                overall_stats["total_chunks"] += result.get("total_chunks", 0)
+                
+                # Store individual file results
+                file_detail = {
+                    "filename": os.path.basename(pdf_file),
+                    "text_chunks": result.get("text_chunks", 0),
+                    "image_chunks": result.get("image_chunks", 0),
+                    "table_chunks": result.get("table_chunks", 0),
+                    "total_chunks": result.get("total_chunks", 0)
+                }
+                overall_stats["file_details"].append(file_detail)
+                
+                print(f"Completed: {file_detail['filename']} - {file_detail['total_chunks']} chunks processed")
+                
+            except Exception as e:
+                print(f"Error processing {pdf_file}: {str(e)}")
+                # Continue with next file
+        
+        print("\nDirectory processing complete!")
+        print(f"Processed {overall_stats['files_processed']} files")
+        print(f"Total chunks: {overall_stats['total_chunks']}")
+        print(f"  - Text chunks: {overall_stats['total_text_chunks']}")
+        print(f"  - Image chunks: {overall_stats['total_image_chunks']}")
+        print(f"  - Table chunks: {overall_stats['total_table_chunks']}")
+        print(f"All content stored in collection: {overall_stats['collection_name']}")
+        
+        return overall_stats
+
+import glob
+import os   
 processor = PdfProcessor({
-    "image_provider": "openai",
-    "openai_api_key": "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",
-    "collection_name": "my_custom_collection",
+    # "image_provider": "openai",
+    # "openai_api_key": "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",
+    "collection_name": "my_control_and calibration",
    "summary_language": "English"
 })
-result = processor.process_pdf(r"F:\Dev\Rag\chat_bot_rag\T4 Machines thermiques.pdf")
+
+results = processor.process_directory(r"C:\Users\serameza\host-data")