Compare commits

...

3 Commits

Author SHA1 Message Date
9fd056baaf Enhance chatbot UI by increasing height, adding copy button, and refining image gallery display 2025-03-09 15:30:54 +01:00
819d3a0956 Upload files to "/"
add progress bar plus folder inspect for pdf files
2025-03-09 12:31:02 +01:00
0cddd0842f Upload files to "/"
update code and add progress bar
2025-03-09 12:21:12 +01:00
3 changed files with 288 additions and 158 deletions

View File

@@ -9,6 +9,9 @@ from translations.lang_mappings import LANGUAGE_MAPPING
from utils.image_utils import base64_to_image from utils.image_utils import base64_to_image
from langchain.callbacks.base import BaseCallbackHandler from langchain.callbacks.base import BaseCallbackHandler
import re import re
from typing import List, Union, Dict, Any
# Pour Gradio 4.x
# from gradio.types.message import ImageMessage, HtmlMessage, TextMessage
def clean_llm_response(text): def clean_llm_response(text):
"""Nettoie la réponse du LLM en enlevant les balises de pensée et autres éléments non désirés.""" """Nettoie la réponse du LLM en enlevant les balises de pensée et autres éléments non désirés."""
@@ -53,7 +56,9 @@ def display_images(images_list=None):
for img_data in images_to_use: for img_data in images_to_use:
image = img_data["image"] image = img_data["image"]
if image: if image:
caption = f"{img_data['caption']} (Source: {img_data['source']}, Page: {img_data['page']})" # Supprimer les infos de type "(Texte 5)" dans la caption
caption = re.sub(pattern_texte, '', img_data["caption"])
caption = f"{caption} (Source: {img_data['source']}, Page: {img_data['page']})"
gallery.append((image, caption)) gallery.append((image, caption))
return gallery if gallery else None return gallery if gallery else None
@@ -171,71 +176,75 @@ def convert_to_messages_format(history):
messages.append({"role": "user", "content": user_msg}) messages.append({"role": "user", "content": user_msg})
if assistant_msg: # Éviter les messages vides if assistant_msg: # Éviter les messages vides
messages.append({"role": "assistant", "content": assistant_msg}) messages.append({"role": "assistant", "content": assistant_msg})
except ValueError: except Exception as e:
# Journaliser l'erreur pour le débogage # Journaliser l'erreur pour le débogage
print(f"Format d'historique non reconnu: {history}") print(f"Format d'historique non reconnu: {history}")
print(f"Erreur: {str(e)}")
# Retourner un historique vide en cas d'erreur # Retourner un historique vide en cas d'erreur
return [] return []
return messages return messages
# Définir le pattern de l'expression régulière en dehors de la f-string
pattern_texte = r'\(Texte \d+\)'
def process_query(message, history, streaming, show_sources, max_images, language): def process_query(message, history, streaming, show_sources, max_images, language):
global current_images, current_tables global current_images, current_tables
# Debug plus clair print(f"Language selected for response: {language} -> {LANGUAGE_MAPPING.get(language, 'français')}")
print(f"Langue sélectionnée pour la réponse: {language} -> {LANGUAGE_MAPPING.get(language, 'français')}")
if not message.strip(): if not message.strip():
return history, "", None, None return history, "", None, None
current_images = [] current_images = []
current_tables = [] current_tables = []
print(f"Traitement du message: {message}")
print(f"Streaming: {streaming}")
try: try:
if streaming: # Convert history to messages format
# Convertir history en format messages pour l'affichage
messages_history = convert_to_messages_format(history) messages_history = convert_to_messages_format(history)
if streaming:
# Add user message to history
messages_history.append({"role": "user", "content": message}) messages_history.append({"role": "user", "content": message})
# Add empty message for assistant response
messages_history.append({"role": "assistant", "content": ""}) messages_history.append({"role": "assistant", "content": ""})
# 1. Récupérer les documents pertinents # Get relevant documents
docs = rag_bot._retrieve_relevant_documents(message) docs = rag_bot._retrieve_relevant_documents(message)
# 2. Préparer le contexte et l'historique # Process context and history
context = rag_bot._format_documents(docs) context = rag_bot._format_documents(docs)
history_text = rag_bot._format_chat_history() history_text = rag_bot._format_chat_history()
# 3. Préparer le prompt # Create prompt
prompt_template = ChatPromptTemplate.from_template(""" prompt_template = ChatPromptTemplate.from_template("""
Tu es un assistant documentaire spécialisé qui utilise le contexte fourni. You are a specialized document assistant that uses the provided context.
===== INSTRUCTION CRUCIALE SUR LA LANGUE ===== ===== CRITICAL LANGUAGE INSTRUCTION =====
RÉPONDS UNIQUEMENT EN {language}. C'est une exigence ABSOLUE. RESPOND ONLY IN {language}. This is an ABSOLUTE requirement.
NE RÉPONDS JAMAIS dans une autre langue que {language}, quelle que soit la langue de la question. NEVER RESPOND in any language other than {language}, regardless of question language.
============================================== ==============================================
Instructions spécifiques: Specific instructions:
1. Pour chaque image mentione: inclure la légende, source, page et description 1. For each image mentioned: include caption, source, page and description
2. Pour chaque tableau: inclure titre, source, page et signification 2. For each table: include title, source, page and significance
3. Pour les équations: utiliser la syntaxe LaTeX exacte 3. For equations: use exact LaTeX syntax
4. Ne pas inventer d'informations hors du contexte fourni 4. Don't invent information outside the provided context
5. Citer précisément les sources 5. Cite sources precisely
Historique de conversation: Conversation history:
{chat_history} {chat_history}
Contexte: Context:
{context} {context}
Question: {question} Question: {question}
Réponds de façon structurée en intégrant les images, tableaux et équations disponibles. Respond in a structured way incorporating available images, tables and equations.
TA RÉPONSE DOIT ÊTRE UNIQUEMENT ET ENTIÈREMENT EN {language}. CETTE RÈGLE EST ABSOLUE. YOUR RESPONSE MUST BE SOLELY AND ENTIRELY IN {language}. THIS RULE IS ABSOLUTE.
""") """)
# Assurer que la langue est bien passée dans le format du prompt # Set language for the response
selected_language = LANGUAGE_MAPPING.get(language, "français") selected_language = LANGUAGE_MAPPING.get(language, "français")
messages = prompt_template.format_messages( messages = prompt_template.format_messages(
chat_history=history_text, chat_history=history_text,
@@ -244,10 +253,10 @@ def process_query(message, history, streaming, show_sources, max_images, languag
language=selected_language language=selected_language
) )
# 5. Créer un handler de streaming personnalisé # Create streaming handler
handler = GradioStreamingHandler() handler = GradioStreamingHandler()
# 6. Créer un modèle LLM avec notre handler # Create LLM model with our handler
streaming_llm = ChatOllama( streaming_llm = ChatOllama(
model=rag_bot.llm.model, model=rag_bot.llm.model,
base_url=rag_bot.llm.base_url, base_url=rag_bot.llm.base_url,
@@ -255,87 +264,81 @@ def process_query(message, history, streaming, show_sources, max_images, languag
callbacks=[handler] callbacks=[handler]
) )
# 7. Lancer la génération dans un thread pour ne pas bloquer l'UI # Generate response in a separate thread
def generate_response(): def generate_response():
streaming_llm.invoke(messages) streaming_llm.invoke(messages)
thread = threading.Thread(target=generate_response) thread = threading.Thread(target=generate_response)
thread.start() thread.start()
# 8. Récupérer les tokens et mettre à jour l'interface # Process tokens and update interface
partial_response = "" partial_response = ""
# Attendre les tokens avec un timeout # Wait for tokens with timeout
while thread.is_alive() or not handler.tokens_queue.empty(): while thread.is_alive() or not handler.tokens_queue.empty():
try: try:
token = handler.tokens_queue.get(timeout=0.05) token = handler.tokens_queue.get(timeout=0.05)
partial_response += token partial_response += token
# Nettoyer la réponse uniquement pour l'affichage (pas pour l'historique interne) # Clean response for display
clean_response = clean_llm_response(partial_response) clean_response = clean_llm_response(partial_response)
# Mettre à jour le dernier message (assistant) # Update assistant message - JUST TEXT, not multimodal
messages_history[-1]["content"] = clean_response messages_history[-1]["content"] = clean_response
yield messages_history, "", None, None yield messages_history, "", None, None
except queue.Empty: except queue.Empty:
continue continue
# Après la boucle, nettoyer la réponse complète pour l'historique interne # After loop, clean the complete response for internal history
partial_response = clean_llm_response(partial_response) partial_response = clean_llm_response(partial_response)
rag_bot.chat_history.append({"role": "user", "content": message}) rag_bot.chat_history.append({"role": "user", "content": message})
rag_bot.chat_history.append({"role": "assistant", "content": partial_response}) rag_bot.chat_history.append({"role": "assistant", "content": partial_response})
# 10. Récupérer les sources, images, tableaux # Get sources, images, tables
texts, images, tables = rag_bot._process_documents(docs) texts, images, tables = rag_bot._process_documents(docs)
# Préparer les informations sur les sources # Process sources
source_info = "" source_info = ""
if texts: if texts:
source_info += f"📚 {len(texts)} textes • " clean_texts = [re.sub(pattern_texte, '', t.get("source", "")) for t in texts]
if images: # Remove duplicates and empty items
source_info += f"🖼️ {len(images)} images • " clean_texts = [t for t in clean_texts if t.strip()]
if tables: clean_texts = list(set(clean_texts))
source_info += f"📊 {len(tables)} tableaux" if clean_texts:
source_info += f"📚 Sources: {', '.join(clean_texts)}"
if source_info: # Process images and tables for SEPARATE display only
source_info = "Sources trouvées: " + source_info if show_sources and images and max_images > 0:
for img in images[:max_images]:
# 11. Traiter les images
if show_sources and images:
images = images[:max_images]
for img in images:
img_data = img.get("image_data") img_data = img.get("image_data")
if img_data: if img_data:
image = base64_to_image(img_data) image = base64_to_image(img_data)
if image: if image:
caption = re.sub(pattern_texte, '', img.get("caption", ""))
# Only add to gallery, not to chat messages
current_images.append({ current_images.append({
"image": image, "image": image,
"caption": img.get("caption", ""), "caption": caption,
"source": img.get("source", ""), "source": img.get("source", ""),
"page": img.get("page", ""), "page": img.get("page", "")
"description": img.get("description", "")
}) })
# 12. Traiter les tableaux # Final yield with separate image gallery
if show_sources and tables: yield messages_history, source_info, display_images(), display_tables()
for table in tables:
current_tables.append({
"data": rag_bot.format_table(table.get("table_data", "")),
"caption": table.get("caption", ""),
"source": table.get("source", ""),
"page": table.get("page", ""),
"description": table.get("description", "")
})
# 13. Retourner les résultats finaux
images_display = display_images()
tables_display = display_tables()
yield messages_history, source_info, images_display, tables_display
else: else:
# Version sans streaming # Version non-streaming
print("Mode non-streaming activé") print("Mode non-streaming activé")
source_info = "" source_info = ""
history_tuples = history if isinstance(history, list) else []
# Ajouter le message utilisateur à l'historique au format message
messages_history.append({"role": "user", "content": message})
# Initialize multimodal_content first
multimodal_content = [result["response"]] # Start with text response
# Après avoir obtenu le résultat
result = rag_bot.chat( result = rag_bot.chat(
message, message,
stream=False, stream=False,
@@ -344,12 +347,10 @@ def process_query(message, history, streaming, show_sources, max_images, languag
# Nettoyer la réponse des balises <think> # Nettoyer la réponse des balises <think>
result["response"] = clean_llm_response(result["response"]) result["response"] = clean_llm_response(result["response"])
# Convertir l'historique au format messages # Ajouter la réponse de l'assistant au format message
messages_history = convert_to_messages_format(history)
messages_history.append({"role": "user", "content": message})
messages_history.append({"role": "assistant", "content": result["response"]}) messages_history.append({"role": "assistant", "content": result["response"]})
# Mise à jour de l'historique interne # Mise à jour de l'historique interne du chatbot
rag_bot.chat_history.append({"role": "user", "content": message}) rag_bot.chat_history.append({"role": "user", "content": message})
rag_bot.chat_history.append({"role": "assistant", "content": result["response"]}) rag_bot.chat_history.append({"role": "assistant", "content": result["response"]})
@@ -364,33 +365,23 @@ def process_query(message, history, streaming, show_sources, max_images, languag
if source_info: if source_info:
source_info = "Sources trouvées: " + source_info source_info = "Sources trouvées: " + source_info
# Traiter les images et tableaux # Process images for SEPARATE gallery
if show_sources and "images" in result and result["images"]: if show_sources and "images" in result and result["images"]:
images = result["images"][:max_images] for img in result["images"][:max_images]:
for img in images:
img_data = img.get("image_data") img_data = img.get("image_data")
if img_data: if img_data:
image = base64_to_image(img_data) image = base64_to_image(img_data)
if image: if image:
caption = re.sub(pattern_texte, '', img.get("caption", ""))
# Only add to gallery
current_images.append({ current_images.append({
"image": image, "image": image,
"caption": img.get("caption", ""), "caption": caption,
"source": img.get("source", ""), "source": img.get("source", ""),
"page": img.get("page", ""), "page": img.get("page", "")
"description": img.get("description", "")
})
if show_sources and "tables" in result and result["tables"]:
tables = result["tables"]
for table in tables:
current_tables.append({
"data": rag_bot.format_table(table.get("table_data", "")),
"caption": table.get("caption", ""),
"source": table.get("source", ""),
"page": table.get("page", ""),
"description": table.get("description", "")
}) })
# Final yield with separate displays
yield messages_history, source_info, display_images(), display_tables() yield messages_history, source_info, display_images(), display_tables()
except Exception as e: except Exception as e:
@@ -398,8 +389,13 @@ def process_query(message, history, streaming, show_sources, max_images, languag
traceback_text = traceback.format_exc() traceback_text = traceback.format_exc()
print(error_msg) print(error_msg)
print(traceback_text) print(traceback_text)
history = history + [(message, error_msg)]
yield history, "Erreur lors du traitement de la requête", None, None # Formater l'erreur au format message
error_history = convert_to_messages_format(history)
error_history.append({"role": "user", "content": message})
error_history.append({"role": "assistant", "content": error_msg})
yield error_history, "Erreur lors du traitement de la requête", None, None
# Fonction pour réinitialiser la conversation # Fonction pour réinitialiser la conversation
def reset_conversation(): def reset_conversation():
@@ -410,4 +406,4 @@ def reset_conversation():
rag_bot.clear_history() rag_bot.clear_history()
# Retourner une liste vide au format messages # Retourner une liste vide au format messages
return [], "", None, None return [], "", None, None # Liste vide = pas de messages

View File

@@ -73,11 +73,11 @@ def build_interface(
with gr.Row(): with gr.Row():
with gr.Column(scale=2): with gr.Column(scale=2):
chat_interface = gr.Chatbot( chat_interface = gr.Chatbot(
height=600, height=800,
show_label=False, bubble_full_width=False,
layout="bubble", show_copy_button=True,
elem_id="chatbot", type="messages"
type="messages" # Ajoutez cette ligne # likeable=False,
) )
with gr.Row(): with gr.Row():
@@ -144,17 +144,9 @@ def build_interface(
label=ui_elements['max_images_label'] label=ui_elements['max_images_label']
) )
gr.Markdown("---") # Ne pas supprimer ces lignes dans ui.py
images_title = gr.Markdown(f"### {ui_elements['images_title']}") images_title = gr.Markdown(f"### {ui_elements['images_title']}")
image_gallery = gr.Gallery( image_gallery = gr.Gallery(label="Images")
label=ui_elements['images_title'],
show_label=False,
columns=2,
height=300,
object_fit="contain"
)
tables_title = gr.Markdown(f"### {ui_elements['tables_title']}") tables_title = gr.Markdown(f"### {ui_elements['tables_title']}")
tables_display = gr.HTML() tables_display = gr.HTML()
@@ -190,9 +182,7 @@ def build_interface(
apply_collection_btn, apply_collection_btn,
streaming, streaming,
show_sources, show_sources,
max_images, max_images
images_title,
tables_title
] ]
) )
@@ -215,7 +205,7 @@ def build_interface(
clear_btn.click( clear_btn.click(
reset_conversation_fn, reset_conversation_fn,
outputs=[chat_interface, source_info, image_gallery, tables_display] outputs=[chat_interface, source_info] # Retirer image_gallery et tables_display
) )
# Connecter le changement de modèle # Connecter le changement de modèle
@@ -236,7 +226,7 @@ def build_interface(
gr.Markdown(""" gr.Markdown("""
<style> <style>
.gradio-container {max-width: 1200px !important} .gradio-container {max-width: 1200px !important}
#chatbot {height: 600px; overflow-y: auto;} #chatbot {height: 800px; overflow-y: auto;}
#sources_info {margin-top: 10px; color: #666;} #sources_info {margin-top: 10px; color: #666;}
/* Improved styles for equations */ /* Improved styles for equations */

View File

@@ -7,7 +7,9 @@ from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser from langchain_core.output_parsers import StrOutputParser
import httpx
from tqdm import tqdm
http_client = httpx.Client(verify=False)
class PdfProcessor: class PdfProcessor:
""" """
@@ -81,6 +83,40 @@ class PdfProcessor:
raise ValueError("OpenAI API key is required when using OpenAI models") raise ValueError("OpenAI API key is required when using OpenAI models")
os.environ["OPENAI_API_KEY"] = self.config["openai_api_key"] os.environ["OPENAI_API_KEY"] = self.config["openai_api_key"]
# Initialize Qdrant client
from qdrant_client import QdrantClient
from qdrant_client.http import models as rest
self.qdrant_client = QdrantClient(url=self.config["qdrant_url"])
# Check if collection exists and create it if not
collections = self.qdrant_client.get_collections().collections
collection_exists = any(collection.name == self.config["collection_name"] for collection in collections)
if not collection_exists:
# Get vector size based on embedding model
if self.config["embedding_provider"] == "ollama":
# For OllamaEmbeddings, typically 4096 dimensions for newer models
vector_size = 4096
else: # OpenAI
# OpenAI embedding dimensions vary by model
model_dimensions = {
"text-embedding-ada-002": 1536,
"text-embedding-3-small": 1536,
"text-embedding-3-large": 3072
}
vector_size = model_dimensions.get(self.config["openai_embedding_model"], 1536)
# Create the collection
self.qdrant_client.create_collection(
collection_name=self.config["collection_name"],
vectors_config=rest.VectorParams(
size=vector_size,
distance=rest.Distance.COSINE
)
)
print(f"Created new Qdrant collection: {self.config['collection_name']}")
def _setup_models(self): def _setup_models(self):
"""Initialize models based on configuration.""" """Initialize models based on configuration."""
# Set up embedding model # Set up embedding model
@@ -106,6 +142,7 @@ class PdfProcessor:
else: # openai else: # openai
from langchain_openai import ChatOpenAI from langchain_openai import ChatOpenAI
self.summary_model = ChatOpenAI( self.summary_model = ChatOpenAI(
http_client=http_client,
model=self.config["openai_summary_model"] model=self.config["openai_summary_model"]
) )
@@ -134,38 +171,45 @@ class PdfProcessor:
Returns: Returns:
Dictionary with processing statistics Dictionary with processing statistics
""" """
# Create a master progress bar
with tqdm(total=5, desc="PDF Processing", position=0) as master_bar:
# Load and extract content from PDF # Load and extract content from PDF
print("Loading PDF and extracting elements...") master_bar.set_description("Loading PDF")
documents = self._load_pdf(pdf_path) documents = self._load_pdf(pdf_path)
master_bar.update(1)
# Process text chunks # Process text chunks
print("Processing text chunks...") master_bar.set_description("Processing text chunks")
title_chunks = self._process_text(documents) title_chunks = self._process_text(documents)
text_summaries = self._summarize_text(title_chunks) text_summaries = self._summarize_text(title_chunks)
processed_text = self._convert_text_to_documents(title_chunks, text_summaries) processed_text = self._convert_text_to_documents(title_chunks, text_summaries)
master_bar.update(1)
# Process images if configured # Process images if configured
print("Processing images...") master_bar.set_description("Processing images")
processed_images = [] processed_images = []
if self.config["extract_images"]: if self.config["extract_images"]:
images = self._extract_images(documents) images = self._extract_images(documents)
image_summaries = self._process_images(images) image_summaries = self._process_images(images)
processed_images = self._convert_images_to_documents(images, image_summaries) processed_images = self._convert_images_to_documents(images, image_summaries)
master_bar.update(1)
# Process tables if configured # Process tables if configured
print("Processing tables...") master_bar.set_description("Processing tables")
processed_tables = [] processed_tables = []
if self.config["extract_tables"]: if self.config["extract_tables"]:
tables = self._extract_tables(documents) tables = self._extract_tables(documents)
table_summaries = self._process_tables(tables) table_summaries = self._process_tables(tables)
processed_tables = self._convert_tables_to_documents(tables, table_summaries) processed_tables = self._convert_tables_to_documents(tables, table_summaries)
master_bar.update(1)
print("Storing processed elements in Qdrant...") master_bar.set_description("Storing in Qdrant")
# Combine all processed elements # Combine all processed elements
final_documents = processed_text + processed_images + processed_tables final_documents = processed_text + processed_images + processed_tables
# Store in Qdrant # Store in Qdrant
self._store_documents(final_documents) self._store_documents(final_documents)
master_bar.update(1)
return { return {
"text_chunks": len(processed_text), "text_chunks": len(processed_text),
@@ -199,7 +243,15 @@ class PdfProcessor:
def _summarize_text(self, chunks: List[Document]) -> List[str]: def _summarize_text(self, chunks: List[Document]) -> List[str]:
"""Generate summaries for text chunks.""" """Generate summaries for text chunks."""
return self.summarize_chain.batch([chunk.page_content for chunk in chunks], {"max_concurrency": 3}) if not chunks:
return []
print(f"Summarizing {len(chunks)} text chunks...")
results = []
for chunk in tqdm(chunks, desc="Text summarization", leave=False):
result = self.summarize_chain.invoke(chunk.page_content)
results.append(result)
return results
def _extract_images(self, documents: List[Document]) -> List[Dict[str, Any]]: def _extract_images(self, documents: List[Document]) -> List[Dict[str, Any]]:
"""Extract images with captions from documents.""" """Extract images with captions from documents."""
@@ -225,12 +277,17 @@ class PdfProcessor:
def _process_images(self, images: List[Dict[str, Any]]) -> List[str]: def _process_images(self, images: List[Dict[str, Any]]) -> List[str]:
"""Generate descriptions for images using configured model.""" """Generate descriptions for images using configured model."""
if not images:
return []
print(f"Processing {len(images)} images...")
if self.config["image_provider"] == "ollama": if self.config["image_provider"] == "ollama":
from ollama import Client from ollama import Client
client = Client(host=self.config["ollama_image_url"]) client = Client(host=self.config["ollama_image_url"])
image_summaries = [] image_summaries = []
for img in images: for img in tqdm(images, desc="Image processing", leave=False):
prompt = f"Caption of image: {img.get('caption', '')}. Describe this image in detail in {self.config['summary_language']}." prompt = f"Caption of image: {img.get('caption', '')}. Describe this image in detail in {self.config['summary_language']}."
response = client.chat( response = client.chat(
model=self.config["ollama_image_model"], model=self.config["ollama_image_model"],
@@ -261,9 +318,17 @@ class PdfProcessor:
] ]
prompt = ChatPromptTemplate.from_messages(messages) prompt = ChatPromptTemplate.from_messages(messages)
chain = prompt | ChatOpenAI(model=self.config["openai_image_model"]) | StrOutputParser() chain = prompt | ChatOpenAI(model=self.config["openai_image_model"], http_client=http_client) | StrOutputParser()
return chain.batch([{"image_base64": img["image_base64"], "caption": img.get("caption", "")} for img in images]) # Process images with progress bar
results = []
image_data = [{"image_base64": img["image_base64"], "caption": img.get("caption", "")} for img in images]
for img_data in tqdm(image_data, desc="Image processing", leave=False):
result = chain.invoke(img_data)
results.append(result)
return results
def _extract_tables(self, documents: List[Document]) -> List[Dict[str, Any]]: def _extract_tables(self, documents: List[Document]) -> List[Dict[str, Any]]:
"""Extract tables with captions from documents.""" """Extract tables with captions from documents."""
@@ -290,9 +355,13 @@ class PdfProcessor:
def _process_tables(self, tables: List[Dict[str, Any]]) -> List[str]: def _process_tables(self, tables: List[Dict[str, Any]]) -> List[str]:
"""Generate summaries for tables.""" """Generate summaries for tables."""
if not tables:
return []
print(f"Processing {len(tables)} tables...")
table_summaries = [] table_summaries = []
for table in tables: for table in tqdm(tables, desc="Table processing", leave=False):
prompt = f"""Caption of table: {table.get('caption', '')}. prompt = f"""Caption of table: {table.get('caption', '')}.
Describe this table in detail in {self.config['summary_language']}. Describe this table in detail in {self.config['summary_language']}.
Table content: {table.get('table_data', '')}""" Table content: {table.get('table_data', '')}"""
@@ -482,10 +551,85 @@ class PdfProcessor:
return final_chunks return final_chunks
def process_directory(self, directory_path: str) -> Dict[str, Any]:
"""
Process all PDF files in the specified directory.
Args:
directory_path: Path to the directory containing PDF files
Returns:
Dictionary with processing statistics for all files
"""
# Check if directory exists
if not os.path.isdir(directory_path):
raise ValueError(f"Directory not found: {directory_path}")
# Find all PDF files in the directory
pdf_files = glob.glob(os.path.join(directory_path, "*.pdf"))
if not pdf_files:
print(f"No PDF files found in {directory_path}")
return {"files_processed": 0}
# Track overall statistics
overall_stats = {
"files_processed": 0,
"total_text_chunks": 0,
"total_image_chunks": 0,
"total_table_chunks": 0,
"total_chunks": 0,
"collection_name": self.config["collection_name"],
"file_details": []
}
# Process each PDF file with a progress bar
print(f"Found {len(pdf_files)} PDF files in {directory_path}")
for pdf_file in tqdm(pdf_files, desc="Processing PDF files", unit="file"):
try:
print(f"\nProcessing: {os.path.basename(pdf_file)}")
result = self.process_pdf(pdf_file)
# Update statistics
overall_stats["files_processed"] += 1
overall_stats["total_text_chunks"] += result.get("text_chunks", 0)
overall_stats["total_image_chunks"] += result.get("image_chunks", 0)
overall_stats["total_table_chunks"] += result.get("table_chunks", 0)
overall_stats["total_chunks"] += result.get("total_chunks", 0)
# Store individual file results
file_detail = {
"filename": os.path.basename(pdf_file),
"text_chunks": result.get("text_chunks", 0),
"image_chunks": result.get("image_chunks", 0),
"table_chunks": result.get("table_chunks", 0),
"total_chunks": result.get("total_chunks", 0)
}
overall_stats["file_details"].append(file_detail)
print(f"Completed: {file_detail['filename']} - {file_detail['total_chunks']} chunks processed")
except Exception as e:
print(f"Error processing {pdf_file}: {str(e)}")
# Continue with next file
print("\nDirectory processing complete!")
print(f"Processed {overall_stats['files_processed']} files")
print(f"Total chunks: {overall_stats['total_chunks']}")
print(f" - Text chunks: {overall_stats['total_text_chunks']}")
print(f" - Image chunks: {overall_stats['total_image_chunks']}")
print(f" - Table chunks: {overall_stats['total_table_chunks']}")
print(f"All content stored in collection: {overall_stats['collection_name']}")
return overall_stats
import glob
import os
processor = PdfProcessor({ processor = PdfProcessor({
"image_provider": "openai", # "image_provider": "openai",
"openai_api_key": "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", # "openai_api_key": "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",
"collection_name": "my_custom_collection", "collection_name": "my_control_and calibration",
"summary_language": "English" "summary_language": "English"
}) })
result = processor.process_pdf(r"F:\Dev\Rag\chat_bot_rag\T4 Machines thermiques.pdf")
results = processor.process_directory(r"C:\Users\serameza\host-data")