Compare commits

...

3 Commits

Author SHA1 Message Date
9fd056baaf Enhance chatbot UI by increasing height, adding copy button, and refining image gallery display 2025-03-09 15:30:54 +01:00
819d3a0956 Upload files to "/"
add progress bar plus folder inspect for pdf files
2025-03-09 12:31:02 +01:00
0cddd0842f Upload files to "/"
update code and add progress bar
2025-03-09 12:21:12 +01:00
3 changed files with 288 additions and 158 deletions

View File

@@ -9,6 +9,9 @@ from translations.lang_mappings import LANGUAGE_MAPPING
from utils.image_utils import base64_to_image
from langchain.callbacks.base import BaseCallbackHandler
import re
from typing import List, Union, Dict, Any
# Pour Gradio 4.x
# from gradio.types.message import ImageMessage, HtmlMessage, TextMessage
def clean_llm_response(text):
"""Nettoie la réponse du LLM en enlevant les balises de pensée et autres éléments non désirés."""
@@ -53,7 +56,9 @@ def display_images(images_list=None):
for img_data in images_to_use:
image = img_data["image"]
if image:
caption = f"{img_data['caption']} (Source: {img_data['source']}, Page: {img_data['page']})"
# Supprimer les infos de type "(Texte 5)" dans la caption
caption = re.sub(pattern_texte, '', img_data["caption"])
caption = f"{caption} (Source: {img_data['source']}, Page: {img_data['page']})"
gallery.append((image, caption))
return gallery if gallery else None
@@ -171,71 +176,75 @@ def convert_to_messages_format(history):
messages.append({"role": "user", "content": user_msg})
if assistant_msg: # Éviter les messages vides
messages.append({"role": "assistant", "content": assistant_msg})
except ValueError:
except Exception as e:
# Journaliser l'erreur pour le débogage
print(f"Format d'historique non reconnu: {history}")
print(f"Erreur: {str(e)}")
# Retourner un historique vide en cas d'erreur
return []
return messages
# Définir le pattern de l'expression régulière en dehors de la f-string
pattern_texte = r'\(Texte \d+\)'
def process_query(message, history, streaming, show_sources, max_images, language):
global current_images, current_tables
# Debug plus clair
print(f"Langue sélectionnée pour la réponse: {language} -> {LANGUAGE_MAPPING.get(language, 'français')}")
print(f"Language selected for response: {language} -> {LANGUAGE_MAPPING.get(language, 'français')}")
if not message.strip():
return history, "", None, None
current_images = []
current_tables = []
print(f"Traitement du message: {message}")
print(f"Streaming: {streaming}")
try:
if streaming:
# Convertir history en format messages pour l'affichage
# Convert history to messages format
messages_history = convert_to_messages_format(history)
if streaming:
# Add user message to history
messages_history.append({"role": "user", "content": message})
# Add empty message for assistant response
messages_history.append({"role": "assistant", "content": ""})
# 1. Récupérer les documents pertinents
# Get relevant documents
docs = rag_bot._retrieve_relevant_documents(message)
# 2. Préparer le contexte et l'historique
# Process context and history
context = rag_bot._format_documents(docs)
history_text = rag_bot._format_chat_history()
# 3. Préparer le prompt
# Create prompt
prompt_template = ChatPromptTemplate.from_template("""
Tu es un assistant documentaire spécialisé qui utilise le contexte fourni.
You are a specialized document assistant that uses the provided context.
===== INSTRUCTION CRUCIALE SUR LA LANGUE =====
RÉPONDS UNIQUEMENT EN {language}. C'est une exigence ABSOLUE.
NE RÉPONDS JAMAIS dans une autre langue que {language}, quelle que soit la langue de la question.
===== CRITICAL LANGUAGE INSTRUCTION =====
RESPOND ONLY IN {language}. This is an ABSOLUTE requirement.
NEVER RESPOND in any language other than {language}, regardless of question language.
==============================================
Instructions spécifiques:
1. Pour chaque image mentione: inclure la légende, source, page et description
2. Pour chaque tableau: inclure titre, source, page et signification
3. Pour les équations: utiliser la syntaxe LaTeX exacte
4. Ne pas inventer d'informations hors du contexte fourni
5. Citer précisément les sources
Specific instructions:
1. For each image mentioned: include caption, source, page and description
2. For each table: include title, source, page and significance
3. For equations: use exact LaTeX syntax
4. Don't invent information outside the provided context
5. Cite sources precisely
Historique de conversation:
Conversation history:
{chat_history}
Contexte:
Context:
{context}
Question: {question}
Réponds de façon structurée en intégrant les images, tableaux et équations disponibles.
TA RÉPONSE DOIT ÊTRE UNIQUEMENT ET ENTIÈREMENT EN {language}. CETTE RÈGLE EST ABSOLUE.
Respond in a structured way incorporating available images, tables and equations.
YOUR RESPONSE MUST BE SOLELY AND ENTIRELY IN {language}. THIS RULE IS ABSOLUTE.
""")
# Assurer que la langue est bien passée dans le format du prompt
# Set language for the response
selected_language = LANGUAGE_MAPPING.get(language, "français")
messages = prompt_template.format_messages(
chat_history=history_text,
@@ -244,10 +253,10 @@ def process_query(message, history, streaming, show_sources, max_images, languag
language=selected_language
)
# 5. Créer un handler de streaming personnalisé
# Create streaming handler
handler = GradioStreamingHandler()
# 6. Créer un modèle LLM avec notre handler
# Create LLM model with our handler
streaming_llm = ChatOllama(
model=rag_bot.llm.model,
base_url=rag_bot.llm.base_url,
@@ -255,87 +264,81 @@ def process_query(message, history, streaming, show_sources, max_images, languag
callbacks=[handler]
)
# 7. Lancer la génération dans un thread pour ne pas bloquer l'UI
# Generate response in a separate thread
def generate_response():
streaming_llm.invoke(messages)
thread = threading.Thread(target=generate_response)
thread.start()
# 8. Récupérer les tokens et mettre à jour l'interface
# Process tokens and update interface
partial_response = ""
# Attendre les tokens avec un timeout
# Wait for tokens with timeout
while thread.is_alive() or not handler.tokens_queue.empty():
try:
token = handler.tokens_queue.get(timeout=0.05)
partial_response += token
# Nettoyer la réponse uniquement pour l'affichage (pas pour l'historique interne)
# Clean response for display
clean_response = clean_llm_response(partial_response)
# Mettre à jour le dernier message (assistant)
# Update assistant message - JUST TEXT, not multimodal
messages_history[-1]["content"] = clean_response
yield messages_history, "", None, None
except queue.Empty:
continue
# Après la boucle, nettoyer la réponse complète pour l'historique interne
# After loop, clean the complete response for internal history
partial_response = clean_llm_response(partial_response)
rag_bot.chat_history.append({"role": "user", "content": message})
rag_bot.chat_history.append({"role": "assistant", "content": partial_response})
# 10. Récupérer les sources, images, tableaux
# Get sources, images, tables
texts, images, tables = rag_bot._process_documents(docs)
# Préparer les informations sur les sources
# Process sources
source_info = ""
if texts:
source_info += f"📚 {len(texts)} textes • "
if images:
source_info += f"🖼️ {len(images)} images • "
if tables:
source_info += f"📊 {len(tables)} tableaux"
clean_texts = [re.sub(pattern_texte, '', t.get("source", "")) for t in texts]
# Remove duplicates and empty items
clean_texts = [t for t in clean_texts if t.strip()]
clean_texts = list(set(clean_texts))
if clean_texts:
source_info += f"📚 Sources: {', '.join(clean_texts)}"
if source_info:
source_info = "Sources trouvées: " + source_info
# 11. Traiter les images
if show_sources and images:
images = images[:max_images]
for img in images:
# Process images and tables for SEPARATE display only
if show_sources and images and max_images > 0:
for img in images[:max_images]:
img_data = img.get("image_data")
if img_data:
image = base64_to_image(img_data)
if image:
caption = re.sub(pattern_texte, '', img.get("caption", ""))
# Only add to gallery, not to chat messages
current_images.append({
"image": image,
"caption": img.get("caption", ""),
"caption": caption,
"source": img.get("source", ""),
"page": img.get("page", ""),
"description": img.get("description", "")
"page": img.get("page", "")
})
# 12. Traiter les tableaux
if show_sources and tables:
for table in tables:
current_tables.append({
"data": rag_bot.format_table(table.get("table_data", "")),
"caption": table.get("caption", ""),
"source": table.get("source", ""),
"page": table.get("page", ""),
"description": table.get("description", "")
})
# 13. Retourner les résultats finaux
images_display = display_images()
tables_display = display_tables()
yield messages_history, source_info, images_display, tables_display
# Final yield with separate image gallery
yield messages_history, source_info, display_images(), display_tables()
else:
# Version sans streaming
# Version non-streaming
print("Mode non-streaming activé")
source_info = ""
history_tuples = history if isinstance(history, list) else []
# Ajouter le message utilisateur à l'historique au format message
messages_history.append({"role": "user", "content": message})
# Initialize multimodal_content first
multimodal_content = [result["response"]] # Start with text response
# Après avoir obtenu le résultat
result = rag_bot.chat(
message,
stream=False,
@@ -344,12 +347,10 @@ def process_query(message, history, streaming, show_sources, max_images, languag
# Nettoyer la réponse des balises <think>
result["response"] = clean_llm_response(result["response"])
# Convertir l'historique au format messages
messages_history = convert_to_messages_format(history)
messages_history.append({"role": "user", "content": message})
# Ajouter la réponse de l'assistant au format message
messages_history.append({"role": "assistant", "content": result["response"]})
# Mise à jour de l'historique interne
# Mise à jour de l'historique interne du chatbot
rag_bot.chat_history.append({"role": "user", "content": message})
rag_bot.chat_history.append({"role": "assistant", "content": result["response"]})
@@ -364,33 +365,23 @@ def process_query(message, history, streaming, show_sources, max_images, languag
if source_info:
source_info = "Sources trouvées: " + source_info
# Traiter les images et tableaux
# Process images for SEPARATE gallery
if show_sources and "images" in result and result["images"]:
images = result["images"][:max_images]
for img in images:
for img in result["images"][:max_images]:
img_data = img.get("image_data")
if img_data:
image = base64_to_image(img_data)
if image:
caption = re.sub(pattern_texte, '', img.get("caption", ""))
# Only add to gallery
current_images.append({
"image": image,
"caption": img.get("caption", ""),
"caption": caption,
"source": img.get("source", ""),
"page": img.get("page", ""),
"description": img.get("description", "")
})
if show_sources and "tables" in result and result["tables"]:
tables = result["tables"]
for table in tables:
current_tables.append({
"data": rag_bot.format_table(table.get("table_data", "")),
"caption": table.get("caption", ""),
"source": table.get("source", ""),
"page": table.get("page", ""),
"description": table.get("description", "")
"page": img.get("page", "")
})
# Final yield with separate displays
yield messages_history, source_info, display_images(), display_tables()
except Exception as e:
@@ -398,8 +389,13 @@ def process_query(message, history, streaming, show_sources, max_images, languag
traceback_text = traceback.format_exc()
print(error_msg)
print(traceback_text)
history = history + [(message, error_msg)]
yield history, "Erreur lors du traitement de la requête", None, None
# Formater l'erreur au format message
error_history = convert_to_messages_format(history)
error_history.append({"role": "user", "content": message})
error_history.append({"role": "assistant", "content": error_msg})
yield error_history, "Erreur lors du traitement de la requête", None, None
# Fonction pour réinitialiser la conversation
def reset_conversation():
@@ -410,4 +406,4 @@ def reset_conversation():
rag_bot.clear_history()
# Retourner une liste vide au format messages
return [], "", None, None
return [], "", None, None # Liste vide = pas de messages

View File

@@ -73,11 +73,11 @@ def build_interface(
with gr.Row():
with gr.Column(scale=2):
chat_interface = gr.Chatbot(
height=600,
show_label=False,
layout="bubble",
elem_id="chatbot",
type="messages" # Ajoutez cette ligne
height=800,
bubble_full_width=False,
show_copy_button=True,
type="messages"
# likeable=False,
)
with gr.Row():
@@ -144,17 +144,9 @@ def build_interface(
label=ui_elements['max_images_label']
)
gr.Markdown("---")
# Ne pas supprimer ces lignes dans ui.py
images_title = gr.Markdown(f"### {ui_elements['images_title']}")
image_gallery = gr.Gallery(
label=ui_elements['images_title'],
show_label=False,
columns=2,
height=300,
object_fit="contain"
)
image_gallery = gr.Gallery(label="Images")
tables_title = gr.Markdown(f"### {ui_elements['tables_title']}")
tables_display = gr.HTML()
@@ -190,9 +182,7 @@ def build_interface(
apply_collection_btn,
streaming,
show_sources,
max_images,
images_title,
tables_title
max_images
]
)
@@ -215,7 +205,7 @@ def build_interface(
clear_btn.click(
reset_conversation_fn,
outputs=[chat_interface, source_info, image_gallery, tables_display]
outputs=[chat_interface, source_info] # Retirer image_gallery et tables_display
)
# Connecter le changement de modèle
@@ -236,7 +226,7 @@ def build_interface(
gr.Markdown("""
<style>
.gradio-container {max-width: 1200px !important}
#chatbot {height: 600px; overflow-y: auto;}
#chatbot {height: 800px; overflow-y: auto;}
#sources_info {margin-top: 10px; color: #666;}
/* Improved styles for equations */

View File

@@ -7,7 +7,9 @@ from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
import httpx
from tqdm import tqdm
http_client = httpx.Client(verify=False)
class PdfProcessor:
"""
@@ -81,6 +83,40 @@ class PdfProcessor:
raise ValueError("OpenAI API key is required when using OpenAI models")
os.environ["OPENAI_API_KEY"] = self.config["openai_api_key"]
# Initialize Qdrant client
from qdrant_client import QdrantClient
from qdrant_client.http import models as rest
self.qdrant_client = QdrantClient(url=self.config["qdrant_url"])
# Check if collection exists and create it if not
collections = self.qdrant_client.get_collections().collections
collection_exists = any(collection.name == self.config["collection_name"] for collection in collections)
if not collection_exists:
# Get vector size based on embedding model
if self.config["embedding_provider"] == "ollama":
# For OllamaEmbeddings, typically 4096 dimensions for newer models
vector_size = 4096
else: # OpenAI
# OpenAI embedding dimensions vary by model
model_dimensions = {
"text-embedding-ada-002": 1536,
"text-embedding-3-small": 1536,
"text-embedding-3-large": 3072
}
vector_size = model_dimensions.get(self.config["openai_embedding_model"], 1536)
# Create the collection
self.qdrant_client.create_collection(
collection_name=self.config["collection_name"],
vectors_config=rest.VectorParams(
size=vector_size,
distance=rest.Distance.COSINE
)
)
print(f"Created new Qdrant collection: {self.config['collection_name']}")
def _setup_models(self):
"""Initialize models based on configuration."""
# Set up embedding model
@@ -106,6 +142,7 @@ class PdfProcessor:
else: # openai
from langchain_openai import ChatOpenAI
self.summary_model = ChatOpenAI(
http_client=http_client,
model=self.config["openai_summary_model"]
)
@@ -134,38 +171,45 @@ class PdfProcessor:
Returns:
Dictionary with processing statistics
"""
# Create a master progress bar
with tqdm(total=5, desc="PDF Processing", position=0) as master_bar:
# Load and extract content from PDF
print("Loading PDF and extracting elements...")
master_bar.set_description("Loading PDF")
documents = self._load_pdf(pdf_path)
master_bar.update(1)
# Process text chunks
print("Processing text chunks...")
master_bar.set_description("Processing text chunks")
title_chunks = self._process_text(documents)
text_summaries = self._summarize_text(title_chunks)
processed_text = self._convert_text_to_documents(title_chunks, text_summaries)
master_bar.update(1)
# Process images if configured
print("Processing images...")
master_bar.set_description("Processing images")
processed_images = []
if self.config["extract_images"]:
images = self._extract_images(documents)
image_summaries = self._process_images(images)
processed_images = self._convert_images_to_documents(images, image_summaries)
master_bar.update(1)
# Process tables if configured
print("Processing tables...")
master_bar.set_description("Processing tables")
processed_tables = []
if self.config["extract_tables"]:
tables = self._extract_tables(documents)
table_summaries = self._process_tables(tables)
processed_tables = self._convert_tables_to_documents(tables, table_summaries)
master_bar.update(1)
print("Storing processed elements in Qdrant...")
master_bar.set_description("Storing in Qdrant")
# Combine all processed elements
final_documents = processed_text + processed_images + processed_tables
# Store in Qdrant
self._store_documents(final_documents)
master_bar.update(1)
return {
"text_chunks": len(processed_text),
@@ -199,7 +243,15 @@ class PdfProcessor:
def _summarize_text(self, chunks: List[Document]) -> List[str]:
"""Generate summaries for text chunks."""
return self.summarize_chain.batch([chunk.page_content for chunk in chunks], {"max_concurrency": 3})
if not chunks:
return []
print(f"Summarizing {len(chunks)} text chunks...")
results = []
for chunk in tqdm(chunks, desc="Text summarization", leave=False):
result = self.summarize_chain.invoke(chunk.page_content)
results.append(result)
return results
def _extract_images(self, documents: List[Document]) -> List[Dict[str, Any]]:
"""Extract images with captions from documents."""
@@ -225,12 +277,17 @@ class PdfProcessor:
def _process_images(self, images: List[Dict[str, Any]]) -> List[str]:
"""Generate descriptions for images using configured model."""
if not images:
return []
print(f"Processing {len(images)} images...")
if self.config["image_provider"] == "ollama":
from ollama import Client
client = Client(host=self.config["ollama_image_url"])
image_summaries = []
for img in images:
for img in tqdm(images, desc="Image processing", leave=False):
prompt = f"Caption of image: {img.get('caption', '')}. Describe this image in detail in {self.config['summary_language']}."
response = client.chat(
model=self.config["ollama_image_model"],
@@ -261,9 +318,17 @@ class PdfProcessor:
]
prompt = ChatPromptTemplate.from_messages(messages)
chain = prompt | ChatOpenAI(model=self.config["openai_image_model"]) | StrOutputParser()
chain = prompt | ChatOpenAI(model=self.config["openai_image_model"], http_client=http_client) | StrOutputParser()
return chain.batch([{"image_base64": img["image_base64"], "caption": img.get("caption", "")} for img in images])
# Process images with progress bar
results = []
image_data = [{"image_base64": img["image_base64"], "caption": img.get("caption", "")} for img in images]
for img_data in tqdm(image_data, desc="Image processing", leave=False):
result = chain.invoke(img_data)
results.append(result)
return results
def _extract_tables(self, documents: List[Document]) -> List[Dict[str, Any]]:
"""Extract tables with captions from documents."""
@@ -290,9 +355,13 @@ class PdfProcessor:
def _process_tables(self, tables: List[Dict[str, Any]]) -> List[str]:
"""Generate summaries for tables."""
if not tables:
return []
print(f"Processing {len(tables)} tables...")
table_summaries = []
for table in tables:
for table in tqdm(tables, desc="Table processing", leave=False):
prompt = f"""Caption of table: {table.get('caption', '')}.
Describe this table in detail in {self.config['summary_language']}.
Table content: {table.get('table_data', '')}"""
@@ -482,10 +551,85 @@ class PdfProcessor:
return final_chunks
def process_directory(self, directory_path: str) -> Dict[str, Any]:
"""
Process all PDF files in the specified directory.
Args:
directory_path: Path to the directory containing PDF files
Returns:
Dictionary with processing statistics for all files
"""
# Check if directory exists
if not os.path.isdir(directory_path):
raise ValueError(f"Directory not found: {directory_path}")
# Find all PDF files in the directory
pdf_files = glob.glob(os.path.join(directory_path, "*.pdf"))
if not pdf_files:
print(f"No PDF files found in {directory_path}")
return {"files_processed": 0}
# Track overall statistics
overall_stats = {
"files_processed": 0,
"total_text_chunks": 0,
"total_image_chunks": 0,
"total_table_chunks": 0,
"total_chunks": 0,
"collection_name": self.config["collection_name"],
"file_details": []
}
# Process each PDF file with a progress bar
print(f"Found {len(pdf_files)} PDF files in {directory_path}")
for pdf_file in tqdm(pdf_files, desc="Processing PDF files", unit="file"):
try:
print(f"\nProcessing: {os.path.basename(pdf_file)}")
result = self.process_pdf(pdf_file)
# Update statistics
overall_stats["files_processed"] += 1
overall_stats["total_text_chunks"] += result.get("text_chunks", 0)
overall_stats["total_image_chunks"] += result.get("image_chunks", 0)
overall_stats["total_table_chunks"] += result.get("table_chunks", 0)
overall_stats["total_chunks"] += result.get("total_chunks", 0)
# Store individual file results
file_detail = {
"filename": os.path.basename(pdf_file),
"text_chunks": result.get("text_chunks", 0),
"image_chunks": result.get("image_chunks", 0),
"table_chunks": result.get("table_chunks", 0),
"total_chunks": result.get("total_chunks", 0)
}
overall_stats["file_details"].append(file_detail)
print(f"Completed: {file_detail['filename']} - {file_detail['total_chunks']} chunks processed")
except Exception as e:
print(f"Error processing {pdf_file}: {str(e)}")
# Continue with next file
print("\nDirectory processing complete!")
print(f"Processed {overall_stats['files_processed']} files")
print(f"Total chunks: {overall_stats['total_chunks']}")
print(f" - Text chunks: {overall_stats['total_text_chunks']}")
print(f" - Image chunks: {overall_stats['total_image_chunks']}")
print(f" - Table chunks: {overall_stats['total_table_chunks']}")
print(f"All content stored in collection: {overall_stats['collection_name']}")
return overall_stats
import glob
import os
processor = PdfProcessor({
"image_provider": "openai",
"openai_api_key": "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",
"collection_name": "my_custom_collection",
# "image_provider": "openai",
# "openai_api_key": "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",
"collection_name": "my_control_and calibration",
"summary_language": "English"
})
result = processor.process_pdf(r"F:\Dev\Rag\chat_bot_rag\T4 Machines thermiques.pdf")
results = processor.process_directory(r"C:\Users\serameza\host-data")