Compare commits

..

5 Commits

Author SHA1 Message Date
9fd056baaf Enhance chatbot UI by increasing height, adding copy button, and refining image gallery display 2025-03-09 15:30:54 +01:00
819d3a0956 Upload files to "/"
add progress bar plus folder inspect for pdf files
2025-03-09 12:31:02 +01:00
0cddd0842f Upload files to "/"
update code and add progress bar
2025-03-09 12:21:12 +01:00
0a9e2d4567 Refactor language options and enhance table description display in chatbot UI 2025-03-09 09:08:09 +01:00
9d142c269d Update UI language handling and improve .gitignore for Python artifacts 2025-03-09 09:06:54 +01:00
9 changed files with 1044 additions and 266 deletions

8
.gitignore vendored
View File

@@ -1 +1,7 @@
apigit.txt apigit.txt
*.pyc
*.pyo
*.pyd
# Dossier de cache
__pycache__/

44
app.py
View File

@@ -1,22 +1,59 @@
# filepath: f:\Dev\Rag\chat_bot_rag\app.py # filepath: f:\Dev\Rag\chat_bot_rag\app.py
import gradio as gr import gradio as gr
from config.settings import DEFAULT_MODEL, QDRANT_COLLECTION_NAME, AVAILABLE_MODELS
from services.rag_service import initialize_rag_bot from services.rag_service import initialize_rag_bot
from components.chatbot import process_query, reset_conversation, change_model, change_collection from components.chatbot import process_query, reset_conversation, change_model, change_collection
from components.ui import build_interface, update_ui_language_elements from components.ui import build_interface, update_ui_language_elements
from translations.lang_mappings import UI_TRANSLATIONS, UI_SUPPORTED_LANGUAGES, LANGUAGE_MAPPING
def update_ui_language(language):
"""Fonction pour mettre à jour la langue de l'interface utilisateur"""
if language not in UI_SUPPORTED_LANGUAGES:
language = "Français" # Langue par défaut
# Récupérer les traductions pour la langue sélectionnée
translations = UI_TRANSLATIONS[language]
# Afficher un message de débogage
print(f"Mise à jour de la langue UI : {language}")
print(f"AVAILABLE_MODELS : {AVAILABLE_MODELS}")
# Retourner les valeurs mises à jour pour tous les éléments de l'interface
return [
f"# {translations['title']}", # Titre
gr.update(placeholder=translations["placeholder"]), # Placeholder du message
gr.update(value=translations["send_btn"]), # Texte du bouton d'envoi
gr.update(value=translations["clear_btn"]), # Texte du bouton d'effacement
gr.update(label=translations["ui_language_label"], info=translations["ui_language_info"]), # Label sélecteur langue UI
# IMPORTANT : Conserver les choices=AVAILABLE_MODELS ici
gr.update(label=translations["model_selector"], info=translations["model_info"], choices=AVAILABLE_MODELS),
f"{translations['model_current']}: **{DEFAULT_MODEL}**", # Statut du modèle
gr.update(label=translations["language_selector"], info=translations["language_info"], choices=list(LANGUAGE_MAPPING.keys())), # Langue réponses
gr.update(label=translations["collection_input"], info=translations["collection_info"]), # Label du champ de collection
f"{translations['collection_current']}: **{QDRANT_COLLECTION_NAME}**", # Statut de la collection
gr.update(value=translations["apply_btn"]), # Texte du bouton d'application
gr.update(label=translations["streaming_label"], info=translations["streaming_info"]), # Label du mode streaming
gr.update(label=translations["sources_label"]), # Label de l'affichage des sources
gr.update(label=translations["max_images_label"]), # Label du nombre max d'images
f"### {translations['images_title']}", # Titre des images
f"### {translations['tables_title']}" # Titre des tableaux
]
def main(): def main():
"""Main entry point for the chatbot application""" """Main entry point for the chatbot application"""
# Initialize the RAG chatbot # Initialize the RAG chatbot
initialize_rag_bot() initialize_rag_bot()
# Construire l'interface # Dans app.py, corriger l'appel à build_interface
interface = build_interface( interface = build_interface(
process_query_fn=process_query, process_query_fn=process_query,
reset_conversation_fn=reset_conversation, reset_conversation_fn=reset_conversation,
change_model_fn=change_model, change_model_fn=change_model,
change_collection_fn=change_collection, change_collection_fn=change_collection,
update_ui_language_fn=update_ui_language_elements # Ajout du paramètre manquant update_ui_language_fn=update_ui_language # Utiliser update_ui_language, pas update_ui_language_elements
) )
# Lancer l'appli Gradio # Lancer l'appli Gradio
@@ -28,4 +65,5 @@ def main():
) )
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@@ -9,6 +9,9 @@ from translations.lang_mappings import LANGUAGE_MAPPING
from utils.image_utils import base64_to_image from utils.image_utils import base64_to_image
from langchain.callbacks.base import BaseCallbackHandler from langchain.callbacks.base import BaseCallbackHandler
import re import re
from typing import List, Union, Dict, Any
# Pour Gradio 4.x
# from gradio.types.message import ImageMessage, HtmlMessage, TextMessage
def clean_llm_response(text): def clean_llm_response(text):
"""Nettoie la réponse du LLM en enlevant les balises de pensée et autres éléments non désirés.""" """Nettoie la réponse du LLM en enlevant les balises de pensée et autres éléments non désirés."""
@@ -53,7 +56,9 @@ def display_images(images_list=None):
for img_data in images_to_use: for img_data in images_to_use:
image = img_data["image"] image = img_data["image"]
if image: if image:
caption = f"{img_data['caption']} (Source: {img_data['source']}, Page: {img_data['page']})" # Supprimer les infos de type "(Texte 5)" dans la caption
caption = re.sub(pattern_texte, '', img_data["caption"])
caption = f"{caption} (Source: {img_data['source']}, Page: {img_data['page']})"
gallery.append((image, caption)) gallery.append((image, caption))
return gallery if gallery else None return gallery if gallery else None
@@ -155,81 +160,103 @@ def change_collection(collection_name, language="Français"):
return f"❌ Erreur: {str(e)}" return f"❌ Erreur: {str(e)}"
# Fonction de traitement de requête # Fonction de traitement de requête
def convert_to_messages_format(history):
"""Convertit différents formats d'historique au format messages."""
messages = []
# Vérifier si nous avons déjà le format messages
if history and isinstance(history[0], dict) and "role" in history[0]:
return history
# Format tuples [(user_msg, assistant_msg), ...]
try:
for item in history:
if isinstance(item, tuple) and len(item) == 2:
user_msg, assistant_msg = item
messages.append({"role": "user", "content": user_msg})
if assistant_msg: # Éviter les messages vides
messages.append({"role": "assistant", "content": assistant_msg})
except Exception as e:
# Journaliser l'erreur pour le débogage
print(f"Format d'historique non reconnu: {history}")
print(f"Erreur: {str(e)}")
# Retourner un historique vide en cas d'erreur
return []
return messages
# Définir le pattern de l'expression régulière en dehors de la f-string
pattern_texte = r'\(Texte \d+\)'
def process_query(message, history, streaming, show_sources, max_images, language): def process_query(message, history, streaming, show_sources, max_images, language):
global current_images, current_tables global current_images, current_tables
print(f"Language selected for response: {language} -> {LANGUAGE_MAPPING.get(language, 'français')}")
if not message.strip(): if not message.strip():
return history, "", None, None return history, "", None, None
current_images = [] current_images = []
current_tables = [] current_tables = []
print(f"Traitement du message: {message}")
print(f"Streaming: {streaming}")
try: try:
# Convert history to messages format
messages_history = convert_to_messages_format(history)
if streaming: if streaming:
# Version avec streaming dans Gradio # Add user message to history
history = history + [(message, "")] messages_history.append({"role": "user", "content": message})
# Add empty message for assistant response
messages_history.append({"role": "assistant", "content": ""})
# 1. Récupérer les documents pertinents # Get relevant documents
docs = rag_bot._retrieve_relevant_documents(message) docs = rag_bot._retrieve_relevant_documents(message)
# 2. Préparer le contexte et l'historique # Process context and history
context = rag_bot._format_documents(docs) context = rag_bot._format_documents(docs)
history_text = rag_bot._format_chat_history() history_text = rag_bot._format_chat_history()
# 3. Préparer le prompt # Create prompt
prompt_template = ChatPromptTemplate.from_template(""" prompt_template = ChatPromptTemplate.from_template("""
Tu es un assistant documentaire spécialisé qui utilise toutes les informations disponibles dans le contexte fourni. You are a specialized document assistant that uses the provided context.
TRÈS IMPORTANT: Tu dois répondre EXCLUSIVEMENT en {language}. Ne réponds JAMAIS dans une autre langue. ===== CRITICAL LANGUAGE INSTRUCTION =====
RESPOND ONLY IN {language}. This is an ABSOLUTE requirement.
NEVER RESPOND in any language other than {language}, regardless of question language.
==============================================
Instructions spécifiques: Specific instructions:
1. Pour chaque image mentionnée dans le contexte, inclue TOUJOURS dans ta réponse: 1. For each image mentioned: include caption, source, page and description
- La légende/caption exacte de l'image 2. For each table: include title, source, page and significance
- La source et le numéro de page 3. For equations: use exact LaTeX syntax
- Une description brève de ce qu'elle montre 4. Don't invent information outside the provided context
5. Cite sources precisely
2. Pour chaque tableau mentionné dans le contexte, inclue TOUJOURS: Conversation history:
- Le titre/caption exact du tableau
- La source et le numéro de page
- Ce que contient et signifie le tableau
3. Lorsque tu cites des équations mathématiques:
- Utilise la syntaxe LaTeX exacte comme dans le document ($...$ ou $$...$$)
- Reproduis-les fidèlement sans modification
4. IMPORTANT: Ne pas inventer d'informations - si une donnée n'est pas explicitement fournie dans le contexte,
indique clairement que cette information n'est pas disponible dans les documents fournis.
5. Cite précisément les sources pour chaque élément d'information (format: [Source, Page]).
6. CRUCIAL: Ta réponse doit être UNIQUEMENT et INTÉGRALEMENT en {language}, quelle que soit la langue de la question.
Historique de conversation:
{chat_history} {chat_history}
Contexte (à utiliser pour répondre): Context:
{context} {context}
Question: {question} Question: {question}
Réponds de façon structurée et précise en intégrant activement les images, tableaux et équations disponibles dans le contexte. Respond in a structured way incorporating available images, tables and equations.
Ta réponse doit être exclusivement en {language}. YOUR RESPONSE MUST BE SOLELY AND ENTIRELY IN {language}. THIS RULE IS ABSOLUTE.
""") """)
# 4. Formater les messages pour le LLM # Set language for the response
selected_language = LANGUAGE_MAPPING.get(language, "français")
messages = prompt_template.format_messages( messages = prompt_template.format_messages(
chat_history=history_text, chat_history=history_text,
context=context, context=context,
question=message, question=message,
language=LANGUAGE_MAPPING.get(language, "français") language=selected_language
) )
# 5. Créer un handler de streaming personnalisé # Create streaming handler
handler = GradioStreamingHandler() handler = GradioStreamingHandler()
# 6. Créer un modèle LLM avec notre handler # Create LLM model with our handler
streaming_llm = ChatOllama( streaming_llm = ChatOllama(
model=rag_bot.llm.model, model=rag_bot.llm.model,
base_url=rag_bot.llm.base_url, base_url=rag_bot.llm.base_url,
@@ -237,92 +264,93 @@ def process_query(message, history, streaming, show_sources, max_images, languag
callbacks=[handler] callbacks=[handler]
) )
# 7. Lancer la génération dans un thread pour ne pas bloquer l'UI # Generate response in a separate thread
def generate_response(): def generate_response():
streaming_llm.invoke(messages) streaming_llm.invoke(messages)
thread = threading.Thread(target=generate_response) thread = threading.Thread(target=generate_response)
thread.start() thread.start()
# 8. Récupérer les tokens et mettre à jour l'interface # Process tokens and update interface
partial_response = "" partial_response = ""
# Attendre les tokens avec un timeout # Wait for tokens with timeout
while thread.is_alive() or not handler.tokens_queue.empty(): while thread.is_alive() or not handler.tokens_queue.empty():
try: try:
token = handler.tokens_queue.get(timeout=0.05) token = handler.tokens_queue.get(timeout=0.05)
partial_response += token partial_response += token
# Nettoyer la réponse uniquement pour l'affichage (pas pour l'historique interne) # Clean response for display
clean_response = clean_llm_response(partial_response) clean_response = clean_llm_response(partial_response)
history[-1] = (message, clean_response) # Update assistant message - JUST TEXT, not multimodal
yield history, "", None, None messages_history[-1]["content"] = clean_response
yield messages_history, "", None, None
except queue.Empty: except queue.Empty:
continue continue
# Après la boucle, nettoyer la réponse complète pour l'historique interne # After loop, clean the complete response for internal history
partial_response = clean_llm_response(partial_response) partial_response = clean_llm_response(partial_response)
rag_bot.chat_history.append({"role": "user", "content": message}) rag_bot.chat_history.append({"role": "user", "content": message})
rag_bot.chat_history.append({"role": "assistant", "content": partial_response}) rag_bot.chat_history.append({"role": "assistant", "content": partial_response})
# 10. Récupérer les sources, images, tableaux # Get sources, images, tables
texts, images, tables = rag_bot._process_documents(docs) texts, images, tables = rag_bot._process_documents(docs)
# Préparer les informations sur les sources # Process sources
source_info = "" source_info = ""
if texts: if texts:
source_info += f"📚 {len(texts)} textes • " clean_texts = [re.sub(pattern_texte, '', t.get("source", "")) for t in texts]
if images: # Remove duplicates and empty items
source_info += f"🖼️ {len(images)} images • " clean_texts = [t for t in clean_texts if t.strip()]
if tables: clean_texts = list(set(clean_texts))
source_info += f"📊 {len(tables)} tableaux" if clean_texts:
source_info += f"📚 Sources: {', '.join(clean_texts)}"
if source_info: # Process images and tables for SEPARATE display only
source_info = "Sources trouvées: " + source_info if show_sources and images and max_images > 0:
for img in images[:max_images]:
# 11. Traiter les images
if show_sources and images:
images = images[:max_images]
for img in images:
img_data = img.get("image_data") img_data = img.get("image_data")
if img_data: if img_data:
image = base64_to_image(img_data) image = base64_to_image(img_data)
if image: if image:
caption = re.sub(pattern_texte, '', img.get("caption", ""))
# Only add to gallery, not to chat messages
current_images.append({ current_images.append({
"image": image, "image": image,
"caption": img.get("caption", ""), "caption": caption,
"source": img.get("source", ""), "source": img.get("source", ""),
"page": img.get("page", ""), "page": img.get("page", "")
"description": img.get("description", "")
}) })
# 12. Traiter les tableaux # Final yield with separate image gallery
if show_sources and tables: yield messages_history, source_info, display_images(), display_tables()
for table in tables:
current_tables.append({
"data": rag_bot.format_table(table.get("table_data", "")),
"caption": table.get("caption", ""),
"source": table.get("source", ""),
"page": table.get("page", ""),
"description": table.get("description", "")
})
# 13. Retourner les résultats finaux
images_display = display_images()
tables_display = display_tables()
yield history, source_info, images_display, tables_display
else: else:
# Version sans streaming # Version non-streaming
print("Mode non-streaming activé") print("Mode non-streaming activé")
source_info = "" source_info = ""
result = rag_bot.chat(message, stream=False) history_tuples = history if isinstance(history, list) else []
# Ajouter le message utilisateur à l'historique au format message
messages_history.append({"role": "user", "content": message})
# Initialize multimodal_content first
multimodal_content = [result["response"]] # Start with text response
# Après avoir obtenu le résultat
result = rag_bot.chat(
message,
stream=False,
language=LANGUAGE_MAPPING.get(language, "français") # Vérifiez que cette ligne existe
)
# Nettoyer la réponse des balises <think> # Nettoyer la réponse des balises <think>
result["response"] = clean_llm_response(result["response"]) result["response"] = clean_llm_response(result["response"])
history = history + [(message, result["response"])]
# Mise à jour de l'historique interne # Ajouter la réponse de l'assistant au format message
messages_history.append({"role": "assistant", "content": result["response"]})
# Mise à jour de l'historique interne du chatbot
rag_bot.chat_history.append({"role": "user", "content": message}) rag_bot.chat_history.append({"role": "user", "content": message})
rag_bot.chat_history.append({"role": "assistant", "content": result["response"]}) rag_bot.chat_history.append({"role": "assistant", "content": result["response"]})
@@ -337,42 +365,37 @@ def process_query(message, history, streaming, show_sources, max_images, languag
if source_info: if source_info:
source_info = "Sources trouvées: " + source_info source_info = "Sources trouvées: " + source_info
# Traiter les images et tableaux # Process images for SEPARATE gallery
if show_sources and "images" in result and result["images"]: if show_sources and "images" in result and result["images"]:
images = result["images"][:max_images] for img in result["images"][:max_images]:
for img in images:
img_data = img.get("image_data") img_data = img.get("image_data")
if img_data: if img_data:
image = base64_to_image(img_data) image = base64_to_image(img_data)
if image: if image:
caption = re.sub(pattern_texte, '', img.get("caption", ""))
# Only add to gallery
current_images.append({ current_images.append({
"image": image, "image": image,
"caption": img.get("caption", ""), "caption": caption,
"source": img.get("source", ""), "source": img.get("source", ""),
"page": img.get("page", ""), "page": img.get("page", "")
"description": img.get("description", "")
}) })
if show_sources and "tables" in result and result["tables"]: # Final yield with separate displays
tables = result["tables"] yield messages_history, source_info, display_images(), display_tables()
for table in tables:
current_tables.append({
"data": rag_bot.format_table(table.get("table_data", "")),
"caption": table.get("caption", ""),
"source": table.get("source", ""),
"page": table.get("page", ""),
"description": table.get("description", "")
})
yield history, source_info, display_images(), display_tables()
except Exception as e: except Exception as e:
error_msg = f"Une erreur est survenue: {str(e)}" error_msg = f"Une erreur est survenue: {str(e)}"
traceback_text = traceback.format_exc() traceback_text = traceback.format_exc()
print(error_msg) print(error_msg)
print(traceback_text) print(traceback_text)
history = history + [(message, error_msg)]
yield history, "Erreur lors du traitement de la requête", None, None # Formater l'erreur au format message
error_history = convert_to_messages_format(history)
error_history.append({"role": "user", "content": message})
error_history.append({"role": "assistant", "content": error_msg})
yield error_history, "Erreur lors du traitement de la requête", None, None
# Fonction pour réinitialiser la conversation # Fonction pour réinitialiser la conversation
def reset_conversation(): def reset_conversation():
@@ -382,4 +405,5 @@ def reset_conversation():
rag_bot.clear_history() rag_bot.clear_history()
return [], "", None, None # Retourner une liste vide au format messages
return [], "", None, None # Liste vide = pas de messages

View File

@@ -1,11 +1,58 @@
import gradio as gr import gradio as gr
from config.settings import DEFAULT_MODEL, QDRANT_COLLECTION_NAME, AVAILABLE_MODELS from config.settings import DEFAULT_MODEL, QDRANT_COLLECTION_NAME, AVAILABLE_MODELS
from translations.lang_mappings import UI_TRANSLATIONS, UI_SUPPORTED_LANGUAGES from translations.lang_mappings import UI_TRANSLATIONS, UI_SUPPORTED_LANGUAGES, LANGUAGE_MAPPING
from utils.katex_script import KATEX_CSS_JS from utils.katex_script import KATEX_CSS_JS
def update_ui_language_elements(language): def update_ui_language_elements(language):
"""Met à jour les éléments de l'interface utilisateur en fonction de la langue sélectionnée""" """Met à jour tous les éléments de l'interface avec la langue sélectionnée"""
pass # Implémentez selon vos besoins
# Vérifier si la langue est supportée par l'interface
if language not in UI_SUPPORTED_LANGUAGES:
language = "Français" # Langue par défaut
# Récupérer les traductions pour la langue sélectionnée
translations = UI_TRANSLATIONS[language]
# Créer un dictionnaire pour stocker tous les éléments modifiés
ui_elements = {}
# Mettre à jour le titre
ui_elements["title"] = translations["title"]
# Mettre à jour le placeholder et les boutons
ui_elements["placeholder"] = translations["placeholder"]
ui_elements["send_btn"] = translations["send_btn"]
ui_elements["clear_btn"] = translations["clear_btn"]
# Ajouter les traductions pour la langue de l'interface
ui_elements["ui_language_label"] = translations["ui_language_label"]
ui_elements["ui_language_info"] = translations["ui_language_info"]
# Mettre à jour les libellés des options
ui_elements["options_label"] = "Options" # Ce texte pourrait aussi être traduit
ui_elements["model_label"] = translations["model_selector"]
ui_elements["model_info"] = translations["model_info"]
ui_elements["model_current_prefix"] = translations["model_current"]
ui_elements["language_label"] = translations["language_selector"]
ui_elements["language_info"] = translations["language_info"]
ui_elements["collection_label"] = translations["collection_input"]
ui_elements["collection_info"] = translations["collection_info"]
ui_elements["collection_current_prefix"] = translations["collection_current"]
ui_elements["apply_btn"] = translations["apply_btn"]
ui_elements["streaming_label"] = translations["streaming_label"]
ui_elements["streaming_info"] = translations["streaming_info"]
ui_elements["sources_label"] = translations["sources_label"]
ui_elements["max_images_label"] = translations["max_images_label"]
ui_elements["images_title"] = translations["images_title"]
ui_elements["tables_title"] = translations["tables_title"]
return ui_elements
def build_interface( def build_interface(
process_query_fn, process_query_fn,
@@ -14,102 +61,129 @@ def build_interface(
change_collection_fn, change_collection_fn,
update_ui_language_fn update_ui_language_fn
): ):
"""Construit l'interface utilisateur avec Gradio.""" """Construit l'interface utilisateur avec Gradio"""
print("Initialisation de l'interface")
print("AVAILABLE_MODELS chargé dans ui.py:", AVAILABLE_MODELS)
# Initialiser avec la langue par défaut (Français)
ui_elements = update_ui_language_elements("Français")
with gr.Blocks(css=KATEX_CSS_JS, theme=gr.themes.Soft(primary_hue="blue")) as interface: with gr.Blocks(css=KATEX_CSS_JS, theme=gr.themes.Soft(primary_hue="blue")) as interface:
gr.Markdown("# 📚 Assistant documentaire intelligent") title_md = gr.Markdown(f"# {ui_elements['title']}")
with gr.Row(): with gr.Row():
with gr.Column(scale=2): with gr.Column(scale=2):
# Chatbot principal
chat_interface = gr.Chatbot( chat_interface = gr.Chatbot(
height=600, height=800,
show_label=False, bubble_full_width=False,
layout="bubble", show_copy_button=True,
elem_id="chatbot" type="messages"
# likeable=False,
) )
with gr.Row(): with gr.Row():
msg = gr.Textbox( msg = gr.Textbox(
show_label=False, show_label=False,
placeholder="Posez votre question...", placeholder=ui_elements['placeholder'],
container=False, container=False,
scale=4 scale=4
) )
submit_btn = gr.Button("Envoyer", variant="primary", scale=1) submit_btn = gr.Button(ui_elements['send_btn'], variant="primary", scale=1)
clear_btn = gr.Button("Effacer la conversation") clear_btn = gr.Button(ui_elements['clear_btn'])
source_info = gr.Markdown("", elem_id="sources_info") source_info = gr.Markdown("", elem_id="sources_info")
with gr.Column(scale=1): with gr.Column(scale=1):
with gr.Accordion("Options", open=True): with gr.Accordion("Options", open=True):
# Sélecteur de modèle # Sélecteur de langue pour l'interface
language_ui_selector = gr.Dropdown(
choices=UI_SUPPORTED_LANGUAGES,
value="Français",
label=ui_elements['ui_language_label'], # Utiliser une clé différente
info=ui_elements['ui_language_info']
)
# Sélecteur de modèle - assurez-vous que cette section est présente
model_selector = gr.Dropdown( model_selector = gr.Dropdown(
choices=AVAILABLE_MODELS, choices=AVAILABLE_MODELS,
value=DEFAULT_MODEL, value=DEFAULT_MODEL,
label="Modèle Ollama", label=ui_elements['model_label'],
info="Choisir le modèle de language à utiliser" info=ui_elements['model_info']
) )
model_status = gr.Markdown(f"Modèle actuel: **{DEFAULT_MODEL}**") model_status = gr.Markdown(f"{ui_elements['model_current_prefix']}: **{DEFAULT_MODEL}**")
# Sélecteur de langue # Sélecteur de langue pour les réponses
language_selector = gr.Dropdown( language_selector = gr.Dropdown(
choices=UI_SUPPORTED_LANGUAGES, choices=list(LANGUAGE_MAPPING.keys()),
value=UI_SUPPORTED_LANGUAGES[0], value="Français",
label="Langue des réponses", label=ui_elements['language_label'],
info="Choisir la langue dans laquelle l'assistant répondra" info=ui_elements['language_info']
) )
# Sélecteur de collection Qdrant # Sélecteur de collection Qdrant
collection_name_input = gr.Textbox( collection_name_input = gr.Textbox(
value=QDRANT_COLLECTION_NAME, value=QDRANT_COLLECTION_NAME,
label="Collection Qdrant", label=ui_elements['collection_label'],
info="Nom de la collection de documents à utiliser" info=ui_elements['collection_info']
) )
collection_status = gr.Markdown(f"Collection actuelle: **{QDRANT_COLLECTION_NAME}**") collection_status = gr.Markdown(f"{ui_elements['collection_current_prefix']}: **{QDRANT_COLLECTION_NAME}**")
# Bouton d'application de la collection # Bouton pour appliquer la collection
apply_collection_btn = gr.Button("Appliquer la collection") apply_collection_btn = gr.Button(ui_elements['apply_btn'])
# Options de streaming et sources
streaming = gr.Checkbox( streaming = gr.Checkbox(
label="Mode streaming", label=ui_elements['streaming_label'],
value=True, value=True,
info="Voir les réponses s'afficher progressivement" info=ui_elements['streaming_info']
) )
show_sources = gr.Checkbox(label="Afficher les sources", value=True) show_sources = gr.Checkbox(label=ui_elements['sources_label'], value=True)
max_images = gr.Slider( max_images = gr.Slider(
minimum=1, minimum=1,
maximum=10, maximum=10,
value=3, value=3,
step=1, step=1,
label="Nombre max d'images" label=ui_elements['max_images_label']
) )
gr.Markdown("---") # Ne pas supprimer ces lignes dans ui.py
images_title = gr.Markdown(f"### {ui_elements['images_title']}")
gr.Markdown("### 🖼️ Images pertinentes") image_gallery = gr.Gallery(label="Images")
image_gallery = gr.Gallery( tables_title = gr.Markdown(f"### {ui_elements['tables_title']}")
label="Images pertinentes",
show_label=False,
columns=2,
height=300,
object_fit="contain"
)
gr.Markdown("### 📊 Tableaux")
tables_display = gr.HTML() tables_display = gr.HTML()
# Connecter le changement de modèle # Ajouter cette fonction juste avant de connecter le changement de langue
model_selector.change( def preserve_models_wrapper(language):
fn=change_model_fn, """Préserve la liste des modèles lors du changement de langue"""
inputs=model_selector, # Obtenir les mises à jour depuis la fonction d'origine
outputs=model_status updates = update_ui_language_fn(language)
)
# Force la liste complète des modèles disponibles (position 5 dans les sorties)
# Connecter le changement de collection # Cela garantit que quelles que soient les mises à jour, la liste des modèles reste intacte
apply_collection_btn.click( if isinstance(updates[5], dict) and "choices" in updates[5]:
fn=change_collection_fn, print("Préservation de la liste des modèles:", AVAILABLE_MODELS)
inputs=collection_name_input, updates[5]["choices"] = AVAILABLE_MODELS
outputs=collection_status
return updates
# Puis modifier la connexion du language_ui_selector.change comme suit :
language_ui_selector.change(
fn=preserve_models_wrapper, # Utiliser notre wrapper au lieu de la fonction directe
inputs=language_ui_selector,
outputs=[
title_md,
msg,
submit_btn,
clear_btn,
language_ui_selector,
model_selector,
model_status,
language_selector,
collection_name_input,
collection_status,
apply_collection_btn,
streaming,
show_sources,
max_images
]
) )
# Fonction pour effacer l'entrée # Fonction pour effacer l'entrée
@@ -131,14 +205,28 @@ def build_interface(
clear_btn.click( clear_btn.click(
reset_conversation_fn, reset_conversation_fn,
outputs=[chat_interface, source_info, image_gallery, tables_display] outputs=[chat_interface, source_info] # Retirer image_gallery et tables_display
)
# Connecter le changement de modèle
model_selector.change(
fn=change_model_fn,
inputs=model_selector,
outputs=model_status
)
# Connecter le changement de collection
apply_collection_btn.click(
fn=change_collection_fn,
inputs=collection_name_input,
outputs=collection_status
) )
# Style KaTeX et amélioration du design # Style KaTeX et amélioration du design
gr.Markdown(""" gr.Markdown("""
<style> <style>
.gradio-container {max-width: 1200px !important} .gradio-container {max-width: 1200px !important}
#chatbot {height: 600px; overflow-y: auto;} #chatbot {height: 800px; overflow-y: auto;}
#sources_info {margin-top: 10px; color: #666;} #sources_info {margin-top: 10px; color: #666;}
/* Improved styles for equations */ /* Improved styles for equations */

File diff suppressed because one or more lines are too long

View File

@@ -53,8 +53,7 @@ LANGUAGE_MAPPING = {
"Italiano": "italiano", "Italiano": "italiano",
"中文": "Chinese", "中文": "Chinese",
"日本語": "Japanese", "日本語": "Japanese",
"العربية": "Arabic", "العربية": "Arabic"
"فارسی": "Persian" # Added Persian language
} }
# Initialiser le chatbot RAG avec le modèle par défaut # Initialiser le chatbot RAG avec le modèle par défaut
@@ -389,11 +388,12 @@ def display_tables():
print(f"Error formatting table {idx}: {e}") print(f"Error formatting table {idx}: {e}")
table_html = f'<pre>{table_data}</pre>' table_html = f'<pre>{table_data}</pre>'
# Create the table container with metadata - REMOVED description # Create the table container with metadata
html += f""" html += f"""
<div style="margin-bottom: 20px; border: 1px solid #ddd; padding: 15px; border-radius: 8px;"> <div style="margin-bottom: 20px; border: 1px solid #ddd; padding: 15px; border-radius: 8px;">
<h3>{table['caption']}</h3> <h3>{table['caption']}</h3>
<p style="color:#666; font-size:0.9em;">Source: {table['source']}, Page: {table['page']}</p> <p style="color:#666; font-size:0.9em;">Source: {table['source']}, Page: {table['page']}</p>
<p><strong>Description:</strong> {table['description']}</p>
{table_html} {table_html}
</div> </div>
""" """
@@ -448,7 +448,7 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
# Sélecteur de langue # Sélecteur de langue
language_selector = gr.Dropdown( language_selector = gr.Dropdown(
choices=["Français", "English", "Español", "Deutsch", "Italiano", "中文", "日本語", "العربية", "فارسی"], choices=["Français", "English", "Español", "Deutsch", "Italiano", "中文", "日本語", "العربية"],
value="Français", value="Français",
label="Langue des réponses", label="Langue des réponses",
info="Choisir la langue dans laquelle l'assistant répondra" info="Choisir la langue dans laquelle l'assistant répondra"
@@ -535,7 +535,7 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
/* Improved styles for equations */ /* Improved styles for equations */
.katex { font-size: 1.1em !important; } .katex { font-size: 1.1em !important; }
.math-inline { background: #f8f9fa; padding: 2px 5px; border-radius: 4px; } .math-inline { background: #f8f9fa; padding: 2px 5px; border-radius: 4px; }
.math-display { background: #f8f9fa; margin: 10px 0; padding: 10px; border-radius: 5px; overflow-x: auto; text-align: center; } .math-display { background: #f8f9f9; margin: 10px 0; padding: 10px; border-radius: 5px; overflow-x: auto; text-align: center; }
/* Table styles */ /* Table styles */
table { table {
@@ -578,15 +578,15 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
delimiters: [ delimiters: [
{left: '$$', right: '$$', display: true}, {left: '$$', right: '$$', display: true},
{left: '$', right: '$', display: false}, {left: '$', right: '$', display: false},
{left: '\\\\(', right: '\\\\)', display: false}, {left: '\\(', right: '\\)', display: false},
{left: '\\\\[', right: '\\\\]', display: true} {left: '\\[', right: '\\]', display: true}
], ],
throwOnError: false, throwOnError: false,
trust: true, trust: true,
strict: false, strict: false,
macros: { macros: {
"\\\\R": "\\\\mathbb{R}", "\\R": "\\mathbb{R}",
"\\\\N": "\\\\mathbb{N}" "\\N": "\\mathbb{N}"
} }
}); });
} catch (e) { } catch (e) {
@@ -617,12 +617,12 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
function prepareLatexInText(text) { function prepareLatexInText(text) {
// Make sure dollar signs used for math have proper spacing // Make sure dollar signs used for math have proper spacing
// First, protect existing well-formed math expressions // First, protect existing well-formed math expressions
text = text.replace(/(\\$\\$[^\\$]+\\$\\$)/g, '<protect>$1</protect>'); // protect display math text = text.replace(/(\$\$[^\$]+\$\$)/g, '<protect>$1</protect>'); // protect display math
text = text.replace(/(\\$[^\\$\\n]+\\$)/g, '<protect>$1</protect>'); // protect inline math text = text.replace(/(\$[^\$\n]+\$)/g, '<protect>$1</protect>'); // protect inline math
// Fix common LaTeX formatting issues outside protected regions // Fix common LaTeX formatting issues outside protected regions
text = text.replace(/([^<]protect[^>]*)(\\$)([^\\s])/g, '$1$2 $3'); // Add space after $ if needed text = text.replace(/([^<]protect[^>]*)(\$)([^\s])/g, '$1$2 $3'); // Add space after $ if needed
text = text.replace(/([^\\s])(\\$)([^<]protect[^>]*)/g, '$1 $2$3'); // Add space before $ if needed text = text.replace(/([^\s])(\$)([^<]protect[^>]*)/g, '$1 $2$3'); // Add space before $ if needed
// Handle subscripts: transform x_1 into x_{1} for better LaTeX compatibility // Handle subscripts: transform x_1 into x_{1} for better LaTeX compatibility
text = text.replace(/([a-zA-Z])_([0-9a-zA-Z])/g, '$1_{$2}'); text = text.replace(/([a-zA-Z])_([0-9a-zA-Z])/g, '$1_{$2}');

View File

@@ -7,7 +7,9 @@ from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser from langchain_core.output_parsers import StrOutputParser
import httpx
from tqdm import tqdm
http_client = httpx.Client(verify=False)
class PdfProcessor: class PdfProcessor:
""" """
@@ -80,6 +82,40 @@ class PdfProcessor:
if not self.config["openai_api_key"]: if not self.config["openai_api_key"]:
raise ValueError("OpenAI API key is required when using OpenAI models") raise ValueError("OpenAI API key is required when using OpenAI models")
os.environ["OPENAI_API_KEY"] = self.config["openai_api_key"] os.environ["OPENAI_API_KEY"] = self.config["openai_api_key"]
# Initialize Qdrant client
from qdrant_client import QdrantClient
from qdrant_client.http import models as rest
self.qdrant_client = QdrantClient(url=self.config["qdrant_url"])
# Check if collection exists and create it if not
collections = self.qdrant_client.get_collections().collections
collection_exists = any(collection.name == self.config["collection_name"] for collection in collections)
if not collection_exists:
# Get vector size based on embedding model
if self.config["embedding_provider"] == "ollama":
# For OllamaEmbeddings, typically 4096 dimensions for newer models
vector_size = 4096
else: # OpenAI
# OpenAI embedding dimensions vary by model
model_dimensions = {
"text-embedding-ada-002": 1536,
"text-embedding-3-small": 1536,
"text-embedding-3-large": 3072
}
vector_size = model_dimensions.get(self.config["openai_embedding_model"], 1536)
# Create the collection
self.qdrant_client.create_collection(
collection_name=self.config["collection_name"],
vectors_config=rest.VectorParams(
size=vector_size,
distance=rest.Distance.COSINE
)
)
print(f"Created new Qdrant collection: {self.config['collection_name']}")
def _setup_models(self): def _setup_models(self):
"""Initialize models based on configuration.""" """Initialize models based on configuration."""
@@ -106,6 +142,7 @@ class PdfProcessor:
else: # openai else: # openai
from langchain_openai import ChatOpenAI from langchain_openai import ChatOpenAI
self.summary_model = ChatOpenAI( self.summary_model = ChatOpenAI(
http_client=http_client,
model=self.config["openai_summary_model"] model=self.config["openai_summary_model"]
) )
@@ -134,38 +171,45 @@ class PdfProcessor:
Returns: Returns:
Dictionary with processing statistics Dictionary with processing statistics
""" """
# Load and extract content from PDF # Create a master progress bar
print("Loading PDF and extracting elements...") with tqdm(total=5, desc="PDF Processing", position=0) as master_bar:
documents = self._load_pdf(pdf_path) # Load and extract content from PDF
master_bar.set_description("Loading PDF")
# Process text chunks documents = self._load_pdf(pdf_path)
print("Processing text chunks...") master_bar.update(1)
title_chunks = self._process_text(documents)
text_summaries = self._summarize_text(title_chunks) # Process text chunks
processed_text = self._convert_text_to_documents(title_chunks, text_summaries) master_bar.set_description("Processing text chunks")
title_chunks = self._process_text(documents)
# Process images if configured text_summaries = self._summarize_text(title_chunks)
print("Processing images...") processed_text = self._convert_text_to_documents(title_chunks, text_summaries)
processed_images = [] master_bar.update(1)
if self.config["extract_images"]:
images = self._extract_images(documents) # Process images if configured
image_summaries = self._process_images(images) master_bar.set_description("Processing images")
processed_images = self._convert_images_to_documents(images, image_summaries) processed_images = []
if self.config["extract_images"]:
# Process tables if configured images = self._extract_images(documents)
print("Processing tables...") image_summaries = self._process_images(images)
processed_tables = [] processed_images = self._convert_images_to_documents(images, image_summaries)
if self.config["extract_tables"]: master_bar.update(1)
tables = self._extract_tables(documents)
table_summaries = self._process_tables(tables) # Process tables if configured
processed_tables = self._convert_tables_to_documents(tables, table_summaries) master_bar.set_description("Processing tables")
processed_tables = []
print("Storing processed elements in Qdrant...") if self.config["extract_tables"]:
# Combine all processed elements tables = self._extract_tables(documents)
final_documents = processed_text + processed_images + processed_tables table_summaries = self._process_tables(tables)
processed_tables = self._convert_tables_to_documents(tables, table_summaries)
# Store in Qdrant master_bar.update(1)
self._store_documents(final_documents)
master_bar.set_description("Storing in Qdrant")
# Combine all processed elements
final_documents = processed_text + processed_images + processed_tables
# Store in Qdrant
self._store_documents(final_documents)
master_bar.update(1)
return { return {
"text_chunks": len(processed_text), "text_chunks": len(processed_text),
@@ -199,7 +243,15 @@ class PdfProcessor:
def _summarize_text(self, chunks: List[Document]) -> List[str]: def _summarize_text(self, chunks: List[Document]) -> List[str]:
"""Generate summaries for text chunks.""" """Generate summaries for text chunks."""
return self.summarize_chain.batch([chunk.page_content for chunk in chunks], {"max_concurrency": 3}) if not chunks:
return []
print(f"Summarizing {len(chunks)} text chunks...")
results = []
for chunk in tqdm(chunks, desc="Text summarization", leave=False):
result = self.summarize_chain.invoke(chunk.page_content)
results.append(result)
return results
def _extract_images(self, documents: List[Document]) -> List[Dict[str, Any]]: def _extract_images(self, documents: List[Document]) -> List[Dict[str, Any]]:
"""Extract images with captions from documents.""" """Extract images with captions from documents."""
@@ -225,12 +277,17 @@ class PdfProcessor:
def _process_images(self, images: List[Dict[str, Any]]) -> List[str]: def _process_images(self, images: List[Dict[str, Any]]) -> List[str]:
"""Generate descriptions for images using configured model.""" """Generate descriptions for images using configured model."""
if not images:
return []
print(f"Processing {len(images)} images...")
if self.config["image_provider"] == "ollama": if self.config["image_provider"] == "ollama":
from ollama import Client from ollama import Client
client = Client(host=self.config["ollama_image_url"]) client = Client(host=self.config["ollama_image_url"])
image_summaries = [] image_summaries = []
for img in images: for img in tqdm(images, desc="Image processing", leave=False):
prompt = f"Caption of image: {img.get('caption', '')}. Describe this image in detail in {self.config['summary_language']}." prompt = f"Caption of image: {img.get('caption', '')}. Describe this image in detail in {self.config['summary_language']}."
response = client.chat( response = client.chat(
model=self.config["ollama_image_model"], model=self.config["ollama_image_model"],
@@ -261,9 +318,17 @@ class PdfProcessor:
] ]
prompt = ChatPromptTemplate.from_messages(messages) prompt = ChatPromptTemplate.from_messages(messages)
chain = prompt | ChatOpenAI(model=self.config["openai_image_model"]) | StrOutputParser() chain = prompt | ChatOpenAI(model=self.config["openai_image_model"], http_client=http_client) | StrOutputParser()
return chain.batch([{"image_base64": img["image_base64"], "caption": img.get("caption", "")} for img in images]) # Process images with progress bar
results = []
image_data = [{"image_base64": img["image_base64"], "caption": img.get("caption", "")} for img in images]
for img_data in tqdm(image_data, desc="Image processing", leave=False):
result = chain.invoke(img_data)
results.append(result)
return results
def _extract_tables(self, documents: List[Document]) -> List[Dict[str, Any]]: def _extract_tables(self, documents: List[Document]) -> List[Dict[str, Any]]:
"""Extract tables with captions from documents.""" """Extract tables with captions from documents."""
@@ -290,9 +355,13 @@ class PdfProcessor:
def _process_tables(self, tables: List[Dict[str, Any]]) -> List[str]: def _process_tables(self, tables: List[Dict[str, Any]]) -> List[str]:
"""Generate summaries for tables.""" """Generate summaries for tables."""
if not tables:
return []
print(f"Processing {len(tables)} tables...")
table_summaries = [] table_summaries = []
for table in tables: for table in tqdm(tables, desc="Table processing", leave=False):
prompt = f"""Caption of table: {table.get('caption', '')}. prompt = f"""Caption of table: {table.get('caption', '')}.
Describe this table in detail in {self.config['summary_language']}. Describe this table in detail in {self.config['summary_language']}.
Table content: {table.get('table_data', '')}""" Table content: {table.get('table_data', '')}"""
@@ -481,11 +550,86 @@ class PdfProcessor:
final_chunks.extend(sub_chunks) final_chunks.extend(sub_chunks)
return final_chunks return final_chunks
def process_directory(self, directory_path: str) -> Dict[str, Any]:
"""
Process all PDF files in the specified directory.
Args:
directory_path: Path to the directory containing PDF files
Returns:
Dictionary with processing statistics for all files
"""
# Check if directory exists
if not os.path.isdir(directory_path):
raise ValueError(f"Directory not found: {directory_path}")
# Find all PDF files in the directory
pdf_files = glob.glob(os.path.join(directory_path, "*.pdf"))
if not pdf_files:
print(f"No PDF files found in {directory_path}")
return {"files_processed": 0}
# Track overall statistics
overall_stats = {
"files_processed": 0,
"total_text_chunks": 0,
"total_image_chunks": 0,
"total_table_chunks": 0,
"total_chunks": 0,
"collection_name": self.config["collection_name"],
"file_details": []
}
# Process each PDF file with a progress bar
print(f"Found {len(pdf_files)} PDF files in {directory_path}")
for pdf_file in tqdm(pdf_files, desc="Processing PDF files", unit="file"):
try:
print(f"\nProcessing: {os.path.basename(pdf_file)}")
result = self.process_pdf(pdf_file)
# Update statistics
overall_stats["files_processed"] += 1
overall_stats["total_text_chunks"] += result.get("text_chunks", 0)
overall_stats["total_image_chunks"] += result.get("image_chunks", 0)
overall_stats["total_table_chunks"] += result.get("table_chunks", 0)
overall_stats["total_chunks"] += result.get("total_chunks", 0)
# Store individual file results
file_detail = {
"filename": os.path.basename(pdf_file),
"text_chunks": result.get("text_chunks", 0),
"image_chunks": result.get("image_chunks", 0),
"table_chunks": result.get("table_chunks", 0),
"total_chunks": result.get("total_chunks", 0)
}
overall_stats["file_details"].append(file_detail)
print(f"Completed: {file_detail['filename']} - {file_detail['total_chunks']} chunks processed")
except Exception as e:
print(f"Error processing {pdf_file}: {str(e)}")
# Continue with next file
print("\nDirectory processing complete!")
print(f"Processed {overall_stats['files_processed']} files")
print(f"Total chunks: {overall_stats['total_chunks']}")
print(f" - Text chunks: {overall_stats['total_text_chunks']}")
print(f" - Image chunks: {overall_stats['total_image_chunks']}")
print(f" - Table chunks: {overall_stats['total_table_chunks']}")
print(f"All content stored in collection: {overall_stats['collection_name']}")
return overall_stats
import glob
import os
processor = PdfProcessor({ processor = PdfProcessor({
"image_provider": "openai", # "image_provider": "openai",
"openai_api_key": "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", # "openai_api_key": "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",
"collection_name": "my_custom_collection", "collection_name": "my_control_and calibration",
"summary_language": "English" "summary_language": "English"
}) })
result = processor.process_pdf(r"F:\Dev\Rag\chat_bot_rag\T4 Machines thermiques.pdf")
results = processor.process_directory(r"C:\Users\serameza\host-data")

328
services/rag_service.py Normal file
View File

@@ -0,0 +1,328 @@
import base64
from io import BytesIO
from PIL import Image
import traceback
import threading
import queue
import time
from rag_chatbot import MultimodalRAGChatbot
from langchain.prompts import ChatPromptTemplate
from langchain_ollama import ChatOllama
from langchain.callbacks.base import BaseCallbackHandler
# Handler personnalisé pour capturer les tokens en streaming
class GradioStreamingHandler(BaseCallbackHandler):
def __init__(self):
self.tokens_queue = queue.Queue()
self.full_text = ""
def on_llm_new_token(self, token, **kwargs):
self.tokens_queue.put(token)
self.full_text += token
# Fonction pour créer un objet Image à partir des données base64
def base64_to_image(base64_data):
"""Convertit une image base64 en objet Image pour l'affichage direct"""
try:
if not base64_data:
return None
image_bytes = base64.b64decode(base64_data)
image = Image.open(BytesIO(image_bytes))
return image
except Exception as e:
print(f"Erreur lors de la conversion d'image: {e}")
return None
# Configuration pour initialiser le chatbot
QDRANT_URL = "http://localhost:6333"
QDRANT_COLLECTION_NAME = "my_custom_collection"
EMBEDDING_MODEL = "mxbai-embed-large"
OLLAMA_URL = "http://127.0.0.1:11434"
DEFAULT_MODEL = "llama3.2"
# Liste des modèles disponibles
AVAILABLE_MODELS = ["llama3.1", "llama3.2", "deepseek-r1:7b", "deepseek-r1:14b"]
# Mapping des langues pour une meilleure compréhension par le LLM
LANGUAGE_MAPPING = {
"Français": "français",
"English": "English",
"Español": "español",
"Deutsch": "Deutsch",
"Italiano": "italiano",
"中文": "Chinese",
"日本語": "Japanese",
"العربية": "Arabic"
}
# Variables globales pour stocker les images et tableaux de la dernière requête
current_images = []
current_tables = []
# Initialiser le chatbot RAG avec le modèle par défaut
def initialize_rag_bot():
global rag_bot
rag_bot = MultimodalRAGChatbot(
qdrant_url=QDRANT_URL,
qdrant_collection_name=QDRANT_COLLECTION_NAME,
ollama_model=DEFAULT_MODEL,
embedding_model=EMBEDDING_MODEL,
ollama_url=OLLAMA_URL
)
print(f"Chatbot initialisé avec modèle: {DEFAULT_MODEL}")
# Fonction pour changer de modèle
def change_model(model_name):
global rag_bot
try:
# Réinitialiser le chatbot avec le nouveau modèle
rag_bot = MultimodalRAGChatbot(
qdrant_url=QDRANT_URL,
qdrant_collection_name=QDRANT_COLLECTION_NAME,
ollama_model=model_name,
embedding_model=EMBEDDING_MODEL,
ollama_url=OLLAMA_URL
)
print(f"Modèle changé pour: {model_name}")
return f"✅ Modèle changé pour: {model_name}"
except Exception as e:
print(f"Erreur lors du changement de modèle: {e}")
return f"❌ Erreur: {str(e)}"
# Fonction pour changer de collection
def change_collection(collection_name):
global rag_bot, QDRANT_COLLECTION_NAME
try:
# Mise à jour de la variable globale
QDRANT_COLLECTION_NAME = collection_name
# Réinitialiser le chatbot avec la nouvelle collection
rag_bot = MultimodalRAGChatbot(
qdrant_url=QDRANT_URL,
qdrant_collection_name=collection_name,
ollama_model=rag_bot.llm.model, # Conserver le modèle actuel
embedding_model=EMBEDDING_MODEL,
ollama_url=OLLAMA_URL
)
print(f"Collection changée pour: {collection_name}")
return f"✅ Collection changée pour: {collection_name}"
except Exception as e:
print(f"Erreur lors du changement de collection: {e}")
return f"❌ Erreur: {str(e)}"
# Fonction de traitement des requêtes avec support du streaming dans Gradio
def process_query(message, history, streaming, show_sources, max_images, language):
global current_images, current_tables
if not message.strip():
return history, "", None, None
current_images = []
current_tables = []
try:
if streaming:
# Version avec streaming dans Gradio
history = history + [(message, "")]
# 1. Récupérer les documents pertinents
docs = rag_bot._retrieve_relevant_documents(message)
# 2. Préparer le contexte et l'historique
context = rag_bot._format_documents(docs)
history_text = rag_bot._format_chat_history()
# 3. Préparer le prompt
prompt_template = ChatPromptTemplate.from_template("""
Tu es un assistant documentaire spécialisé qui utilise toutes les informations disponibles dans le contexte fourni.
TRÈS IMPORTANT: Tu dois répondre EXCLUSIVEMENT en {language}. Ne réponds JAMAIS dans une autre langue.
Instructions spécifiques:
1. Pour chaque image mentionnée dans le contexte, inclue TOUJOURS dans ta réponse:
- La légende/caption exacte de l'image
- La source et le numéro de page
- Une description brève de ce qu'elle montre
2. Pour chaque tableau mentionné dans le contexte, inclue TOUJOURS:
- Le titre/caption exact du tableau
- La source et le numéro de page
- Ce que contient et signifie le tableau
3. Lorsque tu cites des équations mathématiques:
- Utilise la syntaxe LaTeX exacte comme dans le document ($...$ ou $$...$$)
- Reproduis-les fidèlement sans modification
4. IMPORTANT: Ne pas inventer d'informations - si une donnée n'est pas explicitement fournie dans le contexte,
indique clairement que cette information n'est pas disponible dans les documents fournis.
5. Cite précisément les sources pour chaque élément d'information (format: [Source, Page]).
6. CRUCIAL: Ta réponse doit être UNIQUEMENT et INTÉGRALEMENT en {language}, quelle que soit la langue de la question.
Historique de conversation:
{chat_history}
Contexte (à utiliser pour répondre):
{context}
Question: {question}
Réponds de façon structurée et précise en intégrant activement les images, tableaux et équations disponibles dans le contexte.
Ta réponse doit être exclusivement en {language}.
""")
# 4. Formater les messages pour le LLM
messages = prompt_template.format_messages(
chat_history=history_text,
context=context,
question=message,
language=LANGUAGE_MAPPING.get(language, "français") # Use the mapped language value
)
# 5. Créer un handler de streaming personnalisé
handler = GradioStreamingHandler()
# 6. Créer un modèle LLM avec notre handler
streaming_llm = ChatOllama(
model=rag_bot.llm.model,
base_url=rag_bot.llm.base_url,
streaming=True,
callbacks=[handler]
)
# 7. Lancer la génération dans un thread pour ne pas bloquer l'UI
def generate_response():
streaming_llm.invoke(messages)
thread = threading.Thread(target=generate_response)
thread.start()
# 8. Récupérer les tokens et mettre à jour l'interface
partial_response = ""
# Attendre les tokens avec un timeout
while thread.is_alive() or not handler.tokens_queue.empty():
try:
token = handler.tokens_queue.get(timeout=0.05)
partial_response += token
history[-1] = (message, partial_response)
yield history, "", None, None
except queue.Empty:
continue
# 9. Thread terminé, mettre à jour l'historique de conversation du chatbot
rag_bot.chat_history.append({"role": "user", "content": message})
rag_bot.chat_history.append({"role": "assistant", "content": partial_response})
# 10. Récupérer les sources, images, tableaux
texts, images, tables = rag_bot._process_documents(docs)
# Préparer les informations sur les sources
source_info = ""
if texts:
source_info += f"📚 {len(texts)} textes • "
if images:
source_info += f"🖼️ {len(images)} images • "
if tables:
source_info += f"📊 {len(tables)} tableaux"
if source_info:
source_info = "Sources trouvées: " + source_info
# 11. Traiter les images
if show_sources and images:
images = images[:max_images]
for img in images:
img_data = img.get("image_data")
if img_data:
image = base64_to_image(img_data)
if image:
current_images.append({
"image": image,
"caption": img.get("caption", ""),
"source": img.get("source", ""),
"page": img.get("page", ""),
"description": img.get("description", "")
})
# 12. Traiter les tableaux
if show_sources and tables:
for table in tables:
current_tables.append({
"data": rag_bot.format_table(table.get("table_data", "")),
"caption": table.get("caption", ""),
"source": table.get("source", ""),
"page": table.get("page", ""),
"description": table.get("description", "")
})
# 13. Retourner les résultats finaux
yield history, source_info, display_images(current_images), display_tables(current_tables, language)
else:
# Version sans streaming (code existant)
result = rag_bot.chat(message, stream=False)
history = history + [(message, result["response"])]
# Préparer les informations sur les sources
source_info = ""
if "texts" in result:
source_info += f"📚 {len(result['texts'])} textes • "
if "images" in result:
source_info += f"🖼️ {len(result['images'])} images • "
if "tables" in result:
source_info += f"📊 {len(result['tables'])} tableaux"
if source_info:
source_info = "Sources trouvées: " + source_info
# Traiter les images et tableaux
if show_sources and "images" in result and result["images"]:
images = result["images"][:max_images]
for img in images:
img_data = img.get("image_data")
if img_data:
image = base64_to_image(img_data)
if image:
current_images.append({
"image": image,
"caption": img.get("caption", ""),
"source": img.get("source", ""),
"page": img.get("page", ""),
"description": img.get("description", "")
})
if show_sources and "tables" in result and result["tables"]:
tables = result["tables"]
for table in tables:
current_tables.append({
"data": rag_bot.format_table(table.get("table_data", "")),
"caption": table.get("caption", ""),
"source": table.get("source", ""),
"page": table.get("page", ""),
"description": table.get("description", "")
})
return history, source_info, display_images(current_images), display_tables(current_tables, language)
except Exception as e:
error_msg = f"Une erreur est survenue: {str(e)}"
traceback_text = traceback.format_exc()
print(error_msg)
print(traceback_text)
history = history + [(message, error_msg)]
return history, "Erreur lors du traitement de la requête", None, None
# Fonction pour réinitialiser la conversation
def reset_conversation():
global current_images, current_tables
current_images = []
current_tables = []
rag_bot.clear_history()
return [], "", None, None

View File

@@ -7,8 +7,7 @@ LANGUAGE_MAPPING = {
"Italiano": "italiano", "Italiano": "italiano",
"中文": "Chinese", "中文": "Chinese",
"日本語": "Japanese", "日本語": "Japanese",
"العربية": "Arabic" }
}
# Dictionnaire de traductions pour l'interface # Dictionnaire de traductions pour l'interface
UI_TRANSLATIONS = { UI_TRANSLATIONS = {
@@ -39,7 +38,9 @@ UI_TRANSLATIONS = {
"error_msg": "Une erreur est survenue", "error_msg": "Une erreur est survenue",
"processing_error": "Erreur lors du traitement de la requête", "processing_error": "Erreur lors du traitement de la requête",
"table_translation": "Traduction", "table_translation": "Traduction",
"table_description": "Ce tableau présente des données sur" "table_description": "Ce tableau présente des données sur",
"ui_language_label": "Langue de l'interface",
"ui_language_info": "Changer la langue de l'interface uniquement"
}, },
"English": { "English": {
"title": "📚 Intelligent Document Assistant", "title": "📚 Intelligent Document Assistant",
@@ -68,7 +69,9 @@ UI_TRANSLATIONS = {
"error_msg": "An error occurred", "error_msg": "An error occurred",
"processing_error": "Error processing request", "processing_error": "Error processing request",
"table_translation": "Translation", "table_translation": "Translation",
"table_description": "This table presents data on" "table_description": "This table presents data on",
"ui_language_label": "UI Language",
"ui_language_info": "Change only the interface language"
}, },
"Español": { "Español": {
"title": "📚 Asistente documental inteligente", "title": "📚 Asistente documental inteligente",
@@ -97,7 +100,9 @@ UI_TRANSLATIONS = {
"error_msg": "Se ha producido un error", "error_msg": "Se ha producido un error",
"processing_error": "Error al procesar la solicitud", "processing_error": "Error al procesar la solicitud",
"table_translation": "Traducción", "table_translation": "Traducción",
"table_description": "Esta tabla presenta datos sobre" "table_description": "Esta tabla presenta datos sobre",
"ui_language_label": "Idioma de la interfaz",
"ui_language_info": "Cambiar solo el idioma de la interfaz"
} }
} }