Compare commits
5 Commits
cb43b1176f
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 9fd056baaf | |||
| 819d3a0956 | |||
| 0cddd0842f | |||
| 0a9e2d4567 | |||
| 9d142c269d |
8
.gitignore
vendored
8
.gitignore
vendored
@@ -1 +1,7 @@
|
|||||||
apigit.txt
|
apigit.txt
|
||||||
|
*.pyc
|
||||||
|
*.pyo
|
||||||
|
*.pyd
|
||||||
|
|
||||||
|
# Dossier de cache
|
||||||
|
__pycache__/
|
||||||
44
app.py
44
app.py
@@ -1,22 +1,59 @@
|
|||||||
# filepath: f:\Dev\Rag\chat_bot_rag\app.py
|
# filepath: f:\Dev\Rag\chat_bot_rag\app.py
|
||||||
|
|
||||||
import gradio as gr
|
import gradio as gr
|
||||||
|
from config.settings import DEFAULT_MODEL, QDRANT_COLLECTION_NAME, AVAILABLE_MODELS
|
||||||
from services.rag_service import initialize_rag_bot
|
from services.rag_service import initialize_rag_bot
|
||||||
from components.chatbot import process_query, reset_conversation, change_model, change_collection
|
from components.chatbot import process_query, reset_conversation, change_model, change_collection
|
||||||
from components.ui import build_interface, update_ui_language_elements
|
from components.ui import build_interface, update_ui_language_elements
|
||||||
|
from translations.lang_mappings import UI_TRANSLATIONS, UI_SUPPORTED_LANGUAGES, LANGUAGE_MAPPING
|
||||||
|
|
||||||
|
def update_ui_language(language):
|
||||||
|
"""Fonction pour mettre à jour la langue de l'interface utilisateur"""
|
||||||
|
if language not in UI_SUPPORTED_LANGUAGES:
|
||||||
|
language = "Français" # Langue par défaut
|
||||||
|
|
||||||
|
# Récupérer les traductions pour la langue sélectionnée
|
||||||
|
translations = UI_TRANSLATIONS[language]
|
||||||
|
|
||||||
|
# Afficher un message de débogage
|
||||||
|
print(f"Mise à jour de la langue UI : {language}")
|
||||||
|
print(f"AVAILABLE_MODELS : {AVAILABLE_MODELS}")
|
||||||
|
|
||||||
|
# Retourner les valeurs mises à jour pour tous les éléments de l'interface
|
||||||
|
return [
|
||||||
|
f"# {translations['title']}", # Titre
|
||||||
|
gr.update(placeholder=translations["placeholder"]), # Placeholder du message
|
||||||
|
gr.update(value=translations["send_btn"]), # Texte du bouton d'envoi
|
||||||
|
gr.update(value=translations["clear_btn"]), # Texte du bouton d'effacement
|
||||||
|
gr.update(label=translations["ui_language_label"], info=translations["ui_language_info"]), # Label sélecteur langue UI
|
||||||
|
|
||||||
|
# IMPORTANT : Conserver les choices=AVAILABLE_MODELS ici
|
||||||
|
gr.update(label=translations["model_selector"], info=translations["model_info"], choices=AVAILABLE_MODELS),
|
||||||
|
|
||||||
|
f"{translations['model_current']}: **{DEFAULT_MODEL}**", # Statut du modèle
|
||||||
|
gr.update(label=translations["language_selector"], info=translations["language_info"], choices=list(LANGUAGE_MAPPING.keys())), # Langue réponses
|
||||||
|
gr.update(label=translations["collection_input"], info=translations["collection_info"]), # Label du champ de collection
|
||||||
|
f"{translations['collection_current']}: **{QDRANT_COLLECTION_NAME}**", # Statut de la collection
|
||||||
|
gr.update(value=translations["apply_btn"]), # Texte du bouton d'application
|
||||||
|
gr.update(label=translations["streaming_label"], info=translations["streaming_info"]), # Label du mode streaming
|
||||||
|
gr.update(label=translations["sources_label"]), # Label de l'affichage des sources
|
||||||
|
gr.update(label=translations["max_images_label"]), # Label du nombre max d'images
|
||||||
|
f"### {translations['images_title']}", # Titre des images
|
||||||
|
f"### {translations['tables_title']}" # Titre des tableaux
|
||||||
|
]
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
"""Main entry point for the chatbot application"""
|
"""Main entry point for the chatbot application"""
|
||||||
# Initialize the RAG chatbot
|
# Initialize the RAG chatbot
|
||||||
initialize_rag_bot()
|
initialize_rag_bot()
|
||||||
|
|
||||||
# Construire l'interface
|
# Dans app.py, corriger l'appel à build_interface
|
||||||
interface = build_interface(
|
interface = build_interface(
|
||||||
process_query_fn=process_query,
|
process_query_fn=process_query,
|
||||||
reset_conversation_fn=reset_conversation,
|
reset_conversation_fn=reset_conversation,
|
||||||
change_model_fn=change_model,
|
change_model_fn=change_model,
|
||||||
change_collection_fn=change_collection,
|
change_collection_fn=change_collection,
|
||||||
update_ui_language_fn=update_ui_language_elements # Ajout du paramètre manquant
|
update_ui_language_fn=update_ui_language # Utiliser update_ui_language, pas update_ui_language_elements
|
||||||
)
|
)
|
||||||
|
|
||||||
# Lancer l'appli Gradio
|
# Lancer l'appli Gradio
|
||||||
@@ -28,4 +65,5 @@ def main():
|
|||||||
)
|
)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
||||||
|
|||||||
@@ -9,6 +9,9 @@ from translations.lang_mappings import LANGUAGE_MAPPING
|
|||||||
from utils.image_utils import base64_to_image
|
from utils.image_utils import base64_to_image
|
||||||
from langchain.callbacks.base import BaseCallbackHandler
|
from langchain.callbacks.base import BaseCallbackHandler
|
||||||
import re
|
import re
|
||||||
|
from typing import List, Union, Dict, Any
|
||||||
|
# Pour Gradio 4.x
|
||||||
|
# from gradio.types.message import ImageMessage, HtmlMessage, TextMessage
|
||||||
|
|
||||||
def clean_llm_response(text):
|
def clean_llm_response(text):
|
||||||
"""Nettoie la réponse du LLM en enlevant les balises de pensée et autres éléments non désirés."""
|
"""Nettoie la réponse du LLM en enlevant les balises de pensée et autres éléments non désirés."""
|
||||||
@@ -53,7 +56,9 @@ def display_images(images_list=None):
|
|||||||
for img_data in images_to_use:
|
for img_data in images_to_use:
|
||||||
image = img_data["image"]
|
image = img_data["image"]
|
||||||
if image:
|
if image:
|
||||||
caption = f"{img_data['caption']} (Source: {img_data['source']}, Page: {img_data['page']})"
|
# Supprimer les infos de type "(Texte 5)" dans la caption
|
||||||
|
caption = re.sub(pattern_texte, '', img_data["caption"])
|
||||||
|
caption = f"{caption} (Source: {img_data['source']}, Page: {img_data['page']})"
|
||||||
gallery.append((image, caption))
|
gallery.append((image, caption))
|
||||||
|
|
||||||
return gallery if gallery else None
|
return gallery if gallery else None
|
||||||
@@ -155,81 +160,103 @@ def change_collection(collection_name, language="Français"):
|
|||||||
return f"❌ Erreur: {str(e)}"
|
return f"❌ Erreur: {str(e)}"
|
||||||
|
|
||||||
# Fonction de traitement de requête
|
# Fonction de traitement de requête
|
||||||
|
def convert_to_messages_format(history):
|
||||||
|
"""Convertit différents formats d'historique au format messages."""
|
||||||
|
messages = []
|
||||||
|
|
||||||
|
# Vérifier si nous avons déjà le format messages
|
||||||
|
if history and isinstance(history[0], dict) and "role" in history[0]:
|
||||||
|
return history
|
||||||
|
|
||||||
|
# Format tuples [(user_msg, assistant_msg), ...]
|
||||||
|
try:
|
||||||
|
for item in history:
|
||||||
|
if isinstance(item, tuple) and len(item) == 2:
|
||||||
|
user_msg, assistant_msg = item
|
||||||
|
messages.append({"role": "user", "content": user_msg})
|
||||||
|
if assistant_msg: # Éviter les messages vides
|
||||||
|
messages.append({"role": "assistant", "content": assistant_msg})
|
||||||
|
except Exception as e:
|
||||||
|
# Journaliser l'erreur pour le débogage
|
||||||
|
print(f"Format d'historique non reconnu: {history}")
|
||||||
|
print(f"Erreur: {str(e)}")
|
||||||
|
# Retourner un historique vide en cas d'erreur
|
||||||
|
return []
|
||||||
|
|
||||||
|
return messages
|
||||||
|
|
||||||
|
# Définir le pattern de l'expression régulière en dehors de la f-string
|
||||||
|
pattern_texte = r'\(Texte \d+\)'
|
||||||
|
|
||||||
def process_query(message, history, streaming, show_sources, max_images, language):
|
def process_query(message, history, streaming, show_sources, max_images, language):
|
||||||
global current_images, current_tables
|
global current_images, current_tables
|
||||||
|
|
||||||
|
print(f"Language selected for response: {language} -> {LANGUAGE_MAPPING.get(language, 'français')}")
|
||||||
|
|
||||||
if not message.strip():
|
if not message.strip():
|
||||||
return history, "", None, None
|
return history, "", None, None
|
||||||
|
|
||||||
current_images = []
|
current_images = []
|
||||||
current_tables = []
|
current_tables = []
|
||||||
print(f"Traitement du message: {message}")
|
|
||||||
print(f"Streaming: {streaming}")
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# Convert history to messages format
|
||||||
|
messages_history = convert_to_messages_format(history)
|
||||||
|
|
||||||
if streaming:
|
if streaming:
|
||||||
# Version avec streaming dans Gradio
|
# Add user message to history
|
||||||
history = history + [(message, "")]
|
messages_history.append({"role": "user", "content": message})
|
||||||
|
# Add empty message for assistant response
|
||||||
|
messages_history.append({"role": "assistant", "content": ""})
|
||||||
|
|
||||||
# 1. Récupérer les documents pertinents
|
# Get relevant documents
|
||||||
docs = rag_bot._retrieve_relevant_documents(message)
|
docs = rag_bot._retrieve_relevant_documents(message)
|
||||||
|
|
||||||
# 2. Préparer le contexte et l'historique
|
# Process context and history
|
||||||
context = rag_bot._format_documents(docs)
|
context = rag_bot._format_documents(docs)
|
||||||
history_text = rag_bot._format_chat_history()
|
history_text = rag_bot._format_chat_history()
|
||||||
|
|
||||||
# 3. Préparer le prompt
|
# Create prompt
|
||||||
prompt_template = ChatPromptTemplate.from_template("""
|
prompt_template = ChatPromptTemplate.from_template("""
|
||||||
Tu es un assistant documentaire spécialisé qui utilise toutes les informations disponibles dans le contexte fourni.
|
You are a specialized document assistant that uses the provided context.
|
||||||
|
|
||||||
TRÈS IMPORTANT: Tu dois répondre EXCLUSIVEMENT en {language}. Ne réponds JAMAIS dans une autre langue.
|
===== CRITICAL LANGUAGE INSTRUCTION =====
|
||||||
|
RESPOND ONLY IN {language}. This is an ABSOLUTE requirement.
|
||||||
|
NEVER RESPOND in any language other than {language}, regardless of question language.
|
||||||
|
==============================================
|
||||||
|
|
||||||
Instructions spécifiques:
|
Specific instructions:
|
||||||
1. Pour chaque image mentionnée dans le contexte, inclue TOUJOURS dans ta réponse:
|
1. For each image mentioned: include caption, source, page and description
|
||||||
- La légende/caption exacte de l'image
|
2. For each table: include title, source, page and significance
|
||||||
- La source et le numéro de page
|
3. For equations: use exact LaTeX syntax
|
||||||
- Une description brève de ce qu'elle montre
|
4. Don't invent information outside the provided context
|
||||||
|
5. Cite sources precisely
|
||||||
|
|
||||||
2. Pour chaque tableau mentionné dans le contexte, inclue TOUJOURS:
|
Conversation history:
|
||||||
- Le titre/caption exact du tableau
|
|
||||||
- La source et le numéro de page
|
|
||||||
- Ce que contient et signifie le tableau
|
|
||||||
|
|
||||||
3. Lorsque tu cites des équations mathématiques:
|
|
||||||
- Utilise la syntaxe LaTeX exacte comme dans le document ($...$ ou $$...$$)
|
|
||||||
- Reproduis-les fidèlement sans modification
|
|
||||||
|
|
||||||
4. IMPORTANT: Ne pas inventer d'informations - si une donnée n'est pas explicitement fournie dans le contexte,
|
|
||||||
indique clairement que cette information n'est pas disponible dans les documents fournis.
|
|
||||||
|
|
||||||
5. Cite précisément les sources pour chaque élément d'information (format: [Source, Page]).
|
|
||||||
|
|
||||||
6. CRUCIAL: Ta réponse doit être UNIQUEMENT et INTÉGRALEMENT en {language}, quelle que soit la langue de la question.
|
|
||||||
|
|
||||||
Historique de conversation:
|
|
||||||
{chat_history}
|
{chat_history}
|
||||||
|
|
||||||
Contexte (à utiliser pour répondre):
|
Context:
|
||||||
{context}
|
{context}
|
||||||
|
|
||||||
Question: {question}
|
Question: {question}
|
||||||
|
|
||||||
Réponds de façon structurée et précise en intégrant activement les images, tableaux et équations disponibles dans le contexte.
|
Respond in a structured way incorporating available images, tables and equations.
|
||||||
Ta réponse doit être exclusivement en {language}.
|
YOUR RESPONSE MUST BE SOLELY AND ENTIRELY IN {language}. THIS RULE IS ABSOLUTE.
|
||||||
""")
|
""")
|
||||||
|
|
||||||
# 4. Formater les messages pour le LLM
|
# Set language for the response
|
||||||
|
selected_language = LANGUAGE_MAPPING.get(language, "français")
|
||||||
messages = prompt_template.format_messages(
|
messages = prompt_template.format_messages(
|
||||||
chat_history=history_text,
|
chat_history=history_text,
|
||||||
context=context,
|
context=context,
|
||||||
question=message,
|
question=message,
|
||||||
language=LANGUAGE_MAPPING.get(language, "français")
|
language=selected_language
|
||||||
)
|
)
|
||||||
|
|
||||||
# 5. Créer un handler de streaming personnalisé
|
# Create streaming handler
|
||||||
handler = GradioStreamingHandler()
|
handler = GradioStreamingHandler()
|
||||||
|
|
||||||
# 6. Créer un modèle LLM avec notre handler
|
# Create LLM model with our handler
|
||||||
streaming_llm = ChatOllama(
|
streaming_llm = ChatOllama(
|
||||||
model=rag_bot.llm.model,
|
model=rag_bot.llm.model,
|
||||||
base_url=rag_bot.llm.base_url,
|
base_url=rag_bot.llm.base_url,
|
||||||
@@ -237,92 +264,93 @@ def process_query(message, history, streaming, show_sources, max_images, languag
|
|||||||
callbacks=[handler]
|
callbacks=[handler]
|
||||||
)
|
)
|
||||||
|
|
||||||
# 7. Lancer la génération dans un thread pour ne pas bloquer l'UI
|
# Generate response in a separate thread
|
||||||
def generate_response():
|
def generate_response():
|
||||||
streaming_llm.invoke(messages)
|
streaming_llm.invoke(messages)
|
||||||
|
|
||||||
thread = threading.Thread(target=generate_response)
|
thread = threading.Thread(target=generate_response)
|
||||||
thread.start()
|
thread.start()
|
||||||
|
|
||||||
# 8. Récupérer les tokens et mettre à jour l'interface
|
# Process tokens and update interface
|
||||||
partial_response = ""
|
partial_response = ""
|
||||||
|
|
||||||
# Attendre les tokens avec un timeout
|
# Wait for tokens with timeout
|
||||||
while thread.is_alive() or not handler.tokens_queue.empty():
|
while thread.is_alive() or not handler.tokens_queue.empty():
|
||||||
try:
|
try:
|
||||||
token = handler.tokens_queue.get(timeout=0.05)
|
token = handler.tokens_queue.get(timeout=0.05)
|
||||||
partial_response += token
|
partial_response += token
|
||||||
|
|
||||||
# Nettoyer la réponse uniquement pour l'affichage (pas pour l'historique interne)
|
# Clean response for display
|
||||||
clean_response = clean_llm_response(partial_response)
|
clean_response = clean_llm_response(partial_response)
|
||||||
history[-1] = (message, clean_response)
|
# Update assistant message - JUST TEXT, not multimodal
|
||||||
yield history, "", None, None
|
messages_history[-1]["content"] = clean_response
|
||||||
|
yield messages_history, "", None, None
|
||||||
except queue.Empty:
|
except queue.Empty:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Après la boucle, nettoyer la réponse complète pour l'historique interne
|
# After loop, clean the complete response for internal history
|
||||||
partial_response = clean_llm_response(partial_response)
|
partial_response = clean_llm_response(partial_response)
|
||||||
rag_bot.chat_history.append({"role": "user", "content": message})
|
rag_bot.chat_history.append({"role": "user", "content": message})
|
||||||
rag_bot.chat_history.append({"role": "assistant", "content": partial_response})
|
rag_bot.chat_history.append({"role": "assistant", "content": partial_response})
|
||||||
|
|
||||||
# 10. Récupérer les sources, images, tableaux
|
# Get sources, images, tables
|
||||||
texts, images, tables = rag_bot._process_documents(docs)
|
texts, images, tables = rag_bot._process_documents(docs)
|
||||||
|
|
||||||
# Préparer les informations sur les sources
|
# Process sources
|
||||||
source_info = ""
|
source_info = ""
|
||||||
if texts:
|
if texts:
|
||||||
source_info += f"📚 {len(texts)} textes • "
|
clean_texts = [re.sub(pattern_texte, '', t.get("source", "")) for t in texts]
|
||||||
if images:
|
# Remove duplicates and empty items
|
||||||
source_info += f"🖼️ {len(images)} images • "
|
clean_texts = [t for t in clean_texts if t.strip()]
|
||||||
if tables:
|
clean_texts = list(set(clean_texts))
|
||||||
source_info += f"📊 {len(tables)} tableaux"
|
if clean_texts:
|
||||||
|
source_info += f"📚 Sources: {', '.join(clean_texts)} • "
|
||||||
|
|
||||||
if source_info:
|
# Process images and tables for SEPARATE display only
|
||||||
source_info = "Sources trouvées: " + source_info
|
if show_sources and images and max_images > 0:
|
||||||
|
for img in images[:max_images]:
|
||||||
# 11. Traiter les images
|
|
||||||
if show_sources and images:
|
|
||||||
images = images[:max_images]
|
|
||||||
for img in images:
|
|
||||||
img_data = img.get("image_data")
|
img_data = img.get("image_data")
|
||||||
if img_data:
|
if img_data:
|
||||||
image = base64_to_image(img_data)
|
image = base64_to_image(img_data)
|
||||||
if image:
|
if image:
|
||||||
|
caption = re.sub(pattern_texte, '', img.get("caption", ""))
|
||||||
|
# Only add to gallery, not to chat messages
|
||||||
current_images.append({
|
current_images.append({
|
||||||
"image": image,
|
"image": image,
|
||||||
"caption": img.get("caption", ""),
|
"caption": caption,
|
||||||
"source": img.get("source", ""),
|
"source": img.get("source", ""),
|
||||||
"page": img.get("page", ""),
|
"page": img.get("page", "")
|
||||||
"description": img.get("description", "")
|
|
||||||
})
|
})
|
||||||
|
|
||||||
# 12. Traiter les tableaux
|
# Final yield with separate image gallery
|
||||||
if show_sources and tables:
|
yield messages_history, source_info, display_images(), display_tables()
|
||||||
for table in tables:
|
|
||||||
current_tables.append({
|
|
||||||
"data": rag_bot.format_table(table.get("table_data", "")),
|
|
||||||
"caption": table.get("caption", ""),
|
|
||||||
"source": table.get("source", ""),
|
|
||||||
"page": table.get("page", ""),
|
|
||||||
"description": table.get("description", "")
|
|
||||||
})
|
|
||||||
|
|
||||||
# 13. Retourner les résultats finaux
|
|
||||||
images_display = display_images()
|
|
||||||
tables_display = display_tables()
|
|
||||||
yield history, source_info, images_display, tables_display
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# Version sans streaming
|
# Version non-streaming
|
||||||
print("Mode non-streaming activé")
|
print("Mode non-streaming activé")
|
||||||
source_info = ""
|
source_info = ""
|
||||||
|
|
||||||
result = rag_bot.chat(message, stream=False)
|
history_tuples = history if isinstance(history, list) else []
|
||||||
|
|
||||||
|
# Ajouter le message utilisateur à l'historique au format message
|
||||||
|
messages_history.append({"role": "user", "content": message})
|
||||||
|
|
||||||
|
# Initialize multimodal_content first
|
||||||
|
multimodal_content = [result["response"]] # Start with text response
|
||||||
|
|
||||||
|
# Après avoir obtenu le résultat
|
||||||
|
result = rag_bot.chat(
|
||||||
|
message,
|
||||||
|
stream=False,
|
||||||
|
language=LANGUAGE_MAPPING.get(language, "français") # Vérifiez que cette ligne existe
|
||||||
|
)
|
||||||
# Nettoyer la réponse des balises <think>
|
# Nettoyer la réponse des balises <think>
|
||||||
result["response"] = clean_llm_response(result["response"])
|
result["response"] = clean_llm_response(result["response"])
|
||||||
history = history + [(message, result["response"])]
|
|
||||||
|
|
||||||
# Mise à jour de l'historique interne
|
# Ajouter la réponse de l'assistant au format message
|
||||||
|
messages_history.append({"role": "assistant", "content": result["response"]})
|
||||||
|
|
||||||
|
# Mise à jour de l'historique interne du chatbot
|
||||||
rag_bot.chat_history.append({"role": "user", "content": message})
|
rag_bot.chat_history.append({"role": "user", "content": message})
|
||||||
rag_bot.chat_history.append({"role": "assistant", "content": result["response"]})
|
rag_bot.chat_history.append({"role": "assistant", "content": result["response"]})
|
||||||
|
|
||||||
@@ -337,42 +365,37 @@ def process_query(message, history, streaming, show_sources, max_images, languag
|
|||||||
if source_info:
|
if source_info:
|
||||||
source_info = "Sources trouvées: " + source_info
|
source_info = "Sources trouvées: " + source_info
|
||||||
|
|
||||||
# Traiter les images et tableaux
|
# Process images for SEPARATE gallery
|
||||||
if show_sources and "images" in result and result["images"]:
|
if show_sources and "images" in result and result["images"]:
|
||||||
images = result["images"][:max_images]
|
for img in result["images"][:max_images]:
|
||||||
for img in images:
|
|
||||||
img_data = img.get("image_data")
|
img_data = img.get("image_data")
|
||||||
if img_data:
|
if img_data:
|
||||||
image = base64_to_image(img_data)
|
image = base64_to_image(img_data)
|
||||||
if image:
|
if image:
|
||||||
|
caption = re.sub(pattern_texte, '', img.get("caption", ""))
|
||||||
|
# Only add to gallery
|
||||||
current_images.append({
|
current_images.append({
|
||||||
"image": image,
|
"image": image,
|
||||||
"caption": img.get("caption", ""),
|
"caption": caption,
|
||||||
"source": img.get("source", ""),
|
"source": img.get("source", ""),
|
||||||
"page": img.get("page", ""),
|
"page": img.get("page", "")
|
||||||
"description": img.get("description", "")
|
|
||||||
})
|
})
|
||||||
|
|
||||||
if show_sources and "tables" in result and result["tables"]:
|
# Final yield with separate displays
|
||||||
tables = result["tables"]
|
yield messages_history, source_info, display_images(), display_tables()
|
||||||
for table in tables:
|
|
||||||
current_tables.append({
|
|
||||||
"data": rag_bot.format_table(table.get("table_data", "")),
|
|
||||||
"caption": table.get("caption", ""),
|
|
||||||
"source": table.get("source", ""),
|
|
||||||
"page": table.get("page", ""),
|
|
||||||
"description": table.get("description", "")
|
|
||||||
})
|
|
||||||
|
|
||||||
yield history, source_info, display_images(), display_tables()
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_msg = f"Une erreur est survenue: {str(e)}"
|
error_msg = f"Une erreur est survenue: {str(e)}"
|
||||||
traceback_text = traceback.format_exc()
|
traceback_text = traceback.format_exc()
|
||||||
print(error_msg)
|
print(error_msg)
|
||||||
print(traceback_text)
|
print(traceback_text)
|
||||||
history = history + [(message, error_msg)]
|
|
||||||
yield history, "Erreur lors du traitement de la requête", None, None
|
# Formater l'erreur au format message
|
||||||
|
error_history = convert_to_messages_format(history)
|
||||||
|
error_history.append({"role": "user", "content": message})
|
||||||
|
error_history.append({"role": "assistant", "content": error_msg})
|
||||||
|
|
||||||
|
yield error_history, "Erreur lors du traitement de la requête", None, None
|
||||||
|
|
||||||
# Fonction pour réinitialiser la conversation
|
# Fonction pour réinitialiser la conversation
|
||||||
def reset_conversation():
|
def reset_conversation():
|
||||||
@@ -382,4 +405,5 @@ def reset_conversation():
|
|||||||
|
|
||||||
rag_bot.clear_history()
|
rag_bot.clear_history()
|
||||||
|
|
||||||
return [], "", None, None
|
# Retourner une liste vide au format messages
|
||||||
|
return [], "", None, None # Liste vide = pas de messages
|
||||||
202
components/ui.py
202
components/ui.py
@@ -1,11 +1,58 @@
|
|||||||
import gradio as gr
|
import gradio as gr
|
||||||
from config.settings import DEFAULT_MODEL, QDRANT_COLLECTION_NAME, AVAILABLE_MODELS
|
from config.settings import DEFAULT_MODEL, QDRANT_COLLECTION_NAME, AVAILABLE_MODELS
|
||||||
from translations.lang_mappings import UI_TRANSLATIONS, UI_SUPPORTED_LANGUAGES
|
from translations.lang_mappings import UI_TRANSLATIONS, UI_SUPPORTED_LANGUAGES, LANGUAGE_MAPPING
|
||||||
from utils.katex_script import KATEX_CSS_JS
|
from utils.katex_script import KATEX_CSS_JS
|
||||||
|
|
||||||
def update_ui_language_elements(language):
|
def update_ui_language_elements(language):
|
||||||
"""Met à jour les éléments de l'interface utilisateur en fonction de la langue sélectionnée"""
|
"""Met à jour tous les éléments de l'interface avec la langue sélectionnée"""
|
||||||
pass # Implémentez selon vos besoins
|
|
||||||
|
# Vérifier si la langue est supportée par l'interface
|
||||||
|
if language not in UI_SUPPORTED_LANGUAGES:
|
||||||
|
language = "Français" # Langue par défaut
|
||||||
|
|
||||||
|
# Récupérer les traductions pour la langue sélectionnée
|
||||||
|
translations = UI_TRANSLATIONS[language]
|
||||||
|
|
||||||
|
# Créer un dictionnaire pour stocker tous les éléments modifiés
|
||||||
|
ui_elements = {}
|
||||||
|
|
||||||
|
# Mettre à jour le titre
|
||||||
|
ui_elements["title"] = translations["title"]
|
||||||
|
|
||||||
|
# Mettre à jour le placeholder et les boutons
|
||||||
|
ui_elements["placeholder"] = translations["placeholder"]
|
||||||
|
ui_elements["send_btn"] = translations["send_btn"]
|
||||||
|
ui_elements["clear_btn"] = translations["clear_btn"]
|
||||||
|
|
||||||
|
# Ajouter les traductions pour la langue de l'interface
|
||||||
|
ui_elements["ui_language_label"] = translations["ui_language_label"]
|
||||||
|
ui_elements["ui_language_info"] = translations["ui_language_info"]
|
||||||
|
|
||||||
|
# Mettre à jour les libellés des options
|
||||||
|
ui_elements["options_label"] = "Options" # Ce texte pourrait aussi être traduit
|
||||||
|
ui_elements["model_label"] = translations["model_selector"]
|
||||||
|
ui_elements["model_info"] = translations["model_info"]
|
||||||
|
ui_elements["model_current_prefix"] = translations["model_current"]
|
||||||
|
|
||||||
|
ui_elements["language_label"] = translations["language_selector"]
|
||||||
|
ui_elements["language_info"] = translations["language_info"]
|
||||||
|
|
||||||
|
ui_elements["collection_label"] = translations["collection_input"]
|
||||||
|
ui_elements["collection_info"] = translations["collection_info"]
|
||||||
|
ui_elements["collection_current_prefix"] = translations["collection_current"]
|
||||||
|
ui_elements["apply_btn"] = translations["apply_btn"]
|
||||||
|
|
||||||
|
ui_elements["streaming_label"] = translations["streaming_label"]
|
||||||
|
ui_elements["streaming_info"] = translations["streaming_info"]
|
||||||
|
ui_elements["sources_label"] = translations["sources_label"]
|
||||||
|
ui_elements["max_images_label"] = translations["max_images_label"]
|
||||||
|
|
||||||
|
ui_elements["images_title"] = translations["images_title"]
|
||||||
|
ui_elements["tables_title"] = translations["tables_title"]
|
||||||
|
|
||||||
|
return ui_elements
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def build_interface(
|
def build_interface(
|
||||||
process_query_fn,
|
process_query_fn,
|
||||||
@@ -14,102 +61,129 @@ def build_interface(
|
|||||||
change_collection_fn,
|
change_collection_fn,
|
||||||
update_ui_language_fn
|
update_ui_language_fn
|
||||||
):
|
):
|
||||||
"""Construit l'interface utilisateur avec Gradio."""
|
"""Construit l'interface utilisateur avec Gradio"""
|
||||||
|
print("Initialisation de l'interface")
|
||||||
|
print("AVAILABLE_MODELS chargé dans ui.py:", AVAILABLE_MODELS)
|
||||||
|
# Initialiser avec la langue par défaut (Français)
|
||||||
|
ui_elements = update_ui_language_elements("Français")
|
||||||
|
|
||||||
with gr.Blocks(css=KATEX_CSS_JS, theme=gr.themes.Soft(primary_hue="blue")) as interface:
|
with gr.Blocks(css=KATEX_CSS_JS, theme=gr.themes.Soft(primary_hue="blue")) as interface:
|
||||||
gr.Markdown("# 📚 Assistant documentaire intelligent")
|
title_md = gr.Markdown(f"# {ui_elements['title']}")
|
||||||
|
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
with gr.Column(scale=2):
|
with gr.Column(scale=2):
|
||||||
# Chatbot principal
|
|
||||||
chat_interface = gr.Chatbot(
|
chat_interface = gr.Chatbot(
|
||||||
height=600,
|
height=800,
|
||||||
show_label=False,
|
bubble_full_width=False,
|
||||||
layout="bubble",
|
show_copy_button=True,
|
||||||
elem_id="chatbot"
|
type="messages"
|
||||||
|
# likeable=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
msg = gr.Textbox(
|
msg = gr.Textbox(
|
||||||
show_label=False,
|
show_label=False,
|
||||||
placeholder="Posez votre question...",
|
placeholder=ui_elements['placeholder'],
|
||||||
container=False,
|
container=False,
|
||||||
scale=4
|
scale=4
|
||||||
)
|
)
|
||||||
submit_btn = gr.Button("Envoyer", variant="primary", scale=1)
|
submit_btn = gr.Button(ui_elements['send_btn'], variant="primary", scale=1)
|
||||||
|
|
||||||
clear_btn = gr.Button("Effacer la conversation")
|
clear_btn = gr.Button(ui_elements['clear_btn'])
|
||||||
source_info = gr.Markdown("", elem_id="sources_info")
|
source_info = gr.Markdown("", elem_id="sources_info")
|
||||||
|
|
||||||
with gr.Column(scale=1):
|
with gr.Column(scale=1):
|
||||||
with gr.Accordion("Options", open=True):
|
with gr.Accordion("Options", open=True):
|
||||||
# Sélecteur de modèle
|
# Sélecteur de langue pour l'interface
|
||||||
|
language_ui_selector = gr.Dropdown(
|
||||||
|
choices=UI_SUPPORTED_LANGUAGES,
|
||||||
|
value="Français",
|
||||||
|
label=ui_elements['ui_language_label'], # Utiliser une clé différente
|
||||||
|
info=ui_elements['ui_language_info']
|
||||||
|
)
|
||||||
|
# Sélecteur de modèle - assurez-vous que cette section est présente
|
||||||
model_selector = gr.Dropdown(
|
model_selector = gr.Dropdown(
|
||||||
choices=AVAILABLE_MODELS,
|
choices=AVAILABLE_MODELS,
|
||||||
value=DEFAULT_MODEL,
|
value=DEFAULT_MODEL,
|
||||||
label="Modèle Ollama",
|
label=ui_elements['model_label'],
|
||||||
info="Choisir le modèle de language à utiliser"
|
info=ui_elements['model_info']
|
||||||
)
|
)
|
||||||
model_status = gr.Markdown(f"Modèle actuel: **{DEFAULT_MODEL}**")
|
model_status = gr.Markdown(f"{ui_elements['model_current_prefix']}: **{DEFAULT_MODEL}**")
|
||||||
|
|
||||||
# Sélecteur de langue
|
# Sélecteur de langue pour les réponses
|
||||||
language_selector = gr.Dropdown(
|
language_selector = gr.Dropdown(
|
||||||
choices=UI_SUPPORTED_LANGUAGES,
|
choices=list(LANGUAGE_MAPPING.keys()),
|
||||||
value=UI_SUPPORTED_LANGUAGES[0],
|
value="Français",
|
||||||
label="Langue des réponses",
|
label=ui_elements['language_label'],
|
||||||
info="Choisir la langue dans laquelle l'assistant répondra"
|
info=ui_elements['language_info']
|
||||||
)
|
)
|
||||||
|
|
||||||
# Sélecteur de collection Qdrant
|
# Sélecteur de collection Qdrant
|
||||||
collection_name_input = gr.Textbox(
|
collection_name_input = gr.Textbox(
|
||||||
value=QDRANT_COLLECTION_NAME,
|
value=QDRANT_COLLECTION_NAME,
|
||||||
label="Collection Qdrant",
|
label=ui_elements['collection_label'],
|
||||||
info="Nom de la collection de documents à utiliser"
|
info=ui_elements['collection_info']
|
||||||
)
|
)
|
||||||
collection_status = gr.Markdown(f"Collection actuelle: **{QDRANT_COLLECTION_NAME}**")
|
collection_status = gr.Markdown(f"{ui_elements['collection_current_prefix']}: **{QDRANT_COLLECTION_NAME}**")
|
||||||
|
|
||||||
# Bouton d'application de la collection
|
# Bouton pour appliquer la collection
|
||||||
apply_collection_btn = gr.Button("Appliquer la collection")
|
apply_collection_btn = gr.Button(ui_elements['apply_btn'])
|
||||||
|
|
||||||
|
# Options de streaming et sources
|
||||||
streaming = gr.Checkbox(
|
streaming = gr.Checkbox(
|
||||||
label="Mode streaming",
|
label=ui_elements['streaming_label'],
|
||||||
value=True,
|
value=True,
|
||||||
info="Voir les réponses s'afficher progressivement"
|
info=ui_elements['streaming_info']
|
||||||
)
|
)
|
||||||
show_sources = gr.Checkbox(label="Afficher les sources", value=True)
|
show_sources = gr.Checkbox(label=ui_elements['sources_label'], value=True)
|
||||||
max_images = gr.Slider(
|
max_images = gr.Slider(
|
||||||
minimum=1,
|
minimum=1,
|
||||||
maximum=10,
|
maximum=10,
|
||||||
value=3,
|
value=3,
|
||||||
step=1,
|
step=1,
|
||||||
label="Nombre max d'images"
|
label=ui_elements['max_images_label']
|
||||||
)
|
)
|
||||||
|
|
||||||
gr.Markdown("---")
|
# Ne pas supprimer ces lignes dans ui.py
|
||||||
|
images_title = gr.Markdown(f"### {ui_elements['images_title']}")
|
||||||
gr.Markdown("### 🖼️ Images pertinentes")
|
image_gallery = gr.Gallery(label="Images")
|
||||||
image_gallery = gr.Gallery(
|
tables_title = gr.Markdown(f"### {ui_elements['tables_title']}")
|
||||||
label="Images pertinentes",
|
|
||||||
show_label=False,
|
|
||||||
columns=2,
|
|
||||||
height=300,
|
|
||||||
object_fit="contain"
|
|
||||||
)
|
|
||||||
|
|
||||||
gr.Markdown("### 📊 Tableaux")
|
|
||||||
tables_display = gr.HTML()
|
tables_display = gr.HTML()
|
||||||
|
|
||||||
# Connecter le changement de modèle
|
# Ajouter cette fonction juste avant de connecter le changement de langue
|
||||||
model_selector.change(
|
def preserve_models_wrapper(language):
|
||||||
fn=change_model_fn,
|
"""Préserve la liste des modèles lors du changement de langue"""
|
||||||
inputs=model_selector,
|
# Obtenir les mises à jour depuis la fonction d'origine
|
||||||
outputs=model_status
|
updates = update_ui_language_fn(language)
|
||||||
)
|
|
||||||
|
# Force la liste complète des modèles disponibles (position 5 dans les sorties)
|
||||||
# Connecter le changement de collection
|
# Cela garantit que quelles que soient les mises à jour, la liste des modèles reste intacte
|
||||||
apply_collection_btn.click(
|
if isinstance(updates[5], dict) and "choices" in updates[5]:
|
||||||
fn=change_collection_fn,
|
print("Préservation de la liste des modèles:", AVAILABLE_MODELS)
|
||||||
inputs=collection_name_input,
|
updates[5]["choices"] = AVAILABLE_MODELS
|
||||||
outputs=collection_status
|
|
||||||
|
return updates
|
||||||
|
|
||||||
|
# Puis modifier la connexion du language_ui_selector.change comme suit :
|
||||||
|
language_ui_selector.change(
|
||||||
|
fn=preserve_models_wrapper, # Utiliser notre wrapper au lieu de la fonction directe
|
||||||
|
inputs=language_ui_selector,
|
||||||
|
outputs=[
|
||||||
|
title_md,
|
||||||
|
msg,
|
||||||
|
submit_btn,
|
||||||
|
clear_btn,
|
||||||
|
language_ui_selector,
|
||||||
|
model_selector,
|
||||||
|
model_status,
|
||||||
|
language_selector,
|
||||||
|
collection_name_input,
|
||||||
|
collection_status,
|
||||||
|
apply_collection_btn,
|
||||||
|
streaming,
|
||||||
|
show_sources,
|
||||||
|
max_images
|
||||||
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
# Fonction pour effacer l'entrée
|
# Fonction pour effacer l'entrée
|
||||||
@@ -131,14 +205,28 @@ def build_interface(
|
|||||||
|
|
||||||
clear_btn.click(
|
clear_btn.click(
|
||||||
reset_conversation_fn,
|
reset_conversation_fn,
|
||||||
outputs=[chat_interface, source_info, image_gallery, tables_display]
|
outputs=[chat_interface, source_info] # Retirer image_gallery et tables_display
|
||||||
|
)
|
||||||
|
|
||||||
|
# Connecter le changement de modèle
|
||||||
|
model_selector.change(
|
||||||
|
fn=change_model_fn,
|
||||||
|
inputs=model_selector,
|
||||||
|
outputs=model_status
|
||||||
|
)
|
||||||
|
|
||||||
|
# Connecter le changement de collection
|
||||||
|
apply_collection_btn.click(
|
||||||
|
fn=change_collection_fn,
|
||||||
|
inputs=collection_name_input,
|
||||||
|
outputs=collection_status
|
||||||
)
|
)
|
||||||
|
|
||||||
# Style KaTeX et amélioration du design
|
# Style KaTeX et amélioration du design
|
||||||
gr.Markdown("""
|
gr.Markdown("""
|
||||||
<style>
|
<style>
|
||||||
.gradio-container {max-width: 1200px !important}
|
.gradio-container {max-width: 1200px !important}
|
||||||
#chatbot {height: 600px; overflow-y: auto;}
|
#chatbot {height: 800px; overflow-y: auto;}
|
||||||
#sources_info {margin-top: 10px; color: #666;}
|
#sources_info {margin-top: 10px; color: #666;}
|
||||||
|
|
||||||
/* Improved styles for equations */
|
/* Improved styles for equations */
|
||||||
|
|||||||
223
final_pdf.ipynb
223
final_pdf.ipynb
File diff suppressed because one or more lines are too long
@@ -53,8 +53,7 @@ LANGUAGE_MAPPING = {
|
|||||||
"Italiano": "italiano",
|
"Italiano": "italiano",
|
||||||
"中文": "Chinese",
|
"中文": "Chinese",
|
||||||
"日本語": "Japanese",
|
"日本語": "Japanese",
|
||||||
"العربية": "Arabic",
|
"العربية": "Arabic"
|
||||||
"فارسی": "Persian" # Added Persian language
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# Initialiser le chatbot RAG avec le modèle par défaut
|
# Initialiser le chatbot RAG avec le modèle par défaut
|
||||||
@@ -389,11 +388,12 @@ def display_tables():
|
|||||||
print(f"Error formatting table {idx}: {e}")
|
print(f"Error formatting table {idx}: {e}")
|
||||||
table_html = f'<pre>{table_data}</pre>'
|
table_html = f'<pre>{table_data}</pre>'
|
||||||
|
|
||||||
# Create the table container with metadata - REMOVED description
|
# Create the table container with metadata
|
||||||
html += f"""
|
html += f"""
|
||||||
<div style="margin-bottom: 20px; border: 1px solid #ddd; padding: 15px; border-radius: 8px;">
|
<div style="margin-bottom: 20px; border: 1px solid #ddd; padding: 15px; border-radius: 8px;">
|
||||||
<h3>{table['caption']}</h3>
|
<h3>{table['caption']}</h3>
|
||||||
<p style="color:#666; font-size:0.9em;">Source: {table['source']}, Page: {table['page']}</p>
|
<p style="color:#666; font-size:0.9em;">Source: {table['source']}, Page: {table['page']}</p>
|
||||||
|
<p><strong>Description:</strong> {table['description']}</p>
|
||||||
{table_html}
|
{table_html}
|
||||||
</div>
|
</div>
|
||||||
"""
|
"""
|
||||||
@@ -448,7 +448,7 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
|
|||||||
|
|
||||||
# Sélecteur de langue
|
# Sélecteur de langue
|
||||||
language_selector = gr.Dropdown(
|
language_selector = gr.Dropdown(
|
||||||
choices=["Français", "English", "Español", "Deutsch", "Italiano", "中文", "日本語", "العربية", "فارسی"],
|
choices=["Français", "English", "Español", "Deutsch", "Italiano", "中文", "日本語", "العربية"],
|
||||||
value="Français",
|
value="Français",
|
||||||
label="Langue des réponses",
|
label="Langue des réponses",
|
||||||
info="Choisir la langue dans laquelle l'assistant répondra"
|
info="Choisir la langue dans laquelle l'assistant répondra"
|
||||||
@@ -535,7 +535,7 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
|
|||||||
/* Improved styles for equations */
|
/* Improved styles for equations */
|
||||||
.katex { font-size: 1.1em !important; }
|
.katex { font-size: 1.1em !important; }
|
||||||
.math-inline { background: #f8f9fa; padding: 2px 5px; border-radius: 4px; }
|
.math-inline { background: #f8f9fa; padding: 2px 5px; border-radius: 4px; }
|
||||||
.math-display { background: #f8f9fa; margin: 10px 0; padding: 10px; border-radius: 5px; overflow-x: auto; text-align: center; }
|
.math-display { background: #f8f9f9; margin: 10px 0; padding: 10px; border-radius: 5px; overflow-x: auto; text-align: center; }
|
||||||
|
|
||||||
/* Table styles */
|
/* Table styles */
|
||||||
table {
|
table {
|
||||||
@@ -578,15 +578,15 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
|
|||||||
delimiters: [
|
delimiters: [
|
||||||
{left: '$$', right: '$$', display: true},
|
{left: '$$', right: '$$', display: true},
|
||||||
{left: '$', right: '$', display: false},
|
{left: '$', right: '$', display: false},
|
||||||
{left: '\\\\(', right: '\\\\)', display: false},
|
{left: '\\(', right: '\\)', display: false},
|
||||||
{left: '\\\\[', right: '\\\\]', display: true}
|
{left: '\\[', right: '\\]', display: true}
|
||||||
],
|
],
|
||||||
throwOnError: false,
|
throwOnError: false,
|
||||||
trust: true,
|
trust: true,
|
||||||
strict: false,
|
strict: false,
|
||||||
macros: {
|
macros: {
|
||||||
"\\\\R": "\\\\mathbb{R}",
|
"\\R": "\\mathbb{R}",
|
||||||
"\\\\N": "\\\\mathbb{N}"
|
"\\N": "\\mathbb{N}"
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
@@ -617,12 +617,12 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
|
|||||||
function prepareLatexInText(text) {
|
function prepareLatexInText(text) {
|
||||||
// Make sure dollar signs used for math have proper spacing
|
// Make sure dollar signs used for math have proper spacing
|
||||||
// First, protect existing well-formed math expressions
|
// First, protect existing well-formed math expressions
|
||||||
text = text.replace(/(\\$\\$[^\\$]+\\$\\$)/g, '<protect>$1</protect>'); // protect display math
|
text = text.replace(/(\$\$[^\$]+\$\$)/g, '<protect>$1</protect>'); // protect display math
|
||||||
text = text.replace(/(\\$[^\\$\\n]+\\$)/g, '<protect>$1</protect>'); // protect inline math
|
text = text.replace(/(\$[^\$\n]+\$)/g, '<protect>$1</protect>'); // protect inline math
|
||||||
|
|
||||||
// Fix common LaTeX formatting issues outside protected regions
|
// Fix common LaTeX formatting issues outside protected regions
|
||||||
text = text.replace(/([^<]protect[^>]*)(\\$)([^\\s])/g, '$1$2 $3'); // Add space after $ if needed
|
text = text.replace(/([^<]protect[^>]*)(\$)([^\s])/g, '$1$2 $3'); // Add space after $ if needed
|
||||||
text = text.replace(/([^\\s])(\\$)([^<]protect[^>]*)/g, '$1 $2$3'); // Add space before $ if needed
|
text = text.replace(/([^\s])(\$)([^<]protect[^>]*)/g, '$1 $2$3'); // Add space before $ if needed
|
||||||
|
|
||||||
// Handle subscripts: transform x_1 into x_{1} for better LaTeX compatibility
|
// Handle subscripts: transform x_1 into x_{1} for better LaTeX compatibility
|
||||||
text = text.replace(/([a-zA-Z])_([0-9a-zA-Z])/g, '$1_{$2}');
|
text = text.replace(/([a-zA-Z])_([0-9a-zA-Z])/g, '$1_{$2}');
|
||||||
|
|||||||
230
pdfProcessing.py
230
pdfProcessing.py
@@ -7,7 +7,9 @@ from langchain.schema import Document
|
|||||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||||
from langchain_core.prompts import ChatPromptTemplate
|
from langchain_core.prompts import ChatPromptTemplate
|
||||||
from langchain_core.output_parsers import StrOutputParser
|
from langchain_core.output_parsers import StrOutputParser
|
||||||
|
import httpx
|
||||||
|
from tqdm import tqdm
|
||||||
|
http_client = httpx.Client(verify=False)
|
||||||
|
|
||||||
class PdfProcessor:
|
class PdfProcessor:
|
||||||
"""
|
"""
|
||||||
@@ -80,6 +82,40 @@ class PdfProcessor:
|
|||||||
if not self.config["openai_api_key"]:
|
if not self.config["openai_api_key"]:
|
||||||
raise ValueError("OpenAI API key is required when using OpenAI models")
|
raise ValueError("OpenAI API key is required when using OpenAI models")
|
||||||
os.environ["OPENAI_API_KEY"] = self.config["openai_api_key"]
|
os.environ["OPENAI_API_KEY"] = self.config["openai_api_key"]
|
||||||
|
|
||||||
|
# Initialize Qdrant client
|
||||||
|
from qdrant_client import QdrantClient
|
||||||
|
from qdrant_client.http import models as rest
|
||||||
|
|
||||||
|
self.qdrant_client = QdrantClient(url=self.config["qdrant_url"])
|
||||||
|
|
||||||
|
# Check if collection exists and create it if not
|
||||||
|
collections = self.qdrant_client.get_collections().collections
|
||||||
|
collection_exists = any(collection.name == self.config["collection_name"] for collection in collections)
|
||||||
|
|
||||||
|
if not collection_exists:
|
||||||
|
# Get vector size based on embedding model
|
||||||
|
if self.config["embedding_provider"] == "ollama":
|
||||||
|
# For OllamaEmbeddings, typically 4096 dimensions for newer models
|
||||||
|
vector_size = 4096
|
||||||
|
else: # OpenAI
|
||||||
|
# OpenAI embedding dimensions vary by model
|
||||||
|
model_dimensions = {
|
||||||
|
"text-embedding-ada-002": 1536,
|
||||||
|
"text-embedding-3-small": 1536,
|
||||||
|
"text-embedding-3-large": 3072
|
||||||
|
}
|
||||||
|
vector_size = model_dimensions.get(self.config["openai_embedding_model"], 1536)
|
||||||
|
|
||||||
|
# Create the collection
|
||||||
|
self.qdrant_client.create_collection(
|
||||||
|
collection_name=self.config["collection_name"],
|
||||||
|
vectors_config=rest.VectorParams(
|
||||||
|
size=vector_size,
|
||||||
|
distance=rest.Distance.COSINE
|
||||||
|
)
|
||||||
|
)
|
||||||
|
print(f"Created new Qdrant collection: {self.config['collection_name']}")
|
||||||
|
|
||||||
def _setup_models(self):
|
def _setup_models(self):
|
||||||
"""Initialize models based on configuration."""
|
"""Initialize models based on configuration."""
|
||||||
@@ -106,6 +142,7 @@ class PdfProcessor:
|
|||||||
else: # openai
|
else: # openai
|
||||||
from langchain_openai import ChatOpenAI
|
from langchain_openai import ChatOpenAI
|
||||||
self.summary_model = ChatOpenAI(
|
self.summary_model = ChatOpenAI(
|
||||||
|
http_client=http_client,
|
||||||
model=self.config["openai_summary_model"]
|
model=self.config["openai_summary_model"]
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -134,38 +171,45 @@ class PdfProcessor:
|
|||||||
Returns:
|
Returns:
|
||||||
Dictionary with processing statistics
|
Dictionary with processing statistics
|
||||||
"""
|
"""
|
||||||
# Load and extract content from PDF
|
# Create a master progress bar
|
||||||
print("Loading PDF and extracting elements...")
|
with tqdm(total=5, desc="PDF Processing", position=0) as master_bar:
|
||||||
documents = self._load_pdf(pdf_path)
|
# Load and extract content from PDF
|
||||||
|
master_bar.set_description("Loading PDF")
|
||||||
# Process text chunks
|
documents = self._load_pdf(pdf_path)
|
||||||
print("Processing text chunks...")
|
master_bar.update(1)
|
||||||
title_chunks = self._process_text(documents)
|
|
||||||
text_summaries = self._summarize_text(title_chunks)
|
# Process text chunks
|
||||||
processed_text = self._convert_text_to_documents(title_chunks, text_summaries)
|
master_bar.set_description("Processing text chunks")
|
||||||
|
title_chunks = self._process_text(documents)
|
||||||
# Process images if configured
|
text_summaries = self._summarize_text(title_chunks)
|
||||||
print("Processing images...")
|
processed_text = self._convert_text_to_documents(title_chunks, text_summaries)
|
||||||
processed_images = []
|
master_bar.update(1)
|
||||||
if self.config["extract_images"]:
|
|
||||||
images = self._extract_images(documents)
|
# Process images if configured
|
||||||
image_summaries = self._process_images(images)
|
master_bar.set_description("Processing images")
|
||||||
processed_images = self._convert_images_to_documents(images, image_summaries)
|
processed_images = []
|
||||||
|
if self.config["extract_images"]:
|
||||||
# Process tables if configured
|
images = self._extract_images(documents)
|
||||||
print("Processing tables...")
|
image_summaries = self._process_images(images)
|
||||||
processed_tables = []
|
processed_images = self._convert_images_to_documents(images, image_summaries)
|
||||||
if self.config["extract_tables"]:
|
master_bar.update(1)
|
||||||
tables = self._extract_tables(documents)
|
|
||||||
table_summaries = self._process_tables(tables)
|
# Process tables if configured
|
||||||
processed_tables = self._convert_tables_to_documents(tables, table_summaries)
|
master_bar.set_description("Processing tables")
|
||||||
|
processed_tables = []
|
||||||
print("Storing processed elements in Qdrant...")
|
if self.config["extract_tables"]:
|
||||||
# Combine all processed elements
|
tables = self._extract_tables(documents)
|
||||||
final_documents = processed_text + processed_images + processed_tables
|
table_summaries = self._process_tables(tables)
|
||||||
|
processed_tables = self._convert_tables_to_documents(tables, table_summaries)
|
||||||
# Store in Qdrant
|
master_bar.update(1)
|
||||||
self._store_documents(final_documents)
|
|
||||||
|
master_bar.set_description("Storing in Qdrant")
|
||||||
|
# Combine all processed elements
|
||||||
|
final_documents = processed_text + processed_images + processed_tables
|
||||||
|
|
||||||
|
# Store in Qdrant
|
||||||
|
self._store_documents(final_documents)
|
||||||
|
master_bar.update(1)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"text_chunks": len(processed_text),
|
"text_chunks": len(processed_text),
|
||||||
@@ -199,7 +243,15 @@ class PdfProcessor:
|
|||||||
|
|
||||||
def _summarize_text(self, chunks: List[Document]) -> List[str]:
|
def _summarize_text(self, chunks: List[Document]) -> List[str]:
|
||||||
"""Generate summaries for text chunks."""
|
"""Generate summaries for text chunks."""
|
||||||
return self.summarize_chain.batch([chunk.page_content for chunk in chunks], {"max_concurrency": 3})
|
if not chunks:
|
||||||
|
return []
|
||||||
|
|
||||||
|
print(f"Summarizing {len(chunks)} text chunks...")
|
||||||
|
results = []
|
||||||
|
for chunk in tqdm(chunks, desc="Text summarization", leave=False):
|
||||||
|
result = self.summarize_chain.invoke(chunk.page_content)
|
||||||
|
results.append(result)
|
||||||
|
return results
|
||||||
|
|
||||||
def _extract_images(self, documents: List[Document]) -> List[Dict[str, Any]]:
|
def _extract_images(self, documents: List[Document]) -> List[Dict[str, Any]]:
|
||||||
"""Extract images with captions from documents."""
|
"""Extract images with captions from documents."""
|
||||||
@@ -225,12 +277,17 @@ class PdfProcessor:
|
|||||||
|
|
||||||
def _process_images(self, images: List[Dict[str, Any]]) -> List[str]:
|
def _process_images(self, images: List[Dict[str, Any]]) -> List[str]:
|
||||||
"""Generate descriptions for images using configured model."""
|
"""Generate descriptions for images using configured model."""
|
||||||
|
if not images:
|
||||||
|
return []
|
||||||
|
|
||||||
|
print(f"Processing {len(images)} images...")
|
||||||
|
|
||||||
if self.config["image_provider"] == "ollama":
|
if self.config["image_provider"] == "ollama":
|
||||||
from ollama import Client
|
from ollama import Client
|
||||||
client = Client(host=self.config["ollama_image_url"])
|
client = Client(host=self.config["ollama_image_url"])
|
||||||
|
|
||||||
image_summaries = []
|
image_summaries = []
|
||||||
for img in images:
|
for img in tqdm(images, desc="Image processing", leave=False):
|
||||||
prompt = f"Caption of image: {img.get('caption', '')}. Describe this image in detail in {self.config['summary_language']}."
|
prompt = f"Caption of image: {img.get('caption', '')}. Describe this image in detail in {self.config['summary_language']}."
|
||||||
response = client.chat(
|
response = client.chat(
|
||||||
model=self.config["ollama_image_model"],
|
model=self.config["ollama_image_model"],
|
||||||
@@ -261,9 +318,17 @@ class PdfProcessor:
|
|||||||
]
|
]
|
||||||
|
|
||||||
prompt = ChatPromptTemplate.from_messages(messages)
|
prompt = ChatPromptTemplate.from_messages(messages)
|
||||||
chain = prompt | ChatOpenAI(model=self.config["openai_image_model"]) | StrOutputParser()
|
chain = prompt | ChatOpenAI(model=self.config["openai_image_model"], http_client=http_client) | StrOutputParser()
|
||||||
|
|
||||||
return chain.batch([{"image_base64": img["image_base64"], "caption": img.get("caption", "")} for img in images])
|
# Process images with progress bar
|
||||||
|
results = []
|
||||||
|
image_data = [{"image_base64": img["image_base64"], "caption": img.get("caption", "")} for img in images]
|
||||||
|
|
||||||
|
for img_data in tqdm(image_data, desc="Image processing", leave=False):
|
||||||
|
result = chain.invoke(img_data)
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
def _extract_tables(self, documents: List[Document]) -> List[Dict[str, Any]]:
|
def _extract_tables(self, documents: List[Document]) -> List[Dict[str, Any]]:
|
||||||
"""Extract tables with captions from documents."""
|
"""Extract tables with captions from documents."""
|
||||||
@@ -290,9 +355,13 @@ class PdfProcessor:
|
|||||||
|
|
||||||
def _process_tables(self, tables: List[Dict[str, Any]]) -> List[str]:
|
def _process_tables(self, tables: List[Dict[str, Any]]) -> List[str]:
|
||||||
"""Generate summaries for tables."""
|
"""Generate summaries for tables."""
|
||||||
|
if not tables:
|
||||||
|
return []
|
||||||
|
|
||||||
|
print(f"Processing {len(tables)} tables...")
|
||||||
table_summaries = []
|
table_summaries = []
|
||||||
|
|
||||||
for table in tables:
|
for table in tqdm(tables, desc="Table processing", leave=False):
|
||||||
prompt = f"""Caption of table: {table.get('caption', '')}.
|
prompt = f"""Caption of table: {table.get('caption', '')}.
|
||||||
Describe this table in detail in {self.config['summary_language']}.
|
Describe this table in detail in {self.config['summary_language']}.
|
||||||
Table content: {table.get('table_data', '')}"""
|
Table content: {table.get('table_data', '')}"""
|
||||||
@@ -481,11 +550,86 @@ class PdfProcessor:
|
|||||||
final_chunks.extend(sub_chunks)
|
final_chunks.extend(sub_chunks)
|
||||||
|
|
||||||
return final_chunks
|
return final_chunks
|
||||||
|
|
||||||
|
def process_directory(self, directory_path: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Process all PDF files in the specified directory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
directory_path: Path to the directory containing PDF files
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with processing statistics for all files
|
||||||
|
"""
|
||||||
|
# Check if directory exists
|
||||||
|
if not os.path.isdir(directory_path):
|
||||||
|
raise ValueError(f"Directory not found: {directory_path}")
|
||||||
|
|
||||||
|
# Find all PDF files in the directory
|
||||||
|
pdf_files = glob.glob(os.path.join(directory_path, "*.pdf"))
|
||||||
|
|
||||||
|
if not pdf_files:
|
||||||
|
print(f"No PDF files found in {directory_path}")
|
||||||
|
return {"files_processed": 0}
|
||||||
|
|
||||||
|
# Track overall statistics
|
||||||
|
overall_stats = {
|
||||||
|
"files_processed": 0,
|
||||||
|
"total_text_chunks": 0,
|
||||||
|
"total_image_chunks": 0,
|
||||||
|
"total_table_chunks": 0,
|
||||||
|
"total_chunks": 0,
|
||||||
|
"collection_name": self.config["collection_name"],
|
||||||
|
"file_details": []
|
||||||
|
}
|
||||||
|
|
||||||
|
# Process each PDF file with a progress bar
|
||||||
|
print(f"Found {len(pdf_files)} PDF files in {directory_path}")
|
||||||
|
for pdf_file in tqdm(pdf_files, desc="Processing PDF files", unit="file"):
|
||||||
|
try:
|
||||||
|
print(f"\nProcessing: {os.path.basename(pdf_file)}")
|
||||||
|
result = self.process_pdf(pdf_file)
|
||||||
|
|
||||||
|
# Update statistics
|
||||||
|
overall_stats["files_processed"] += 1
|
||||||
|
overall_stats["total_text_chunks"] += result.get("text_chunks", 0)
|
||||||
|
overall_stats["total_image_chunks"] += result.get("image_chunks", 0)
|
||||||
|
overall_stats["total_table_chunks"] += result.get("table_chunks", 0)
|
||||||
|
overall_stats["total_chunks"] += result.get("total_chunks", 0)
|
||||||
|
|
||||||
|
# Store individual file results
|
||||||
|
file_detail = {
|
||||||
|
"filename": os.path.basename(pdf_file),
|
||||||
|
"text_chunks": result.get("text_chunks", 0),
|
||||||
|
"image_chunks": result.get("image_chunks", 0),
|
||||||
|
"table_chunks": result.get("table_chunks", 0),
|
||||||
|
"total_chunks": result.get("total_chunks", 0)
|
||||||
|
}
|
||||||
|
overall_stats["file_details"].append(file_detail)
|
||||||
|
|
||||||
|
print(f"Completed: {file_detail['filename']} - {file_detail['total_chunks']} chunks processed")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing {pdf_file}: {str(e)}")
|
||||||
|
# Continue with next file
|
||||||
|
|
||||||
|
print("\nDirectory processing complete!")
|
||||||
|
print(f"Processed {overall_stats['files_processed']} files")
|
||||||
|
print(f"Total chunks: {overall_stats['total_chunks']}")
|
||||||
|
print(f" - Text chunks: {overall_stats['total_text_chunks']}")
|
||||||
|
print(f" - Image chunks: {overall_stats['total_image_chunks']}")
|
||||||
|
print(f" - Table chunks: {overall_stats['total_table_chunks']}")
|
||||||
|
print(f"All content stored in collection: {overall_stats['collection_name']}")
|
||||||
|
|
||||||
|
return overall_stats
|
||||||
|
|
||||||
|
import glob
|
||||||
|
import os
|
||||||
processor = PdfProcessor({
|
processor = PdfProcessor({
|
||||||
"image_provider": "openai",
|
# "image_provider": "openai",
|
||||||
"openai_api_key": "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",
|
# "openai_api_key": "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",
|
||||||
"collection_name": "my_custom_collection",
|
"collection_name": "my_control_and calibration",
|
||||||
"summary_language": "English"
|
"summary_language": "English"
|
||||||
})
|
})
|
||||||
result = processor.process_pdf(r"F:\Dev\Rag\chat_bot_rag\T4 Machines thermiques.pdf")
|
|
||||||
|
results = processor.process_directory(r"C:\Users\serameza\host-data")
|
||||||
|
|||||||
328
services/rag_service.py
Normal file
328
services/rag_service.py
Normal file
@@ -0,0 +1,328 @@
|
|||||||
|
import base64
|
||||||
|
from io import BytesIO
|
||||||
|
from PIL import Image
|
||||||
|
import traceback
|
||||||
|
import threading
|
||||||
|
import queue
|
||||||
|
import time
|
||||||
|
|
||||||
|
from rag_chatbot import MultimodalRAGChatbot
|
||||||
|
from langchain.prompts import ChatPromptTemplate
|
||||||
|
from langchain_ollama import ChatOllama
|
||||||
|
from langchain.callbacks.base import BaseCallbackHandler
|
||||||
|
|
||||||
|
# Handler personnalisé pour capturer les tokens en streaming
|
||||||
|
class GradioStreamingHandler(BaseCallbackHandler):
|
||||||
|
def __init__(self):
|
||||||
|
self.tokens_queue = queue.Queue()
|
||||||
|
self.full_text = ""
|
||||||
|
|
||||||
|
def on_llm_new_token(self, token, **kwargs):
|
||||||
|
self.tokens_queue.put(token)
|
||||||
|
self.full_text += token
|
||||||
|
|
||||||
|
# Fonction pour créer un objet Image à partir des données base64
|
||||||
|
def base64_to_image(base64_data):
|
||||||
|
"""Convertit une image base64 en objet Image pour l'affichage direct"""
|
||||||
|
try:
|
||||||
|
if not base64_data:
|
||||||
|
return None
|
||||||
|
image_bytes = base64.b64decode(base64_data)
|
||||||
|
image = Image.open(BytesIO(image_bytes))
|
||||||
|
return image
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Erreur lors de la conversion d'image: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Configuration pour initialiser le chatbot
|
||||||
|
QDRANT_URL = "http://localhost:6333"
|
||||||
|
QDRANT_COLLECTION_NAME = "my_custom_collection"
|
||||||
|
EMBEDDING_MODEL = "mxbai-embed-large"
|
||||||
|
OLLAMA_URL = "http://127.0.0.1:11434"
|
||||||
|
DEFAULT_MODEL = "llama3.2"
|
||||||
|
|
||||||
|
# Liste des modèles disponibles
|
||||||
|
AVAILABLE_MODELS = ["llama3.1", "llama3.2", "deepseek-r1:7b", "deepseek-r1:14b"]
|
||||||
|
|
||||||
|
# Mapping des langues pour une meilleure compréhension par le LLM
|
||||||
|
LANGUAGE_MAPPING = {
|
||||||
|
"Français": "français",
|
||||||
|
"English": "English",
|
||||||
|
"Español": "español",
|
||||||
|
"Deutsch": "Deutsch",
|
||||||
|
"Italiano": "italiano",
|
||||||
|
"中文": "Chinese",
|
||||||
|
"日本語": "Japanese",
|
||||||
|
"العربية": "Arabic"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Variables globales pour stocker les images et tableaux de la dernière requête
|
||||||
|
current_images = []
|
||||||
|
current_tables = []
|
||||||
|
|
||||||
|
# Initialiser le chatbot RAG avec le modèle par défaut
|
||||||
|
def initialize_rag_bot():
|
||||||
|
global rag_bot
|
||||||
|
rag_bot = MultimodalRAGChatbot(
|
||||||
|
qdrant_url=QDRANT_URL,
|
||||||
|
qdrant_collection_name=QDRANT_COLLECTION_NAME,
|
||||||
|
ollama_model=DEFAULT_MODEL,
|
||||||
|
embedding_model=EMBEDDING_MODEL,
|
||||||
|
ollama_url=OLLAMA_URL
|
||||||
|
)
|
||||||
|
print(f"Chatbot initialisé avec modèle: {DEFAULT_MODEL}")
|
||||||
|
|
||||||
|
# Fonction pour changer de modèle
|
||||||
|
def change_model(model_name):
|
||||||
|
global rag_bot
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Réinitialiser le chatbot avec le nouveau modèle
|
||||||
|
rag_bot = MultimodalRAGChatbot(
|
||||||
|
qdrant_url=QDRANT_URL,
|
||||||
|
qdrant_collection_name=QDRANT_COLLECTION_NAME,
|
||||||
|
ollama_model=model_name,
|
||||||
|
embedding_model=EMBEDDING_MODEL,
|
||||||
|
ollama_url=OLLAMA_URL
|
||||||
|
)
|
||||||
|
print(f"Modèle changé pour: {model_name}")
|
||||||
|
return f"✅ Modèle changé pour: {model_name}"
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Erreur lors du changement de modèle: {e}")
|
||||||
|
return f"❌ Erreur: {str(e)}"
|
||||||
|
|
||||||
|
# Fonction pour changer de collection
|
||||||
|
def change_collection(collection_name):
|
||||||
|
global rag_bot, QDRANT_COLLECTION_NAME
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Mise à jour de la variable globale
|
||||||
|
QDRANT_COLLECTION_NAME = collection_name
|
||||||
|
|
||||||
|
# Réinitialiser le chatbot avec la nouvelle collection
|
||||||
|
rag_bot = MultimodalRAGChatbot(
|
||||||
|
qdrant_url=QDRANT_URL,
|
||||||
|
qdrant_collection_name=collection_name,
|
||||||
|
ollama_model=rag_bot.llm.model, # Conserver le modèle actuel
|
||||||
|
embedding_model=EMBEDDING_MODEL,
|
||||||
|
ollama_url=OLLAMA_URL
|
||||||
|
)
|
||||||
|
print(f"Collection changée pour: {collection_name}")
|
||||||
|
return f"✅ Collection changée pour: {collection_name}"
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Erreur lors du changement de collection: {e}")
|
||||||
|
return f"❌ Erreur: {str(e)}"
|
||||||
|
|
||||||
|
# Fonction de traitement des requêtes avec support du streaming dans Gradio
|
||||||
|
def process_query(message, history, streaming, show_sources, max_images, language):
|
||||||
|
global current_images, current_tables
|
||||||
|
|
||||||
|
if not message.strip():
|
||||||
|
return history, "", None, None
|
||||||
|
|
||||||
|
current_images = []
|
||||||
|
current_tables = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
if streaming:
|
||||||
|
# Version avec streaming dans Gradio
|
||||||
|
history = history + [(message, "")]
|
||||||
|
|
||||||
|
# 1. Récupérer les documents pertinents
|
||||||
|
docs = rag_bot._retrieve_relevant_documents(message)
|
||||||
|
|
||||||
|
# 2. Préparer le contexte et l'historique
|
||||||
|
context = rag_bot._format_documents(docs)
|
||||||
|
history_text = rag_bot._format_chat_history()
|
||||||
|
|
||||||
|
# 3. Préparer le prompt
|
||||||
|
prompt_template = ChatPromptTemplate.from_template("""
|
||||||
|
Tu es un assistant documentaire spécialisé qui utilise toutes les informations disponibles dans le contexte fourni.
|
||||||
|
|
||||||
|
TRÈS IMPORTANT: Tu dois répondre EXCLUSIVEMENT en {language}. Ne réponds JAMAIS dans une autre langue.
|
||||||
|
|
||||||
|
Instructions spécifiques:
|
||||||
|
1. Pour chaque image mentionnée dans le contexte, inclue TOUJOURS dans ta réponse:
|
||||||
|
- La légende/caption exacte de l'image
|
||||||
|
- La source et le numéro de page
|
||||||
|
- Une description brève de ce qu'elle montre
|
||||||
|
|
||||||
|
2. Pour chaque tableau mentionné dans le contexte, inclue TOUJOURS:
|
||||||
|
- Le titre/caption exact du tableau
|
||||||
|
- La source et le numéro de page
|
||||||
|
- Ce que contient et signifie le tableau
|
||||||
|
|
||||||
|
3. Lorsque tu cites des équations mathématiques:
|
||||||
|
- Utilise la syntaxe LaTeX exacte comme dans le document ($...$ ou $$...$$)
|
||||||
|
- Reproduis-les fidèlement sans modification
|
||||||
|
|
||||||
|
4. IMPORTANT: Ne pas inventer d'informations - si une donnée n'est pas explicitement fournie dans le contexte,
|
||||||
|
indique clairement que cette information n'est pas disponible dans les documents fournis.
|
||||||
|
|
||||||
|
5. Cite précisément les sources pour chaque élément d'information (format: [Source, Page]).
|
||||||
|
|
||||||
|
6. CRUCIAL: Ta réponse doit être UNIQUEMENT et INTÉGRALEMENT en {language}, quelle que soit la langue de la question.
|
||||||
|
|
||||||
|
Historique de conversation:
|
||||||
|
{chat_history}
|
||||||
|
|
||||||
|
Contexte (à utiliser pour répondre):
|
||||||
|
{context}
|
||||||
|
|
||||||
|
Question: {question}
|
||||||
|
|
||||||
|
Réponds de façon structurée et précise en intégrant activement les images, tableaux et équations disponibles dans le contexte.
|
||||||
|
Ta réponse doit être exclusivement en {language}.
|
||||||
|
""")
|
||||||
|
|
||||||
|
# 4. Formater les messages pour le LLM
|
||||||
|
messages = prompt_template.format_messages(
|
||||||
|
chat_history=history_text,
|
||||||
|
context=context,
|
||||||
|
question=message,
|
||||||
|
language=LANGUAGE_MAPPING.get(language, "français") # Use the mapped language value
|
||||||
|
)
|
||||||
|
|
||||||
|
# 5. Créer un handler de streaming personnalisé
|
||||||
|
handler = GradioStreamingHandler()
|
||||||
|
|
||||||
|
# 6. Créer un modèle LLM avec notre handler
|
||||||
|
streaming_llm = ChatOllama(
|
||||||
|
model=rag_bot.llm.model,
|
||||||
|
base_url=rag_bot.llm.base_url,
|
||||||
|
streaming=True,
|
||||||
|
callbacks=[handler]
|
||||||
|
)
|
||||||
|
|
||||||
|
# 7. Lancer la génération dans un thread pour ne pas bloquer l'UI
|
||||||
|
def generate_response():
|
||||||
|
streaming_llm.invoke(messages)
|
||||||
|
|
||||||
|
thread = threading.Thread(target=generate_response)
|
||||||
|
thread.start()
|
||||||
|
|
||||||
|
# 8. Récupérer les tokens et mettre à jour l'interface
|
||||||
|
partial_response = ""
|
||||||
|
|
||||||
|
# Attendre les tokens avec un timeout
|
||||||
|
while thread.is_alive() or not handler.tokens_queue.empty():
|
||||||
|
try:
|
||||||
|
token = handler.tokens_queue.get(timeout=0.05)
|
||||||
|
partial_response += token
|
||||||
|
history[-1] = (message, partial_response)
|
||||||
|
yield history, "", None, None
|
||||||
|
except queue.Empty:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 9. Thread terminé, mettre à jour l'historique de conversation du chatbot
|
||||||
|
rag_bot.chat_history.append({"role": "user", "content": message})
|
||||||
|
rag_bot.chat_history.append({"role": "assistant", "content": partial_response})
|
||||||
|
|
||||||
|
# 10. Récupérer les sources, images, tableaux
|
||||||
|
texts, images, tables = rag_bot._process_documents(docs)
|
||||||
|
|
||||||
|
# Préparer les informations sur les sources
|
||||||
|
source_info = ""
|
||||||
|
if texts:
|
||||||
|
source_info += f"📚 {len(texts)} textes • "
|
||||||
|
if images:
|
||||||
|
source_info += f"🖼️ {len(images)} images • "
|
||||||
|
if tables:
|
||||||
|
source_info += f"📊 {len(tables)} tableaux"
|
||||||
|
|
||||||
|
if source_info:
|
||||||
|
source_info = "Sources trouvées: " + source_info
|
||||||
|
|
||||||
|
# 11. Traiter les images
|
||||||
|
if show_sources and images:
|
||||||
|
images = images[:max_images]
|
||||||
|
for img in images:
|
||||||
|
img_data = img.get("image_data")
|
||||||
|
if img_data:
|
||||||
|
image = base64_to_image(img_data)
|
||||||
|
if image:
|
||||||
|
current_images.append({
|
||||||
|
"image": image,
|
||||||
|
"caption": img.get("caption", ""),
|
||||||
|
"source": img.get("source", ""),
|
||||||
|
"page": img.get("page", ""),
|
||||||
|
"description": img.get("description", "")
|
||||||
|
})
|
||||||
|
|
||||||
|
# 12. Traiter les tableaux
|
||||||
|
if show_sources and tables:
|
||||||
|
for table in tables:
|
||||||
|
current_tables.append({
|
||||||
|
"data": rag_bot.format_table(table.get("table_data", "")),
|
||||||
|
"caption": table.get("caption", ""),
|
||||||
|
"source": table.get("source", ""),
|
||||||
|
"page": table.get("page", ""),
|
||||||
|
"description": table.get("description", "")
|
||||||
|
})
|
||||||
|
|
||||||
|
# 13. Retourner les résultats finaux
|
||||||
|
yield history, source_info, display_images(current_images), display_tables(current_tables, language)
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Version sans streaming (code existant)
|
||||||
|
result = rag_bot.chat(message, stream=False)
|
||||||
|
history = history + [(message, result["response"])]
|
||||||
|
|
||||||
|
# Préparer les informations sur les sources
|
||||||
|
source_info = ""
|
||||||
|
if "texts" in result:
|
||||||
|
source_info += f"📚 {len(result['texts'])} textes • "
|
||||||
|
if "images" in result:
|
||||||
|
source_info += f"🖼️ {len(result['images'])} images • "
|
||||||
|
if "tables" in result:
|
||||||
|
source_info += f"📊 {len(result['tables'])} tableaux"
|
||||||
|
|
||||||
|
if source_info:
|
||||||
|
source_info = "Sources trouvées: " + source_info
|
||||||
|
|
||||||
|
# Traiter les images et tableaux
|
||||||
|
if show_sources and "images" in result and result["images"]:
|
||||||
|
images = result["images"][:max_images]
|
||||||
|
for img in images:
|
||||||
|
img_data = img.get("image_data")
|
||||||
|
if img_data:
|
||||||
|
image = base64_to_image(img_data)
|
||||||
|
if image:
|
||||||
|
current_images.append({
|
||||||
|
"image": image,
|
||||||
|
"caption": img.get("caption", ""),
|
||||||
|
"source": img.get("source", ""),
|
||||||
|
"page": img.get("page", ""),
|
||||||
|
"description": img.get("description", "")
|
||||||
|
})
|
||||||
|
|
||||||
|
if show_sources and "tables" in result and result["tables"]:
|
||||||
|
tables = result["tables"]
|
||||||
|
for table in tables:
|
||||||
|
current_tables.append({
|
||||||
|
"data": rag_bot.format_table(table.get("table_data", "")),
|
||||||
|
"caption": table.get("caption", ""),
|
||||||
|
"source": table.get("source", ""),
|
||||||
|
"page": table.get("page", ""),
|
||||||
|
"description": table.get("description", "")
|
||||||
|
})
|
||||||
|
|
||||||
|
return history, source_info, display_images(current_images), display_tables(current_tables, language)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
error_msg = f"Une erreur est survenue: {str(e)}"
|
||||||
|
traceback_text = traceback.format_exc()
|
||||||
|
print(error_msg)
|
||||||
|
print(traceback_text)
|
||||||
|
history = history + [(message, error_msg)]
|
||||||
|
return history, "Erreur lors du traitement de la requête", None, None
|
||||||
|
|
||||||
|
# Fonction pour réinitialiser la conversation
|
||||||
|
def reset_conversation():
|
||||||
|
global current_images, current_tables
|
||||||
|
current_images = []
|
||||||
|
current_tables = []
|
||||||
|
|
||||||
|
rag_bot.clear_history()
|
||||||
|
|
||||||
|
return [], "", None, None
|
||||||
@@ -7,8 +7,7 @@ LANGUAGE_MAPPING = {
|
|||||||
"Italiano": "italiano",
|
"Italiano": "italiano",
|
||||||
"中文": "Chinese",
|
"中文": "Chinese",
|
||||||
"日本語": "Japanese",
|
"日本語": "Japanese",
|
||||||
"العربية": "Arabic"
|
}
|
||||||
}
|
|
||||||
|
|
||||||
# Dictionnaire de traductions pour l'interface
|
# Dictionnaire de traductions pour l'interface
|
||||||
UI_TRANSLATIONS = {
|
UI_TRANSLATIONS = {
|
||||||
@@ -39,7 +38,9 @@ UI_TRANSLATIONS = {
|
|||||||
"error_msg": "Une erreur est survenue",
|
"error_msg": "Une erreur est survenue",
|
||||||
"processing_error": "Erreur lors du traitement de la requête",
|
"processing_error": "Erreur lors du traitement de la requête",
|
||||||
"table_translation": "Traduction",
|
"table_translation": "Traduction",
|
||||||
"table_description": "Ce tableau présente des données sur"
|
"table_description": "Ce tableau présente des données sur",
|
||||||
|
"ui_language_label": "Langue de l'interface",
|
||||||
|
"ui_language_info": "Changer la langue de l'interface uniquement"
|
||||||
},
|
},
|
||||||
"English": {
|
"English": {
|
||||||
"title": "📚 Intelligent Document Assistant",
|
"title": "📚 Intelligent Document Assistant",
|
||||||
@@ -68,7 +69,9 @@ UI_TRANSLATIONS = {
|
|||||||
"error_msg": "An error occurred",
|
"error_msg": "An error occurred",
|
||||||
"processing_error": "Error processing request",
|
"processing_error": "Error processing request",
|
||||||
"table_translation": "Translation",
|
"table_translation": "Translation",
|
||||||
"table_description": "This table presents data on"
|
"table_description": "This table presents data on",
|
||||||
|
"ui_language_label": "UI Language",
|
||||||
|
"ui_language_info": "Change only the interface language"
|
||||||
},
|
},
|
||||||
"Español": {
|
"Español": {
|
||||||
"title": "📚 Asistente documental inteligente",
|
"title": "📚 Asistente documental inteligente",
|
||||||
@@ -97,7 +100,9 @@ UI_TRANSLATIONS = {
|
|||||||
"error_msg": "Se ha producido un error",
|
"error_msg": "Se ha producido un error",
|
||||||
"processing_error": "Error al procesar la solicitud",
|
"processing_error": "Error al procesar la solicitud",
|
||||||
"table_translation": "Traducción",
|
"table_translation": "Traducción",
|
||||||
"table_description": "Esta tabla presenta datos sobre"
|
"table_description": "Esta tabla presenta datos sobre",
|
||||||
|
"ui_language_label": "Idioma de la interfaz",
|
||||||
|
"ui_language_info": "Cambiar solo el idioma de la interfaz"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user