Add utility modules and configuration settings for chatbot application

This commit is contained in:
sepehr 2025-03-08 18:12:18 +01:00
parent d4518a89dd
commit cb43b1176f
16 changed files with 1237 additions and 0 deletions

31
app.py Normal file
View File

@ -0,0 +1,31 @@
# filepath: f:\Dev\Rag\chat_bot_rag\app.py
import gradio as gr
from services.rag_service import initialize_rag_bot
from components.chatbot import process_query, reset_conversation, change_model, change_collection
from components.ui import build_interface, update_ui_language_elements
def main():
"""Main entry point for the chatbot application"""
# Initialize the RAG chatbot
initialize_rag_bot()
# Construire l'interface
interface = build_interface(
process_query_fn=process_query,
reset_conversation_fn=reset_conversation,
change_model_fn=change_model,
change_collection_fn=change_collection,
update_ui_language_fn=update_ui_language_elements # Ajout du paramètre manquant
)
# Lancer l'appli Gradio
interface.launch(
share=False,
inbrowser=True,
server_name="localhost",
server_port=7860
)
if __name__ == "__main__":
main()

2
components/__init__.py Normal file
View File

@ -0,0 +1,2 @@
from .chatbot import process_query, reset_conversation, change_model, change_collection
from .callbacks import GradioStreamingHandler

12
components/callbacks.py Normal file
View File

@ -0,0 +1,12 @@
import queue
from langchain.callbacks.base import BaseCallbackHandler
# Handler personnalisé pour capturer les tokens en streaming
class GradioStreamingHandler(BaseCallbackHandler):
def __init__(self):
self.tokens_queue = queue.Queue()
self.full_text = ""
def on_llm_new_token(self, token, **kwargs):
self.tokens_queue.put(token)
self.full_text += token

385
components/chatbot.py Normal file
View File

@ -0,0 +1,385 @@
import traceback
import threading
import queue
from langchain.prompts import ChatPromptTemplate
from langchain_ollama import ChatOllama
from rag_chatbot import MultimodalRAGChatbot
from config.settings import QDRANT_URL, QDRANT_COLLECTION_NAME, EMBEDDING_MODEL, OLLAMA_URL, DEFAULT_MODEL
from translations.lang_mappings import LANGUAGE_MAPPING
from utils.image_utils import base64_to_image
from langchain.callbacks.base import BaseCallbackHandler
import re
def clean_llm_response(text):
"""Nettoie la réponse du LLM en enlevant les balises de pensée et autres éléments non désirés."""
# Supprimer les blocs de pensée (<think>...</think>)
text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
# Supprimer les espaces supplémentaires au début de la réponse
text = text.lstrip()
return text
# Handler personnalisé pour le streaming
class GradioStreamingHandler(BaseCallbackHandler):
def __init__(self):
self.tokens_queue = queue.Queue()
self.full_text = ""
def on_llm_new_token(self, token, **kwargs):
self.tokens_queue.put(token)
self.full_text += token
# Initialiser le chatbot
rag_bot = MultimodalRAGChatbot(
qdrant_url=QDRANT_URL,
qdrant_collection_name=QDRANT_COLLECTION_NAME,
ollama_model=DEFAULT_MODEL,
embedding_model=EMBEDDING_MODEL,
ollama_url=OLLAMA_URL
)
print(f"Chatbot initialisé avec modèle: {DEFAULT_MODEL}")
# Variables globales
current_images = []
current_tables = []
# Fonctions utilitaires
def display_images(images_list=None):
"""Crée une liste de tuples (image, caption) pour Gradio Gallery"""
images_to_use = images_list if images_list is not None else current_images
if not images_to_use:
return None
gallery = []
for img_data in images_to_use:
image = img_data["image"]
if image:
caption = f"{img_data['caption']} (Source: {img_data['source']}, Page: {img_data['page']})"
gallery.append((image, caption))
return gallery if gallery else None
def display_tables(tables_list=None, language=None):
"""Crée le HTML pour afficher les tableaux"""
tables_to_use = tables_list if tables_list is not None else current_tables
if not tables_to_use:
return None
html = ""
for idx, table in enumerate(tables_to_use):
table_data = table['data']
table_html = ""
try:
if isinstance(table_data, str):
if '|' in table_data:
rows = table_data.strip().split('\n')
table_html = '<div class="table-container"><table>'
for i, row in enumerate(rows):
if i == 1 and all(c in ':-|' for c in row):
continue
cells = row.split('|')
if cells and cells[0].strip() == '':
cells = cells[1:]
if cells and cells[-1].strip() == '':
cells = cells[:-1]
if cells:
is_header = (i == 0)
table_html += '<tr>'
for cell in cells:
cell_content = cell.strip()
if is_header:
table_html += f'<th>{cell_content}</th>'
else:
table_html += f'<td>{cell_content}</td>'
table_html += '</tr>'
table_html += '</table></div>'
else:
table_html = f'<pre>{table_data}</pre>'
else:
table_html = f'<pre>{table_data}</pre>'
except Exception as e:
print(f"Error formatting table {idx}: {e}")
table_html = f'<pre>{table_data}</pre>'
html += f"""
<div style="margin-bottom: 20px; border: 1px solid #ddd; padding: 15px; border-radius: 8px;">
<h3>{table.get('caption', 'Tableau')}</h3>
<p style="color:#666; font-size:0.9em;">Source: {table.get('source', 'N/A')}, Page: {table.get('page', 'N/A')}</p>
<p><strong>Description:</strong> {table.get('description', '')}</p>
{table_html}
</div>
"""
return html if html else None
# Fonction pour changer de modèle
def change_model(model_name, language="Français"):
global rag_bot
try:
rag_bot = MultimodalRAGChatbot(
qdrant_url=QDRANT_URL,
qdrant_collection_name=QDRANT_COLLECTION_NAME,
ollama_model=model_name,
embedding_model=EMBEDDING_MODEL,
ollama_url=OLLAMA_URL
)
print(f"Modèle changé pour: {model_name}")
return f"✅ Modèle changé pour: {model_name}"
except Exception as e:
print(f"Erreur lors du changement de modèle: {e}")
return f"❌ Erreur: {str(e)}"
# Fonction pour changer de collection
def change_collection(collection_name, language="Français"):
global rag_bot
try:
rag_bot = MultimodalRAGChatbot(
qdrant_url=QDRANT_URL,
qdrant_collection_name=collection_name,
ollama_model=rag_bot.llm.model,
embedding_model=EMBEDDING_MODEL,
ollama_url=OLLAMA_URL
)
print(f"Collection changée pour: {collection_name}")
return f"✅ Collection changée pour: {collection_name}"
except Exception as e:
print(f"Erreur lors du changement de collection: {e}")
return f"❌ Erreur: {str(e)}"
# Fonction de traitement de requête
def process_query(message, history, streaming, show_sources, max_images, language):
global current_images, current_tables
if not message.strip():
return history, "", None, None
current_images = []
current_tables = []
print(f"Traitement du message: {message}")
print(f"Streaming: {streaming}")
try:
if streaming:
# Version avec streaming dans Gradio
history = history + [(message, "")]
# 1. Récupérer les documents pertinents
docs = rag_bot._retrieve_relevant_documents(message)
# 2. Préparer le contexte et l'historique
context = rag_bot._format_documents(docs)
history_text = rag_bot._format_chat_history()
# 3. Préparer le prompt
prompt_template = ChatPromptTemplate.from_template("""
Tu es un assistant documentaire spécialisé qui utilise toutes les informations disponibles dans le contexte fourni.
TRÈS IMPORTANT: Tu dois répondre EXCLUSIVEMENT en {language}. Ne réponds JAMAIS dans une autre langue.
Instructions spécifiques:
1. Pour chaque image mentionnée dans le contexte, inclue TOUJOURS dans ta réponse:
- La légende/caption exacte de l'image
- La source et le numéro de page
- Une description brève de ce qu'elle montre
2. Pour chaque tableau mentionné dans le contexte, inclue TOUJOURS:
- Le titre/caption exact du tableau
- La source et le numéro de page
- Ce que contient et signifie le tableau
3. Lorsque tu cites des équations mathématiques:
- Utilise la syntaxe LaTeX exacte comme dans le document ($...$ ou $$...$$)
- Reproduis-les fidèlement sans modification
4. IMPORTANT: Ne pas inventer d'informations - si une donnée n'est pas explicitement fournie dans le contexte,
indique clairement que cette information n'est pas disponible dans les documents fournis.
5. Cite précisément les sources pour chaque élément d'information (format: [Source, Page]).
6. CRUCIAL: Ta réponse doit être UNIQUEMENT et INTÉGRALEMENT en {language}, quelle que soit la langue de la question.
Historique de conversation:
{chat_history}
Contexte (à utiliser pour répondre):
{context}
Question: {question}
Réponds de façon structurée et précise en intégrant activement les images, tableaux et équations disponibles dans le contexte.
Ta réponse doit être exclusivement en {language}.
""")
# 4. Formater les messages pour le LLM
messages = prompt_template.format_messages(
chat_history=history_text,
context=context,
question=message,
language=LANGUAGE_MAPPING.get(language, "français")
)
# 5. Créer un handler de streaming personnalisé
handler = GradioStreamingHandler()
# 6. Créer un modèle LLM avec notre handler
streaming_llm = ChatOllama(
model=rag_bot.llm.model,
base_url=rag_bot.llm.base_url,
streaming=True,
callbacks=[handler]
)
# 7. Lancer la génération dans un thread pour ne pas bloquer l'UI
def generate_response():
streaming_llm.invoke(messages)
thread = threading.Thread(target=generate_response)
thread.start()
# 8. Récupérer les tokens et mettre à jour l'interface
partial_response = ""
# Attendre les tokens avec un timeout
while thread.is_alive() or not handler.tokens_queue.empty():
try:
token = handler.tokens_queue.get(timeout=0.05)
partial_response += token
# Nettoyer la réponse uniquement pour l'affichage (pas pour l'historique interne)
clean_response = clean_llm_response(partial_response)
history[-1] = (message, clean_response)
yield history, "", None, None
except queue.Empty:
continue
# Après la boucle, nettoyer la réponse complète pour l'historique interne
partial_response = clean_llm_response(partial_response)
rag_bot.chat_history.append({"role": "user", "content": message})
rag_bot.chat_history.append({"role": "assistant", "content": partial_response})
# 10. Récupérer les sources, images, tableaux
texts, images, tables = rag_bot._process_documents(docs)
# Préparer les informations sur les sources
source_info = ""
if texts:
source_info += f"📚 {len(texts)} textes • "
if images:
source_info += f"🖼️ {len(images)} images • "
if tables:
source_info += f"📊 {len(tables)} tableaux"
if source_info:
source_info = "Sources trouvées: " + source_info
# 11. Traiter les images
if show_sources and images:
images = images[:max_images]
for img in images:
img_data = img.get("image_data")
if img_data:
image = base64_to_image(img_data)
if image:
current_images.append({
"image": image,
"caption": img.get("caption", ""),
"source": img.get("source", ""),
"page": img.get("page", ""),
"description": img.get("description", "")
})
# 12. Traiter les tableaux
if show_sources and tables:
for table in tables:
current_tables.append({
"data": rag_bot.format_table(table.get("table_data", "")),
"caption": table.get("caption", ""),
"source": table.get("source", ""),
"page": table.get("page", ""),
"description": table.get("description", "")
})
# 13. Retourner les résultats finaux
images_display = display_images()
tables_display = display_tables()
yield history, source_info, images_display, tables_display
else:
# Version sans streaming
print("Mode non-streaming activé")
source_info = ""
result = rag_bot.chat(message, stream=False)
# Nettoyer la réponse des balises <think>
result["response"] = clean_llm_response(result["response"])
history = history + [(message, result["response"])]
# Mise à jour de l'historique interne
rag_bot.chat_history.append({"role": "user", "content": message})
rag_bot.chat_history.append({"role": "assistant", "content": result["response"]})
# Traiter les sources
if "texts" in result:
source_info += f"📚 {len(result['texts'])} textes • "
if "images" in result:
source_info += f"🖼️ {len(result['images'])} images • "
if "tables" in result:
source_info += f"📊 {len(result['tables'])} tableaux"
if source_info:
source_info = "Sources trouvées: " + source_info
# Traiter les images et tableaux
if show_sources and "images" in result and result["images"]:
images = result["images"][:max_images]
for img in images:
img_data = img.get("image_data")
if img_data:
image = base64_to_image(img_data)
if image:
current_images.append({
"image": image,
"caption": img.get("caption", ""),
"source": img.get("source", ""),
"page": img.get("page", ""),
"description": img.get("description", "")
})
if show_sources and "tables" in result and result["tables"]:
tables = result["tables"]
for table in tables:
current_tables.append({
"data": rag_bot.format_table(table.get("table_data", "")),
"caption": table.get("caption", ""),
"source": table.get("source", ""),
"page": table.get("page", ""),
"description": table.get("description", "")
})
yield history, source_info, display_images(), display_tables()
except Exception as e:
error_msg = f"Une erreur est survenue: {str(e)}"
traceback_text = traceback.format_exc()
print(error_msg)
print(traceback_text)
history = history + [(message, error_msg)]
yield history, "Erreur lors du traitement de la requête", None, None
# Fonction pour réinitialiser la conversation
def reset_conversation():
global current_images, current_tables
current_images = []
current_tables = []
rag_bot.clear_history()
return [], "", None, None

198
components/ui.py Normal file
View File

@ -0,0 +1,198 @@
import gradio as gr
from config.settings import DEFAULT_MODEL, QDRANT_COLLECTION_NAME, AVAILABLE_MODELS
from translations.lang_mappings import UI_TRANSLATIONS, UI_SUPPORTED_LANGUAGES
from utils.katex_script import KATEX_CSS_JS
def update_ui_language_elements(language):
"""Met à jour les éléments de l'interface utilisateur en fonction de la langue sélectionnée"""
pass # Implémentez selon vos besoins
def build_interface(
process_query_fn,
reset_conversation_fn,
change_model_fn,
change_collection_fn,
update_ui_language_fn
):
"""Construit l'interface utilisateur avec Gradio."""
with gr.Blocks(css=KATEX_CSS_JS, theme=gr.themes.Soft(primary_hue="blue")) as interface:
gr.Markdown("# 📚 Assistant documentaire intelligent")
with gr.Row():
with gr.Column(scale=2):
# Chatbot principal
chat_interface = gr.Chatbot(
height=600,
show_label=False,
layout="bubble",
elem_id="chatbot"
)
with gr.Row():
msg = gr.Textbox(
show_label=False,
placeholder="Posez votre question...",
container=False,
scale=4
)
submit_btn = gr.Button("Envoyer", variant="primary", scale=1)
clear_btn = gr.Button("Effacer la conversation")
source_info = gr.Markdown("", elem_id="sources_info")
with gr.Column(scale=1):
with gr.Accordion("Options", open=True):
# Sélecteur de modèle
model_selector = gr.Dropdown(
choices=AVAILABLE_MODELS,
value=DEFAULT_MODEL,
label="Modèle Ollama",
info="Choisir le modèle de language à utiliser"
)
model_status = gr.Markdown(f"Modèle actuel: **{DEFAULT_MODEL}**")
# Sélecteur de langue
language_selector = gr.Dropdown(
choices=UI_SUPPORTED_LANGUAGES,
value=UI_SUPPORTED_LANGUAGES[0],
label="Langue des réponses",
info="Choisir la langue dans laquelle l'assistant répondra"
)
# Sélecteur de collection Qdrant
collection_name_input = gr.Textbox(
value=QDRANT_COLLECTION_NAME,
label="Collection Qdrant",
info="Nom de la collection de documents à utiliser"
)
collection_status = gr.Markdown(f"Collection actuelle: **{QDRANT_COLLECTION_NAME}**")
# Bouton d'application de la collection
apply_collection_btn = gr.Button("Appliquer la collection")
streaming = gr.Checkbox(
label="Mode streaming",
value=True,
info="Voir les réponses s'afficher progressivement"
)
show_sources = gr.Checkbox(label="Afficher les sources", value=True)
max_images = gr.Slider(
minimum=1,
maximum=10,
value=3,
step=1,
label="Nombre max d'images"
)
gr.Markdown("---")
gr.Markdown("### 🖼️ Images pertinentes")
image_gallery = gr.Gallery(
label="Images pertinentes",
show_label=False,
columns=2,
height=300,
object_fit="contain"
)
gr.Markdown("### 📊 Tableaux")
tables_display = gr.HTML()
# Connecter le changement de modèle
model_selector.change(
fn=change_model_fn,
inputs=model_selector,
outputs=model_status
)
# Connecter le changement de collection
apply_collection_btn.click(
fn=change_collection_fn,
inputs=collection_name_input,
outputs=collection_status
)
# Fonction pour effacer l'entrée
def clear_input():
return ""
# Configuration des actions principales
msg.submit(
process_query_fn,
inputs=[msg, chat_interface, streaming, show_sources, max_images, language_selector],
outputs=[chat_interface, source_info, image_gallery, tables_display]
).then(clear_input, None, msg)
submit_btn.click(
process_query_fn,
inputs=[msg, chat_interface, streaming, show_sources, max_images, language_selector],
outputs=[chat_interface, source_info, image_gallery, tables_display]
).then(clear_input, None, msg)
clear_btn.click(
reset_conversation_fn,
outputs=[chat_interface, source_info, image_gallery, tables_display]
)
# Style KaTeX et amélioration du design
gr.Markdown("""
<style>
.gradio-container {max-width: 1200px !important}
#chatbot {height: 600px; overflow-y: auto;}
#sources_info {margin-top: 10px; color: #666;}
/* Improved styles for equations */
.katex { font-size: 1.1em !important; }
.math-inline { background: #f8f9fa; padding: 2px 5px; border-radius: 4px; }
.math-display { background: #f8f9f9; margin: 10px 0; padding: 10px; border-radius: 5px; overflow-x: auto; text-align: center; }
/* Table styles */
table {
border-collapse: collapse;
width: 100%;
margin: 15px 0;
font-size: 0.9em;
}
table, th, td {
border: 1px solid #ddd;
}
th, td {
padding: 8px 12px;
text-align: left;
}
th {
background-color: #f2f2f2;
}
tr:nth-child(even) {
background-color: #f9f9f9;
}
.table-container {
overflow-x: auto;
margin-top: 10px;
}
</style>
<!-- Loading KaTeX -->
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.8/dist/katex.min.css">
<script src="https://cdn.jsdelivr.net/npm/katex@0.16.8/dist/katex.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/katex@0.16.8/dist/contrib/auto-render.min.js"></script>
<script>
// Script pour rendre les équations mathématiques avec KaTeX
document.addEventListener('DOMContentLoaded', function() {
setTimeout(function() {
if (window.renderMathInElement) {
renderMathInElement(document.body, {
delimiters: [
{left: '$$', right: '$$', display: true},
{left: '$', right: '$', display: false}
],
throwOnError: false
});
}
}, 1000);
});
</script>
""")
return interface

18
config/settings.py Normal file
View File

@ -0,0 +1,18 @@
import gradio as gr
# Configuration settings for the application
# URLs and connection settings
QDRANT_URL = "http://localhost:6333"
QDRANT_COLLECTION_NAME = "my_custom_collection"
OLLAMA_URL = "http://127.0.0.1:11434"
# Model settings
EMBEDDING_MODEL = "mxbai-embed-large"
DEFAULT_MODEL = "llama3.2"
# Available models
AVAILABLE_MODELS = ["llama3.1", "llama3.2", "deepseek-r1:7b", "deepseek-r1:14b"]
# Default theme
DEFAULT_THEME = gr.themes.Soft(primary_hue="blue")

50
config/translations.py Normal file
View File

@ -0,0 +1,50 @@
# Mapping des langues pour une meilleure compréhension par le LLM
LANGUAGE_MAPPING = {
"Français": "français",
"English": "English",
"Español": "español",
"Deutsch": "Deutsch",
"Italiano": "italiano",
"中文": "Chinese",
"日本語": "Japanese",
"العربية": "Arabic",
"فارسی": "Persian"
}
# Dictionnaire de traductions pour l'interface
UI_TRANSLATIONS = {
"Français": {
"title": "📚 Assistant documentaire intelligent",
"placeholder": "Posez votre question...",
"send_btn": "Envoyer",
"clear_btn": "Effacer la conversation",
"model_selector": "Modèle Ollama",
"model_info": "Choisir le modèle de language à utiliser",
"model_current": "Modèle actuel",
"language_selector": "Langue des réponses",
"language_info": "Choisir la langue dans laquelle l'assistant répondra",
"collection_input": "Collection Qdrant",
"collection_info": "Nom de la collection de documents à utiliser",
"collection_current": "Collection actuelle",
"apply_btn": "Appliquer la collection",
"streaming_label": "Mode streaming",
"streaming_info": "Voir les réponses s'afficher progressivement",
"sources_label": "Afficher les sources",
"max_images_label": "Nombre max d'images",
"images_title": "🖼️ Images pertinentes",
"tables_title": "📊 Tableaux",
"sources_found": "Sources trouvées",
"texts": "textes",
"images": "images",
"tables": "tableaux",
"error_msg": "Une erreur est survenue"
},
"English": {
"title": "📚 Intelligent Document Assistant",
"placeholder": "Ask your question...",
"send_btn": "Send",
"clear_btn": "Clear conversation",
# Ajoutez les autres traductions anglaises ici
}
# Ajoutez d'autres langues au besoin
}

142
test_mistral.ipynb Normal file
View File

@ -0,0 +1,142 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"ename": "SDKError",
"evalue": "API error occurred: Status 401\n{\n \"message\":\"Unauthorized\",\n \"request_id\":\"11390a73fd79bc1a934c5858569caa3a\"\n}",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mSDKError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[11]\u001b[39m\u001b[32m, line 8\u001b[39m\n\u001b[32m 4\u001b[39m api_key =\u001b[33m\"\u001b[39m\u001b[33mxmM3IG80Y97Hg8kJVUPy1ijyIhmS2H9j\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 6\u001b[39m client = Mistral(api_key=api_key)\n\u001b[32m----> \u001b[39m\u001b[32m8\u001b[39m uploaded_pdf = \u001b[43mclient\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfiles\u001b[49m\u001b[43m.\u001b[49m\u001b[43mupload\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 9\u001b[39m \u001b[43m \u001b[49m\u001b[43mfile\u001b[49m\u001b[43m=\u001b[49m\u001b[43m{\u001b[49m\n\u001b[32m 10\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mfile_name\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m11_chapitre3.pdf\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 11\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcontent\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[33;43mr\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mF:\u001b[39;49m\u001b[33;43m\\\u001b[39;49m\u001b[33;43mDev\u001b[39;49m\u001b[33;43m\\\u001b[39;49m\u001b[33;43mRag\u001b[39;49m\u001b[33;43m\\\u001b[39;49m\u001b[33;43mRag_Modeling\u001b[39;49m\u001b[33;43m\\\u001b[39;49m\u001b[33;43mdocument\u001b[39;49m\u001b[33;43m\\\u001b[39;49m\u001b[33;43m11_chapitre3.pdf\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mrb\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 12\u001b[39m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 13\u001b[39m \u001b[43m \u001b[49m\u001b[43mpurpose\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mocr\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\n\u001b[32m 14\u001b[39m \u001b[43m)\u001b[49m \n\u001b[32m 15\u001b[39m client.files.retrieve(file_id=uploaded_pdf.id) \n",
"\u001b[36mFile \u001b[39m\u001b[32mf:\\Dev\\Rag\\chat_bot_rag\\.venv\\Lib\\site-packages\\mistralai\\files.py:101\u001b[39m, in \u001b[36mFiles.upload\u001b[39m\u001b[34m(self, file, purpose, retries, server_url, timeout_ms, http_headers)\u001b[39m\n\u001b[32m 99\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m utils.match_response(http_res, \u001b[33m\"\u001b[39m\u001b[33m4XX\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33m*\u001b[39m\u001b[33m\"\u001b[39m):\n\u001b[32m 100\u001b[39m http_res_text = utils.stream_to_text(http_res)\n\u001b[32m--> \u001b[39m\u001b[32m101\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m models.SDKError(\n\u001b[32m 102\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mAPI error occurred\u001b[39m\u001b[33m\"\u001b[39m, http_res.status_code, http_res_text, http_res\n\u001b[32m 103\u001b[39m )\n\u001b[32m 104\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m utils.match_response(http_res, \u001b[33m\"\u001b[39m\u001b[33m5XX\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33m*\u001b[39m\u001b[33m\"\u001b[39m):\n\u001b[32m 105\u001b[39m http_res_text = utils.stream_to_text(http_res)\n",
"\u001b[31mSDKError\u001b[39m: API error occurred: Status 401\n{\n \"message\":\"Unauthorized\",\n \"request_id\":\"11390a73fd79bc1a934c5858569caa3a\"\n}"
]
}
],
"source": [
"from mistralai import Mistral\n",
"import os\n",
"\n",
"api_key =\"xmM3IG80Y97Hg8kJVUPy1ijyIhmS2H9j\"\n",
"\n",
"client = Mistral(api_key=api_key)\n",
"\n",
"uploaded_pdf = client.files.upload(\n",
" file={\n",
" \"file_name\": \"11_chapitre3.pdf\",\n",
" \"content\": open(r\"F:\\Dev\\Rag\\Rag_Modeling\\document\\11_chapitre3.pdf\", \"rb\"),\n",
" },\n",
" purpose=\"ocr\"\n",
") \n",
"client.files.retrieve(file_id=uploaded_pdf.id) \n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"ename": "SDKError",
"evalue": "API error occurred: Status 401\n{\n \"message\":\"Unauthorized\",\n \"request_id\":\"bf40e3105e1f257ec16fc233e4d0396b\"\n}",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mSDKError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[10]\u001b[39m\u001b[32m, line 5\u001b[39m\n\u001b[32m 1\u001b[39m model = \u001b[33m\"\u001b[39m\u001b[33mmistral-large-latest\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 3\u001b[39m client = Mistral(api_key=api_key)\n\u001b[32m----> \u001b[39m\u001b[32m5\u001b[39m chat_response = \u001b[43mclient\u001b[49m\u001b[43m.\u001b[49m\u001b[43mchat\u001b[49m\u001b[43m.\u001b[49m\u001b[43mcomplete\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 6\u001b[39m \u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 7\u001b[39m \u001b[43m \u001b[49m\u001b[43mmessages\u001b[49m\u001b[43m \u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\n\u001b[32m 8\u001b[39m \u001b[43m \u001b[49m\u001b[43m{\u001b[49m\n\u001b[32m 9\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mrole\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43muser\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 10\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcontent\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mWhat is the best French cheese?\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 11\u001b[39m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 12\u001b[39m \u001b[43m \u001b[49m\u001b[43m]\u001b[49m\n\u001b[32m 13\u001b[39m \u001b[43m)\u001b[49m\n\u001b[32m 14\u001b[39m \u001b[38;5;28mprint\u001b[39m(chat_response.choices[\u001b[32m0\u001b[39m].message.content)\n",
"\u001b[36mFile \u001b[39m\u001b[32mf:\\Dev\\Rag\\chat_bot_rag\\.venv\\Lib\\site-packages\\mistralai\\chat.py:235\u001b[39m, in \u001b[36mChat.complete\u001b[39m\u001b[34m(self, model, messages, temperature, top_p, max_tokens, stream, stop, random_seed, response_format, tools, tool_choice, presence_penalty, frequency_penalty, n, prediction, safe_prompt, retries, server_url, timeout_ms, http_headers)\u001b[39m\n\u001b[32m 233\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m utils.match_response(http_res, \u001b[33m\"\u001b[39m\u001b[33m4XX\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33m*\u001b[39m\u001b[33m\"\u001b[39m):\n\u001b[32m 234\u001b[39m http_res_text = utils.stream_to_text(http_res)\n\u001b[32m--> \u001b[39m\u001b[32m235\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m models.SDKError(\n\u001b[32m 236\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mAPI error occurred\u001b[39m\u001b[33m\"\u001b[39m, http_res.status_code, http_res_text, http_res\n\u001b[32m 237\u001b[39m )\n\u001b[32m 238\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m utils.match_response(http_res, \u001b[33m\"\u001b[39m\u001b[33m5XX\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33m*\u001b[39m\u001b[33m\"\u001b[39m):\n\u001b[32m 239\u001b[39m http_res_text = utils.stream_to_text(http_res)\n",
"\u001b[31mSDKError\u001b[39m: API error occurred: Status 401\n{\n \"message\":\"Unauthorized\",\n \"request_id\":\"bf40e3105e1f257ec16fc233e4d0396b\"\n}"
]
}
],
"source": [
"model = \"mistral-large-latest\"\n",
"\n",
"client = Mistral(api_key=api_key)\n",
"\n",
"chat_response = client.chat.complete(\n",
" model= model,\n",
" messages = [\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": \"What is the best French cheese?\",\n",
" },\n",
" ]\n",
")\n",
"print(chat_response.choices[0].message.content)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<_io.BufferedReader name='F:\\\\Dev\\\\Rag\\\\Rag_Modeling\\\\document\\\\11_chapitre3.pdf'>"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"open(r\"F:\\Dev\\Rag\\Rag_Modeling\\document\\11_chapitre3.pdf\", \"rb\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MISTRAL_API_KEY: None\n"
]
}
],
"source": [
"import os\n",
"print(\"MISTRAL_API_KEY:\", repr(os.environ.get(\"MISTRAL_API_KEY\")))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -0,0 +1,105 @@
# Mapping des langues pour une meilleure compréhension par le LLM
LANGUAGE_MAPPING = {
"Français": "français",
"English": "English",
"Español": "español",
"Deutsch": "Deutsch",
"Italiano": "italiano",
"中文": "Chinese",
"日本語": "Japanese",
"العربية": "Arabic"
}
# Dictionnaire de traductions pour l'interface
UI_TRANSLATIONS = {
"Français": {
"title": "📚 Assistant documentaire intelligent",
"placeholder": "Posez votre question...",
"send_btn": "Envoyer",
"clear_btn": "Effacer la conversation",
"model_selector": "Modèle Ollama",
"model_info": "Choisir le modèle de language à utiliser",
"model_current": "Modèle actuel",
"language_selector": "Langue des réponses",
"language_info": "Choisir la langue dans laquelle l'assistant répondra",
"collection_input": "Collection Qdrant",
"collection_info": "Nom de la collection de documents à utiliser",
"collection_current": "Collection actuelle",
"apply_btn": "Appliquer la collection",
"streaming_label": "Mode streaming",
"streaming_info": "Voir les réponses s'afficher progressivement",
"sources_label": "Afficher les sources",
"max_images_label": "Nombre max d'images",
"images_title": "🖼️ Images pertinentes",
"tables_title": "📊 Tableaux",
"sources_found": "Sources trouvées",
"texts": "textes",
"images": "images",
"tables": "tableaux",
"error_msg": "Une erreur est survenue",
"processing_error": "Erreur lors du traitement de la requête",
"table_translation": "Traduction",
"table_description": "Ce tableau présente des données sur"
},
"English": {
"title": "📚 Intelligent Document Assistant",
"placeholder": "Ask your question...",
"send_btn": "Send",
"clear_btn": "Clear conversation",
"model_selector": "Ollama Model",
"model_info": "Choose the language model to use",
"model_current": "Current model",
"language_selector": "Response language",
"language_info": "Choose the language in which the assistant will respond",
"collection_input": "Qdrant Collection",
"collection_info": "Name of the document collection to use",
"collection_current": "Current collection",
"apply_btn": "Apply collection",
"streaming_label": "Streaming mode",
"streaming_info": "See responses appear progressively",
"sources_label": "Show sources",
"max_images_label": "Maximum number of images",
"images_title": "🖼️ Relevant images",
"tables_title": "📊 Tables",
"sources_found": "Sources found",
"texts": "texts",
"images": "images",
"tables": "tables",
"error_msg": "An error occurred",
"processing_error": "Error processing request",
"table_translation": "Translation",
"table_description": "This table presents data on"
},
"Español": {
"title": "📚 Asistente documental inteligente",
"placeholder": "Haz tu pregunta...",
"send_btn": "Enviar",
"clear_btn": "Borrar conversación",
"model_selector": "Modelo Ollama",
"model_info": "Elegir el modelo de lenguaje a utilizar",
"model_current": "Modelo actual",
"language_selector": "Idioma de respuesta",
"language_info": "Elegir el idioma en el que responderá el asistente",
"collection_input": "Colección Qdrant",
"collection_info": "Nombre de la colección de documentos a utilizar",
"collection_current": "Colección actual",
"apply_btn": "Aplicar colección",
"streaming_label": "Modo streaming",
"streaming_info": "Ver las respuestas aparecer progresivamente",
"sources_label": "Mostrar fuentes",
"max_images_label": "Número máximo de imágenes",
"images_title": "🖼️ Imágenes relevantes",
"tables_title": "📊 Tablas",
"sources_found": "Fuentes encontradas",
"texts": "textos",
"images": "imágenes",
"tables": "tablas",
"error_msg": "Se ha producido un error",
"processing_error": "Error al procesar la solicitud",
"table_translation": "Traducción",
"table_description": "Esta tabla presenta datos sobre"
}
}
# Définir les langues supportées par l'interface
UI_SUPPORTED_LANGUAGES = ["Français", "English", "Español"]

1
utils/__init__.py Normal file
View File

@ -0,0 +1 @@
# Package initialization for display utilities

15
utils/conversion.py Normal file
View File

@ -0,0 +1,15 @@
import base64
from io import BytesIO
from PIL import Image
def base64_to_image(base64_data):
"""Convertit une image base64 en objet Image pour l'affichage direct"""
try:
if not base64_data:
return None
image_bytes = base64.b64decode(base64_data)
image = Image.open(BytesIO(image_bytes))
return image
except Exception as e:
print(f"Erreur lors de la conversion d'image: {e}")
return None

40
utils/display.py Normal file
View File

@ -0,0 +1,40 @@
from PIL import Image
from io import BytesIO
import base64
def base64_to_image(base64_data):
"""Convert base64 image data to PIL Image"""
try:
if not base64_data:
return None
image_bytes = base64.b64decode(base64_data)
return Image.open(BytesIO(image_bytes))
except Exception as e:
print(f"Image conversion error: {e}")
return None
def display_images(current_images):
"""Format images for Gradio gallery display"""
if not current_images:
return None
return [
(img["image"], f"{img['caption']} (Source: {img['source']}, Page: {img['page']})")
for img in current_images
if img.get("image")
]
def display_tables(current_tables):
"""Format tables for HTML display"""
if not current_tables:
return None
html = ""
for table in current_tables:
html += f"""
<div style="margin-bottom: 20px; border: 1px solid #ddd; padding: 15px; border-radius: 8px;">
<h3>{table['caption']}</h3>
<p style="color:#666; font-size:0.9em;">Source: {table['source']}, Page: {table['page']}</p>
<div class="table-container">{table.get('data', '')}</div>
</div>
"""
return html if html else None

29
utils/image_utils.py Normal file
View File

@ -0,0 +1,29 @@
from io import BytesIO
from PIL import Image
import base64
def base64_to_image(base64_data):
"""Convertit une image base64 en objet Image pour l'affichage direct"""
try:
if not base64_data:
return None
image_bytes = base64.b64decode(base64_data)
image = Image.open(BytesIO(image_bytes))
return image
except Exception as e:
print(f"Erreur lors de la conversion d'image: {e}")
return None
def display_images(current_images):
"""Prépare les images pour l'affichage dans la galerie Gradio"""
if not current_images:
return None
gallery = []
for img_data in current_images:
image = img_data["image"]
if image:
caption = f"{img_data['caption']} (Source: {img_data['source']}, Page: {img_data['page']})"
gallery.append((image, caption))
return gallery if gallery else None

190
utils/katex_script.py Normal file
View File

@ -0,0 +1,190 @@
KATEX_CSS_JS = """
<style>
.gradio-container {max-width: 1200px !important}
#chatbot {height: 600px; overflow-y: auto;}
#sources_info {margin-top: 10px; color: #666;}
/* Improved styles for equations */
.katex { font-size: 1.1em !important; }
.math-inline { background: #f8f9fa; padding: 2px 5px; border-radius: 4px; }
.math-display { background: #f8f9f9; margin: 10px 0; padding: 10px; border-radius: 5px; overflow-x: auto; text-align: center; }
/* Table styles */
table {
border-collapse: collapse;
width: 100%;
margin: 15px 0;
font-size: 0.9em;
}
table, th, td {
border: 1px solid #ddd;
}
th, td {
padding: 8px 12px;
text-align: left;
}
th {
background-color: #f2f2f2;
}
tr:nth-child(even) {
background-color: #f9f9f9;
}
.table-container {
overflow-x: auto;
margin-top: 10px;
}
</style>
<!-- Loading KaTeX -->
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.8/dist/katex.min.css">
<script src="https://cdn.jsdelivr.net/npm/katex@0.16.8/dist/katex.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/katex@0.16.8/dist/contrib/auto-render.min.js"></script>
<script>
// Function to process math equations with KaTeX
function renderMathInElement(element) {
if (!window.renderMathInElement) return;
try {
window.renderMathInElement(element, {
delimiters: [
{left: '$$', right: '$$', display: true},
{left: '$', right: '$', display: false},
{left: '\\\\(', right: '\\\\)', display: false},
{left: '\\\\[', right: '\\\\]', display: true}
],
throwOnError: false,
trust: true,
strict: false,
macros: {
"\\\\R": "\\\\mathbb{R}",
"\\\\N": "\\\\mathbb{N}"
}
});
} catch (e) {
console.error("KaTeX rendering error:", e);
}
}
// Function to fix and prepare text for LaTeX rendering
function prepareTextForLatex(text) {
if (!text) return text;
// Don't modify code blocks
if (text.indexOf('<pre>') !== -1) {
const parts = text.split(/<pre>|<\/pre>/);
for (let i = 0; i < parts.length; i++) {
// Only process odd-indexed parts (non-code)
if (i % 2 === 0) {
parts[i] = prepareLatexInText(parts[i]);
}
}
return parts.join('');
}
return prepareLatexInText(text);
}
// Helper to process LaTeX in regular text
function prepareLatexInText(text) {
// Make sure dollar signs used for math have proper spacing
// First, protect existing well-formed math expressions
text = text.replace(/(\\$\\$[^\\$]+\\$\\$)/g, '<protect>$1</protect>'); // protect display math
text = text.replace(/(\\$[^\\$\\n]+\\$)/g, '<protect>$1</protect>'); // protect inline math
// Fix common LaTeX formatting issues outside protected regions
text = text.replace(/([^<]protect[^>]*)(\\$)([^\\s])/g, '$1$2 $3'); // Add space after $ if needed
text = text.replace(/([^\\s])(\\$)([^<]protect[^>]*)/g, '$1 $2$3'); // Add space before $ if needed
// Handle subscripts: transform x_1 into x_{1} for better LaTeX compatibility
text = text.replace(/([a-zA-Z])_([0-9a-zA-Z])/g, '$1_{$2}');
// Restore protected content
text = text.replace(/<protect>(.*?)<\/protect>/g, '$1');
return text;
}
// Enhanced message processor for KaTeX rendering
function processMessage(message) {
if (!message) return;
try {
// Get direct textual content when possible
const elements = message.querySelectorAll('p, li, h1, h2, h3, h4, h5, span');
elements.forEach(el => {
const originalText = el.innerHTML;
const preparedText = prepareTextForLatex(originalText);
// Only update if changes were made
if (preparedText !== originalText) {
el.innerHTML = preparedText;
}
// Render equations in this element
renderMathInElement(el);
});
// Also try to render on the entire message as fallback
renderMathInElement(message);
} catch (e) {
console.error("Error processing message for LaTeX:", e);
}
}
// Function to monitor for new messages
function setupMathObserver() {
const chatElement = document.getElementById('chatbot');
if (!chatElement) {
setTimeout(setupMathObserver, 500);
return;
}
// Process any existing messages
chatElement.querySelectorAll('.message').forEach(processMessage);
// Set up observer for new content
const observer = new MutationObserver((mutations) => {
for (const mutation of mutations) {
if (mutation.addedNodes.length > 0 || mutation.type === 'characterData') {
chatElement.querySelectorAll('.message').forEach(processMessage);
break;
}
}
});
observer.observe(chatElement, {
childList: true,
subtree: true,
characterData: true
});
console.log("LaTeX rendering observer set up successfully");
}
// Initialize once the document is fully loaded
function initializeRendering() {
if (window.renderMathInElement) {
setupMathObserver();
} else {
// If KaTeX isn't loaded yet, wait for it
const katexScript = document.querySelector('script[src*="auto-render.min.js"]');
if (katexScript) {
katexScript.onload = setupMathObserver;
} else {
// Last resort: try again later
setTimeout(initializeRendering, 500);
}
}
}
// Set up multiple trigger points to ensure it loads
document.addEventListener('DOMContentLoaded', function() {
setTimeout(initializeRendering, 800);
});
window.addEventListener('load', function() {
setTimeout(initializeRendering, 1200);
});
</script>
"""

19
utils/table_utils.py Normal file
View File

@ -0,0 +1,19 @@
from translations.lang_mappings import UI_TRANSLATIONS
def display_tables(current_tables, language=None):
"""Version simplifiée qui ignore le paramètre language"""
if not current_tables:
return None
html = ""
for table in current_tables:
table_data = table.get('data', '')
html += f"""
<div style="margin-bottom: 20px; border: 1px solid #ddd; padding: 15px; border-radius: 8px;">
<h3>{table.get('caption', 'Tableau')}</h3>
<p style="color:#666; font-size:0.9em;">Source: {table.get('source', 'N/A')}, Page: {table.get('page', 'N/A')}</p>
<pre>{table_data}</pre>
</div>
"""
return html if html else None