From cb43b1176f2504c225e950178e5cd3f561b4aeaa Mon Sep 17 00:00:00 2001 From: sepehr Date: Sat, 8 Mar 2025 18:12:18 +0100 Subject: [PATCH] Add utility modules and configuration settings for chatbot application --- __pycache__/rag_chatbot.cpython-311.pyc | Bin 11944 -> 11944 bytes app.py | 31 ++ components/__init__.py | 2 + components/callbacks.py | 12 + components/chatbot.py | 385 ++++++++++++++++++++++++ components/ui.py | 198 ++++++++++++ config/settings.py | 18 ++ config/translations.py | 50 +++ test_mistral.ipynb | 142 +++++++++ translations/lang_mappings.py | 105 +++++++ utils/__init__.py | 1 + utils/conversion.py | 15 + utils/display.py | 40 +++ utils/image_utils.py | 29 ++ utils/katex_script.py | 190 ++++++++++++ utils/table_utils.py | 19 ++ 16 files changed, 1237 insertions(+) create mode 100644 app.py create mode 100644 components/__init__.py create mode 100644 components/callbacks.py create mode 100644 components/chatbot.py create mode 100644 components/ui.py create mode 100644 config/settings.py create mode 100644 config/translations.py create mode 100644 test_mistral.ipynb create mode 100644 translations/lang_mappings.py create mode 100644 utils/__init__.py create mode 100644 utils/conversion.py create mode 100644 utils/display.py create mode 100644 utils/image_utils.py create mode 100644 utils/katex_script.py create mode 100644 utils/table_utils.py diff --git a/__pycache__/rag_chatbot.cpython-311.pyc b/__pycache__/rag_chatbot.cpython-311.pyc index d6f59ab4cdd3a6caed6cd3951d3a78a6f8bf54e4..dda769601fa404b9c2010f40798614cc839ae1ed 100644 GIT binary patch delta 20 acmZ1xyCRl*IWI340}#B...) + text = re.sub(r'.*?', '', text, flags=re.DOTALL) + # Supprimer les espaces supplémentaires au début de la réponse + text = text.lstrip() + return text +# Handler personnalisé pour le streaming +class GradioStreamingHandler(BaseCallbackHandler): + def __init__(self): + self.tokens_queue = queue.Queue() + self.full_text = "" + + def on_llm_new_token(self, token, **kwargs): + self.tokens_queue.put(token) + self.full_text += token + +# Initialiser le chatbot +rag_bot = MultimodalRAGChatbot( + qdrant_url=QDRANT_URL, + qdrant_collection_name=QDRANT_COLLECTION_NAME, + ollama_model=DEFAULT_MODEL, + embedding_model=EMBEDDING_MODEL, + ollama_url=OLLAMA_URL +) +print(f"Chatbot initialisé avec modèle: {DEFAULT_MODEL}") + +# Variables globales +current_images = [] +current_tables = [] + +# Fonctions utilitaires +def display_images(images_list=None): + """Crée une liste de tuples (image, caption) pour Gradio Gallery""" + images_to_use = images_list if images_list is not None else current_images + + if not images_to_use: + return None + + gallery = [] + for img_data in images_to_use: + image = img_data["image"] + if image: + caption = f"{img_data['caption']} (Source: {img_data['source']}, Page: {img_data['page']})" + gallery.append((image, caption)) + + return gallery if gallery else None + +def display_tables(tables_list=None, language=None): + """Crée le HTML pour afficher les tableaux""" + tables_to_use = tables_list if tables_list is not None else current_tables + + if not tables_to_use: + return None + + html = "" + for idx, table in enumerate(tables_to_use): + table_data = table['data'] + table_html = "" + + try: + if isinstance(table_data, str): + if '|' in table_data: + rows = table_data.strip().split('\n') + table_html = '
' + + for i, row in enumerate(rows): + if i == 1 and all(c in ':-|' for c in row): + continue + + cells = row.split('|') + + if cells and cells[0].strip() == '': + cells = cells[1:] + if cells and cells[-1].strip() == '': + cells = cells[:-1] + + if cells: + is_header = (i == 0) + table_html += '' + for cell in cells: + cell_content = cell.strip() + if is_header: + table_html += f'' + else: + table_html += f'' + table_html += '' + + table_html += '
{cell_content}{cell_content}
' + else: + table_html = f'
{table_data}
' + else: + table_html = f'
{table_data}
' + except Exception as e: + print(f"Error formatting table {idx}: {e}") + table_html = f'
{table_data}
' + + html += f""" +
+

{table.get('caption', 'Tableau')}

+

Source: {table.get('source', 'N/A')}, Page: {table.get('page', 'N/A')}

+

Description: {table.get('description', '')}

+ {table_html} +
+ """ + + return html if html else None + +# Fonction pour changer de modèle +def change_model(model_name, language="Français"): + global rag_bot + + try: + rag_bot = MultimodalRAGChatbot( + qdrant_url=QDRANT_URL, + qdrant_collection_name=QDRANT_COLLECTION_NAME, + ollama_model=model_name, + embedding_model=EMBEDDING_MODEL, + ollama_url=OLLAMA_URL + ) + print(f"Modèle changé pour: {model_name}") + return f"✅ Modèle changé pour: {model_name}" + except Exception as e: + print(f"Erreur lors du changement de modèle: {e}") + return f"❌ Erreur: {str(e)}" + +# Fonction pour changer de collection +def change_collection(collection_name, language="Français"): + global rag_bot + + try: + rag_bot = MultimodalRAGChatbot( + qdrant_url=QDRANT_URL, + qdrant_collection_name=collection_name, + ollama_model=rag_bot.llm.model, + embedding_model=EMBEDDING_MODEL, + ollama_url=OLLAMA_URL + ) + print(f"Collection changée pour: {collection_name}") + return f"✅ Collection changée pour: {collection_name}" + except Exception as e: + print(f"Erreur lors du changement de collection: {e}") + return f"❌ Erreur: {str(e)}" + +# Fonction de traitement de requête +def process_query(message, history, streaming, show_sources, max_images, language): + global current_images, current_tables + + if not message.strip(): + return history, "", None, None + + current_images = [] + current_tables = [] + print(f"Traitement du message: {message}") + print(f"Streaming: {streaming}") + + try: + if streaming: + # Version avec streaming dans Gradio + history = history + [(message, "")] + + # 1. Récupérer les documents pertinents + docs = rag_bot._retrieve_relevant_documents(message) + + # 2. Préparer le contexte et l'historique + context = rag_bot._format_documents(docs) + history_text = rag_bot._format_chat_history() + + # 3. Préparer le prompt + prompt_template = ChatPromptTemplate.from_template(""" + Tu es un assistant documentaire spécialisé qui utilise toutes les informations disponibles dans le contexte fourni. + + TRÈS IMPORTANT: Tu dois répondre EXCLUSIVEMENT en {language}. Ne réponds JAMAIS dans une autre langue. + + Instructions spécifiques: + 1. Pour chaque image mentionnée dans le contexte, inclue TOUJOURS dans ta réponse: + - La légende/caption exacte de l'image + - La source et le numéro de page + - Une description brève de ce qu'elle montre + + 2. Pour chaque tableau mentionné dans le contexte, inclue TOUJOURS: + - Le titre/caption exact du tableau + - La source et le numéro de page + - Ce que contient et signifie le tableau + + 3. Lorsque tu cites des équations mathématiques: + - Utilise la syntaxe LaTeX exacte comme dans le document ($...$ ou $$...$$) + - Reproduis-les fidèlement sans modification + + 4. IMPORTANT: Ne pas inventer d'informations - si une donnée n'est pas explicitement fournie dans le contexte, + indique clairement que cette information n'est pas disponible dans les documents fournis. + + 5. Cite précisément les sources pour chaque élément d'information (format: [Source, Page]). + + 6. CRUCIAL: Ta réponse doit être UNIQUEMENT et INTÉGRALEMENT en {language}, quelle que soit la langue de la question. + + Historique de conversation: + {chat_history} + + Contexte (à utiliser pour répondre): + {context} + + Question: {question} + + Réponds de façon structurée et précise en intégrant activement les images, tableaux et équations disponibles dans le contexte. + Ta réponse doit être exclusivement en {language}. + """) + + # 4. Formater les messages pour le LLM + messages = prompt_template.format_messages( + chat_history=history_text, + context=context, + question=message, + language=LANGUAGE_MAPPING.get(language, "français") + ) + + # 5. Créer un handler de streaming personnalisé + handler = GradioStreamingHandler() + + # 6. Créer un modèle LLM avec notre handler + streaming_llm = ChatOllama( + model=rag_bot.llm.model, + base_url=rag_bot.llm.base_url, + streaming=True, + callbacks=[handler] + ) + + # 7. Lancer la génération dans un thread pour ne pas bloquer l'UI + def generate_response(): + streaming_llm.invoke(messages) + + thread = threading.Thread(target=generate_response) + thread.start() + + # 8. Récupérer les tokens et mettre à jour l'interface + partial_response = "" + + # Attendre les tokens avec un timeout + while thread.is_alive() or not handler.tokens_queue.empty(): + try: + token = handler.tokens_queue.get(timeout=0.05) + partial_response += token + + # Nettoyer la réponse uniquement pour l'affichage (pas pour l'historique interne) + clean_response = clean_llm_response(partial_response) + history[-1] = (message, clean_response) + yield history, "", None, None + except queue.Empty: + continue + + # Après la boucle, nettoyer la réponse complète pour l'historique interne + partial_response = clean_llm_response(partial_response) + rag_bot.chat_history.append({"role": "user", "content": message}) + rag_bot.chat_history.append({"role": "assistant", "content": partial_response}) + + # 10. Récupérer les sources, images, tableaux + texts, images, tables = rag_bot._process_documents(docs) + + # Préparer les informations sur les sources + source_info = "" + if texts: + source_info += f"📚 {len(texts)} textes • " + if images: + source_info += f"🖼️ {len(images)} images • " + if tables: + source_info += f"📊 {len(tables)} tableaux" + + if source_info: + source_info = "Sources trouvées: " + source_info + + # 11. Traiter les images + if show_sources and images: + images = images[:max_images] + for img in images: + img_data = img.get("image_data") + if img_data: + image = base64_to_image(img_data) + if image: + current_images.append({ + "image": image, + "caption": img.get("caption", ""), + "source": img.get("source", ""), + "page": img.get("page", ""), + "description": img.get("description", "") + }) + + # 12. Traiter les tableaux + if show_sources and tables: + for table in tables: + current_tables.append({ + "data": rag_bot.format_table(table.get("table_data", "")), + "caption": table.get("caption", ""), + "source": table.get("source", ""), + "page": table.get("page", ""), + "description": table.get("description", "") + }) + + # 13. Retourner les résultats finaux + images_display = display_images() + tables_display = display_tables() + yield history, source_info, images_display, tables_display + + else: + # Version sans streaming + print("Mode non-streaming activé") + source_info = "" + + result = rag_bot.chat(message, stream=False) + # Nettoyer la réponse des balises + result["response"] = clean_llm_response(result["response"]) + history = history + [(message, result["response"])] + + # Mise à jour de l'historique interne + rag_bot.chat_history.append({"role": "user", "content": message}) + rag_bot.chat_history.append({"role": "assistant", "content": result["response"]}) + + # Traiter les sources + if "texts" in result: + source_info += f"📚 {len(result['texts'])} textes • " + if "images" in result: + source_info += f"🖼️ {len(result['images'])} images • " + if "tables" in result: + source_info += f"📊 {len(result['tables'])} tableaux" + + if source_info: + source_info = "Sources trouvées: " + source_info + + # Traiter les images et tableaux + if show_sources and "images" in result and result["images"]: + images = result["images"][:max_images] + for img in images: + img_data = img.get("image_data") + if img_data: + image = base64_to_image(img_data) + if image: + current_images.append({ + "image": image, + "caption": img.get("caption", ""), + "source": img.get("source", ""), + "page": img.get("page", ""), + "description": img.get("description", "") + }) + + if show_sources and "tables" in result and result["tables"]: + tables = result["tables"] + for table in tables: + current_tables.append({ + "data": rag_bot.format_table(table.get("table_data", "")), + "caption": table.get("caption", ""), + "source": table.get("source", ""), + "page": table.get("page", ""), + "description": table.get("description", "") + }) + + yield history, source_info, display_images(), display_tables() + + except Exception as e: + error_msg = f"Une erreur est survenue: {str(e)}" + traceback_text = traceback.format_exc() + print(error_msg) + print(traceback_text) + history = history + [(message, error_msg)] + yield history, "Erreur lors du traitement de la requête", None, None + +# Fonction pour réinitialiser la conversation +def reset_conversation(): + global current_images, current_tables + current_images = [] + current_tables = [] + + rag_bot.clear_history() + + return [], "", None, None \ No newline at end of file diff --git a/components/ui.py b/components/ui.py new file mode 100644 index 0000000..78fb7e3 --- /dev/null +++ b/components/ui.py @@ -0,0 +1,198 @@ +import gradio as gr +from config.settings import DEFAULT_MODEL, QDRANT_COLLECTION_NAME, AVAILABLE_MODELS +from translations.lang_mappings import UI_TRANSLATIONS, UI_SUPPORTED_LANGUAGES +from utils.katex_script import KATEX_CSS_JS + +def update_ui_language_elements(language): + """Met à jour les éléments de l'interface utilisateur en fonction de la langue sélectionnée""" + pass # Implémentez selon vos besoins + +def build_interface( + process_query_fn, + reset_conversation_fn, + change_model_fn, + change_collection_fn, + update_ui_language_fn +): + """Construit l'interface utilisateur avec Gradio.""" + with gr.Blocks(css=KATEX_CSS_JS, theme=gr.themes.Soft(primary_hue="blue")) as interface: + gr.Markdown("# 📚 Assistant documentaire intelligent") + + with gr.Row(): + with gr.Column(scale=2): + # Chatbot principal + chat_interface = gr.Chatbot( + height=600, + show_label=False, + layout="bubble", + elem_id="chatbot" + ) + + with gr.Row(): + msg = gr.Textbox( + show_label=False, + placeholder="Posez votre question...", + container=False, + scale=4 + ) + submit_btn = gr.Button("Envoyer", variant="primary", scale=1) + + clear_btn = gr.Button("Effacer la conversation") + source_info = gr.Markdown("", elem_id="sources_info") + + with gr.Column(scale=1): + with gr.Accordion("Options", open=True): + # Sélecteur de modèle + model_selector = gr.Dropdown( + choices=AVAILABLE_MODELS, + value=DEFAULT_MODEL, + label="Modèle Ollama", + info="Choisir le modèle de language à utiliser" + ) + model_status = gr.Markdown(f"Modèle actuel: **{DEFAULT_MODEL}**") + + # Sélecteur de langue + language_selector = gr.Dropdown( + choices=UI_SUPPORTED_LANGUAGES, + value=UI_SUPPORTED_LANGUAGES[0], + label="Langue des réponses", + info="Choisir la langue dans laquelle l'assistant répondra" + ) + + # Sélecteur de collection Qdrant + collection_name_input = gr.Textbox( + value=QDRANT_COLLECTION_NAME, + label="Collection Qdrant", + info="Nom de la collection de documents à utiliser" + ) + collection_status = gr.Markdown(f"Collection actuelle: **{QDRANT_COLLECTION_NAME}**") + + # Bouton d'application de la collection + apply_collection_btn = gr.Button("Appliquer la collection") + + streaming = gr.Checkbox( + label="Mode streaming", + value=True, + info="Voir les réponses s'afficher progressivement" + ) + show_sources = gr.Checkbox(label="Afficher les sources", value=True) + max_images = gr.Slider( + minimum=1, + maximum=10, + value=3, + step=1, + label="Nombre max d'images" + ) + + gr.Markdown("---") + + gr.Markdown("### 🖼️ Images pertinentes") + image_gallery = gr.Gallery( + label="Images pertinentes", + show_label=False, + columns=2, + height=300, + object_fit="contain" + ) + + gr.Markdown("### 📊 Tableaux") + tables_display = gr.HTML() + + # Connecter le changement de modèle + model_selector.change( + fn=change_model_fn, + inputs=model_selector, + outputs=model_status + ) + + # Connecter le changement de collection + apply_collection_btn.click( + fn=change_collection_fn, + inputs=collection_name_input, + outputs=collection_status + ) + + # Fonction pour effacer l'entrée + def clear_input(): + return "" + + # Configuration des actions principales + msg.submit( + process_query_fn, + inputs=[msg, chat_interface, streaming, show_sources, max_images, language_selector], + outputs=[chat_interface, source_info, image_gallery, tables_display] + ).then(clear_input, None, msg) + + submit_btn.click( + process_query_fn, + inputs=[msg, chat_interface, streaming, show_sources, max_images, language_selector], + outputs=[chat_interface, source_info, image_gallery, tables_display] + ).then(clear_input, None, msg) + + clear_btn.click( + reset_conversation_fn, + outputs=[chat_interface, source_info, image_gallery, tables_display] + ) + + # Style KaTeX et amélioration du design + gr.Markdown(""" + + + + + + + + + """) + + return interface \ No newline at end of file diff --git a/config/settings.py b/config/settings.py new file mode 100644 index 0000000..cbefde5 --- /dev/null +++ b/config/settings.py @@ -0,0 +1,18 @@ +import gradio as gr + +# Configuration settings for the application + +# URLs and connection settings +QDRANT_URL = "http://localhost:6333" +QDRANT_COLLECTION_NAME = "my_custom_collection" +OLLAMA_URL = "http://127.0.0.1:11434" + +# Model settings +EMBEDDING_MODEL = "mxbai-embed-large" +DEFAULT_MODEL = "llama3.2" + +# Available models +AVAILABLE_MODELS = ["llama3.1", "llama3.2", "deepseek-r1:7b", "deepseek-r1:14b"] + +# Default theme +DEFAULT_THEME = gr.themes.Soft(primary_hue="blue") diff --git a/config/translations.py b/config/translations.py new file mode 100644 index 0000000..ffc95c3 --- /dev/null +++ b/config/translations.py @@ -0,0 +1,50 @@ +# Mapping des langues pour une meilleure compréhension par le LLM +LANGUAGE_MAPPING = { + "Français": "français", + "English": "English", + "Español": "español", + "Deutsch": "Deutsch", + "Italiano": "italiano", + "中文": "Chinese", + "日本語": "Japanese", + "العربية": "Arabic", + "فارسی": "Persian" +} + +# Dictionnaire de traductions pour l'interface +UI_TRANSLATIONS = { + "Français": { + "title": "📚 Assistant documentaire intelligent", + "placeholder": "Posez votre question...", + "send_btn": "Envoyer", + "clear_btn": "Effacer la conversation", + "model_selector": "Modèle Ollama", + "model_info": "Choisir le modèle de language à utiliser", + "model_current": "Modèle actuel", + "language_selector": "Langue des réponses", + "language_info": "Choisir la langue dans laquelle l'assistant répondra", + "collection_input": "Collection Qdrant", + "collection_info": "Nom de la collection de documents à utiliser", + "collection_current": "Collection actuelle", + "apply_btn": "Appliquer la collection", + "streaming_label": "Mode streaming", + "streaming_info": "Voir les réponses s'afficher progressivement", + "sources_label": "Afficher les sources", + "max_images_label": "Nombre max d'images", + "images_title": "🖼️ Images pertinentes", + "tables_title": "📊 Tableaux", + "sources_found": "Sources trouvées", + "texts": "textes", + "images": "images", + "tables": "tableaux", + "error_msg": "Une erreur est survenue" + }, + "English": { + "title": "📚 Intelligent Document Assistant", + "placeholder": "Ask your question...", + "send_btn": "Send", + "clear_btn": "Clear conversation", + # Ajoutez les autres traductions anglaises ici + } + # Ajoutez d'autres langues au besoin +} \ No newline at end of file diff --git a/test_mistral.ipynb b/test_mistral.ipynb new file mode 100644 index 0000000..c56da06 --- /dev/null +++ b/test_mistral.ipynb @@ -0,0 +1,142 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "ename": "SDKError", + "evalue": "API error occurred: Status 401\n{\n \"message\":\"Unauthorized\",\n \"request_id\":\"11390a73fd79bc1a934c5858569caa3a\"\n}", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mSDKError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[11]\u001b[39m\u001b[32m, line 8\u001b[39m\n\u001b[32m 4\u001b[39m api_key =\u001b[33m\"\u001b[39m\u001b[33mxmM3IG80Y97Hg8kJVUPy1ijyIhmS2H9j\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 6\u001b[39m client = Mistral(api_key=api_key)\n\u001b[32m----> \u001b[39m\u001b[32m8\u001b[39m uploaded_pdf = \u001b[43mclient\u001b[49m\u001b[43m.\u001b[49m\u001b[43mfiles\u001b[49m\u001b[43m.\u001b[49m\u001b[43mupload\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 9\u001b[39m \u001b[43m \u001b[49m\u001b[43mfile\u001b[49m\u001b[43m=\u001b[49m\u001b[43m{\u001b[49m\n\u001b[32m 10\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mfile_name\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m11_chapitre3.pdf\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 11\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcontent\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[33;43mr\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mF:\u001b[39;49m\u001b[33;43m\\\u001b[39;49m\u001b[33;43mDev\u001b[39;49m\u001b[33;43m\\\u001b[39;49m\u001b[33;43mRag\u001b[39;49m\u001b[33;43m\\\u001b[39;49m\u001b[33;43mRag_Modeling\u001b[39;49m\u001b[33;43m\\\u001b[39;49m\u001b[33;43mdocument\u001b[39;49m\u001b[33;43m\\\u001b[39;49m\u001b[33;43m11_chapitre3.pdf\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mrb\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 12\u001b[39m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 13\u001b[39m \u001b[43m \u001b[49m\u001b[43mpurpose\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mocr\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\n\u001b[32m 14\u001b[39m \u001b[43m)\u001b[49m \n\u001b[32m 15\u001b[39m client.files.retrieve(file_id=uploaded_pdf.id) \n", + "\u001b[36mFile \u001b[39m\u001b[32mf:\\Dev\\Rag\\chat_bot_rag\\.venv\\Lib\\site-packages\\mistralai\\files.py:101\u001b[39m, in \u001b[36mFiles.upload\u001b[39m\u001b[34m(self, file, purpose, retries, server_url, timeout_ms, http_headers)\u001b[39m\n\u001b[32m 99\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m utils.match_response(http_res, \u001b[33m\"\u001b[39m\u001b[33m4XX\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33m*\u001b[39m\u001b[33m\"\u001b[39m):\n\u001b[32m 100\u001b[39m http_res_text = utils.stream_to_text(http_res)\n\u001b[32m--> \u001b[39m\u001b[32m101\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m models.SDKError(\n\u001b[32m 102\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mAPI error occurred\u001b[39m\u001b[33m\"\u001b[39m, http_res.status_code, http_res_text, http_res\n\u001b[32m 103\u001b[39m )\n\u001b[32m 104\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m utils.match_response(http_res, \u001b[33m\"\u001b[39m\u001b[33m5XX\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33m*\u001b[39m\u001b[33m\"\u001b[39m):\n\u001b[32m 105\u001b[39m http_res_text = utils.stream_to_text(http_res)\n", + "\u001b[31mSDKError\u001b[39m: API error occurred: Status 401\n{\n \"message\":\"Unauthorized\",\n \"request_id\":\"11390a73fd79bc1a934c5858569caa3a\"\n}" + ] + } + ], + "source": [ + "from mistralai import Mistral\n", + "import os\n", + "\n", + "api_key =\"xmM3IG80Y97Hg8kJVUPy1ijyIhmS2H9j\"\n", + "\n", + "client = Mistral(api_key=api_key)\n", + "\n", + "uploaded_pdf = client.files.upload(\n", + " file={\n", + " \"file_name\": \"11_chapitre3.pdf\",\n", + " \"content\": open(r\"F:\\Dev\\Rag\\Rag_Modeling\\document\\11_chapitre3.pdf\", \"rb\"),\n", + " },\n", + " purpose=\"ocr\"\n", + ") \n", + "client.files.retrieve(file_id=uploaded_pdf.id) \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "ename": "SDKError", + "evalue": "API error occurred: Status 401\n{\n \"message\":\"Unauthorized\",\n \"request_id\":\"bf40e3105e1f257ec16fc233e4d0396b\"\n}", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mSDKError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[10]\u001b[39m\u001b[32m, line 5\u001b[39m\n\u001b[32m 1\u001b[39m model = \u001b[33m\"\u001b[39m\u001b[33mmistral-large-latest\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 3\u001b[39m client = Mistral(api_key=api_key)\n\u001b[32m----> \u001b[39m\u001b[32m5\u001b[39m chat_response = \u001b[43mclient\u001b[49m\u001b[43m.\u001b[49m\u001b[43mchat\u001b[49m\u001b[43m.\u001b[49m\u001b[43mcomplete\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 6\u001b[39m \u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 7\u001b[39m \u001b[43m \u001b[49m\u001b[43mmessages\u001b[49m\u001b[43m \u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[43m[\u001b[49m\n\u001b[32m 8\u001b[39m \u001b[43m \u001b[49m\u001b[43m{\u001b[49m\n\u001b[32m 9\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mrole\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43muser\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 10\u001b[39m \u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcontent\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mWhat is the best French cheese?\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 11\u001b[39m \u001b[43m \u001b[49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 12\u001b[39m \u001b[43m \u001b[49m\u001b[43m]\u001b[49m\n\u001b[32m 13\u001b[39m \u001b[43m)\u001b[49m\n\u001b[32m 14\u001b[39m \u001b[38;5;28mprint\u001b[39m(chat_response.choices[\u001b[32m0\u001b[39m].message.content)\n", + "\u001b[36mFile \u001b[39m\u001b[32mf:\\Dev\\Rag\\chat_bot_rag\\.venv\\Lib\\site-packages\\mistralai\\chat.py:235\u001b[39m, in \u001b[36mChat.complete\u001b[39m\u001b[34m(self, model, messages, temperature, top_p, max_tokens, stream, stop, random_seed, response_format, tools, tool_choice, presence_penalty, frequency_penalty, n, prediction, safe_prompt, retries, server_url, timeout_ms, http_headers)\u001b[39m\n\u001b[32m 233\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m utils.match_response(http_res, \u001b[33m\"\u001b[39m\u001b[33m4XX\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33m*\u001b[39m\u001b[33m\"\u001b[39m):\n\u001b[32m 234\u001b[39m http_res_text = utils.stream_to_text(http_res)\n\u001b[32m--> \u001b[39m\u001b[32m235\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m models.SDKError(\n\u001b[32m 236\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mAPI error occurred\u001b[39m\u001b[33m\"\u001b[39m, http_res.status_code, http_res_text, http_res\n\u001b[32m 237\u001b[39m )\n\u001b[32m 238\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m utils.match_response(http_res, \u001b[33m\"\u001b[39m\u001b[33m5XX\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33m*\u001b[39m\u001b[33m\"\u001b[39m):\n\u001b[32m 239\u001b[39m http_res_text = utils.stream_to_text(http_res)\n", + "\u001b[31mSDKError\u001b[39m: API error occurred: Status 401\n{\n \"message\":\"Unauthorized\",\n \"request_id\":\"bf40e3105e1f257ec16fc233e4d0396b\"\n}" + ] + } + ], + "source": [ + "model = \"mistral-large-latest\"\n", + "\n", + "client = Mistral(api_key=api_key)\n", + "\n", + "chat_response = client.chat.complete(\n", + " model= model,\n", + " messages = [\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": \"What is the best French cheese?\",\n", + " },\n", + " ]\n", + ")\n", + "print(chat_response.choices[0].message.content)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<_io.BufferedReader name='F:\\\\Dev\\\\Rag\\\\Rag_Modeling\\\\document\\\\11_chapitre3.pdf'>" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "open(r\"F:\\Dev\\Rag\\Rag_Modeling\\document\\11_chapitre3.pdf\", \"rb\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MISTRAL_API_KEY: None\n" + ] + } + ], + "source": [ + "import os\n", + "print(\"MISTRAL_API_KEY:\", repr(os.environ.get(\"MISTRAL_API_KEY\")))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/translations/lang_mappings.py b/translations/lang_mappings.py new file mode 100644 index 0000000..5733043 --- /dev/null +++ b/translations/lang_mappings.py @@ -0,0 +1,105 @@ +# Mapping des langues pour une meilleure compréhension par le LLM +LANGUAGE_MAPPING = { + "Français": "français", + "English": "English", + "Español": "español", + "Deutsch": "Deutsch", + "Italiano": "italiano", + "中文": "Chinese", + "日本語": "Japanese", + "العربية": "Arabic" +} + +# Dictionnaire de traductions pour l'interface +UI_TRANSLATIONS = { + "Français": { + "title": "📚 Assistant documentaire intelligent", + "placeholder": "Posez votre question...", + "send_btn": "Envoyer", + "clear_btn": "Effacer la conversation", + "model_selector": "Modèle Ollama", + "model_info": "Choisir le modèle de language à utiliser", + "model_current": "Modèle actuel", + "language_selector": "Langue des réponses", + "language_info": "Choisir la langue dans laquelle l'assistant répondra", + "collection_input": "Collection Qdrant", + "collection_info": "Nom de la collection de documents à utiliser", + "collection_current": "Collection actuelle", + "apply_btn": "Appliquer la collection", + "streaming_label": "Mode streaming", + "streaming_info": "Voir les réponses s'afficher progressivement", + "sources_label": "Afficher les sources", + "max_images_label": "Nombre max d'images", + "images_title": "🖼️ Images pertinentes", + "tables_title": "📊 Tableaux", + "sources_found": "Sources trouvées", + "texts": "textes", + "images": "images", + "tables": "tableaux", + "error_msg": "Une erreur est survenue", + "processing_error": "Erreur lors du traitement de la requête", + "table_translation": "Traduction", + "table_description": "Ce tableau présente des données sur" + }, + "English": { + "title": "📚 Intelligent Document Assistant", + "placeholder": "Ask your question...", + "send_btn": "Send", + "clear_btn": "Clear conversation", + "model_selector": "Ollama Model", + "model_info": "Choose the language model to use", + "model_current": "Current model", + "language_selector": "Response language", + "language_info": "Choose the language in which the assistant will respond", + "collection_input": "Qdrant Collection", + "collection_info": "Name of the document collection to use", + "collection_current": "Current collection", + "apply_btn": "Apply collection", + "streaming_label": "Streaming mode", + "streaming_info": "See responses appear progressively", + "sources_label": "Show sources", + "max_images_label": "Maximum number of images", + "images_title": "🖼️ Relevant images", + "tables_title": "📊 Tables", + "sources_found": "Sources found", + "texts": "texts", + "images": "images", + "tables": "tables", + "error_msg": "An error occurred", + "processing_error": "Error processing request", + "table_translation": "Translation", + "table_description": "This table presents data on" + }, + "Español": { + "title": "📚 Asistente documental inteligente", + "placeholder": "Haz tu pregunta...", + "send_btn": "Enviar", + "clear_btn": "Borrar conversación", + "model_selector": "Modelo Ollama", + "model_info": "Elegir el modelo de lenguaje a utilizar", + "model_current": "Modelo actual", + "language_selector": "Idioma de respuesta", + "language_info": "Elegir el idioma en el que responderá el asistente", + "collection_input": "Colección Qdrant", + "collection_info": "Nombre de la colección de documentos a utilizar", + "collection_current": "Colección actual", + "apply_btn": "Aplicar colección", + "streaming_label": "Modo streaming", + "streaming_info": "Ver las respuestas aparecer progresivamente", + "sources_label": "Mostrar fuentes", + "max_images_label": "Número máximo de imágenes", + "images_title": "🖼️ Imágenes relevantes", + "tables_title": "📊 Tablas", + "sources_found": "Fuentes encontradas", + "texts": "textos", + "images": "imágenes", + "tables": "tablas", + "error_msg": "Se ha producido un error", + "processing_error": "Error al procesar la solicitud", + "table_translation": "Traducción", + "table_description": "Esta tabla presenta datos sobre" + } +} + +# Définir les langues supportées par l'interface +UI_SUPPORTED_LANGUAGES = ["Français", "English", "Español"] \ No newline at end of file diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..96be681 --- /dev/null +++ b/utils/__init__.py @@ -0,0 +1 @@ +# Package initialization for display utilities diff --git a/utils/conversion.py b/utils/conversion.py new file mode 100644 index 0000000..6635e89 --- /dev/null +++ b/utils/conversion.py @@ -0,0 +1,15 @@ +import base64 +from io import BytesIO +from PIL import Image + +def base64_to_image(base64_data): + """Convertit une image base64 en objet Image pour l'affichage direct""" + try: + if not base64_data: + return None + image_bytes = base64.b64decode(base64_data) + image = Image.open(BytesIO(image_bytes)) + return image + except Exception as e: + print(f"Erreur lors de la conversion d'image: {e}") + return None \ No newline at end of file diff --git a/utils/display.py b/utils/display.py new file mode 100644 index 0000000..515152c --- /dev/null +++ b/utils/display.py @@ -0,0 +1,40 @@ +from PIL import Image +from io import BytesIO +import base64 + +def base64_to_image(base64_data): + """Convert base64 image data to PIL Image""" + try: + if not base64_data: + return None + image_bytes = base64.b64decode(base64_data) + return Image.open(BytesIO(image_bytes)) + except Exception as e: + print(f"Image conversion error: {e}") + return None + +def display_images(current_images): + """Format images for Gradio gallery display""" + if not current_images: + return None + return [ + (img["image"], f"{img['caption']} (Source: {img['source']}, Page: {img['page']})") + for img in current_images + if img.get("image") + ] + +def display_tables(current_tables): + """Format tables for HTML display""" + if not current_tables: + return None + + html = "" + for table in current_tables: + html += f""" +
+

{table['caption']}

+

Source: {table['source']}, Page: {table['page']}

+
{table.get('data', '')}
+
+ """ + return html if html else None diff --git a/utils/image_utils.py b/utils/image_utils.py new file mode 100644 index 0000000..ba7937d --- /dev/null +++ b/utils/image_utils.py @@ -0,0 +1,29 @@ +from io import BytesIO +from PIL import Image +import base64 + +def base64_to_image(base64_data): + """Convertit une image base64 en objet Image pour l'affichage direct""" + try: + if not base64_data: + return None + image_bytes = base64.b64decode(base64_data) + image = Image.open(BytesIO(image_bytes)) + return image + except Exception as e: + print(f"Erreur lors de la conversion d'image: {e}") + return None + +def display_images(current_images): + """Prépare les images pour l'affichage dans la galerie Gradio""" + if not current_images: + return None + + gallery = [] + for img_data in current_images: + image = img_data["image"] + if image: + caption = f"{img_data['caption']} (Source: {img_data['source']}, Page: {img_data['page']})" + gallery.append((image, caption)) + + return gallery if gallery else None \ No newline at end of file diff --git a/utils/katex_script.py b/utils/katex_script.py new file mode 100644 index 0000000..7dec972 --- /dev/null +++ b/utils/katex_script.py @@ -0,0 +1,190 @@ +KATEX_CSS_JS = """ + + + + + + + + +""" \ No newline at end of file diff --git a/utils/table_utils.py b/utils/table_utils.py new file mode 100644 index 0000000..3fa5b72 --- /dev/null +++ b/utils/table_utils.py @@ -0,0 +1,19 @@ +from translations.lang_mappings import UI_TRANSLATIONS + +def display_tables(current_tables, language=None): + """Version simplifiée qui ignore le paramètre language""" + if not current_tables: + return None + + html = "" + for table in current_tables: + table_data = table.get('data', '') + html += f""" +
+

{table.get('caption', 'Tableau')}

+

Source: {table.get('source', 'N/A')}, Page: {table.get('page', 'N/A')}

+
{table_data}
+
+ """ + + return html if html else None \ No newline at end of file