Add system prompt, glossary, presets for Ollama/WebLLM, image translation support

2025-11-30 16:45:41 +01:00
parent 465cab8a61
commit e48ea07e44
6 changed files with 497 additions and 51 deletions
--- a/main.py
+++ b/main.py
@@ -111,7 +111,9 @@ async def translate_document(
    target_language: str = Form(..., description="Target language code (e.g., 'es', 'fr', 'de')"),
    source_language: str = Form(default="auto", description="Source language code (default: auto-detect)"),
    provider: str = Form(default="google", description="Translation provider (google, ollama, deepl, libre)"),
-    translate_images: bool = Form(default=False, description="Translate images with Ollama vision (only for Ollama provider)"),
+    translate_images: bool = Form(default=False, description="Translate images with multimodal Ollama model"),
+    ollama_model: str = Form(default="", description="Ollama model to use (also used for vision if multimodal)"),
+    system_prompt: str = Form(default="", description="Custom system prompt with context, glossary, or instructions for LLM translation"),
    cleanup: bool = Form(default=True, description="Delete input file after translation")
 ):
    """
@@ -154,7 +156,7 @@ async def translate_document(
        logger.info(f"Saved input file to: {input_path}")
        
        # Configure translation provider
-        from services.translation_service import GoogleTranslationProvider, DeepLTranslationProvider, LibreTranslationProvider, OllamaTranslationProvider, WebLLMTranslationProvider, translation_service
+        from services.translation_service import GoogleTranslationProvider, DeepLTranslationProvider, LibreTranslationProvider, OllamaTranslationProvider, translation_service
        
        if provider.lower() == "deepl":
            if not config.DEEPL_API_KEY:
@@ -163,10 +165,13 @@ async def translate_document(
        elif provider.lower() == "libre":
            translation_provider = LibreTranslationProvider()
        elif provider.lower() == "ollama":
-            vision_model = getattr(config, 'OLLAMA_VISION_MODEL', 'llava')
-            translation_provider = OllamaTranslationProvider(config.OLLAMA_BASE_URL, config.OLLAMA_MODEL, vision_model)
-        elif provider.lower() == "webllm":
-            translation_provider = WebLLMTranslationProvider()
+            # Use the same model for text and vision (multimodal models like gemma3, qwen3-vl)
+            model_to_use = ollama_model.strip() if ollama_model else config.OLLAMA_MODEL
+            custom_prompt = system_prompt.strip() if system_prompt else ""
+            logger.info(f"Using Ollama model: {model_to_use} (text + vision)")
+            if custom_prompt:
+                logger.info(f"Custom system prompt provided ({len(custom_prompt)} chars)")
+            translation_provider = OllamaTranslationProvider(config.OLLAMA_BASE_URL, model_to_use, model_to_use, custom_prompt)
        else:
            translation_provider = GoogleTranslationProvider()
        
--- a/services/translation_service.py
+++ b/services/translation_service.py
@@ -70,30 +70,65 @@ class LibreTranslationProvider(TranslationProvider):
 class OllamaTranslationProvider(TranslationProvider):
    """Ollama LLM translation implementation"""
    
-    def __init__(self, base_url: str = "http://localhost:11434", model: str = "llama3", vision_model: str = "llava"):
+    def __init__(self, base_url: str = "http://localhost:11434", model: str = "llama3", vision_model: str = "llava", system_prompt: str = ""):
        self.base_url = base_url.rstrip('/')
-        self.model = model
-        self.vision_model = vision_model
+        self.model = model.strip()  # Remove any leading/trailing whitespace
+        self.vision_model = vision_model.strip()
+        self.custom_system_prompt = system_prompt  # Custom context, glossary, instructions
    
    def translate(self, text: str, target_language: str, source_language: str = 'auto') -> str:
        if not text or not text.strip():
            return text
        
-        try:
-            prompt = f"Translate the following text to {target_language}. Return ONLY the translation, nothing else:\n\n{text}"
+        # Skip very short text or numbers only
+        if len(text.strip()) < 2 or text.strip().isdigit():
+            return text
        
+        try:
+            # Build system prompt with custom context if provided
+            base_prompt = f"You are a translator. Translate the user's text to {target_language}. Return ONLY the translation, nothing else."
+            
+            if self.custom_system_prompt:
+                system_content = f"""{base_prompt}
+
+ADDITIONAL CONTEXT AND INSTRUCTIONS:
+{self.custom_system_prompt}"""
+            else:
+                system_content = base_prompt
+            
+            # Use /api/chat endpoint (more compatible with all models)
            response = requests.post(
-                f"{self.base_url}/api/generate",
+                f"{self.base_url}/api/chat",
                json={
                    "model": self.model,
-                    "prompt": prompt,
-                    "stream": False
+                    "messages": [
+                        {
+                            "role": "system",
+                            "content": system_content
+                        },
+                        {
+                            "role": "user",
+                            "content": text
+                        }
+                    ],
+                    "stream": False,
+                    "options": {
+                        "temperature": 0.3,
+                        "num_predict": 500
+                    }
                },
-                timeout=30
+                timeout=120  # 2 minutes timeout
            )
            response.raise_for_status()
            result = response.json()
-            return result.get("response", text).strip()
+            translated = result.get("message", {}).get("content", "").strip()
+            return translated if translated else text
+        except requests.exceptions.ConnectionError:
+            print(f"Ollama error: Cannot connect to {self.base_url}. Is Ollama running?")
+            return text
+        except requests.exceptions.Timeout:
+            print(f"Ollama error: Request timeout after 120s")
+            return text
        except Exception as e:
            print(f"Ollama translation error: {e}")
            return text
@@ -107,21 +142,25 @@ class OllamaTranslationProvider(TranslationProvider):
            with open(image_path, 'rb') as img_file:
                image_data = base64.b64encode(img_file.read()).decode('utf-8')
            
-            prompt = f"Extract all text from this image and translate it to {target_language}. Return ONLY the translated text, preserving the structure and formatting."
-            
+            # Use /api/chat for vision models too
            response = requests.post(
-                f"{self.base_url}/api/generate",
+                f"{self.base_url}/api/chat",
                json={
                    "model": self.vision_model,
-                    "prompt": prompt,
-                    "images": [image_data],
+                    "messages": [
+                        {
+                            "role": "user",
+                            "content": f"Extract all text from this image and translate it to {target_language}. Return ONLY the translated text, preserving the structure and formatting.",
+                            "images": [image_data]
+                        }
+                    ],
                    "stream": False
                },
                timeout=60
            )
            response.raise_for_status()
            result = response.json()
-            return result.get("response", "").strip()
+            return result.get("message", {}).get("content", "").strip()
        except Exception as e:
            print(f"Ollama vision translation error: {e}")
            return ""
@@ -158,6 +197,7 @@ class TranslationService:
        else:
            # Auto-select provider based on configuration
            self.provider = self._get_default_provider()
+        self.translate_images = False  # Flag to enable image translation
    
    def _get_default_provider(self) -> TranslationProvider:
        """Get the default translation provider from configuration"""
@@ -182,6 +222,26 @@ class TranslationService:
        
        return self.provider.translate(text, target_language, source_language)
    
+    def translate_image(self, image_path: str, target_language: str) -> str:
+        """
+        Translate text in an image using vision model (Ollama only)
+        
+        Args:
+            image_path: Path to image file
+            target_language: Target language code
+        
+        Returns:
+            Translated text from image
+        """
+        if not self.translate_images:
+            return ""
+        
+        # Only Ollama supports image translation
+        if isinstance(self.provider, OllamaTranslationProvider):
+            return self.provider.translate_image(image_path, target_language)
+        
+        return ""
+    
    def translate_batch(self, texts: list[str], target_language: str, source_language: str = 'auto') -> list[str]:
        """
        Translate multiple text strings
--- a/static/index.html
+++ b/static/index.html
@@ -309,7 +309,7 @@
                </div>
                <div class="form-group">
                    <label for="ollama-model">Modèle Ollama</label>
-                    <input type="text" id="ollama-model" value="llama3" placeholder="llama3, mistral, etc.">
+                    <input type="text" id="ollama-model" value="llama3.2" placeholder="llama3.2, mistral, etc.">
                </div>
            </div>
            <button onclick="listOllamaModels()" class="btn-secondary">List Available Models</button>
@@ -318,6 +318,39 @@
            <div id="models-result"></div>
        </div>

+        <!-- System Prompt for LLM Translation -->
+        <div class="card">
+            <h2>Translation Context (Ollama / WebLLM)</h2>
+            <p style="font-size: 13px; color: #718096; margin-bottom: 15px;">
+                Provide context, technical glossary, or specific instructions to improve translation quality.
+            </p>
+            
+            <div class="form-group">
+                <label for="system-prompt">System Prompt / Instructions</label>
+                <textarea id="system-prompt" rows="4" style="width: 100%; padding: 10px 14px; border: 1px solid #cbd5e0; border-radius: 6px; font-size: 14px; font-family: inherit; resize: vertical;" placeholder="Example: You are translating HVAC technical documents. Use these terms:
+- Batterie (FR) = Coil (EN)
+- Groupe froid (FR) = Chiller (EN)
+- CTA (FR) = AHU (EN)"></textarea>
+            </div>
+            
+            <div class="form-group">
+                <label for="glossary">Technical Glossary (one per line: source=target)</label>
+                <textarea id="glossary" rows="5" style="width: 100%; padding: 10px 14px; border: 1px solid #cbd5e0; border-radius: 6px; font-size: 13px; font-family: monospace; resize: vertical;" placeholder="batterie=coil
+groupe froid=chiller
+CTA=AHU
+échangeur=heat exchanger
+vanne 3 voies=3-way valve"></textarea>
+            </div>
+            
+            <div style="display: flex; gap: 10px; flex-wrap: wrap;">
+                <button onclick="loadPreset('hvac')" class="btn-secondary" style="font-size: 12px;">HVAC Preset</button>
+                <button onclick="loadPreset('it')" class="btn-secondary" style="font-size: 12px;">IT Preset</button>
+                <button onclick="loadPreset('legal')" class="btn-secondary" style="font-size: 12px;">Legal Preset</button>
+                <button onclick="loadPreset('medical')" class="btn-secondary" style="font-size: 12px;">Medical Preset</button>
+                <button onclick="clearPrompt()" class="btn-secondary" style="font-size: 12px; background: #dc2626;">Clear</button>
+            </div>
+        </div>
+
        <!-- Traduction de fichier -->
        <div class="card">
            <h2>Document Translation</h2>
@@ -335,6 +368,8 @@
                <div class="form-group">
                    <label for="target-lang">Target Language</label>
                    <select id="target-lang">
+                        <option value="en">English (en)</option>
+                        <option value="fa">Persian / Farsi (fa)</option>
                        <option value="es">Espagnol (es)</option>
                        <option value="fr">Français (fr)</option>
                        <option value="de">Allemand (de)</option>
@@ -350,10 +385,10 @@

                <div class="form-group">
                    <label for="provider">Translation Service</label>
-                    <select id="provider" onchange="toggleImageTranslation()">
+                    <select id="provider" onchange="toggleProviderOptions()">
                        <option value="google">Google Translate (Default)</option>
                        <option value="ollama">Ollama LLM (Local Server)</option>
-                        <option value="webllm">WebLLM (Browser - No Server)</option>
+                        <option value="webllm">WebLLM (Browser - WebGPU)</option>
                        <option value="deepl">DeepL</option>
                        <option value="libre">LibreTranslate</option>
                    </select>
@@ -363,11 +398,11 @@
            <div class="form-group" id="image-translation-option" style="display: none;">
                <label style="display: flex; align-items: center; cursor: pointer;">
                    <input type="checkbox" id="translate-images" style="width: auto; margin-right: 10px;">
-                    <span>Translate images with Ollama Vision (requires llava model)</span>
+                    <span>Translate images with vision (use multimodal models: gemma3, qwen3-vl, llava, etc.)</span>
                </label>
            </div>

-            <div class="form-group" id="webllm-info" style="display: none; padding: 12px; background: #e0f2ff; border-radius: 6px; border-left: 4px solid #2563eb;">
+            <div class="form-group" id="webllm-options" style="display: none; padding: 12px; background: #e0f2ff; border-radius: 6px; border-left: 4px solid #2563eb;">
                <p style="margin: 0 0 10px 0; font-size: 13px; color: #1e40af;">
                    <strong>WebLLM Mode:</strong> Translation runs entirely in your browser using WebGPU. First use downloads the model.
                </p>
@@ -375,8 +410,8 @@
                    <div>
                        <label for="webllm-model" style="font-size: 12px; color: #4a5568; margin-bottom: 4px;">Select Model:</label>
                        <select id="webllm-model" style="width: 100%; padding: 6px; font-size: 13px; border: 1px solid #cbd5e0; border-radius: 4px;">
+                            <option value="Llama-3.2-3B-Instruct-q4f32_1-MLC">Llama 3.2 3B (~2GB) - Recommended</option>
                            <option value="Llama-3.1-8B-Instruct-q4f32_1-MLC">Llama 3.1 8B (~4.5GB)</option>
-                            <option value="Llama-3.2-3B-Instruct-q4f32_1-MLC">Llama 3.2 3B (~2GB)</option>
                            <option value="Phi-3.5-mini-instruct-q4f16_1-MLC">Phi 3.5 Mini (~2.5GB)</option>
                            <option value="Mistral-7B-Instruct-v0.3-q4f16_1-MLC">Mistral 7B (~4.5GB)</option>
                            <option value="gemma-2-2b-it-q4f16_1-MLC">Gemma 2 2B (~1.5GB)</option>
@@ -386,6 +421,7 @@
                        Clear Cache
                    </button>
                </div>
+                <div id="webllm-status" style="margin-top: 10px; font-size: 12px; color: #4a5568;"></div>
            </div>

            <button onclick="translateFile()">Translate Document</button>
@@ -445,26 +481,193 @@
            }
        }

-        // Toggle image translation option based on provider
-        function toggleImageTranslation() {
-            const provider = document.getElementById('provider').value;
-            const imageOption = document.getElementById('image-translation-option');
-            const webllmInfo = document.getElementById('webllm-info');
+        // Toggle provider options based on selection
+        // Preset templates for different domains
+        const presets = {
+            hvac: {
+                prompt: `You are translating HVAC (Heating, Ventilation, Air Conditioning) technical documents. 
+Use precise technical terminology. Maintain consistency with industry standards.
+Keep unit measurements (kW, m³/h, Pa) unchanged.
+Translate component names according to the glossary provided.`,
+                glossary: `batterie=coil
+groupe froid=chiller
+CTA=AHU (Air Handling Unit)
+échangeur=heat exchanger
+vanne 3 voies=3-way valve
+détendeur=expansion valve
+compresseur=compressor
+évaporateur=evaporator
+condenseur=condenser
+fluide frigorigène=refrigerant
+débit d'air=airflow
+pression statique=static pressure
+récupérateur=heat recovery unit
+ventilo-convecteur=fan coil unit
+gaine=duct
+diffuseur=diffuser
+registre=damper`
+            },
+            it: {
+                prompt: `You are translating IT and software documentation.
+Keep technical terms, code snippets, and variable names unchanged.
+Translate UI labels and user-facing text appropriately.
+Maintain formatting markers like **bold** and \`code\`.`,
+                glossary: `serveur=server
+base de données=database
+requête=query
+sauvegarde=backup
+mise à jour=update
+télécharger=download
+téléverser=upload
+mot de passe=password
+identifiant=username
+pare-feu=firewall
+réseau=network
+stockage=storage
+conteneur=container
+déploiement=deployment`
+            },
+            legal: {
+                prompt: `You are translating legal documents.
+Use formal legal terminology. Be precise and unambiguous.
+Maintain references to laws, articles, and clauses in their original form.
+Use standard legal phrases for the target language.`,
+                glossary: `contrat=contract
+clause=clause
+partie=party
+signataire=signatory
+résiliation=termination
+préavis=notice period
+dommages et intérêts=damages
+responsabilité=liability
+juridiction=jurisdiction
+arbitrage=arbitration
+avenant=amendment
+ayant droit=beneficiary`
+            },
+            medical: {
+                prompt: `You are translating medical and healthcare documents.
+Use standard medical terminology (Latin/Greek roots when appropriate).
+Keep drug names, dosages, and medical codes unchanged.
+Be precise with anatomical terms and procedures.`,
+                glossary: `patient=patient
+ordonnance=prescription
+posologie=dosage
+effet secondaire=side effect
+contre-indication=contraindication
+diagnostic=diagnosis
+symptôme=symptom
+traitement=treatment
+chirurgie=surgery
+anesthésie=anesthesia
+perfusion=infusion
+prélèvement=sample collection`
+            }
+        };

-            if (provider === 'ollama') {
-                imageOption.style.display = 'block';
-                webllmInfo.style.display = 'none';
-            } else if (provider === 'webllm') {
-                imageOption.style.display = 'none';
-                webllmInfo.style.display = 'block';
-                document.getElementById('translate-images').checked = false;
-            } else {
-                imageOption.style.display = 'none';
-                webllmInfo.style.display = 'none';
-                document.getElementById('translate-images').checked = false;
+        function loadPreset(presetName) {
+            const preset = presets[presetName];
+            if (preset) {
+                document.getElementById('system-prompt').value = preset.prompt;
+                document.getElementById('glossary').value = preset.glossary;
            }
        }

+        function clearPrompt() {
+            document.getElementById('system-prompt').value = '';
+            document.getElementById('glossary').value = '';
+        }
+
+        function getFullSystemPrompt() {
+            let prompt = document.getElementById('system-prompt').value || '';
+            const glossary = document.getElementById('glossary').value || '';
+            
+            if (glossary.trim()) {
+                prompt += '\n\nGLOSSARY (use these exact translations):\n' + glossary;
+            }
+            
+            return prompt;
+        }
+
+        function toggleProviderOptions() {
+            const provider = document.getElementById('provider').value;
+            const imageOption = document.getElementById('image-translation-option');
+            const webllmOptions = document.getElementById('webllm-options');
+            
+            // Hide all options first
+            imageOption.style.display = 'none';
+            webllmOptions.style.display = 'none';
+            document.getElementById('translate-images').checked = false;
+            
+            if (provider === 'ollama') {
+                imageOption.style.display = 'block';
+            } else if (provider === 'webllm') {
+                webllmOptions.style.display = 'block';
+            }
+        }
+
+        // WebLLM engine instance
+        let webllmEngine = null;
+        let webllmReady = false;
+
+        // Initialize WebLLM
+        async function initWebLLM(modelId) {
+            const statusDiv = document.getElementById('webllm-status');
+            statusDiv.innerHTML = '⏳ Loading WebLLM...';
+            
+            try {
+                // Dynamically import WebLLM
+                const webllm = await import('https://esm.run/@mlc-ai/web-llm');
+                
+                statusDiv.innerHTML = '⏳ Downloading model (this may take a while on first use)...';
+                
+                webllmEngine = await webllm.CreateMLCEngine(modelId, {
+                    initProgressCallback: (progress) => {
+                        statusDiv.innerHTML = `⏳ ${progress.text}`;
+                    }
+                });
+                
+                webllmReady = true;
+                statusDiv.innerHTML = '✅ Model loaded and ready!';
+                return true;
+            } catch (error) {
+                statusDiv.innerHTML = `❌ Error: ${error.message}`;
+                console.error('WebLLM init error:', error);
+                return false;
+            }
+        }
+
+        // Translate text with WebLLM
+        async function translateWithWebLLM(text, targetLang) {
+            if (!webllmEngine) return text;
+            
+            try {
+                // Build system prompt with custom context and glossary
+                let systemPrompt = `You are a translator. Translate the user's text to ${targetLang}. Return ONLY the translation, nothing else.`;
+                
+                const customPrompt = getFullSystemPrompt();
+                if (customPrompt.trim()) {
+                    systemPrompt = `You are a translator. Translate the user's text to ${targetLang}. Return ONLY the translation, nothing else.
+
+ADDITIONAL CONTEXT AND INSTRUCTIONS:
+${customPrompt}`;
+                }
+                
+                const response = await webllmEngine.chat.completions.create({
+                    messages: [
+                        { role: "system", content: systemPrompt },
+                        { role: "user", content: text }
+                    ],
+                    temperature: 0.3,
+                    max_tokens: 500
+                });
+                
+                return response.choices[0].message.content.trim();
+            } catch (error) {
+                console.error('WebLLM translation error:', error);
+                return text;
+            }
+        }
        // Liste des modèles Ollama
        async function listOllamaModels() {
            const url = document.getElementById('ollama-url').value;
@@ -553,11 +756,19 @@
                return;
            }
            
+            // Get Ollama model from configuration field (used for both text and vision)
+            const ollamaModel = document.getElementById('ollama-model').value || 'llama3.2';
+            
+            // Get custom system prompt with glossary
+            const systemPrompt = getFullSystemPrompt();
+            
            const formData = new FormData();
            formData.append('file', fileInput.files[0]);
            formData.append('target_language', targetLang);
            formData.append('provider', provider);
            formData.append('translate_images', translateImages);
+            formData.append('ollama_model', ollamaModel);
+            formData.append('system_prompt', systemPrompt);
            
            loadingDiv.classList.add('active');
            progressContainer.classList.add('active');
--- a/translators/excel_translator.py
+++ b/translators/excel_translator.py
@@ -3,6 +3,8 @@ Excel Translation Module
 Translates Excel files while preserving all formatting, formulas, images, and layout
 """
 import re
+import tempfile
+import os
 from pathlib import Path
 from typing import Dict, Set
 from openpyxl import load_workbook
@@ -40,6 +42,10 @@ class ExcelTranslator:
            worksheet = workbook[sheet_name]
            self._translate_worksheet(worksheet, target_language)
            
+            # Translate images if enabled
+            if getattr(self.translation_service, 'translate_images', False):
+                self._translate_images(worksheet, target_language)
+            
            # Prepare translated sheet name (but don't rename yet)
            translated_sheet_name = self.translation_service.translate_text(
                sheet_name, target_language
@@ -156,6 +162,54 @@ class ExcelTranslator:
        
        return True
    
+    def _translate_images(self, worksheet: Worksheet, target_language: str):
+        """
+        Translate text in images using vision model and add as comments
+        """
+        from services.translation_service import OllamaTranslationProvider
+        
+        if not isinstance(self.translation_service.provider, OllamaTranslationProvider):
+            return
+        
+        try:
+            # Get images from worksheet
+            images = getattr(worksheet, '_images', [])
+            
+            for idx, image in enumerate(images):
+                try:
+                    # Get image data
+                    image_data = image._data()
+                    ext = image.format or 'png'
+                    
+                    # Save to temp file
+                    with tempfile.NamedTemporaryFile(suffix=f'.{ext}', delete=False) as tmp:
+                        tmp.write(image_data)
+                        tmp_path = tmp.name
+                    
+                    # Translate with vision
+                    translated_text = self.translation_service.provider.translate_image(tmp_path, target_language)
+                    
+                    # Clean up
+                    os.unlink(tmp_path)
+                    
+                    if translated_text and translated_text.strip():
+                        # Add translation as a cell near the image
+                        anchor = image.anchor
+                        if hasattr(anchor, '_from'):
+                            cell_ref = f"{get_column_letter(anchor._from.col + 1)}{anchor._from.row + 1}"
+                            cell = worksheet[cell_ref]
+                            # Add as comment
+                            from openpyxl.comments import Comment
+                            cell.comment = Comment(f"Image translation: {translated_text}", "Translator")
+                            print(f"Added Excel image translation at {cell_ref}: {translated_text[:50]}...")
+                            
+                except Exception as e:
+                    print(f"Error translating Excel image {idx}: {e}")
+                    continue
+                    
+        except Exception as e:
+            print(f"Error processing Excel images: {e}")
+

 # Global translator instance
 excel_translator = ExcelTranslator()
--- a/translators/pptx_translator.py
+++ b/translators/pptx_translator.py
@@ -9,6 +9,8 @@ from pptx.shapes.group import GroupShape
 from pptx.util import Inches, Pt
 from pptx.enum.shapes import MSO_SHAPE_TYPE
 from services.translation_service import translation_service
+import tempfile
+import os


 class PowerPointTranslator:
@@ -32,21 +34,23 @@ class PowerPointTranslator:
        presentation = Presentation(input_path)
        
        # Translate each slide
-        for slide in presentation.slides:
-            self._translate_slide(slide, target_language)
+        for slide_idx, slide in enumerate(presentation.slides):
+            self._translate_slide(slide, target_language, slide_idx + 1, input_path)
        
        # Save the translated presentation
        presentation.save(output_path)
        
        return output_path
    
-    def _translate_slide(self, slide, target_language: str):
+    def _translate_slide(self, slide, target_language: str, slide_num: int, input_path: Path):
        """
        Translate all text elements in a slide while preserving layout
        
        Args:
            slide: Slide to translate
            target_language: Target language code
+            slide_num: Slide number for reference
+            input_path: Path to source file for image extraction
        """
        # Translate notes (speaker notes)
        if slide.has_notes_slide:
@@ -56,15 +60,16 @@ class PowerPointTranslator:
        
        # Translate shapes in the slide
        for shape in slide.shapes:
-            self._translate_shape(shape, target_language)
+            self._translate_shape(shape, target_language, slide)
    
-    def _translate_shape(self, shape: BaseShape, target_language: str):
+    def _translate_shape(self, shape: BaseShape, target_language: str, slide=None):
        """
        Translate text in a shape based on its type
        
        Args:
            shape: Shape to translate
            target_language: Target language code
+            slide: Parent slide for adding image translations
        """
        # Handle text-containing shapes
        if shape.has_text_frame:
@@ -74,20 +79,72 @@ class PowerPointTranslator:
        if shape.shape_type == MSO_SHAPE_TYPE.TABLE:
            self._translate_table(shape.table, target_language)
        
+        # Handle pictures/images
+        if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
+            self._translate_image_shape(shape, target_language, slide)
+        
        # Handle group shapes (shapes within shapes)
        if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
            for sub_shape in shape.shapes:
-                self._translate_shape(sub_shape, target_language)
+                self._translate_shape(sub_shape, target_language, slide)
        
        # Handle smart art (contains multiple shapes)
        # Smart art is complex, but we can try to translate text within it
        if hasattr(shape, 'shapes'):
            try:
                for sub_shape in shape.shapes:
-                    self._translate_shape(sub_shape, target_language)
+                    self._translate_shape(sub_shape, target_language, slide)
            except:
                pass  # Some shapes may not support iteration
    
+    def _translate_image_shape(self, shape, target_language: str, slide):
+        """
+        Translate text in an image using vision model and add as text box
+        """
+        if not getattr(self.translation_service, 'translate_images', False):
+            return
+        
+        from services.translation_service import OllamaTranslationProvider
+        
+        if not isinstance(self.translation_service.provider, OllamaTranslationProvider):
+            return
+        
+        try:
+            # Get image blob
+            image_blob = shape.image.blob
+            ext = shape.image.ext
+            
+            # Save to temp file
+            with tempfile.NamedTemporaryFile(suffix=f'.{ext}', delete=False) as tmp:
+                tmp.write(image_blob)
+                tmp_path = tmp.name
+            
+            # Translate with vision
+            translated_text = self.translation_service.provider.translate_image(tmp_path, target_language)
+            
+            # Clean up
+            os.unlink(tmp_path)
+            
+            if translated_text and translated_text.strip():
+                # Add text box below the image with translation
+                left = shape.left
+                top = shape.top + shape.height + Inches(0.1)
+                width = shape.width
+                height = Inches(0.5)
+                
+                # Add text box
+                textbox = slide.shapes.add_textbox(left, top, width, height)
+                tf = textbox.text_frame
+                p = tf.paragraphs[0]
+                p.text = f"[{translated_text}]"
+                p.font.size = Pt(10)
+                p.font.italic = True
+                
+                print(f"Added image translation: {translated_text[:50]}...")
+                
+        except Exception as e:
+            print(f"Error translating image: {e}")
+    
    def _translate_text_frame(self, text_frame, target_language: str):
        """
        Translate text within a text frame while preserving formatting
--- a/translators/word_translator.py
+++ b/translators/word_translator.py
@@ -9,7 +9,11 @@ from docx.table import Table, _Cell
 from docx.oxml.text.paragraph import CT_P
 from docx.oxml.table import CT_Tbl
 from docx.section import Section
+from docx.shared import Inches, Pt
+from docx.oxml.ns import qn
 from services.translation_service import translation_service
+import tempfile
+import os


 class WordTranslator:
@@ -39,11 +43,66 @@ class WordTranslator:
        for section in document.sections:
            self._translate_section(section, target_language)
        
+        # Translate images if enabled
+        if getattr(self.translation_service, 'translate_images', False):
+            self._translate_images(document, target_language, input_path)
+        
        # Save the translated document
        document.save(output_path)
        
        return output_path
    
+    def _translate_images(self, document: Document, target_language: str, input_path: Path):
+        """
+        Extract text from images and add translations as captions
+        """
+        from services.translation_service import OllamaTranslationProvider
+        
+        # Only works with Ollama vision
+        if not isinstance(self.translation_service.provider, OllamaTranslationProvider):
+            return
+        
+        try:
+            import zipfile
+            import base64
+            
+            # Extract images from docx (it's a zip file)
+            with zipfile.ZipFile(input_path, 'r') as zip_ref:
+                image_files = [f for f in zip_ref.namelist() if f.startswith('word/media/')]
+                
+                for idx, image_file in enumerate(image_files):
+                    try:
+                        # Extract image
+                        image_data = zip_ref.read(image_file)
+                        
+                        # Create temp file
+                        ext = os.path.splitext(image_file)[1]
+                        with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
+                            tmp.write(image_data)
+                            tmp_path = tmp.name
+                        
+                        # Translate image with vision
+                        translated_text = self.translation_service.provider.translate_image(tmp_path, target_language)
+                        
+                        # Clean up temp file
+                        os.unlink(tmp_path)
+                        
+                        if translated_text and translated_text.strip():
+                            # Add translated text as a new paragraph after image
+                            # We'll add it at the end with a note
+                            p = document.add_paragraph()
+                            p.add_run(f"[Image {idx + 1} translation: ").bold = True
+                            p.add_run(translated_text)
+                            p.add_run("]").bold = True
+                            
+                            print(f"Translated image {idx + 1}: {translated_text[:50]}...")
+                    except Exception as e:
+                        print(f"Error translating image {image_file}: {e}")
+                        continue
+                        
+        except Exception as e:
+            print(f"Error processing images: {e}")
+    
    def _translate_document_body(self, document: Document, target_language: str):
        """
        Translate all elements in the document body