diff --git a/main.py b/main.py index 071c822..9de5608 100644 --- a/main.py +++ b/main.py @@ -111,7 +111,9 @@ async def translate_document( target_language: str = Form(..., description="Target language code (e.g., 'es', 'fr', 'de')"), source_language: str = Form(default="auto", description="Source language code (default: auto-detect)"), provider: str = Form(default="google", description="Translation provider (google, ollama, deepl, libre)"), - translate_images: bool = Form(default=False, description="Translate images with Ollama vision (only for Ollama provider)"), + translate_images: bool = Form(default=False, description="Translate images with multimodal Ollama model"), + ollama_model: str = Form(default="", description="Ollama model to use (also used for vision if multimodal)"), + system_prompt: str = Form(default="", description="Custom system prompt with context, glossary, or instructions for LLM translation"), cleanup: bool = Form(default=True, description="Delete input file after translation") ): """ @@ -154,7 +156,7 @@ async def translate_document( logger.info(f"Saved input file to: {input_path}") # Configure translation provider - from services.translation_service import GoogleTranslationProvider, DeepLTranslationProvider, LibreTranslationProvider, OllamaTranslationProvider, WebLLMTranslationProvider, translation_service + from services.translation_service import GoogleTranslationProvider, DeepLTranslationProvider, LibreTranslationProvider, OllamaTranslationProvider, translation_service if provider.lower() == "deepl": if not config.DEEPL_API_KEY: @@ -163,10 +165,13 @@ async def translate_document( elif provider.lower() == "libre": translation_provider = LibreTranslationProvider() elif provider.lower() == "ollama": - vision_model = getattr(config, 'OLLAMA_VISION_MODEL', 'llava') - translation_provider = OllamaTranslationProvider(config.OLLAMA_BASE_URL, config.OLLAMA_MODEL, vision_model) - elif provider.lower() == "webllm": - translation_provider = WebLLMTranslationProvider() + # Use the same model for text and vision (multimodal models like gemma3, qwen3-vl) + model_to_use = ollama_model.strip() if ollama_model else config.OLLAMA_MODEL + custom_prompt = system_prompt.strip() if system_prompt else "" + logger.info(f"Using Ollama model: {model_to_use} (text + vision)") + if custom_prompt: + logger.info(f"Custom system prompt provided ({len(custom_prompt)} chars)") + translation_provider = OllamaTranslationProvider(config.OLLAMA_BASE_URL, model_to_use, model_to_use, custom_prompt) else: translation_provider = GoogleTranslationProvider() diff --git a/services/translation_service.py b/services/translation_service.py index a491a87..8113dba 100644 --- a/services/translation_service.py +++ b/services/translation_service.py @@ -70,30 +70,65 @@ class LibreTranslationProvider(TranslationProvider): class OllamaTranslationProvider(TranslationProvider): """Ollama LLM translation implementation""" - def __init__(self, base_url: str = "http://localhost:11434", model: str = "llama3", vision_model: str = "llava"): + def __init__(self, base_url: str = "http://localhost:11434", model: str = "llama3", vision_model: str = "llava", system_prompt: str = ""): self.base_url = base_url.rstrip('/') - self.model = model - self.vision_model = vision_model + self.model = model.strip() # Remove any leading/trailing whitespace + self.vision_model = vision_model.strip() + self.custom_system_prompt = system_prompt # Custom context, glossary, instructions def translate(self, text: str, target_language: str, source_language: str = 'auto') -> str: if not text or not text.strip(): return text + # Skip very short text or numbers only + if len(text.strip()) < 2 or text.strip().isdigit(): + return text + try: - prompt = f"Translate the following text to {target_language}. Return ONLY the translation, nothing else:\n\n{text}" + # Build system prompt with custom context if provided + base_prompt = f"You are a translator. Translate the user's text to {target_language}. Return ONLY the translation, nothing else." + if self.custom_system_prompt: + system_content = f"""{base_prompt} + +ADDITIONAL CONTEXT AND INSTRUCTIONS: +{self.custom_system_prompt}""" + else: + system_content = base_prompt + + # Use /api/chat endpoint (more compatible with all models) response = requests.post( - f"{self.base_url}/api/generate", + f"{self.base_url}/api/chat", json={ "model": self.model, - "prompt": prompt, - "stream": False + "messages": [ + { + "role": "system", + "content": system_content + }, + { + "role": "user", + "content": text + } + ], + "stream": False, + "options": { + "temperature": 0.3, + "num_predict": 500 + } }, - timeout=30 + timeout=120 # 2 minutes timeout ) response.raise_for_status() result = response.json() - return result.get("response", text).strip() + translated = result.get("message", {}).get("content", "").strip() + return translated if translated else text + except requests.exceptions.ConnectionError: + print(f"Ollama error: Cannot connect to {self.base_url}. Is Ollama running?") + return text + except requests.exceptions.Timeout: + print(f"Ollama error: Request timeout after 120s") + return text except Exception as e: print(f"Ollama translation error: {e}") return text @@ -107,21 +142,25 @@ class OllamaTranslationProvider(TranslationProvider): with open(image_path, 'rb') as img_file: image_data = base64.b64encode(img_file.read()).decode('utf-8') - prompt = f"Extract all text from this image and translate it to {target_language}. Return ONLY the translated text, preserving the structure and formatting." - + # Use /api/chat for vision models too response = requests.post( - f"{self.base_url}/api/generate", + f"{self.base_url}/api/chat", json={ "model": self.vision_model, - "prompt": prompt, - "images": [image_data], + "messages": [ + { + "role": "user", + "content": f"Extract all text from this image and translate it to {target_language}. Return ONLY the translated text, preserving the structure and formatting.", + "images": [image_data] + } + ], "stream": False }, timeout=60 ) response.raise_for_status() result = response.json() - return result.get("response", "").strip() + return result.get("message", {}).get("content", "").strip() except Exception as e: print(f"Ollama vision translation error: {e}") return "" @@ -158,6 +197,7 @@ class TranslationService: else: # Auto-select provider based on configuration self.provider = self._get_default_provider() + self.translate_images = False # Flag to enable image translation def _get_default_provider(self) -> TranslationProvider: """Get the default translation provider from configuration""" @@ -182,6 +222,26 @@ class TranslationService: return self.provider.translate(text, target_language, source_language) + def translate_image(self, image_path: str, target_language: str) -> str: + """ + Translate text in an image using vision model (Ollama only) + + Args: + image_path: Path to image file + target_language: Target language code + + Returns: + Translated text from image + """ + if not self.translate_images: + return "" + + # Only Ollama supports image translation + if isinstance(self.provider, OllamaTranslationProvider): + return self.provider.translate_image(image_path, target_language) + + return "" + def translate_batch(self, texts: list[str], target_language: str, source_language: str = 'auto') -> list[str]: """ Translate multiple text strings diff --git a/static/index.html b/static/index.html index 9c32a02..40a3d63 100644 --- a/static/index.html +++ b/static/index.html @@ -309,7 +309,7 @@
- +
@@ -318,6 +318,39 @@
+ +
+

Translation Context (Ollama / WebLLM)

+

+ Provide context, technical glossary, or specific instructions to improve translation quality. +

+ +
+ + +
+ +
+ + +
+ +
+ + + + + +
+
+

Document Translation

@@ -335,6 +368,8 @@
+ @@ -363,11 +398,11 @@ -