Files
office_translator/scripts/migrate_glossaries_to_multilingual_by_translation.py
sepehr dde80f6bc3
All checks were successful
Deploy to Production / Build and Deploy (push) Successful in 2m43s
feat(glossaries): update script to translate missing terms in any glossary, even if target_language is already 'multi'
2026-06-28 10:46:48 +02:00

104 lines
4.3 KiB
Python

#!/usr/bin/env python3
"""
Migrate and translate all non-multilingual glossaries in the database to multilingual glossaries.
It uses the free Google Translate provider to fill in the translations for target languages
(de, es, it, pt, nl, ru, ja, ko, zh, ar, fa, en) in the glossary_terms table.
"""
import sys
import logging
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
from database.connection import get_sync_session
from database.models import Glossary, GlossaryTerm
from services.providers.google_provider import get_legacy_google_adapter
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
)
logger = logging.getLogger("translate_non_multilingual_glossaries")
TARGET_LANGUAGES = ["de", "es", "it", "pt", "nl", "ru", "ja", "ko", "zh", "ar", "fa"]
def translate_term(adapter, text_to_translate: str, src_lang: str, target_langs: list[str]) -> dict:
translations = {}
for lang in target_langs:
try:
translated = adapter.translate(text_to_translate, target_language=lang, source_language=src_lang)
translations[lang] = translated.strip()
except Exception as e:
logger.error(f"Failed to translate '{text_to_translate}' to {lang}: {e}")
translations[lang] = ""
return translations
def main():
logger.info("Connecting to database and fetching all glossaries...")
adapter = get_legacy_google_adapter()
with get_sync_session() as session:
# Find all glossaries
all_glossaries = session.query(Glossary).all()
if not all_glossaries:
logger.info("No glossaries found in database.")
return 0
logger.info(f"Found {len(all_glossaries)} glossaries to process.")
for glossary in all_glossaries:
logger.info(f"Processing glossary '{glossary.name}' (ID: {glossary.id})")
# Change target_language to 'multi' if not already
if glossary.target_language != 'multi':
glossary.target_language = 'multi'
# Rename the glossary to indicate it's now multilingual (e.g. replace "Anglais" with "Multilingue")
if "Anglais" in glossary.name:
glossary.name = glossary.name.replace("Anglais", "Multilingue")
elif "English" in glossary.name:
glossary.name = glossary.name.replace("English", "Multilingue")
else:
glossary.name = f"{glossary.name} → Multilingue"
src_lang = glossary.source_language or "fr"
terms = session.query(GlossaryTerm).filter(GlossaryTerm.glossary_id == glossary.id).all()
logger.info(f"Checking translations for {len(terms)} terms in '{glossary.name}'...")
updated_terms_count = 0
for idx, term in enumerate(terms):
translations = term.translations or {}
# Check if we already have translations for the target languages
missing_langs = [lang for lang in TARGET_LANGUAGES if lang not in translations or not translations[lang]]
if missing_langs:
# Translate
logger.info(f" [{idx+1}/{len(terms)}] Translating '{term.source}' to {missing_langs}...")
new_translations = translate_term(adapter, term.source, src_lang, missing_langs)
translations.update(new_translations)
updated_terms_count += 1
# Ensure the original default target (e.g. English translation) is in the translations dict under 'en'
if 'en' not in translations or not translations['en']:
translations['en'] = term.target
term.translations = translations
if updated_terms_count > 0:
session.commit()
logger.info(f"Glossary '{glossary.name}' updated with {updated_terms_count} translated terms.")
else:
logger.info(f"No translations were missing for glossary '{glossary.name}'.")
logger.info("Migration complete!")
return 0
if __name__ == '__main__':
sys.exit(main())