Files
office_translator/scripts/migrate_glossaries_to_multilingual_by_translation.py
sepehr 7398cae359
All checks were successful
Deploy to Production / Build and Deploy (push) Successful in 2m25s
feat(glossaries): add script to translate non-multilingual database glossaries using Google Translate adapter
2026-06-28 10:42:41 +02:00

98 lines
4.1 KiB
Python

#!/usr/bin/env python3
"""
Migrate and translate all non-multilingual glossaries in the database to multilingual glossaries.
It uses the free Google Translate provider to fill in the translations for target languages
(de, es, it, pt, nl, ru, ja, ko, zh, ar, fa, en) in the glossary_terms table.
"""
import sys
import logging
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
from database.connection import get_sync_session
from database.models import Glossary, GlossaryTerm
from services.providers.google_provider import get_legacy_google_adapter
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
)
logger = logging.getLogger("translate_non_multilingual_glossaries")
TARGET_LANGUAGES = ["de", "es", "it", "pt", "nl", "ru", "ja", "ko", "zh", "ar", "fa"]
def translate_term(adapter, text_to_translate: str, src_lang: str, target_langs: list[str]) -> dict:
translations = {}
for lang in target_langs:
try:
translated = adapter.translate(text_to_translate, target_language=lang, source_language=src_lang)
translations[lang] = translated.strip()
except Exception as e:
logger.error(f"Failed to translate '{text_to_translate}' to {lang}: {e}")
translations[lang] = ""
return translations
def main():
logger.info("Connecting to database and fetching non-multilingual glossaries...")
adapter = get_legacy_google_adapter()
with get_sync_session() as session:
# Find glossaries where target_language != 'multi'
non_multi_glossaries = session.query(Glossary).filter(Glossary.target_language != 'multi').all()
if not non_multi_glossaries:
logger.info("All glossaries in database are already multilingual ('multi').")
return 0
logger.info(f"Found {len(non_multi_glossaries)} non-multilingual glossaries to convert.")
for glossary in non_multi_glossaries:
logger.info(f"Processing glossary '{glossary.name}' (ID: {glossary.id})")
# Change target_language to 'multi'
glossary.target_language = 'multi'
# Rename the glossary to indicate it's now multilingual (e.g. replace "Anglais" with "Multilingue")
if "Anglais" in glossary.name:
glossary.name = glossary.name.replace("Anglais", "Multilingue")
elif "English" in glossary.name:
glossary.name = glossary.name.replace("English", "Multilingue")
else:
glossary.name = f"{glossary.name} → Multilingue"
src_lang = glossary.source_language or "fr"
terms = session.query(GlossaryTerm).filter(GlossaryTerm.glossary_id == glossary.id).all()
logger.info(f"Translating {len(terms)} terms for '{glossary.name}'...")
for idx, term in enumerate(terms):
translations = term.translations or {}
# Check if we already have translations for the target languages
missing_langs = [lang for lang in TARGET_LANGUAGES if lang not in translations or not translations[lang]]
if missing_langs:
# Translate
logger.info(f" [{idx+1}/{len(terms)}] Translating '{term.source}' to {missing_langs}...")
new_translations = translate_term(adapter, term.source, src_lang, missing_langs)
translations.update(new_translations)
# Ensure the original default target (e.g. English translation) is in the translations dict under 'en'
if 'en' not in translations or not translations['en']:
translations['en'] = term.target
term.translations = translations
session.commit()
logger.info(f"Glossary '{glossary.name}' successfully converted to multilingual.")
logger.info("Migration complete!")
return 0
if __name__ == '__main__':
sys.exit(main())