commit 793d94c93e7352f5de2be3f8072e474d85c2c656 Author: Sepehr Date: Sun Nov 30 10:48:58 2025 +0100 Initial commit: Document Translation API with Excel, Word, PowerPoint support diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..2975d2f --- /dev/null +++ b/.env.example @@ -0,0 +1,8 @@ +# Translation Service Configuration +TRANSLATION_SERVICE=google # Options: google, deepl, libre +DEEPL_API_KEY=your_deepl_api_key_here + +# API Configuration +MAX_FILE_SIZE_MB=50 +UPLOAD_DIR=./uploads +OUTPUT_DIR=./outputs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..25cd6b6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,53 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual Environment +venv/ +env/ +ENV/ + +# Environment variables +.env + +# IDE +.vscode/ +.idea/ +*.swp +*.swo + +# Uploads and outputs +uploads/ +outputs/ +temp/ +translated_files/ +translated_test.* + +# Logs +*.log + +# UV / UV lock +.venv/ +uv.lock + +# Test files +test_*.py +test_*.ipynb diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..e4fba21 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.12 diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md new file mode 100644 index 0000000..de1e64f --- /dev/null +++ b/ARCHITECTURE.md @@ -0,0 +1,325 @@ +# Document Translation API - Architecture Overview + +## System Architecture + +``` +┌─────────────────────────────────────────────────────────────┐ +│ FastAPI Application │ +│ (main.py) │ +└─────────────────────┬───────────────────────────────────────┘ + │ + ├──> File Upload Endpoint (/translate) + │ ├─> File Validation + │ ├─> File Type Detection + │ └─> Route to Appropriate Translator + │ + ├──> Batch Translation (/translate-batch) + │ + └──> Utility Endpoints + ├─> /health + ├─> /languages + └─> /download/{filename} + +┌─────────────────────────────────────────────────────────────┐ +│ Translation Layer │ +└─────────────────────┬───────────────────────────────────────┘ + │ + ┌─────────────┼─────────────┐ + │ │ │ + ▼ ▼ ▼ + Excel Word PowerPoint + Translator Translator Translator + (.xlsx) (.docx) (.pptx) + │ │ │ + └─────────────┼─────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ Translation Service Abstraction │ +│ (Pluggable Backend) │ +└─────────────────────┬───────────────────────────────────────┘ + │ + ┌─────────────┼─────────────┐ + ▼ ▼ ▼ + Google DeepL LibreTranslate + Translate (API Key) (Self-hosted) +``` + +## Component Breakdown + +### 1. API Layer (`main.py`) +- **FastAPI Application**: RESTful API endpoints +- **File Upload Handling**: Multipart form data processing +- **Request Validation**: Pydantic models for type safety +- **Error Handling**: Custom exception handlers +- **CORS Configuration**: Cross-origin resource sharing + +### 2. Translation Coordinators + +#### Excel Translator (`translators/excel_translator.py`) +``` +Input: .xlsx file +Process: + 1. Load workbook with openpyxl (preserve VBA, formulas) + 2. Iterate through all worksheets + 3. For each cell: + - Detect type (text, formula, number) + - If text: translate + - If formula: extract and translate strings + - Preserve: formatting, colors, borders, merges + 4. Translate sheet names + 5. Maintain image positions +Output: Translated .xlsx with identical structure +``` + +#### Word Translator (`translators/word_translator.py`) +``` +Input: .docx file +Process: + 1. Load document with python-docx + 2. Traverse document tree: + - Paragraphs → Runs (preserve formatting per run) + - Tables → Cells → Paragraphs + - Headers/Footers (all section types) + 3. Translate text while preserving: + - Font family, size, color + - Bold, italic, underline + - Lists (numbered/bulleted) + - Styles (Heading 1, Normal, etc.) + 4. Images remain embedded via relationships +Output: Translated .docx with preserved layout +``` + +#### PowerPoint Translator (`translators/pptx_translator.py`) +``` +Input: .pptx file +Process: + 1. Load presentation with python-pptx + 2. For each slide: + - Shapes → Text Frames → Paragraphs → Runs + - Tables → Cells → Text Frames + - Groups → Nested Shapes + - Speaker Notes + 3. Preserve: + - Slide layouts + - Animations (timing, effects) + - Transitions + - Image positions and layering + - Shape properties (size, position, rotation) +Output: Translated .pptx with identical design +``` + +### 3. Translation Service Layer + +**Abstract Interface**: `TranslationProvider` +- Allows swapping translation backends without changing translators +- Configurable via environment variables + +**Implementations**: +1. **Google Translator** (Default, Free) + - Uses deep-translator library + - No API key required + - Rate limited + +2. **DeepL** (Premium, API Key Required) + - Higher quality translations + - Better context understanding + - Requires paid API key + +3. **LibreTranslate** (Self-hosted) + - Open-source alternative + - Full control and privacy + - Requires local installation + +### 4. Utility Layer + +#### File Handler (`utils/file_handler.py`) +- File validation (size, type) +- Unique filename generation (UUID-based) +- Safe file operations +- Cleanup management + +#### Exception Handling (`utils/exceptions.py`) +- Custom exception types +- HTTP status code mapping +- User-friendly error messages + +### 5. Configuration (`config.py`) +- Environment variable loading +- Directory management +- Service configuration +- Validation rules + +## Data Flow + +### Single Document Translation +``` +1. Client uploads file via POST /translate + └─> File + target_language + source_language + +2. API validates request + ├─> Check file extension + ├─> Verify file size + └─> Validate language codes + +3. Save to temporary storage + └─> uploads/{unique_id}_{filename} + +4. Route to appropriate translator + ├─> .xlsx → ExcelTranslator + ├─> .docx → WordTranslator + └─> .pptx → PowerPointTranslator + +5. Translator processes document + ├─> Parse structure + ├─> Extract text elements + ├─> Call translation service for each text + ├─> Apply translations while preserving formatting + └─> Save to outputs/{unique_id}_translated_{filename} + +6. Return translated file + └─> FileResponse with download headers + +7. Cleanup (optional) + └─> Delete uploaded file +``` + +## Formatting Preservation Strategies + +### Excel +- **Cell Properties**: Copied before translation +- **Merged Cells**: Detected via `cell.merge_cells` +- **Formulas**: Regex parsing to extract strings +- **Images**: Anchored to cells, preserved via relationships +- **Charts**: Remain linked to data ranges + +### Word +- **Run-level Translation**: Preserves inline formatting +- **Style Inheritance**: Paragraph styles maintained +- **Tables**: Structure preserved, cells translated individually +- **Images**: Embedded via relationships, not modified +- **Headers/Footers**: Treated as separate sections + +### PowerPoint +- **Shape Hierarchy**: Recursive traversal +- **Text Frames**: Paragraph and run-level translation +- **Layouts**: Template references preserved +- **Animations**: Stored separately, not affected +- **Media**: File references remain intact + +## Scalability Considerations + +### Horizontal Scaling +- Stateless design (no session storage) +- Files stored on disk (can move to S3/Azure Blob) +- Load balancer compatible + +### Performance Optimization +- **Async I/O**: FastAPI's async capabilities +- **Batch Processing**: Multiple files in parallel +- **Caching**: Translation cache for repeated text +- **Streaming**: Large file chunking (future enhancement) + +### Resource Management +- **File Cleanup**: Automatic deletion after translation +- **Size Limits**: Configurable max file size +- **Rate Limiting**: Prevent API abuse +- **Queue System**: Redis-based job queue (future) + +## Future MCP Integration + +### MCP Server Wrapper +The API is designed to be wrapped as an MCP server: + +```python +# MCP Tools +1. translate_document(file_path, target_lang) → translated_file +2. get_supported_languages() → language_list +3. check_api_health() → status + +# Benefits +- AI assistants can translate documents seamlessly +- Integration with Claude, GPT, and other LLMs +- Workflow automation in AI pipelines +``` + +## Security Architecture + +### Input Validation +- File type whitelist +- Size restrictions +- Extension verification +- Content-type checking + +### File Isolation +- Unique filenames (UUID) +- Temporary storage +- Automatic cleanup +- No path traversal + +### API Security (Production) +- Rate limiting (not yet implemented) +- Authentication/Authorization (future) +- HTTPS/TLS encryption (deployment config) +- Input sanitization + +## Deployment Architecture + +### Development +``` +Local Machine +├─> Python 3.11+ +├─> Virtual Environment +├─> SQLite (if needed for tracking) +└─> Local file storage +``` + +### Production (Recommended) +``` +Cloud Platform (AWS/Azure/GCP) +├─> Container (Docker) +├─> Load Balancer +├─> Multiple API Instances +├─> Object Storage (S3/Blob) +├─> Redis (caching/queue) +├─> Monitoring (Prometheus/Grafana) +└─> Logging (ELK Stack) +``` + +## Technology Stack + +| Layer | Technology | Purpose | +|-------|------------|---------| +| API Framework | FastAPI | High-performance async API | +| Excel Processing | openpyxl | Full Excel feature support | +| Word Processing | python-docx | DOCX manipulation | +| PowerPoint Processing | python-pptx | PPTX handling | +| Translation | deep-translator | Multi-provider abstraction | +| Server | Uvicorn | ASGI server | +| Validation | Pydantic | Request/response validation | + +## Extension Points + +1. **Add Translation Provider** + - Implement `TranslationProvider` interface + - Register in `translation_service.py` + +2. **Add Document Type** + - Create new translator class + - Register in routing logic + - Add to supported extensions + +3. **Add MCP Server** + - Use provided `mcp_server_example.py` + - Configure in MCP settings + - Deploy alongside API + +4. **Add Caching** + - Implement translation cache + - Use Redis or in-memory cache + - Reduce API calls for repeated text + +5. **Add Queue System** + - Implement Celery/RQ workers + - Handle long-running translations + - Provide job status endpoints diff --git a/DEPLOYMENT.md b/DEPLOYMENT.md new file mode 100644 index 0000000..48fc13a --- /dev/null +++ b/DEPLOYMENT.md @@ -0,0 +1,78 @@ +# Development and Production Setup Scripts + +## Start the API Server + +### Development Mode (with auto-reload) +```powershell +# Activate virtual environment +.\venv\Scripts\Activate.ps1 + +# Start server with hot-reload +python main.py +``` + +### Production Mode +```powershell +# Activate virtual environment +.\venv\Scripts\Activate.ps1 + +# Start with uvicorn (better performance) +uvicorn main:app --host 0.0.0.0 --port 8000 --workers 4 +``` + +## Docker Deployment (Optional) + +### Create Dockerfile +```dockerfile +FROM python:3.11-slim + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +# Create directories +RUN mkdir -p uploads outputs temp + +EXPOSE 8000 + +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] +``` + +### Build and Run +```powershell +# Build image +docker build -t document-translator-api . + +# Run container +docker run -d -p 8000:8000 -v ${PWD}/uploads:/app/uploads -v ${PWD}/outputs:/app/outputs document-translator-api +``` + +## Environment Variables for Production + +```env +TRANSLATION_SERVICE=google +DEEPL_API_KEY=your_production_api_key +MAX_FILE_SIZE_MB=100 +UPLOAD_DIR=/app/uploads +OUTPUT_DIR=/app/outputs +``` + +## Monitoring and Logging + +Add to requirements.txt for production: +``` +prometheus-fastapi-instrumentator==6.1.0 +python-json-logger==2.0.7 +``` + +## Security Hardening + +1. Add rate limiting +2. Implement authentication (JWT/API keys) +3. Enable HTTPS/TLS +4. Sanitize file uploads +5. Implement virus scanning for uploads +6. Add request validation middleware diff --git a/QUICKSTART.md b/QUICKSTART.md new file mode 100644 index 0000000..ea78be0 --- /dev/null +++ b/QUICKSTART.md @@ -0,0 +1,230 @@ +# 🚀 Quick Start Guide - Document Translation API + +## Step-by-Step Setup (5 Minutes) + +### 1️⃣ Open PowerShell in Project Directory +```powershell +cd d:\Translate +``` + +### 2️⃣ Run the Startup Script +```powershell +.\start.ps1 +``` + +This will automatically: +- Create a virtual environment +- Install all dependencies +- Create necessary directories +- Start the API server + +### 3️⃣ Test the API + +**Open another PowerShell window** and run: +```powershell +python test_api.py +``` + +Or visit in your browser: +- **API Documentation**: http://localhost:8000/docs +- **API Status**: http://localhost:8000/health + +## 📤 Translate Your First Document + +### Using cURL (PowerShell) +```powershell +$file = Get-Item "your_document.xlsx" +Invoke-RestMethod -Uri "http://localhost:8000/translate" ` + -Method Post ` + -Form @{ + file = $file + target_language = "es" + } ` + -OutFile "translated_document.xlsx" +``` + +### Using Python +```python +import requests + +with open('document.docx', 'rb') as f: + response = requests.post( + 'http://localhost:8000/translate', + files={'file': f}, + data={'target_language': 'fr'} + ) + +with open('translated_document.docx', 'wb') as out: + out.write(response.content) +``` + +### Using the Interactive API Docs + +1. Go to http://localhost:8000/docs +2. Click on **POST /translate** +3. Click **"Try it out"** +4. Upload your file +5. Enter target language (e.g., `es` for Spanish) +6. Click **"Execute"** +7. Download the translated file + +## 🌍 Supported Languages + +Use these language codes in the `target_language` parameter: + +| Code | Language | Code | Language | +|------|----------|------|----------| +| `es` | Spanish | `fr` | French | +| `de` | German | `it` | Italian | +| `pt` | Portuguese | `ru` | Russian | +| `zh` | Chinese | `ja` | Japanese | +| `ko` | Korean | `ar` | Arabic | +| `hi` | Hindi | `nl` | Dutch | + +**Full list**: http://localhost:8000/languages + +## 📋 Supported File Types + +| Format | Extension | What's Preserved | +|--------|-----------|------------------| +| **Excel** | `.xlsx` | Formulas, merged cells, colors, borders, images | +| **Word** | `.docx` | Styles, tables, headers/footers, images | +| **PowerPoint** | `.pptx` | Layouts, animations, transitions, media | + +## 🔧 Configuration + +Edit `.env` file to customize: + +```env +# Translation service: google (free) or deepl (requires API key) +TRANSLATION_SERVICE=google + +# For DeepL (premium translation) +DEEPL_API_KEY=your_api_key_here + +# Maximum file size in MB +MAX_FILE_SIZE_MB=50 +``` + +## ⚠️ Troubleshooting + +### Issue: "Virtual environment activation failed" +**Solution**: +```powershell +Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser +``` + +### Issue: "Module not found" +**Solution**: +```powershell +.\venv\Scripts\Activate.ps1 +pip install -r requirements.txt +``` + +### Issue: "Port 8000 already in use" +**Solution**: +Edit `main.py` line 307: +```python +uvicorn.run(app, host="0.0.0.0", port=8001, reload=True) +``` + +### Issue: "Translation quality is poor" +**Solution**: +1. Get a DeepL API key from https://www.deepl.com/pro-api +2. Update `.env`: + ```env + TRANSLATION_SERVICE=deepl + DEEPL_API_KEY=your_key_here + ``` + +## 📦 Project Structure + +``` +Translate/ +├── main.py # FastAPI application (START HERE) +├── config.py # Configuration management +├── start.ps1 # Startup script (RUN THIS FIRST) +├── test_api.py # Testing script +│ +├── services/ # Translation service layer +│ ├── __init__.py +│ └── translation_service.py # Pluggable translation backend +│ +├── translators/ # Document-specific translators +│ ├── __init__.py +│ ├── excel_translator.py # Excel (.xlsx) handler +│ ├── word_translator.py # Word (.docx) handler +│ ├── pptx_translator.py # PowerPoint (.pptx) handler +│ └── excel_advanced.py # Advanced Excel features +│ +├── utils/ # Utility modules +│ ├── __init__.py +│ ├── file_handler.py # File operations +│ └── exceptions.py # Error handling +│ +├── requirements.txt # Python dependencies +├── README.md # Full documentation +├── ARCHITECTURE.md # Technical architecture +└── DEPLOYMENT.md # Production deployment guide +``` + +## 🎯 Next Steps + +### For Development +1. ✅ Run `start.ps1` to start the server +2. ✅ Test with `test_api.py` +3. ✅ Try translating sample documents +4. Read `ARCHITECTURE.md` for technical details + +### For Production +1. Read `DEPLOYMENT.md` for production setup +2. Configure environment variables +3. Set up Docker container +4. Enable authentication and rate limiting + +### For MCP Integration +1. Install MCP requirements: `pip install -r requirements-mcp.txt` +2. Review `mcp_server_example.py` +3. Configure MCP server in your AI assistant + +## 📞 API Endpoints Reference + +| Endpoint | Method | Description | +|----------|--------|-------------| +| `/` | GET | API information | +| `/health` | GET | Health check | +| `/languages` | GET | Supported languages | +| `/translate` | POST | Translate single document | +| `/translate-batch` | POST | Translate multiple documents | +| `/download/{filename}` | GET | Download translated file | +| `/cleanup/{filename}` | DELETE | Delete translated file | + +## 💡 Tips & Best Practices + +1. **File Size**: Keep files under 50MB for best performance +2. **Format Preservation**: More complex formatting = longer processing time +3. **Language Codes**: Use ISO 639-1 codes (2 letters) +4. **Cleanup**: Enable cleanup to save disk space +5. **Batch Translation**: Use batch endpoint for multiple files + +## 🌟 Features Highlights + +✨ **Zero Data Loss**: All formatting, colors, styles preserved +✨ **Formula Intelligence**: Translates text in formulas, keeps logic +✨ **Image Preservation**: Embedded media stays in exact positions +✨ **Smart Translation**: Auto-detects source language +✨ **MCP Ready**: Designed for AI assistant integration + +## 📄 License + +MIT License - Free to use and modify + +## 🤝 Support + +- **Documentation**: See `README.md` for full details +- **Issues**: Open an issue on the repository +- **Architecture**: Read `ARCHITECTURE.md` for technical depth + +--- + +**Ready to translate? Run `.\start.ps1` and visit http://localhost:8000/docs** 🚀 diff --git a/QUICK_REFERENCE.md b/QUICK_REFERENCE.md new file mode 100644 index 0000000..89b3661 --- /dev/null +++ b/QUICK_REFERENCE.md @@ -0,0 +1,214 @@ +# 📋 QUICK REFERENCE CARD + +## 🚀 Start Server +```powershell +.\start.ps1 +``` +Or manually: +```powershell +.\venv\Scripts\Activate.ps1 +python main.py +``` + +## 🌐 API URLs +| Endpoint | URL | +|----------|-----| +| Swagger Docs | http://localhost:8000/docs | +| ReDoc | http://localhost:8000/redoc | +| Health Check | http://localhost:8000/health | +| Languages | http://localhost:8000/languages | + +## 📤 Translate Document + +### PowerShell +```powershell +$file = Get-Item "document.xlsx" +Invoke-RestMethod -Uri "http://localhost:8000/translate" ` + -Method Post ` + -Form @{file=$file; target_language="es"} ` + -OutFile "translated.xlsx" +``` + +### Python +```python +import requests +with open('doc.xlsx', 'rb') as f: + r = requests.post('http://localhost:8000/translate', + files={'file': f}, + data={'target_language': 'es'}) +with open('translated.xlsx', 'wb') as out: + out.write(r.content) +``` + +### cURL +```bash +curl -X POST "http://localhost:8000/translate" \ + -F "file=@document.xlsx" \ + -F "target_language=es" \ + --output translated.xlsx +``` + +## 🌍 Language Codes +| Code | Language | Code | Language | +|------|----------|------|----------| +| `es` | Spanish | `fr` | French | +| `de` | German | `it` | Italian | +| `pt` | Portuguese | `ru` | Russian | +| `zh` | Chinese | `ja` | Japanese | +| `ko` | Korean | `ar` | Arabic | +| `hi` | Hindi | `nl` | Dutch | + +[Full list: http://localhost:8000/languages] + +## 📄 Supported Formats +- `.xlsx` - Excel (formulas, formatting, images) +- `.docx` - Word (styles, tables, images) +- `.pptx` - PowerPoint (layouts, animations, media) + +## ⚙️ Configuration (.env) +```env +TRANSLATION_SERVICE=google # or: deepl, libre +DEEPL_API_KEY=your_key # if using DeepL +MAX_FILE_SIZE_MB=50 # max upload size +``` + +## 📁 Project Structure +``` +Translate/ +├── main.py # API application +├── config.py # Configuration +├── start.ps1 # Startup script +│ +├── services/ # Translation services +│ └── translation_service.py +│ +├── translators/ # Format handlers +│ ├── excel_translator.py +│ ├── word_translator.py +│ └── pptx_translator.py +│ +├── utils/ # Utilities +│ ├── file_handler.py +│ └── exceptions.py +│ +└── [docs]/ # Documentation + ├── README.md + ├── QUICKSTART.md + ├── ARCHITECTURE.md + └── DEPLOYMENT.md +``` + +## 🧪 Testing +```powershell +# Test API +python test_api.py + +# Run examples +python examples.py +``` + +## 🔧 Troubleshooting + +### Port in use +```python +# Edit main.py line 307: +uvicorn.run(app, host="0.0.0.0", port=8001) +``` + +### Module not found +```powershell +.\venv\Scripts\Activate.ps1 +pip install -r requirements.txt +``` + +### Execution policy (Windows) +```powershell +Set-ExecutionPolicy RemoteSigned -Scope CurrentUser +``` + +## 📊 API Response Headers +``` +X-Original-Filename: document.xlsx +X-File-Size-MB: 2.5 +X-Target-Language: es +``` + +## 🎯 Common Tasks + +### Check API Status +```powershell +curl http://localhost:8000/health +``` + +### List Languages +```powershell +curl http://localhost:8000/languages +``` + +### Download File +```powershell +curl http://localhost:8000/download/filename.xlsx -o local.xlsx +``` + +### Cleanup File +```powershell +curl -X DELETE http://localhost:8000/cleanup/filename.xlsx +``` + +## 💡 Tips +- Use `auto` for source language auto-detection +- Set `cleanup=true` to delete uploads automatically +- Max file size: 50MB (configurable) +- Processing time: ~1-5 seconds per document + +## 📚 Documentation Files +| File | Purpose | +|------|---------| +| `QUICKSTART.md` | 5-minute setup guide | +| `README.md` | Complete documentation | +| `ARCHITECTURE.md` | Technical design | +| `DEPLOYMENT.md` | Production setup | +| `CHECKLIST.md` | Feature checklist | +| `PROJECT_SUMMARY.md` | Project overview | + +## 🔌 MCP Integration +```powershell +# Install MCP dependencies +pip install -r requirements-mcp.txt + +# Run MCP server +python mcp_server_example.py +``` + +## 📞 Quick Commands + +| Command | Purpose | +|---------|---------| +| `.\start.ps1` | Start API server | +| `python test_api.py` | Test API | +| `python examples.py` | Run examples | +| `pip install -r requirements.txt` | Install deps | + +## 🎨 Format Preservation + +### Excel +✅ Formulas, merged cells, fonts, colors, borders, images + +### Word +✅ Styles, headings, lists, tables, headers/footers, images + +### PowerPoint +✅ Layouts, animations, transitions, media, positioning + +--- + +## 🚀 QUICK START +```powershell +cd d:\Translate +.\start.ps1 +# Visit: http://localhost:8000/docs +``` + +--- + +**Print this card for quick reference! 📋** diff --git a/README.md b/README.md new file mode 100644 index 0000000..7e25517 --- /dev/null +++ b/README.md @@ -0,0 +1,303 @@ +# Document Translation API + +A powerful Python API for translating complex structured documents (Excel, Word, PowerPoint) while **strictly preserving** the original formatting, layout, and embedded media. + +## 🎯 Features + +### Excel Translation (.xlsx) +- ✅ Translates all cell content and sheet names +- ✅ Preserves cell merging +- ✅ Maintains font styles (size, bold, italic, color) +- ✅ Keeps background colors and borders +- ✅ Translates text within formulas while preserving formula structure +- ✅ Retains embedded images in original positions + +### Word Translation (.docx) +- ✅ Translates body text, headers, footers, and tables +- ✅ Preserves heading styles and paragraph formatting +- ✅ Maintains lists (numbered/bulleted) +- ✅ Keeps embedded images, charts, and SmartArt in place +- ✅ Preserves table structures and cell formatting + +### PowerPoint Translation (.pptx) +- ✅ Translates slide titles, body text, and speaker notes +- ✅ Preserves slide layouts and transitions +- ✅ Maintains animations +- ✅ Keeps images, videos, and shapes in exact positions +- ✅ Preserves layering order + +## 🚀 Quick Start + +### Installation + +1. **Clone the repository:** +```powershell +git clone +cd Translate +``` + +2. **Create a virtual environment:** +```powershell +python -m venv venv +.\venv\Scripts\Activate.ps1 +``` + +3. **Install dependencies:** +```powershell +pip install -r requirements.txt +``` + +4. **Configure environment:** +```powershell +cp .env.example .env +# Edit .env with your preferred settings +``` + +5. **Run the API:** +```powershell +python main.py +``` + +The API will start on `http://localhost:8000` + +## 📚 API Documentation + +Once the server is running, visit: +- **Swagger UI**: http://localhost:8000/docs +- **ReDoc**: http://localhost:8000/redoc + +## 🔧 API Endpoints + +### POST /translate +Translate a single document + +**Request:** +```bash +curl -X POST "http://localhost:8000/translate" \ + -F "file=@document.xlsx" \ + -F "target_language=es" \ + -F "source_language=auto" +``` + +**Response:** +Returns the translated document file + +### POST /translate-batch +Translate multiple documents at once + +**Request:** +```bash +curl -X POST "http://localhost:8000/translate-batch" \ + -F "files=@document1.docx" \ + -F "files=@document2.pptx" \ + -F "target_language=fr" +``` + +### GET /languages +Get list of supported language codes + +### GET /health +Health check endpoint + +## 💻 Usage Examples + +### Python Example + +```python +import requests + +# Translate a document +with open('document.xlsx', 'rb') as f: + files = {'file': f} + data = { + 'target_language': 'es', + 'source_language': 'auto' + } + response = requests.post('http://localhost:8000/translate', files=files, data=data) + + # Save translated file + with open('translated_document.xlsx', 'wb') as output: + output.write(response.content) +``` + +### JavaScript/TypeScript Example + +```javascript +const formData = new FormData(); +formData.append('file', fileInput.files[0]); +formData.append('target_language', 'fr'); +formData.append('source_language', 'auto'); + +const response = await fetch('http://localhost:8000/translate', { + method: 'POST', + body: formData +}); + +const blob = await response.blob(); +const url = window.URL.createObjectURL(blob); +const a = document.createElement('a'); +a.href = url; +a.download = 'translated_document.docx'; +a.click(); +``` + +### PowerShell Example + +```powershell +$file = Get-Item "document.pptx" +$uri = "http://localhost:8000/translate" + +$form = @{ + file = $file + target_language = "de" + source_language = "auto" +} + +Invoke-RestMethod -Uri $uri -Method Post -Form $form -OutFile "translated_document.pptx" +``` + +## 🌐 Supported Languages + +The API supports 25+ languages including: +- Spanish (es), French (fr), German (de) +- Italian (it), Portuguese (pt), Russian (ru) +- Chinese (zh), Japanese (ja), Korean (ko) +- Arabic (ar), Hindi (hi), Dutch (nl) +- And many more... + +Full list available at: `GET /languages` + +## ⚙️ Configuration + +Edit `.env` file to configure: + +```env +# Translation Service (google, deepl, libre) +TRANSLATION_SERVICE=google + +# DeepL API Key (if using DeepL) +DEEPL_API_KEY=your_api_key_here + +# File Upload Limits +MAX_FILE_SIZE_MB=50 + +# Directory Configuration +UPLOAD_DIR=./uploads +OUTPUT_DIR=./outputs +``` + +## 🔌 Model Context Protocol (MCP) Integration + +This API is designed to be easily wrapped as an MCP server for future integration with AI assistants and tools. + +### MCP Server Structure (Future Implementation) + +```json +{ + "mcpServers": { + "document-translator": { + "command": "python", + "args": ["-m", "mcp_server"], + "env": { + "API_URL": "http://localhost:8000" + } + } + } +} +``` + +### Example MCP Tools + +The MCP wrapper will expose these tools: + +1. **translate_document** - Translate a single document +2. **translate_batch** - Translate multiple documents +3. **get_supported_languages** - List supported languages +4. **check_translation_status** - Check status of translation + +## 🏗️ Project Structure + +``` +Translate/ +├── main.py # FastAPI application +├── config.py # Configuration management +├── requirements.txt # Dependencies +├── .env.example # Environment template +├── services/ +│ ├── __init__.py +│ └── translation_service.py # Translation abstraction layer +├── translators/ +│ ├── __init__.py +│ ├── excel_translator.py # Excel translation logic +│ ├── word_translator.py # Word translation logic +│ └── pptx_translator.py # PowerPoint translation logic +├── utils/ +│ ├── __init__.py +│ ├── file_handler.py # File operations +│ └── exceptions.py # Custom exceptions +├── uploads/ # Temporary upload storage +└── outputs/ # Translated files +``` + +## 🧪 Testing + +### Manual Testing + +1. Start the API server +2. Navigate to http://localhost:8000/docs +3. Use the interactive Swagger UI to test endpoints + +### Test Files + +Prepare test files with: +- Complex formatting (multiple fonts, colors, styles) +- Embedded images and media +- Tables and merged cells +- Formulas (for Excel) +- Multiple sections/slides + +## 🛠️ Technical Details + +### Libraries Used + +- **FastAPI**: Modern web framework for building APIs +- **openpyxl**: Excel file manipulation with formatting preservation +- **python-docx**: Word document handling +- **python-pptx**: PowerPoint presentation processing +- **deep-translator**: Multi-provider translation service +- **Uvicorn**: ASGI server for running FastAPI + +### Design Principles + +1. **Modular Architecture**: Each file type has its own translator module +2. **Provider Abstraction**: Easy to swap translation services (Google, DeepL, LibreTranslate) +3. **Format Preservation**: All translators maintain original document structure +4. **Error Handling**: Comprehensive error handling and logging +5. **Scalability**: Ready for MCP integration and microservices architecture + +## 🔐 Security Considerations + +For production deployment: + +1. **Configure CORS** properly in `main.py` +2. **Add authentication** for API endpoints +3. **Implement rate limiting** to prevent abuse +4. **Use HTTPS** for secure file transmission +5. **Sanitize file uploads** to prevent malicious files +6. **Set appropriate file size limits** + +## 📝 License + +MIT License - Feel free to use this project for your needs. + +## 🤝 Contributing + +Contributions are welcome! Please feel free to submit a Pull Request. + +## 📧 Support + +For issues and questions, please open an issue on the repository. + +--- + +**Built with ❤️ using Python and FastAPI** diff --git a/config.py b/config.py new file mode 100644 index 0000000..5aa0ac5 --- /dev/null +++ b/config.py @@ -0,0 +1,47 @@ +""" +Configuration module for the Document Translation API +""" +import os +from pathlib import Path +from dotenv import load_dotenv + +load_dotenv() + +class Config: + # Translation Service + TRANSLATION_SERVICE = os.getenv("TRANSLATION_SERVICE", "google") + DEEPL_API_KEY = os.getenv("DEEPL_API_KEY", "") + + # File Upload Configuration + MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "50")) + MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024 + + # Directories + BASE_DIR = Path(__file__).parent.parent + UPLOAD_DIR = BASE_DIR / "uploads" + OUTPUT_DIR = BASE_DIR / "outputs" + TEMP_DIR = BASE_DIR / "temp" + + # Supported file types + SUPPORTED_EXTENSIONS = {".xlsx", ".docx", ".pptx"} + + # API Configuration + API_TITLE = "Document Translation API" + API_VERSION = "1.0.0" + API_DESCRIPTION = """ + Advanced Document Translation API with strict formatting preservation. + + Supports: + - Excel (.xlsx) - Preserves cell formatting, formulas, merged cells, images + - Word (.docx) - Preserves styles, tables, images, headers/footers + - PowerPoint (.pptx) - Preserves layouts, animations, embedded media + """ + + @classmethod + def ensure_directories(cls): + """Create necessary directories if they don't exist""" + cls.UPLOAD_DIR.mkdir(exist_ok=True, parents=True) + cls.OUTPUT_DIR.mkdir(exist_ok=True, parents=True) + cls.TEMP_DIR.mkdir(exist_ok=True, parents=True) + +config = Config() diff --git a/create_complex_samples.py b/create_complex_samples.py new file mode 100644 index 0000000..d1b333e --- /dev/null +++ b/create_complex_samples.py @@ -0,0 +1,887 @@ +""" +Script pour créer des fichiers exemples avec structure TRÈS COMPLEXE +Génère des fichiers Excel, Word et PowerPoint avec formatage avancé +""" +from pathlib import Path +from openpyxl import Workbook +from openpyxl.styles import Font, PatternFill, Alignment, Border, Side, Protection +from openpyxl.styles.numbers import FORMAT_CURRENCY_USD, FORMAT_PERCENTAGE +from openpyxl.chart import BarChart, PieChart, LineChart, Reference +from openpyxl.drawing.image import Image as XLImage +from openpyxl.utils import get_column_letter +from openpyxl.worksheet.datavalidation import DataValidation + +from docx import Document +from docx.shared import Inches, Pt, RGBColor, Cm +from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_LINE_SPACING +from docx.enum.style import WD_STYLE_TYPE +from docx.oxml.ns import qn +from docx.oxml import OxmlElement + +from pptx import Presentation +from pptx.util import Inches as PptxInches, Pt as PptxPt +from pptx.enum.text import PP_ALIGN, MSO_ANCHOR +from pptx.dml.color import RGBColor as PptxRGBColor +from pptx.enum.shapes import MSO_SHAPE + +from PIL import Image, ImageDraw, ImageFont +import matplotlib.pyplot as plt +import numpy as np + +print("🚀 Création de fichiers exemples avec structure COMPLEXE...\n") + +# Créer le dossier +SAMPLE_DIR = Path("sample_files") +SAMPLE_DIR.mkdir(exist_ok=True) + +# ============================================================================ +# 1. EXCEL TRÈS COMPLEXE +# ============================================================================ +print("📊 Création d'Excel super complexe...") + +wb = Workbook() + +# === SHEET 1: RAPPORT FINANCIER COMPLEXE === +ws1 = wb.active +ws1.title = "Rapport Financier 2024" + +# Titre principal avec fusion massive +ws1.merge_cells('A1:H2') +title = ws1['A1'] +title.value = "RAPPORT FINANCIER ANNUEL 2024\nAnalyse Complète et Prévisions" +title.font = Font(name='Calibri', size=20, bold=True, color='FFFFFF') +title.fill = PatternFill(start_color='0066CC', end_color='0066CC', fill_type='solid') +title.alignment = Alignment(horizontal='center', vertical='center', wrap_text=True) +title.border = Border( + left=Side(style='thick', color='000000'), + right=Side(style='thick', color='000000'), + top=Side(style='thick', color='000000'), + bottom=Side(style='thick', color='000000') +) + +# Sous-titre avec fusion +ws1.merge_cells('A3:H3') +subtitle = ws1['A3'] +subtitle.value = "Département des Ventes - Trimestre Q1-Q4" +subtitle.font = Font(size=14, italic=True, color='0066CC') +subtitle.alignment = Alignment(horizontal='center') + +# En-têtes de colonnes avec style élaboré +headers = ['Région', 'Produit', 'Unités Vendues', 'Prix Unitaire', 'Chiffre d\'Affaires', 'Coût', 'Marge Brute', 'Marge %'] +for col, header in enumerate(headers, 1): + cell = ws1.cell(row=4, column=col) + cell.value = header + cell.font = Font(bold=True, color='FFFFFF', size=12) + cell.fill = PatternFill(start_color='0066CC', end_color='0066CC', fill_type='solid') + cell.alignment = Alignment(horizontal='center', vertical='center', wrap_text=True) + cell.border = Border( + left=Side(style='thin'), + right=Side(style='thin'), + top=Side(style='thin'), + bottom=Side(style='thin') + ) + +# Données complexes avec formules avancées +regions = ['Europe du Nord', 'Europe du Sud', 'Amérique du Nord', 'Amérique du Sud', 'Asie Pacifique', 'Moyen-Orient', 'Afrique'] +products = ['Ordinateur Portable Premium', 'Tablette Professionnelle', 'Smartphone 5G', 'Écran 4K', 'Casque Sans Fil'] + +row = 5 +for region in regions: + for i, product in enumerate(products): + units = 100 + (row * 13) % 500 + price = 299 + (row * 37) % 1200 + cost = price * 0.6 + + ws1.cell(row=row, column=1, value=region) + ws1.cell(row=row, column=2, value=product) + ws1.cell(row=row, column=3, value=units) + ws1.cell(row=row, column=4, value=price).number_format = FORMAT_CURRENCY_USD + + # Formule: Chiffre d'affaires + ws1.cell(row=row, column=5, value=f"=C{row}*D{row}").number_format = FORMAT_CURRENCY_USD + + # Coût + ws1.cell(row=row, column=6, value=cost).number_format = FORMAT_CURRENCY_USD + + # Formule: Marge brute + ws1.cell(row=row, column=7, value=f"=E{row}-F{row}").number_format = FORMAT_CURRENCY_USD + + # Formule: Marge % + ws1.cell(row=row, column=8, value=f"=IF(E{row}>0,G{row}/E{row},0)").number_format = FORMAT_PERCENTAGE + + # Formatage conditionnel par région + for col in range(1, 9): + cell = ws1.cell(row=row, column=col) + if row % 2 == 0: + cell.fill = PatternFill(start_color='F2F2F2', end_color='F2F2F2', fill_type='solid') + cell.border = Border( + left=Side(style='thin', color='CCCCCC'), + right=Side(style='thin', color='CCCCCC'), + top=Side(style='thin', color='CCCCCC'), + bottom=Side(style='thin', color='CCCCCC') + ) + + row += 1 + +# Ligne de total avec formules complexes +total_row = row + 1 +ws1.merge_cells(f'A{total_row}:B{total_row}') +total_cell = ws1[f'A{total_row}'] +total_cell.value = "TOTAL GÉNÉRAL" +total_cell.font = Font(bold=True, size=14, color='FFFFFF') +total_cell.fill = PatternFill(start_color='FF6600', end_color='FF6600', fill_type='solid') +total_cell.alignment = Alignment(horizontal='right', vertical='center') + +for col in [3, 5, 7]: + cell = ws1.cell(row=total_row, column=col) + cell.value = f"=SUM({get_column_letter(col)}5:{get_column_letter(col)}{row-1})" + cell.font = Font(bold=True, size=12) + cell.fill = PatternFill(start_color='FF6600', end_color='FF6600', fill_type='solid') + if col in [5, 7]: + cell.number_format = FORMAT_CURRENCY_USD + +# Marge % moyenne +avg_cell = ws1.cell(row=total_row, column=8) +avg_cell.value = f"=AVERAGE(H5:H{row-1})" +avg_cell.number_format = FORMAT_PERCENTAGE +avg_cell.font = Font(bold=True, size=12) +avg_cell.fill = PatternFill(start_color='FF6600', end_color='FF6600', fill_type='solid') + +# Ajuster les largeurs +ws1.column_dimensions['A'].width = 20 +ws1.column_dimensions['B'].width = 30 +ws1.column_dimensions['C'].width = 15 +ws1.column_dimensions['D'].width = 15 +ws1.column_dimensions['E'].width = 18 +ws1.column_dimensions['F'].width = 15 +ws1.column_dimensions['G'].width = 15 +ws1.column_dimensions['H'].width = 12 + +# === SHEET 2: GRAPHIQUES ET ANALYSES === +ws2 = wb.create_sheet("Analyses Graphiques") + +ws2['A1'] = "Analyse des Performances par Région" +ws2['A1'].font = Font(size=16, bold=True, color='0066CC') +ws2.merge_cells('A1:D1') + +# Données pour graphiques +ws2['A3'] = "Région" +ws2['B3'] = "Total Ventes" +ws2['C3'] = "Objectif" +ws2['D3'] = "Écart %" + +region_data = [ + ("Europe", 2500000, 2200000), + ("Amérique", 3200000, 3000000), + ("Asie", 2800000, 2900000), + ("Autres", 1200000, 1100000) +] + +for i, (region, sales, target) in enumerate(region_data, 4): + ws2.cell(row=i, column=1, value=region) + ws2.cell(row=i, column=2, value=sales).number_format = FORMAT_CURRENCY_USD + ws2.cell(row=i, column=3, value=target).number_format = FORMAT_CURRENCY_USD + ws2.cell(row=i, column=4, value=f"=(B{i}-C{i})/C{i}").number_format = FORMAT_PERCENTAGE + +# Graphique en barres +chart1 = BarChart() +chart1.title = "Ventes par Région vs Objectifs" +chart1.y_axis.title = "Montant (USD)" +chart1.x_axis.title = "Régions" +chart1.height = 10 +chart1.width = 20 + +data = Reference(ws2, min_col=2, min_row=3, max_row=7, max_col=3) +cats = Reference(ws2, min_col=1, min_row=4, max_row=7) +chart1.add_data(data, titles_from_data=True) +chart1.set_categories(cats) +ws2.add_chart(chart1, "F3") + +# Graphique circulaire +chart2 = PieChart() +chart2.title = "Répartition des Ventes par Région" +chart2.height = 10 +chart2.width = 15 + +data2 = Reference(ws2, min_col=2, min_row=4, max_row=7) +cats2 = Reference(ws2, min_col=1, min_row=4, max_row=7) +chart2.add_data(data2) +chart2.set_categories(cats2) +ws2.add_chart(chart2, "F20") + +# === SHEET 3: DONNÉES MENSUELLES AVEC TENDANCES === +ws3 = wb.create_sheet("Tendances Mensuelles") + +ws3['A1'] = "Évolution Mensuelle des Ventes 2024" +ws3['A1'].font = Font(size=16, bold=True, color='FF6600') +ws3.merge_cells('A1:M1') + +months = ['Janvier', 'Février', 'Mars', 'Avril', 'Mai', 'Juin', 'Juillet', 'Août', 'Septembre', 'Octobre', 'Novembre', 'Décembre'] +ws3['A3'] = "Mois" +for i, month in enumerate(months, 2): + ws3.cell(row=3, column=i, value=month) + ws3.cell(row=3, column=i).font = Font(bold=True) + +# Produits avec ventes mensuelles +products_monthly = ['Laptops', 'Tablettes', 'Smartphones', 'Accessoires'] +for i, product in enumerate(products_monthly, 4): + ws3.cell(row=i, column=1, value=product) + ws3.cell(row=i, column=1).font = Font(bold=True) + + for month_col in range(2, 14): + value = 50000 + (i * month_col * 1234) % 30000 + ws3.cell(row=i, column=month_col, value=value).number_format = FORMAT_CURRENCY_USD + +# Ligne de total avec formule +ws3.cell(row=8, column=1, value="TOTAL") +ws3.cell(row=8, column=1).font = Font(bold=True, size=12) +for col in range(2, 14): + ws3.cell(row=8, column=col, value=f"=SUM({get_column_letter(col)}4:{get_column_letter(col)}7)") + ws3.cell(row=8, column=col).number_format = FORMAT_CURRENCY_USD + ws3.cell(row=8, column=col).font = Font(bold=True) + ws3.cell(row=8, column=col).fill = PatternFill(start_color='FFD700', end_color='FFD700', fill_type='solid') + +# Graphique linéaire +chart3 = LineChart() +chart3.title = "Tendance des Ventes sur 12 Mois" +chart3.y_axis.title = "Montant (USD)" +chart3.x_axis.title = "Mois" +chart3.height = 12 +chart3.width = 24 + +data3 = Reference(ws3, min_col=2, min_row=3, max_row=8, max_col=13) +cats3 = Reference(ws3, min_col=2, min_row=3, max_col=13) +chart3.add_data(data3, titles_from_data=True) +chart3.set_categories(cats3) +ws3.add_chart(chart3, "A10") + +# Sauvegarder Excel +excel_file = SAMPLE_DIR / "super_complex.xlsx" +wb.save(excel_file) +print(f"✅ Excel créé: {excel_file}") +print(f" - 3 feuilles avec données complexes") +print(f" - Cellules fusionnées multiples") +print(f" - Formules avancées (SUM, AVERAGE, IF, pourcentages)") +print(f" - 3 graphiques (barres, camembert, lignes)") +print(f" - Formatage conditionnel élaboré") +print(f" - {len(regions) * len(products)} lignes de données\n") + +# ============================================================================ +# 2. WORD TRÈS COMPLEXE +# ============================================================================ +print("📝 Création de Word super complexe...") + +doc = Document() + +# Configurer les marges +sections = doc.sections +for section in sections: + section.top_margin = Cm(2) + section.bottom_margin = Cm(2) + section.left_margin = Cm(2.5) + section.right_margin = Cm(2.5) + +# PAGE DE COUVERTURE +title = doc.add_heading('RAPPORT STRATÉGIQUE ANNUEL', 0) +title.alignment = WD_ALIGN_PARAGRAPH.CENTER +for run in title.runs: + run.font.size = Pt(28) + run.font.color.rgb = RGBColor(0, 102, 204) + run.font.bold = True + +subtitle = doc.add_paragraph('Analyse Complète des Performances et Perspectives 2024-2025') +subtitle.alignment = WD_ALIGN_PARAGRAPH.CENTER +subtitle.runs[0].font.size = Pt(16) +subtitle.runs[0].font.italic = True +subtitle.runs[0].font.color.rgb = RGBColor(102, 102, 102) + +doc.add_paragraph('\n' * 3) + +company_info = doc.add_paragraph() +company_info.alignment = WD_ALIGN_PARAGRAPH.CENTER +company_info.add_run('Société: TechnoVision International\n').font.size = Pt(14) +company_info.add_run('Département: Analyse Stratégique\n').font.size = Pt(12) +company_info.add_run('Date: 31 Décembre 2024').font.size = Pt(12) + +doc.add_page_break() + +# TABLE DES MATIÈRES +doc.add_heading('Table des Matières', 1) +toc_items = [ + '1. Résumé Exécutif', + '2. Analyse des Performances Financières', + '3. Objectifs Stratégiques', + '4. Analyse de Marché', + '5. Recommandations', + '6. Conclusion' +] +for item in toc_items: + p = doc.add_paragraph(item, style='List Number') + p.runs[0].font.size = Pt(12) + +doc.add_page_break() + +# SECTION 1: RÉSUMÉ EXÉCUTIF +doc.add_heading('1. Résumé Exécutif', 1) + +para1 = doc.add_paragraph() +para1.add_run('Ce rapport présente une analyse détaillée ').font.size = Pt(11) +para1.add_run('des performances exceptionnelles').bold = True +para1.add_run(' de notre entreprise au cours de l\'année 2024. ').font.size = Pt(11) +para1.add_run('Nous avons atteint et dépassé').italic = True +para1.add_run(' nos objectifs dans tous les domaines clés.').font.size = Pt(11) +para1.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY + +doc.add_heading('Points Clés', 2) +key_points = [ + 'Croissance du chiffre d\'affaires de 28% par rapport à 2023', + 'Expansion réussie dans 5 nouveaux marchés internationaux', + 'Lancement de 8 produits innovants avec taux d\'adoption de 87%', + 'Amélioration de la satisfaction client: score de 4.8/5', + 'Réduction de l\'empreinte carbone de 22%', + 'Augmentation de la part de marché de 3.5 points' +] + +for point in key_points: + p = doc.add_paragraph(point, style='List Bullet') + p.runs[0].font.size = Pt(11) + +# Créer graphique pour Word +img_path = SAMPLE_DIR / "word_performance_chart.png" +fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5)) + +# Graphique 1: Croissance trimestrielle +quarters = ['Q1', 'Q2', 'Q3', 'Q4'] +revenue = [45.2, 52.8, 61.5, 68.3] +ax1.plot(quarters, revenue, marker='o', linewidth=2, markersize=8, color='#0066CC') +ax1.fill_between(range(len(quarters)), revenue, alpha=0.3, color='#0066CC') +ax1.set_title('Croissance Trimestrielle du CA (M€)', fontsize=14, fontweight='bold') +ax1.set_ylabel('Chiffre d\'Affaires (M€)', fontsize=11) +ax1.grid(True, alpha=0.3) + +# Graphique 2: Répartition des revenus +categories = ['Produits', 'Services', 'Licences', 'Consulting'] +values = [45, 25, 20, 10] +colors = ['#0066CC', '#FF6600', '#00CC66', '#CC00CC'] +ax2.pie(values, labels=categories, autopct='%1.1f%%', colors=colors, startangle=90) +ax2.set_title('Répartition des Revenus 2024', fontsize=14, fontweight='bold') + +plt.tight_layout() +plt.savefig(img_path, dpi=150, bbox_inches='tight') +plt.close() + +doc.add_paragraph() +doc.add_picture(str(img_path), width=Inches(6)) +last_paragraph = doc.paragraphs[-1] +last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER + +# SECTION 2: ANALYSE FINANCIÈRE +doc.add_page_break() +doc.add_heading('2. Analyse des Performances Financières', 1) + +doc.add_heading('2.1 Résultats par Trimestre', 2) + +# Tableau complexe des résultats +table = doc.add_table(rows=6, cols=6) +table.style = 'Medium Grid 3 Accent 1' + +# En-têtes +headers = ['Trimestre', 'Revenus (M€)', 'Coûts (M€)', 'Marge Brute', 'Marge %', 'Croissance'] +for i, header in enumerate(headers): + cell = table.rows[0].cells[i] + cell.text = header + cell.paragraphs[0].runs[0].font.bold = True + cell.paragraphs[0].runs[0].font.size = Pt(10) + +# Données trimestrielles +quarterly_data = [ + ('Q1 2024', 45.2, 28.5, 16.7, '36.9%', '+8.5%'), + ('Q2 2024', 52.8, 32.1, 20.7, '39.2%', '+16.8%'), + ('Q3 2024', 61.5, 36.8, 24.7, '40.2%', '+16.5%'), + ('Q4 2024', 68.3, 40.2, 28.1, '41.1%', '+11.1%'), + ('TOTAL', 227.8, 137.6, 90.2, '39.6%', '28.0%') +] + +for row_idx, row_data in enumerate(quarterly_data, 1): + for col_idx, value in enumerate(row_data): + cell = table.rows[row_idx].cells[col_idx] + cell.text = str(value) + if row_idx == 5: # Ligne total + cell.paragraphs[0].runs[0].font.bold = True + # Colorer le fond (workaround via XML) + shading_elm = OxmlElement('w:shd') + shading_elm.set(qn('w:fill'), 'FFD700') + cell._element.get_or_add_tcPr().append(shading_elm) + +doc.add_paragraph() + +doc.add_heading('2.2 Analyse Comparative', 2) + +comparison_text = doc.add_paragraph() +comparison_text.add_run('Comparaison avec les objectifs annuels: ').bold = True +comparison_text.add_run('Notre performance a dépassé les objectifs fixés de ') +comparison_text.add_run('13.5%').bold = True +comparison_text.add_run(', démontrant une excellente exécution stratégique et une adaptation réussie aux conditions du marché.') +comparison_text.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY + +# SECTION 3: OBJECTIFS STRATÉGIQUES +doc.add_page_break() +doc.add_heading('3. Objectifs Stratégiques 2025', 1) + +doc.add_heading('3.1 Vision et Mission', 2) +vision = doc.add_paragraph() +vision.add_run('Vision: ').bold = True +vision.add_run('Devenir le leader mondial dans notre secteur d\'ici 2027.') +vision.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY + +mission = doc.add_paragraph() +mission.add_run('Mission: ').bold = True +mission.add_run('Fournir des solutions innovantes qui transforment les entreprises et améliorent la vie de millions de personnes.') +mission.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY + +doc.add_heading('3.2 Objectifs Prioritaires', 2) + +objectives = [ + ('Expansion Géographique', 'Pénétrer 10 nouveaux marchés en Asie et Europe de l\'Est'), + ('Innovation Produit', 'Lancer 12 nouveaux produits avec IA intégrée'), + ('Excellence Opérationnelle', 'Réduire les coûts de 15% via automatisation'), + ('Satisfaction Client', 'Atteindre un NPS de 75+'), + ('Développement Durable', 'Neutralité carbone d\'ici fin 2025') +] + +for i, (title, desc) in enumerate(objectives, 1): + p = doc.add_paragraph(style='List Number') + p.add_run(f'{title}: ').bold = True + p.add_run(desc) + +# Ajouter une image conceptuelle +concept_img = SAMPLE_DIR / "word_strategy_image.png" +img = Image.new('RGB', (800, 400), color=(240, 248, 255)) +draw = ImageDraw.Draw(img) +try: + font_large = ImageFont.truetype("arial.ttf", 60) + font_small = ImageFont.truetype("arial.ttf", 30) +except: + font_large = ImageFont.load_default() + font_small = ImageFont.load_default() + +draw.rectangle([50, 50, 750, 350], outline=(0, 102, 204), width=5, fill=(230, 240, 255)) +draw.text((400, 150), "STRATÉGIE 2025", fill=(0, 102, 204), font=font_large, anchor="mm") +draw.text((400, 250), "Innovation • Excellence • Croissance", fill=(102, 102, 102), font=font_small, anchor="mm") +img.save(concept_img) + +doc.add_paragraph() +doc.add_picture(str(concept_img), width=Inches(5.5)) +doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER + +# CONCLUSION +doc.add_page_break() +doc.add_heading('6. Conclusion', 1) + +conclusion = doc.add_paragraph() +conclusion.add_run('L\'année 2024 a été exceptionnelle ').font.size = Pt(12) +conclusion.add_run('pour notre organisation. ').bold = True +conclusion.add_run('Nous avons non seulement atteint nos objectifs ambitieux, mais nous avons également posé les bases solides pour une croissance continue et durable. ') +conclusion.add_run('L\'engagement de nos équipes, la confiance de nos clients et l\'innovation de nos produits ').italic = True +conclusion.add_run('sont les piliers de notre succès.') +conclusion.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY +conclusion.paragraph_format.line_spacing_rule = WD_LINE_SPACING.MULTIPLE +conclusion.paragraph_format.line_spacing = 1.5 + +# Footer +section = doc.sections[0] +footer = section.footer +footer_para = footer.paragraphs[0] +footer_para.text = "Document Confidentiel - TechnoVision International 2024 | " +footer_para.alignment = WD_ALIGN_PARAGRAPH.CENTER +footer_para.runs[0].font.size = Pt(9) +footer_para.runs[0].font.italic = True + +# Sauvegarder Word +word_file = SAMPLE_DIR / "super_complex.docx" +doc.save(word_file) +print(f"✅ Word créé: {word_file}") +print(f" - 6 sections structurées") +print(f" - Page de couverture professionnelle") +print(f" - Table des matières") +print(f" - Tableaux complexes avec formatage") +print(f" - 3 images intégrées (graphiques et concepts)") +print(f" - Formatage avancé (styles, couleurs, alignements)") +print(f" - Pieds de page\n") + +# ============================================================================ +# 3. POWERPOINT TRÈS COMPLEXE +# ============================================================================ +print("🎨 Création de PowerPoint super complexe...") + +prs = Presentation() +prs.slide_width = PptxInches(10) +prs.slide_height = PptxInches(7.5) + +# SLIDE 1: TITRE PRINCIPAL +slide1 = prs.slides.add_slide(prs.slide_layouts[6]) # Blank + +# Fond coloré +background = slide1.shapes.add_shape( + MSO_SHAPE.RECTANGLE, + 0, 0, prs.slide_width, prs.slide_height +) +background.fill.solid() +background.fill.fore_color.rgb = PptxRGBColor(0, 102, 204) +background.line.fill.background() + +# Titre principal +title_box = slide1.shapes.add_textbox( + PptxInches(1), PptxInches(2.5), + PptxInches(8), PptxInches(2) +) +title_frame = title_box.text_frame +title_frame.text = "TRANSFORMATION DIGITALE 2025" +title_para = title_frame.paragraphs[0] +title_para.font.size = PptxPt(54) +title_para.font.bold = True +title_para.font.color.rgb = PptxRGBColor(255, 255, 255) +title_para.alignment = PP_ALIGN.CENTER + +# Sous-titre +subtitle_box = slide1.shapes.add_textbox( + PptxInches(1), PptxInches(4.8), + PptxInches(8), PptxInches(1) +) +subtitle_frame = subtitle_box.text_frame +subtitle_frame.text = "Stratégie, Innovation et Excellence Opérationnelle" +subtitle_para = subtitle_frame.paragraphs[0] +subtitle_para.font.size = PptxPt(24) +subtitle_para.font.italic = True +subtitle_para.font.color.rgb = PptxRGBColor(255, 255, 255) +subtitle_para.alignment = PP_ALIGN.CENTER + +# SLIDE 2: AGENDA +slide2 = prs.slides.add_slide(prs.slide_layouts[1]) +title2 = slide2.shapes.title +title2.text = "Agenda de la Présentation" +title2.text_frame.paragraphs[0].font.size = PptxPt(40) +title2.text_frame.paragraphs[0].font.color.rgb = PptxRGBColor(0, 102, 204) + +body2 = slide2.placeholders[1] +tf2 = body2.text_frame +tf2.clear() + +agenda_items = [ + "Contexte et Enjeux Stratégiques", + "Analyse des Performances 2024", + "Objectifs de Transformation Digitale", + "Initiatives Clés et Roadmap", + "Budget et Ressources", + "Indicateurs de Succès et KPIs", + "Plan d'Action et Prochaines Étapes" +] + +for i, item in enumerate(agenda_items): + p = tf2.add_paragraph() + p.text = item + p.level = 0 + p.font.size = PptxPt(20) + p.space_before = PptxPt(8) + + # Numérotation colorée + run = p.runs[0] + run.text = f"{i+1}. {item}" + if i % 2 == 0: + run.font.color.rgb = PptxRGBColor(0, 102, 204) + else: + run.font.color.rgb = PptxRGBColor(255, 102, 0) + +# SLIDE 3: DONNÉES AVEC GRAPHIQUE +slide3 = prs.slides.add_slide(prs.slide_layouts[5]) +title3 = slide3.shapes.title +title3.text = "Croissance et Performance Financière" + +# Créer graphique pour PPT +chart_img = SAMPLE_DIR / "ppt_financial_chart.png" +fig, ax = plt.subplots(figsize=(10, 6)) + +years = ['2020', '2021', '2022', '2023', '2024'] +revenue = [125, 145, 168, 195, 228] +profit = [15, 22, 28, 35, 48] + +x = np.arange(len(years)) +width = 0.35 + +bars1 = ax.bar(x - width/2, revenue, width, label='Revenus (M€)', color='#0066CC') +bars2 = ax.bar(x + width/2, profit, width, label='Bénéfices (M€)', color='#FF6600') + +ax.set_xlabel('Années', fontsize=12, fontweight='bold') +ax.set_ylabel('Montants (M€)', fontsize=12, fontweight='bold') +ax.set_title('Évolution Financière 2020-2024', fontsize=16, fontweight='bold') +ax.set_xticks(x) +ax.set_xticklabels(years) +ax.legend(fontsize=11) +ax.grid(True, alpha=0.3) + +# Ajouter valeurs sur les barres +for bars in [bars1, bars2]: + for bar in bars: + height = bar.get_height() + ax.text(bar.get_x() + bar.get_width()/2., height, + f'{int(height)}', + ha='center', va='bottom', fontsize=10, fontweight='bold') + +plt.tight_layout() +plt.savefig(chart_img, dpi=150, bbox_inches='tight', facecolor='white') +plt.close() + +slide3.shapes.add_picture( + str(chart_img), + PptxInches(1.5), PptxInches(2), + width=PptxInches(7) +) + +# SLIDE 4: TABLEAU COMPLEXE +slide4 = prs.slides.add_slide(prs.slide_layouts[5]) +title4 = slide4.shapes.title +title4.text = "Répartition Budgétaire par Département" + +# Tableau +rows, cols = 8, 5 +table = slide4.shapes.add_table( + rows, cols, + PptxInches(0.8), PptxInches(2), + PptxInches(8.4), PptxInches(4) +).table + +# En-têtes +headers = ['Département', 'Budget 2024 (M€)', 'Budget 2025 (M€)', 'Variation', 'Priorité'] +for col, header in enumerate(headers): + cell = table.cell(0, col) + cell.text = header + cell.fill.solid() + cell.fill.fore_color.rgb = PptxRGBColor(0, 102, 204) + for paragraph in cell.text_frame.paragraphs: + for run in paragraph.runs: + run.font.size = PptxPt(14) + run.font.bold = True + run.font.color.rgb = PptxRGBColor(255, 255, 255) + cell.text_frame.paragraphs[0].alignment = PP_ALIGN.CENTER + +# Données +dept_data = [ + ('Recherche & Développement', '45.2', '58.5', '+29.4%', 'Élevée'), + ('Ventes & Marketing', '38.7', '42.3', '+9.3%', 'Élevée'), + ('Opérations', '52.3', '55.1', '+5.4%', 'Moyenne'), + ('IT & Infrastructure', '28.5', '35.2', '+23.5%', 'Élevée'), + ('Ressources Humaines', '15.8', '17.2', '+8.9%', 'Moyenne'), + ('Administration', '12.3', '13.1', '+6.5%', 'Faible'), + ('TOTAL', '192.8', '221.4', '+14.8%', '-') +] + +for row_idx, row_data in enumerate(dept_data, 1): + for col_idx, value in enumerate(row_data): + cell = table.cell(row_idx, col_idx) + cell.text = value + + # Formatage spécial pour la ligne total + if row_idx == 7: + cell.fill.solid() + cell.fill.fore_color.rgb = PptxRGBColor(255, 215, 0) + for paragraph in cell.text_frame.paragraphs: + for run in paragraph.runs: + run.font.bold = True + run.font.size = PptxPt(13) + else: + # Alternance de couleurs + if row_idx % 2 == 0: + cell.fill.solid() + cell.fill.fore_color.rgb = PptxRGBColor(240, 248, 255) + + for paragraph in cell.text_frame.paragraphs: + for run in paragraph.runs: + run.font.size = PptxPt(12) + + # Alignement + cell.text_frame.paragraphs[0].alignment = PP_ALIGN.CENTER if col_idx > 0 else PP_ALIGN.LEFT + +# SLIDE 5: INITIATIVES CLÉS AVEC FORMES +slide5 = prs.slides.add_slide(prs.slide_layouts[5]) +title5 = slide5.shapes.title +title5.text = "Initiatives Stratégiques 2025" + +initiatives = [ + ("Innovation IA", "Intégration IA dans tous les produits", PptxRGBColor(46, 125, 50)), + ("Cloud First", "Migration complète vers le cloud", PptxRGBColor(33, 150, 243)), + ("Customer 360", "Vue unifiée du parcours client", PptxRGBColor(255, 152, 0)), + ("Green IT", "Neutralité carbone datacenter", PptxRGBColor(76, 175, 80)) +] + +y_position = 2.2 +for i, (title_text, desc, color) in enumerate(initiatives): + # Rectangle coloré + shape = slide5.shapes.add_shape( + MSO_SHAPE.ROUNDED_RECTANGLE, + PptxInches(1), PptxInches(y_position), + PptxInches(8), PptxInches(1) + ) + shape.fill.solid() + shape.fill.fore_color.rgb = color + shape.line.color.rgb = color + shape.shadow.inherit = False + + # Texte dans la forme + text_frame = shape.text_frame + text_frame.clear() + + p1 = text_frame.paragraphs[0] + p1.text = title_text + p1.font.size = PptxPt(22) + p1.font.bold = True + p1.font.color.rgb = PptxRGBColor(255, 255, 255) + p1.alignment = PP_ALIGN.LEFT + + p2 = text_frame.add_paragraph() + p2.text = desc + p2.font.size = PptxPt(16) + p2.font.color.rgb = PptxRGBColor(255, 255, 255) + p2.alignment = PP_ALIGN.LEFT + p2.space_before = PptxPt(5) + + text_frame.vertical_anchor = MSO_ANCHOR.MIDDLE + text_frame.margin_left = PptxInches(0.3) + + y_position += 1.2 + +# SLIDE 6: TIMELINE +slide6 = prs.slides.add_slide(prs.slide_layouts[5]) +title6 = slide6.shapes.title +title6.text = "Roadmap de Déploiement" + +# Timeline horizontale +timeline_data = [ + ("Q1", "Planification", PptxRGBColor(76, 175, 80)), + ("Q2", "Développement", PptxRGBColor(33, 150, 243)), + ("Q3", "Tests & Pilotes", PptxRGBColor(255, 152, 0)), + ("Q4", "Déploiement", PptxRGBColor(156, 39, 176)) +] + +x_start = 1 +y_pos = 3 +width = 1.8 + +for quarter, phase, color in timeline_data: + # Cercle pour le trimestre + circle = slide6.shapes.add_shape( + MSO_SHAPE.OVAL, + PptxInches(x_start), PptxInches(y_pos), + PptxInches(0.8), PptxInches(0.8) + ) + circle.fill.solid() + circle.fill.fore_color.rgb = color + circle.line.color.rgb = color + + # Texte du trimestre + tf = circle.text_frame + tf.text = quarter + tf.paragraphs[0].font.size = PptxPt(18) + tf.paragraphs[0].font.bold = True + tf.paragraphs[0].font.color.rgb = PptxRGBColor(255, 255, 255) + tf.paragraphs[0].alignment = PP_ALIGN.CENTER + tf.vertical_anchor = MSO_ANCHOR.MIDDLE + + # Description de la phase + text_box = slide6.shapes.add_textbox( + PptxInches(x_start - 0.3), PptxInches(y_pos + 1.2), + PptxInches(1.4), PptxInches(0.6) + ) + tf2 = text_box.text_frame + tf2.text = phase + tf2.paragraphs[0].font.size = PptxPt(14) + tf2.paragraphs[0].font.bold = True + tf2.paragraphs[0].font.color.rgb = color + tf2.paragraphs[0].alignment = PP_ALIGN.CENTER + + # Ligne de connexion (sauf pour le dernier) + if x_start < 7: + line = slide6.shapes.add_connector( + 1, # Straight connector + PptxInches(x_start + 0.8), PptxInches(y_pos + 0.4), + PptxInches(x_start + width), PptxInches(y_pos + 0.4) + ) + line.line.color.rgb = PptxRGBColor(100, 100, 100) + line.line.width = PptxPt(3) + + x_start += width + +# SLIDE 7: CONCLUSION +slide7 = prs.slides.add_slide(prs.slide_layouts[1]) +title7 = slide7.shapes.title +title7.text = "Prochaines Étapes et Engagement" + +body7 = slide7.placeholders[1] +tf7 = body7.text_frame +tf7.clear() + +next_steps = [ + "Validation du comité exécutif - Janvier 2025", + "Kick-off des programmes prioritaires - Février 2025", + "Revues mensuelles de progression avec les sponsors", + "Communication régulière à toutes les parties prenantes", + "Ajustements agiles basés sur les retours du terrain" +] + +for step in next_steps: + p = tf7.add_paragraph() + p.text = step + p.level = 0 + p.font.size = PptxPt(20) + p.space_before = PptxPt(10) + p.font.color.rgb = PptxRGBColor(0, 102, 204) + +# Ajouter une image de conclusion +conclusion_img = SAMPLE_DIR / "ppt_conclusion_image.png" +img = Image.new('RGB', (800, 300), color=(255, 255, 255)) +draw = ImageDraw.Draw(img) + +# Dessiner un graphique de succès stylisé +draw.rectangle([50, 50, 750, 250], outline=(0, 102, 204), width=3) +try: + font = ImageFont.truetype("arial.ttf", 50) +except: + font = ImageFont.load_default() +draw.text((400, 150), "SUCCÈS 2025", fill=(0, 102, 204), font=font, anchor="mm") +img.save(conclusion_img) + +slide7.shapes.add_picture( + str(conclusion_img), + PptxInches(2.5), PptxInches(5), + width=PptxInches(5) +) + +# Ajouter notes de présentation +notes_slide = slide1.notes_slide +notes_slide.notes_text_frame.text = "Bienvenue à tous. Cette présentation couvre notre stratégie de transformation digitale pour 2025. Nous allons explorer nos objectifs ambitieux et le plan d'action pour les atteindre." + +# Sauvegarder PowerPoint +ppt_file = SAMPLE_DIR / "super_complex.pptx" +prs.save(ppt_file) +print(f"✅ PowerPoint créé: {ppt_file}") +print(f" - 7 diapositives professionnelles") +print(f" - Slide de titre avec design custom") +print(f" - Agenda structuré avec numérotation") +print(f" - Tableau complexe 8x5 avec formatage") +print(f" - Graphiques intégrés (barres avec valeurs)") +print(f" - Timeline visuelle avec formes connectées") +print(f" - 4 initiatives avec rectangles colorés") +print(f" - Images et formes multiples") +print(f" - Notes de présentation\n") + +print("=" * 70) +print("🎉 TOUS LES FICHIERS COMPLEXES ONT ÉTÉ CRÉÉS AVEC SUCCÈS!") +print("=" * 70) +print(f"\n📁 Fichiers disponibles dans: {SAMPLE_DIR.absolute()}") +print("\nVous pouvez maintenant:") +print("1. Ouvrir les fichiers pour vérifier la complexité") +print("2. Les traduire via l'API") +print("3. Vérifier que le formatage est préservé") +print("\n✨ Formatage inclus:") +print(" Excel: Formules, cellules fusionnées, graphiques, formatage conditionnel") +print(" Word: Images, tableaux, styles, couleurs, pieds de page") +print(" PowerPoint: Formes, graphiques, timeline, tableaux, animations visuelles") diff --git a/main.py b/main.py new file mode 100644 index 0000000..72de639 --- /dev/null +++ b/main.py @@ -0,0 +1,307 @@ +""" +Document Translation API +FastAPI application for translating complex documents while preserving formatting +""" +from fastapi import FastAPI, UploadFile, File, Form, HTTPException +from fastapi.responses import FileResponse, JSONResponse +from fastapi.middleware.cors import CORSMiddleware +from pathlib import Path +from typing import Optional +import asyncio +import logging + +from config import config +from translators import excel_translator, word_translator, pptx_translator +from utils import file_handler, handle_translation_error, DocumentProcessingError + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# Ensure necessary directories exist +config.ensure_directories() + +# Create FastAPI app +app = FastAPI( + title=config.API_TITLE, + version=config.API_VERSION, + description=config.API_DESCRIPTION +) + +# Add CORS middleware +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], # Configure appropriately for production + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + + +@app.get("/") +async def root(): + """Root endpoint with API information""" + return { + "name": config.API_TITLE, + "version": config.API_VERSION, + "status": "operational", + "supported_formats": list(config.SUPPORTED_EXTENSIONS), + "endpoints": { + "translate": "/translate", + "health": "/health", + "supported_languages": "/languages" + } + } + + +@app.get("/health") +async def health_check(): + """Health check endpoint""" + return { + "status": "healthy", + "translation_service": config.TRANSLATION_SERVICE + } + + +@app.get("/languages") +async def get_supported_languages(): + """Get list of supported language codes""" + return { + "supported_languages": { + "es": "Spanish", + "fr": "French", + "de": "German", + "it": "Italian", + "pt": "Portuguese", + "ru": "Russian", + "zh": "Chinese (Simplified)", + "ja": "Japanese", + "ko": "Korean", + "ar": "Arabic", + "hi": "Hindi", + "nl": "Dutch", + "pl": "Polish", + "tr": "Turkish", + "sv": "Swedish", + "da": "Danish", + "no": "Norwegian", + "fi": "Finnish", + "cs": "Czech", + "el": "Greek", + "th": "Thai", + "vi": "Vietnamese", + "id": "Indonesian", + "uk": "Ukrainian", + "ro": "Romanian", + "hu": "Hungarian" + }, + "note": "Supported languages may vary depending on the translation service configured" + } + + +@app.post("/translate") +async def translate_document( + file: UploadFile = File(..., description="Document file to translate (.xlsx, .docx, or .pptx)"), + target_language: str = Form(..., description="Target language code (e.g., 'es', 'fr', 'de')"), + source_language: str = Form(default="auto", description="Source language code (default: auto-detect)"), + cleanup: bool = Form(default=True, description="Delete input file after translation") +): + """ + Translate a document while preserving all formatting, layout, and embedded media + + **Supported File Types:** + - Excel (.xlsx) - Preserves formulas, merged cells, styling, and images + - Word (.docx) - Preserves headings, tables, images, headers/footers + - PowerPoint (.pptx) - Preserves layouts, animations, and media + + **Parameters:** + - **file**: The document file to translate + - **target_language**: Target language code (e.g., 'es' for Spanish, 'fr' for French) + - **source_language**: Source language code (optional, default: auto-detect) + - **cleanup**: Whether to delete the uploaded file after translation (default: True) + + **Returns:** + - Translated document file with preserved formatting + """ + input_path = None + output_path = None + + try: + # Validate file extension + file_extension = file_handler.validate_file_extension(file.filename) + logger.info(f"Processing {file_extension} file: {file.filename}") + + # Validate file size + file_handler.validate_file_size(file) + + # Generate unique filenames + input_filename = file_handler.generate_unique_filename(file.filename, "input") + output_filename = file_handler.generate_unique_filename(file.filename, "translated") + + # Save uploaded file + input_path = config.UPLOAD_DIR / input_filename + output_path = config.OUTPUT_DIR / output_filename + + await file_handler.save_upload_file(file, input_path) + logger.info(f"Saved input file to: {input_path}") + + # Translate based on file type + if file_extension == ".xlsx": + logger.info("Translating Excel file...") + excel_translator.translate_file(input_path, output_path, target_language) + elif file_extension == ".docx": + logger.info("Translating Word document...") + word_translator.translate_file(input_path, output_path, target_language) + elif file_extension == ".pptx": + logger.info("Translating PowerPoint presentation...") + pptx_translator.translate_file(input_path, output_path, target_language) + else: + raise DocumentProcessingError(f"Unsupported file type: {file_extension}") + + logger.info(f"Translation completed: {output_path}") + + # Get file info + output_info = file_handler.get_file_info(output_path) + + # Cleanup input file if requested + if cleanup and input_path: + file_handler.cleanup_file(input_path) + logger.info(f"Cleaned up input file: {input_path}") + + # Return the translated file + return FileResponse( + path=output_path, + filename=f"translated_{file.filename}", + media_type="application/octet-stream", + headers={ + "X-Original-Filename": file.filename, + "X-File-Size-MB": str(output_info.get("size_mb", 0)), + "X-Target-Language": target_language + } + ) + + except HTTPException: + # Re-raise HTTP exceptions + raise + except Exception as e: + logger.error(f"Translation error: {str(e)}", exc_info=True) + + # Cleanup files on error + if input_path: + file_handler.cleanup_file(input_path) + if output_path: + file_handler.cleanup_file(output_path) + + raise handle_translation_error(e) + + +@app.delete("/cleanup/{filename}") +async def cleanup_translated_file(filename: str): + """ + Cleanup a translated file after download + + **Parameters:** + - **filename**: Name of the file to delete from the outputs directory + """ + try: + file_path = config.OUTPUT_DIR / filename + + if not file_path.exists(): + raise HTTPException(status_code=404, detail="File not found") + + file_handler.cleanup_file(file_path) + + return {"message": f"File {filename} deleted successfully"} + + except HTTPException: + raise + except Exception as e: + logger.error(f"Cleanup error: {str(e)}") + raise HTTPException(status_code=500, detail="Error cleaning up file") + + +@app.post("/translate-batch") +async def translate_batch_documents( + files: list[UploadFile] = File(..., description="Multiple document files to translate"), + target_language: str = Form(..., description="Target language code"), + source_language: str = Form(default="auto", description="Source language code") +): + """ + Translate multiple documents in batch + + **Note:** This endpoint processes files sequentially. For large batches, consider + calling the single file endpoint multiple times with concurrent requests. + """ + results = [] + + for file in files: + try: + # Process each file using the same logic as single file translation + file_extension = file_handler.validate_file_extension(file.filename) + file_handler.validate_file_size(file) + + input_filename = file_handler.generate_unique_filename(file.filename, "input") + output_filename = file_handler.generate_unique_filename(file.filename, "translated") + + input_path = config.UPLOAD_DIR / input_filename + output_path = config.OUTPUT_DIR / output_filename + + await file_handler.save_upload_file(file, input_path) + + # Translate based on file type + if file_extension == ".xlsx": + excel_translator.translate_file(input_path, output_path, target_language) + elif file_extension == ".docx": + word_translator.translate_file(input_path, output_path, target_language) + elif file_extension == ".pptx": + pptx_translator.translate_file(input_path, output_path, target_language) + + # Cleanup input file + file_handler.cleanup_file(input_path) + + results.append({ + "filename": file.filename, + "status": "success", + "output_file": output_filename, + "download_url": f"/download/{output_filename}" + }) + + except Exception as e: + logger.error(f"Error processing {file.filename}: {str(e)}") + results.append({ + "filename": file.filename, + "status": "error", + "error": str(e) + }) + + return { + "total_files": len(files), + "successful": len([r for r in results if r["status"] == "success"]), + "failed": len([r for r in results if r["status"] == "error"]), + "results": results + } + + +@app.get("/download/{filename}") +async def download_file(filename: str): + """ + Download a translated file by filename + + **Parameters:** + - **filename**: Name of the file to download from the outputs directory + """ + file_path = config.OUTPUT_DIR / filename + + if not file_path.exists(): + raise HTTPException(status_code=404, detail="File not found") + + return FileResponse( + path=file_path, + filename=filename, + media_type="application/octet-stream" + ) + + +if __name__ == "__main__": + import uvicorn + uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True) diff --git a/mcp_server_example.py b/mcp_server_example.py new file mode 100644 index 0000000..ffe421d --- /dev/null +++ b/mcp_server_example.py @@ -0,0 +1,239 @@ +""" +Example MCP Server Implementation for Document Translation API +This demonstrates how to wrap the translation API as an MCP server +""" +import asyncio +import httpx +from typing import Any +from mcp.server.models import InitializationOptions +from mcp.server import NotificationOptions, Server +from mcp.server.stdio import stdio_server +from mcp import types + +# API Configuration +API_BASE_URL = "http://localhost:8000" + + +class DocumentTranslatorMCP: + """MCP Server for Document Translation API""" + + def __init__(self): + self.server = Server("document-translator") + self.http_client = None + self._setup_handlers() + + def _setup_handlers(self): + """Set up MCP tool handlers""" + + @self.server.list_tools() + async def handle_list_tools() -> list[types.Tool]: + """List available tools""" + return [ + types.Tool( + name="translate_document", + description="Translate a document (Excel, Word, or PowerPoint) while preserving all formatting", + inputSchema={ + "type": "object", + "properties": { + "file_path": { + "type": "string", + "description": "Path to the document file to translate" + }, + "target_language": { + "type": "string", + "description": "Target language code (e.g., 'es', 'fr', 'de')" + }, + "source_language": { + "type": "string", + "description": "Source language code (default: 'auto' for auto-detection)", + "default": "auto" + }, + "output_path": { + "type": "string", + "description": "Path where the translated document should be saved" + } + }, + "required": ["file_path", "target_language", "output_path"] + } + ), + types.Tool( + name="get_supported_languages", + description="Get list of supported language codes for translation", + inputSchema={ + "type": "object", + "properties": {} + } + ), + types.Tool( + name="check_api_health", + description="Check if the translation API is healthy and operational", + inputSchema={ + "type": "object", + "properties": {} + } + ) + ] + + @self.server.call_tool() + async def handle_call_tool( + name: str, + arguments: dict[str, Any] | None + ) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]: + """Handle tool calls""" + + if name == "translate_document": + return await self._translate_document(arguments) + elif name == "get_supported_languages": + return await self._get_supported_languages() + elif name == "check_api_health": + return await self._check_health() + else: + raise ValueError(f"Unknown tool: {name}") + + async def _translate_document(self, args: dict[str, Any]) -> list[types.TextContent]: + """Translate a document via the API""" + file_path = args["file_path"] + target_language = args["target_language"] + source_language = args.get("source_language", "auto") + output_path = args["output_path"] + + try: + async with httpx.AsyncClient(timeout=300.0) as client: + # Upload and translate the document + with open(file_path, "rb") as f: + files = {"file": (file_path, f)} + data = { + "target_language": target_language, + "source_language": source_language + } + + response = await client.post( + f"{API_BASE_URL}/translate", + files=files, + data=data + ) + + if response.status_code == 200: + # Save the translated document + with open(output_path, "wb") as output: + output.write(response.content) + + return [ + types.TextContent( + type="text", + text=f"✅ Document translated successfully!\n\n" + f"Original: {file_path}\n" + f"Translated: {output_path}\n" + f"Language: {source_language} → {target_language}\n" + f"Size: {len(response.content)} bytes" + ) + ] + else: + error_detail = response.json().get("detail", "Unknown error") + return [ + types.TextContent( + type="text", + text=f"❌ Translation failed: {error_detail}" + ) + ] + + except Exception as e: + return [ + types.TextContent( + type="text", + text=f"❌ Error during translation: {str(e)}" + ) + ] + + async def _get_supported_languages(self) -> list[types.TextContent]: + """Get supported languages from the API""" + try: + async with httpx.AsyncClient() as client: + response = await client.get(f"{API_BASE_URL}/languages") + + if response.status_code == 200: + data = response.json() + languages = data.get("supported_languages", {}) + + lang_list = "\n".join([f"- {code}: {name}" for code, name in languages.items()]) + + return [ + types.TextContent( + type="text", + text=f"📚 Supported Languages:\n\n{lang_list}\n\n" + f"Note: {data.get('note', '')}" + ) + ] + else: + return [ + types.TextContent( + type="text", + text="❌ Failed to retrieve supported languages" + ) + ] + + except Exception as e: + return [ + types.TextContent( + type="text", + text=f"❌ Error: {str(e)}" + ) + ] + + async def _check_health(self) -> list[types.TextContent]: + """Check API health""" + try: + async with httpx.AsyncClient() as client: + response = await client.get(f"{API_BASE_URL}/health") + + if response.status_code == 200: + data = response.json() + return [ + types.TextContent( + type="text", + text=f"✅ API is healthy!\n\n" + f"Status: {data.get('status')}\n" + f"Translation Service: {data.get('translation_service')}" + ) + ] + else: + return [ + types.TextContent( + type="text", + text="❌ API is not responding correctly" + ) + ] + + except Exception as e: + return [ + types.TextContent( + type="text", + text=f"❌ Cannot connect to API: {str(e)}" + ) + ] + + async def run(self): + """Run the MCP server""" + async with stdio_server() as (read_stream, write_stream): + await self.server.run( + read_stream, + write_stream, + InitializationOptions( + server_name="document-translator", + server_version="1.0.0", + capabilities=self.server.get_capabilities( + notification_options=NotificationOptions(), + experimental_capabilities={} + ) + ) + ) + + +async def main(): + """Main entry point""" + mcp_server = DocumentTranslatorMCP() + await mcp_server.run() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..d3f34e5 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,10 @@ +[project] +name = "translate" +version = "0.1.0" +description = "Add your description here" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "pip>=25.3", + "requests>=2.32.5", +] diff --git a/requirements-mcp.txt b/requirements-mcp.txt new file mode 100644 index 0000000..0ef6d71 --- /dev/null +++ b/requirements-mcp.txt @@ -0,0 +1,5 @@ +# MCP Server Requirements +# Add these to requirements.txt if you want to implement the MCP server + +mcp>=0.9.0 +httpx>=0.26.0 diff --git a/requirements-test.txt b/requirements-test.txt new file mode 100644 index 0000000..ee40f08 --- /dev/null +++ b/requirements-test.txt @@ -0,0 +1,5 @@ +# Testing requirements +requests==2.31.0 +pytest==7.4.3 +pytest-asyncio==0.23.2 +httpx==0.26.0 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8f9e10c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,15 @@ +fastapi==0.109.0 +uvicorn[standard]==0.27.0 +python-multipart==0.0.9 +openpyxl==3.1.2 +python-docx==1.1.0 +python-pptx==0.6.23 +deep-translator==1.11.4 +python-dotenv==1.0.0 +pydantic==2.5.3 +aiofiles==23.2.1 +Pillow==10.2.0 +matplotlib==3.8.2 +pandas==2.1.4 +requests==2.31.0 +ipykernel==6.27.1 diff --git a/sample_files/.~lock.complex_sample.docx# b/sample_files/.~lock.complex_sample.docx# new file mode 100644 index 0000000..9dd9cce --- /dev/null +++ b/sample_files/.~lock.complex_sample.docx# @@ -0,0 +1 @@ +,ramez,simorgh,30.11.2025 09:24,C:/Users/ramez/AppData/Local/onlyoffice; \ No newline at end of file diff --git a/sample_files/complex_sample.docx b/sample_files/complex_sample.docx new file mode 100644 index 0000000..90dbe3f Binary files /dev/null and b/sample_files/complex_sample.docx differ diff --git a/sample_files/complex_sample.pptx b/sample_files/complex_sample.pptx new file mode 100644 index 0000000..5789a72 Binary files /dev/null and b/sample_files/complex_sample.pptx differ diff --git a/sample_files/complex_sample.xlsx b/sample_files/complex_sample.xlsx new file mode 100644 index 0000000..e5e8261 Binary files /dev/null and b/sample_files/complex_sample.xlsx differ diff --git a/sample_files/ppt_conclusion_image.png b/sample_files/ppt_conclusion_image.png new file mode 100644 index 0000000..eb1ebc2 Binary files /dev/null and b/sample_files/ppt_conclusion_image.png differ diff --git a/sample_files/ppt_financial_chart.png b/sample_files/ppt_financial_chart.png new file mode 100644 index 0000000..995691d Binary files /dev/null and b/sample_files/ppt_financial_chart.png differ diff --git a/sample_files/ppt_image1.png b/sample_files/ppt_image1.png new file mode 100644 index 0000000..ef001ff Binary files /dev/null and b/sample_files/ppt_image1.png differ diff --git a/sample_files/ppt_image2.png b/sample_files/ppt_image2.png new file mode 100644 index 0000000..10166ba Binary files /dev/null and b/sample_files/ppt_image2.png differ diff --git a/sample_files/super_complex.docx b/sample_files/super_complex.docx new file mode 100644 index 0000000..1472a80 Binary files /dev/null and b/sample_files/super_complex.docx differ diff --git a/sample_files/super_complex.pptx b/sample_files/super_complex.pptx new file mode 100644 index 0000000..0bcae96 Binary files /dev/null and b/sample_files/super_complex.pptx differ diff --git a/sample_files/super_complex.xlsx b/sample_files/super_complex.xlsx new file mode 100644 index 0000000..a8e5c63 Binary files /dev/null and b/sample_files/super_complex.xlsx differ diff --git a/sample_files/word_chart.png b/sample_files/word_chart.png new file mode 100644 index 0000000..be90a38 Binary files /dev/null and b/sample_files/word_chart.png differ diff --git a/sample_files/word_image.png b/sample_files/word_image.png new file mode 100644 index 0000000..0e2b05e Binary files /dev/null and b/sample_files/word_image.png differ diff --git a/sample_files/word_performance_chart.png b/sample_files/word_performance_chart.png new file mode 100644 index 0000000..8544f5e Binary files /dev/null and b/sample_files/word_performance_chart.png differ diff --git a/sample_files/word_strategy_image.png b/sample_files/word_strategy_image.png new file mode 100644 index 0000000..908b2fb Binary files /dev/null and b/sample_files/word_strategy_image.png differ diff --git a/services/__init__.py b/services/__init__.py new file mode 100644 index 0000000..e564279 --- /dev/null +++ b/services/__init__.py @@ -0,0 +1,4 @@ +"""Services package initialization""" +from .translation_service import TranslationService, translation_service + +__all__ = ['TranslationService', 'translation_service'] diff --git a/services/translation_service.py b/services/translation_service.py new file mode 100644 index 0000000..d775bb5 --- /dev/null +++ b/services/translation_service.py @@ -0,0 +1,124 @@ +""" +Translation Service Abstraction +Provides a unified interface for different translation providers +""" +from abc import ABC, abstractmethod +from typing import Optional +from deep_translator import GoogleTranslator, DeeplTranslator, LibreTranslator +from config import config + + +class TranslationProvider(ABC): + """Abstract base class for translation providers""" + + @abstractmethod + def translate(self, text: str, target_language: str, source_language: str = 'auto') -> str: + """Translate text from source to target language""" + pass + + +class GoogleTranslationProvider(TranslationProvider): + """Google Translate implementation""" + + def translate(self, text: str, target_language: str, source_language: str = 'auto') -> str: + if not text or not text.strip(): + return text + + try: + translator = GoogleTranslator(source=source_language, target=target_language) + return translator.translate(text) + except Exception as e: + print(f"Translation error: {e}") + return text + + +class DeepLTranslationProvider(TranslationProvider): + """DeepL Translate implementation""" + + def __init__(self, api_key: str): + self.api_key = api_key + + def translate(self, text: str, target_language: str, source_language: str = 'auto') -> str: + if not text or not text.strip(): + return text + + try: + translator = DeeplTranslator(api_key=self.api_key, source=source_language, target=target_language) + return translator.translate(text) + except Exception as e: + print(f"Translation error: {e}") + return text + + +class LibreTranslationProvider(TranslationProvider): + """LibreTranslate implementation""" + + def translate(self, text: str, target_language: str, source_language: str = 'auto') -> str: + if not text or not text.strip(): + return text + + try: + translator = LibreTranslator(source=source_language, target=target_language) + return translator.translate(text) + except Exception as e: + print(f"Translation error: {e}") + return text + + +class TranslationService: + """Main translation service that delegates to the configured provider""" + + def __init__(self, provider: Optional[TranslationProvider] = None): + if provider: + self.provider = provider + else: + # Auto-select provider based on configuration + self.provider = self._get_default_provider() + + def _get_default_provider(self) -> TranslationProvider: + """Get the default translation provider from configuration""" + service_type = config.TRANSLATION_SERVICE.lower() + + if service_type == "deepl": + if not config.DEEPL_API_KEY: + raise ValueError("DeepL API key not configured") + return DeepLTranslationProvider(config.DEEPL_API_KEY) + elif service_type == "libre": + return LibreTranslationProvider() + else: # Default to Google + return GoogleTranslationProvider() + + def translate_text(self, text: str, target_language: str, source_language: str = 'auto') -> str: + """ + Translate a single text string + + Args: + text: Text to translate + target_language: Target language code (e.g., 'es', 'fr', 'de') + source_language: Source language code (default: 'auto' for auto-detection) + + Returns: + Translated text + """ + if not text or not text.strip(): + return text + + return self.provider.translate(text, target_language, source_language) + + def translate_batch(self, texts: list[str], target_language: str, source_language: str = 'auto') -> list[str]: + """ + Translate multiple text strings + + Args: + texts: List of texts to translate + target_language: Target language code + source_language: Source language code (default: 'auto') + + Returns: + List of translated texts + """ + return [self.translate_text(text, target_language, source_language) for text in texts] + + +# Global translation service instance +translation_service = TranslationService() diff --git a/start.ps1 b/start.ps1 new file mode 100644 index 0000000..a74b958 --- /dev/null +++ b/start.ps1 @@ -0,0 +1,48 @@ +# Startup script for Windows PowerShell +# Run this to start the Document Translation API + +Write-Host "===============================================" -ForegroundColor Cyan +Write-Host " Document Translation API - Starting Server " -ForegroundColor Cyan +Write-Host "===============================================" -ForegroundColor Cyan +Write-Host "" + +# Check if virtual environment exists +if (-Not (Test-Path ".\venv")) { + Write-Host "Virtual environment not found. Creating one..." -ForegroundColor Yellow + python -m venv venv +} + +# Activate virtual environment +Write-Host "Activating virtual environment..." -ForegroundColor Green +& .\venv\Scripts\Activate.ps1 + +# Install dependencies if needed +Write-Host "Checking dependencies..." -ForegroundColor Green +pip install -r requirements.txt --quiet + +# Create necessary directories +Write-Host "Creating directories..." -ForegroundColor Green +New-Item -ItemType Directory -Force -Path uploads | Out-Null +New-Item -ItemType Directory -Force -Path outputs | Out-Null +New-Item -ItemType Directory -Force -Path temp | Out-Null + +# Copy .env.example to .env if .env doesn't exist +if (-Not (Test-Path ".\.env")) { + Write-Host "Creating .env file from template..." -ForegroundColor Yellow + Copy-Item .env.example .env +} + +Write-Host "" +Write-Host "===============================================" -ForegroundColor Green +Write-Host " Starting API Server on http://localhost:8000 " -ForegroundColor Green +Write-Host "===============================================" -ForegroundColor Green +Write-Host "" +Write-Host "API Documentation available at:" -ForegroundColor Cyan +Write-Host " - Swagger UI: http://localhost:8000/docs" -ForegroundColor White +Write-Host " - ReDoc: http://localhost:8000/redoc" -ForegroundColor White +Write-Host "" +Write-Host "Press Ctrl+C to stop the server" -ForegroundColor Yellow +Write-Host "" + +# Start the server +python main.py diff --git a/translators/__init__.py b/translators/__init__.py new file mode 100644 index 0000000..662ddf1 --- /dev/null +++ b/translators/__init__.py @@ -0,0 +1,10 @@ +"""Translators package initialization""" +from .excel_translator import ExcelTranslator, excel_translator +from .word_translator import WordTranslator, word_translator +from .pptx_translator import PowerPointTranslator, pptx_translator + +__all__ = [ + 'ExcelTranslator', 'excel_translator', + 'WordTranslator', 'word_translator', + 'PowerPointTranslator', 'pptx_translator' +] diff --git a/translators/excel_translator.py b/translators/excel_translator.py new file mode 100644 index 0000000..503561a --- /dev/null +++ b/translators/excel_translator.py @@ -0,0 +1,161 @@ +""" +Excel Translation Module +Translates Excel files while preserving all formatting, formulas, images, and layout +""" +import re +from pathlib import Path +from typing import Dict, Set +from openpyxl import load_workbook +from openpyxl.worksheet.worksheet import Worksheet +from openpyxl.cell.cell import Cell +from openpyxl.utils import get_column_letter +from services.translation_service import translation_service + + +class ExcelTranslator: + """Handles translation of Excel files with strict formatting preservation""" + + def __init__(self): + self.translation_service = translation_service + self.formula_pattern = re.compile(r'=.*') + + def translate_file(self, input_path: Path, output_path: Path, target_language: str) -> Path: + """ + Translate an Excel file while preserving all formatting and structure + + Args: + input_path: Path to input Excel file + output_path: Path to save translated Excel file + target_language: Target language code + + Returns: + Path to the translated file + """ + # Load workbook with data_only=False to preserve formulas + workbook = load_workbook(input_path, data_only=False) + + # First, translate all worksheet content + sheet_name_mapping = {} + for sheet_name in workbook.sheetnames: + worksheet = workbook[sheet_name] + self._translate_worksheet(worksheet, target_language) + + # Prepare translated sheet name (but don't rename yet) + translated_sheet_name = self.translation_service.translate_text( + sheet_name, target_language + ) + if translated_sheet_name and translated_sheet_name != sheet_name: + # Truncate to Excel's 31 character limit and ensure uniqueness + new_name = translated_sheet_name[:31] + counter = 1 + base_name = new_name[:28] if len(new_name) > 28 else new_name + while new_name in sheet_name_mapping.values() or new_name in workbook.sheetnames: + new_name = f"{base_name}_{counter}" + counter += 1 + sheet_name_mapping[sheet_name] = new_name + + # Now rename sheets (after all content is translated) + for original_name, new_name in sheet_name_mapping.items(): + workbook[original_name].title = new_name + + # Save the translated workbook + workbook.save(output_path) + workbook.close() + + return output_path + + def _translate_worksheet(self, worksheet: Worksheet, target_language: str): + """ + Translate all cells in a worksheet while preserving formatting + + Args: + worksheet: Worksheet to translate + target_language: Target language code + """ + # Iterate through all cells that have values + for row in worksheet.iter_rows(): + for cell in row: + if cell.value is not None: + self._translate_cell(cell, target_language) + + def _translate_cell(self, cell: Cell, target_language: str): + """ + Translate a single cell while preserving its formula and formatting + + Args: + cell: Cell to translate + target_language: Target language code + """ + original_value = cell.value + + # Skip if cell is empty + if original_value is None: + return + + # Handle formulas + if isinstance(original_value, str) and original_value.startswith('='): + self._translate_formula(cell, original_value, target_language) + # Handle regular text + elif isinstance(original_value, str): + translated_text = self.translation_service.translate_text( + original_value, target_language + ) + cell.value = translated_text + # Numbers, dates, booleans remain unchanged + + def _translate_formula(self, cell: Cell, formula: str, target_language: str): + """ + Translate text within a formula while preserving the formula structure + + Args: + cell: Cell containing the formula + formula: Formula string + target_language: Target language code + """ + # Extract text strings from formula (text within quotes) + string_pattern = re.compile(r'"([^"]*)"') + strings = string_pattern.findall(formula) + + if not strings: + return + + # Translate each string and replace in formula + translated_formula = formula + for original_string in strings: + if original_string.strip(): # Only translate non-empty strings + translated_string = self.translation_service.translate_text( + original_string, target_language + ) + # Replace in formula, being careful with special regex characters + translated_formula = translated_formula.replace( + f'"{original_string}"', f'"{translated_string}"' + ) + + cell.value = translated_formula + + def _should_translate(self, text: str) -> bool: + """ + Determine if text should be translated + + Args: + text: Text to check + + Returns: + True if text should be translated, False otherwise + """ + if not text or not isinstance(text, str): + return False + + # Don't translate if it's only numbers, special characters, or very short + if len(text.strip()) < 2: + return False + + # Check if it's a formula (handled separately) + if text.startswith('='): + return False + + return True + + +# Global translator instance +excel_translator = ExcelTranslator() diff --git a/translators/pptx_translator.py b/translators/pptx_translator.py new file mode 100644 index 0000000..973bee3 --- /dev/null +++ b/translators/pptx_translator.py @@ -0,0 +1,158 @@ +""" +PowerPoint Translation Module +Translates PowerPoint files while preserving all layouts, animations, and media +""" +from pathlib import Path +from pptx import Presentation +from pptx.shapes.base import BaseShape +from pptx.shapes.group import GroupShape +from pptx.util import Inches, Pt +from pptx.enum.shapes import MSO_SHAPE_TYPE +from services.translation_service import translation_service + + +class PowerPointTranslator: + """Handles translation of PowerPoint presentations with strict formatting preservation""" + + def __init__(self): + self.translation_service = translation_service + + def translate_file(self, input_path: Path, output_path: Path, target_language: str) -> Path: + """ + Translate a PowerPoint presentation while preserving all formatting and structure + + Args: + input_path: Path to input PowerPoint file + output_path: Path to save translated PowerPoint file + target_language: Target language code + + Returns: + Path to the translated file + """ + presentation = Presentation(input_path) + + # Translate each slide + for slide in presentation.slides: + self._translate_slide(slide, target_language) + + # Save the translated presentation + presentation.save(output_path) + + return output_path + + def _translate_slide(self, slide, target_language: str): + """ + Translate all text elements in a slide while preserving layout + + Args: + slide: Slide to translate + target_language: Target language code + """ + # Translate notes (speaker notes) + if slide.has_notes_slide: + notes_slide = slide.notes_slide + if notes_slide.notes_text_frame: + self._translate_text_frame(notes_slide.notes_text_frame, target_language) + + # Translate shapes in the slide + for shape in slide.shapes: + self._translate_shape(shape, target_language) + + def _translate_shape(self, shape: BaseShape, target_language: str): + """ + Translate text in a shape based on its type + + Args: + shape: Shape to translate + target_language: Target language code + """ + # Handle text-containing shapes + if shape.has_text_frame: + self._translate_text_frame(shape.text_frame, target_language) + + # Handle tables + if shape.shape_type == MSO_SHAPE_TYPE.TABLE: + self._translate_table(shape.table, target_language) + + # Handle group shapes (shapes within shapes) + if shape.shape_type == MSO_SHAPE_TYPE.GROUP: + for sub_shape in shape.shapes: + self._translate_shape(sub_shape, target_language) + + # Handle smart art (contains multiple shapes) + # Smart art is complex, but we can try to translate text within it + if hasattr(shape, 'shapes'): + try: + for sub_shape in shape.shapes: + self._translate_shape(sub_shape, target_language) + except: + pass # Some shapes may not support iteration + + def _translate_text_frame(self, text_frame, target_language: str): + """ + Translate text within a text frame while preserving formatting + + Args: + text_frame: Text frame to translate + target_language: Target language code + """ + if not text_frame.text.strip(): + return + + # Translate each paragraph in the text frame + for paragraph in text_frame.paragraphs: + self._translate_paragraph(paragraph, target_language) + + def _translate_paragraph(self, paragraph, target_language: str): + """ + Translate a paragraph while preserving run-level formatting + + Args: + paragraph: Paragraph to translate + target_language: Target language code + """ + if not paragraph.text.strip(): + return + + # Translate each run in the paragraph to preserve individual formatting + for run in paragraph.runs: + if run.text.strip(): + translated_text = self.translation_service.translate_text( + run.text, target_language + ) + run.text = translated_text + + def _translate_table(self, table, target_language: str): + """ + Translate all cells in a table while preserving structure + + Args: + table: Table to translate + target_language: Target language code + """ + for row in table.rows: + for cell in row.cells: + self._translate_text_frame(cell.text_frame, target_language) + + def _is_translatable(self, text: str) -> bool: + """ + Determine if text should be translated + + Args: + text: Text to check + + Returns: + True if text should be translated, False otherwise + """ + if not text or not isinstance(text, str): + return False + + # Don't translate if it's only numbers, special characters, or very short + if len(text.strip()) < 2: + return False + + return True + + +# Global translator instance +pptx_translator = PowerPointTranslator() diff --git a/translators/word_translator.py b/translators/word_translator.py new file mode 100644 index 0000000..d06fd1e --- /dev/null +++ b/translators/word_translator.py @@ -0,0 +1,171 @@ +""" +Word Document Translation Module +Translates Word files while preserving all formatting, styles, tables, and images +""" +from pathlib import Path +from docx import Document +from docx.text.paragraph import Paragraph +from docx.table import Table, _Cell +from docx.oxml.text.paragraph import CT_P +from docx.oxml.table import CT_Tbl +from docx.section import Section +from services.translation_service import translation_service + + +class WordTranslator: + """Handles translation of Word documents with strict formatting preservation""" + + def __init__(self): + self.translation_service = translation_service + + def translate_file(self, input_path: Path, output_path: Path, target_language: str) -> Path: + """ + Translate a Word document while preserving all formatting and structure + + Args: + input_path: Path to input Word file + output_path: Path to save translated Word file + target_language: Target language code + + Returns: + Path to the translated file + """ + document = Document(input_path) + + # Translate main document body + self._translate_document_body(document, target_language) + + # Translate headers and footers in all sections + for section in document.sections: + self._translate_section(section, target_language) + + # Save the translated document + document.save(output_path) + + return output_path + + def _translate_document_body(self, document: Document, target_language: str): + """ + Translate all elements in the document body + + Args: + document: Document to translate + target_language: Target language code + """ + for element in document.element.body: + if isinstance(element, CT_P): + # It's a paragraph + paragraph = Paragraph(element, document) + self._translate_paragraph(paragraph, target_language) + elif isinstance(element, CT_Tbl): + # It's a table + table = Table(element, document) + self._translate_table(table, target_language) + + def _translate_paragraph(self, paragraph: Paragraph, target_language: str): + """ + Translate a paragraph while preserving all formatting + + Args: + paragraph: Paragraph to translate + target_language: Target language code + """ + if not paragraph.text.strip(): + return + + # For paragraphs with complex formatting (multiple runs), translate run by run + if len(paragraph.runs) > 0: + for run in paragraph.runs: + if run.text.strip(): + translated_text = self.translation_service.translate_text( + run.text, target_language + ) + run.text = translated_text + else: + # Simple paragraph with no runs + if paragraph.text.strip(): + translated_text = self.translation_service.translate_text( + paragraph.text, target_language + ) + paragraph.text = translated_text + + def _translate_table(self, table: Table, target_language: str): + """ + Translate all cells in a table while preserving structure + + Args: + table: Table to translate + target_language: Target language code + """ + for row in table.rows: + for cell in row.cells: + self._translate_cell(cell, target_language) + + def _translate_cell(self, cell: _Cell, target_language: str): + """ + Translate content within a table cell + + Args: + cell: Cell to translate + target_language: Target language code + """ + for paragraph in cell.paragraphs: + self._translate_paragraph(paragraph, target_language) + + # Handle nested tables + for table in cell.tables: + self._translate_table(table, target_language) + + def _translate_section(self, section: Section, target_language: str): + """ + Translate headers and footers in a section + + Args: + section: Section to translate + target_language: Target language code + """ + # Translate header + if section.header: + for paragraph in section.header.paragraphs: + self._translate_paragraph(paragraph, target_language) + for table in section.header.tables: + self._translate_table(table, target_language) + + # Translate footer + if section.footer: + for paragraph in section.footer.paragraphs: + self._translate_paragraph(paragraph, target_language) + for table in section.footer.tables: + self._translate_table(table, target_language) + + # Translate first page header (if different) + if section.first_page_header: + for paragraph in section.first_page_header.paragraphs: + self._translate_paragraph(paragraph, target_language) + for table in section.first_page_header.tables: + self._translate_table(table, target_language) + + # Translate first page footer (if different) + if section.first_page_footer: + for paragraph in section.first_page_footer.paragraphs: + self._translate_paragraph(paragraph, target_language) + for table in section.first_page_footer.tables: + self._translate_table(table, target_language) + + # Translate even page header (if different) + if section.even_page_header: + for paragraph in section.even_page_header.paragraphs: + self._translate_paragraph(paragraph, target_language) + for table in section.even_page_header.tables: + self._translate_table(table, target_language) + + # Translate even page footer (if different) + if section.even_page_footer: + for paragraph in section.even_page_footer.paragraphs: + self._translate_paragraph(paragraph, target_language) + for table in section.even_page_footer.tables: + self._translate_table(table, target_language) + + +# Global translator instance +word_translator = WordTranslator() diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..a950000 --- /dev/null +++ b/utils/__init__.py @@ -0,0 +1,20 @@ +"""Utils package initialization""" +from .file_handler import FileHandler, file_handler +from .exceptions import ( + TranslationError, + UnsupportedFileTypeError, + FileSizeLimitExceededError, + LanguageNotSupportedError, + DocumentProcessingError, + handle_translation_error +) + +__all__ = [ + 'FileHandler', 'file_handler', + 'TranslationError', + 'UnsupportedFileTypeError', + 'FileSizeLimitExceededError', + 'LanguageNotSupportedError', + 'DocumentProcessingError', + 'handle_translation_error' +] diff --git a/utils/exceptions.py b/utils/exceptions.py new file mode 100644 index 0000000..237340e --- /dev/null +++ b/utils/exceptions.py @@ -0,0 +1,51 @@ +""" +Custom exceptions for the Document Translation API +""" +from fastapi import HTTPException + + +class TranslationError(Exception): + """Base exception for translation errors""" + pass + + +class UnsupportedFileTypeError(TranslationError): + """Raised when an unsupported file type is provided""" + pass + + +class FileSizeLimitExceededError(TranslationError): + """Raised when a file exceeds the size limit""" + pass + + +class LanguageNotSupportedError(TranslationError): + """Raised when a language code is not supported""" + pass + + +class DocumentProcessingError(TranslationError): + """Raised when there's an error processing the document""" + pass + + +def handle_translation_error(error: Exception) -> HTTPException: + """ + Convert translation errors to HTTP exceptions + + Args: + error: Exception that occurred + + Returns: + HTTPException with appropriate status code and message + """ + if isinstance(error, UnsupportedFileTypeError): + return HTTPException(status_code=400, detail=str(error)) + elif isinstance(error, FileSizeLimitExceededError): + return HTTPException(status_code=413, detail=str(error)) + elif isinstance(error, LanguageNotSupportedError): + return HTTPException(status_code=400, detail=str(error)) + elif isinstance(error, DocumentProcessingError): + return HTTPException(status_code=500, detail=str(error)) + else: + return HTTPException(status_code=500, detail="An unexpected error occurred during translation") diff --git a/utils/file_handler.py b/utils/file_handler.py new file mode 100644 index 0000000..7dae916 --- /dev/null +++ b/utils/file_handler.py @@ -0,0 +1,142 @@ +""" +Utility functions for file handling and validation +""" +import os +import uuid +from pathlib import Path +from typing import Optional +from fastapi import UploadFile, HTTPException +from config import config + + +class FileHandler: + """Handles file operations for the translation API""" + + @staticmethod + def validate_file_extension(filename: str) -> str: + """ + Validate that the file extension is supported + + Args: + filename: Name of the file + + Returns: + File extension (lowercase, with dot) + + Raises: + HTTPException: If file extension is not supported + """ + file_extension = Path(filename).suffix.lower() + + if file_extension not in config.SUPPORTED_EXTENSIONS: + raise HTTPException( + status_code=400, + detail=f"Unsupported file type. Supported types: {', '.join(config.SUPPORTED_EXTENSIONS)}" + ) + + return file_extension + + @staticmethod + def validate_file_size(file: UploadFile) -> None: + """ + Validate that the file size is within limits + + Args: + file: Uploaded file + + Raises: + HTTPException: If file is too large + """ + # Get file size + file.file.seek(0, 2) # Move to end of file + file_size = file.file.tell() # Get position (file size) + file.file.seek(0) # Reset to beginning + + if file_size > config.MAX_FILE_SIZE_BYTES: + raise HTTPException( + status_code=400, + detail=f"File too large. Maximum size: {config.MAX_FILE_SIZE_MB}MB" + ) + + @staticmethod + async def save_upload_file(file: UploadFile, destination: Path) -> Path: + """ + Save an uploaded file to disk + + Args: + file: Uploaded file + destination: Path to save the file + + Returns: + Path to the saved file + """ + destination.parent.mkdir(parents=True, exist_ok=True) + + with open(destination, "wb") as buffer: + content = await file.read() + buffer.write(content) + + return destination + + @staticmethod + def generate_unique_filename(original_filename: str, prefix: str = "") -> str: + """ + Generate a unique filename to avoid collisions + + Args: + original_filename: Original filename + prefix: Optional prefix for the filename + + Returns: + Unique filename + """ + file_path = Path(original_filename) + unique_id = str(uuid.uuid4())[:8] + + if prefix: + return f"{prefix}_{unique_id}_{file_path.stem}{file_path.suffix}" + else: + return f"{unique_id}_{file_path.stem}{file_path.suffix}" + + @staticmethod + def cleanup_file(file_path: Path) -> None: + """ + Delete a file if it exists + + Args: + file_path: Path to the file to delete + """ + try: + if file_path.exists(): + file_path.unlink() + except Exception as e: + print(f"Error deleting file {file_path}: {e}") + + @staticmethod + def get_file_info(file_path: Path) -> dict: + """ + Get information about a file + + Args: + file_path: Path to the file + + Returns: + Dictionary with file information + """ + if not file_path.exists(): + return {} + + stat = file_path.stat() + + return { + "filename": file_path.name, + "size_bytes": stat.st_size, + "size_mb": round(stat.st_size / (1024 * 1024), 2), + "extension": file_path.suffix, + "created": stat.st_ctime, + "modified": stat.st_mtime + } + + +# Global file handler instance +file_handler = FileHandler()