OmniRoute/scripts/i18n_autotranslate.py
2026-03-29 11:26:17 -03:00

151 lines
5.8 KiB
Python
Executable file

#!/usr/bin/env python3
"""
OmniRoute i18n Auto-Translator
This script scans all docs/i18n directory markdown files and uses an LLM
API (like OmniRoute itself) to translate any English paragraphs into the
target language.
Usage:
python3 scripts/i18n_autotranslate.py --api-url http://192.168.0.15:20128/v1 --api-key sk-14e76c286e84ff2d-agn73z-5a1fd283 --model cx/gpt-5.4
"""
import os
import re
import sys
import glob
import json
import urllib.request
import urllib.error
import argparse
from pathlib import Path
# The base path of the project
SCRIPT_DIR = Path(__file__).parent.resolve()
PROJECT_ROOT = SCRIPT_DIR.parent
I18N_DIR = PROJECT_ROOT / "docs" / "i18n"
def get_language_name(lang_code):
lang_map = {
"pt-BR": "Portuguese (Brazil)", "es": "Spanish", "fr": "French",
"it": "Italian", "ru": "Russian", "zh-CN": "Simplified Chinese",
"de": "German", "in": "Hindi", "th": "Thai", "uk-UA": "Ukrainian",
"ar": "Arabic", "ja": "Japanese", "vi": "Vietnamese", "bg": "Bulgarian",
"da": "Danish", "fi": "Finnish", "he": "Hebrew", "hu": "Hungarian",
"id": "Indonesian", "ko": "Korean", "ms": "Malay", "nl": "Dutch",
"no": "Norwegian", "pt": "Portuguese (Portugal)", "ro": "Romanian",
"pl": "Polish", "sk": "Slovak", "sv": "Swedish", "phi": "Filipino",
"cs": "Czech"
}
return lang_map.get(lang_code, lang_code)
def translate_block(text, target_language, api_url, api_key, model):
if not text.strip():
return text
prompt = (
f"You are a professional technical translator working on the OmniRoute proxy project documentation.\n"
f"Translate the following Markdown text from English to {target_language}.\n"
f"CRITICAL RULES:\n"
f"- Do NOT translate code blocks (```...```).\n"
f"- Do NOT translate markdown formatting elements, links syntax, or image syntax.\n"
f"- Retain formatting perfectly.\n"
f"- Only return the translated text without introductory phrases.\n\n"
f"{text}"
)
data = {
"model": model,
"messages": [
{"role": "system", "content": "You are a direct translator. Output only the requested translation."},
{"role": "user", "content": prompt}
],
"temperature": 0.3,
"stream": False
}
req = urllib.request.Request(
f"{api_url}/chat/completions",
data=json.dumps(data).encode('utf-8'),
headers={
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
)
try:
with urllib.request.urlopen(req) as response:
result = json.loads(response.read().decode())
if "choices" in result and len(result["choices"]) > 0:
translated = result["choices"][0]["message"]["content"]
return translated.strip()
except Exception as e:
print(f" ❌ API Error: {e}")
return text
def process_file(file_path, target_language, api_url, api_key, model):
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Simple heuristic: we look for English common words to identify if a block needs translation.
# A true robust implementation would diff against the English source.
# For now, we split by double newlines (markdown blocks)
blocks = content.split('\n\n')
translated_blocks = []
english_words = [" the ", " is ", " are ", " this ", " that ", " a ", " to "]
needs_update = False
for block in blocks:
# Skip translation if it's a pure code block or doesn't have English markers
if block.startswith('```') or block.startswith('<div') or block.startswith('🌐') or block.startswith('|'):
translated_blocks.append(block)
continue
is_english = any(w in block.lower() for w in english_words)
if is_english and len(block.strip()) > 10:
print(f" 🔄 Translating paragraph (length {len(block)})...")
new_block = translate_block(block, target_language, api_url, api_key, model)
if new_block != block:
needs_update = True
translated_blocks.append(new_block)
else:
translated_blocks.append(block)
if needs_update:
with open(file_path, 'w', encoding='utf-8') as f:
f.write('\n\n'.join(translated_blocks))
print(f" ✅ Updated translations in {file_path.name}")
else:
print(f"{file_path.name} already fully translated or no English blocks found.")
def main():
parser = argparse.ArgumentParser(description="OmniRoute Auto-Translator for i18n Markdown")
parser.add_argument("--api-url", default="http://localhost:20128/v1", help="Base URL of OmniRoute or target provider")
parser.add_argument("--api-key", default="sk-test", help="API Key for the provider")
parser.add_argument("--model", default="gc/gemini-3-flash", help="Model name to use")
parser.add_argument("--lang", default=None, help="Process only a specific language code (e.g. pt-BR)")
args = parser.parse_args()
print(f"🚀 Starting Auto-Translator")
print(f"🔗 Target API: {args.api_url} | Model: {args.model}\n")
if args.lang:
lang_dirs = [d for d in I18N_DIR.iterdir() if d.is_dir() and d.name == args.lang]
else:
lang_dirs = [d for d in I18N_DIR.iterdir() if d.is_dir()]
for lang_dir in lang_dirs:
lang_code = lang_dir.name
lang_name = get_language_name(lang_code)
print(f"\n🌍 Processing {lang_name} ({lang_code})")
md_files = list(lang_dir.glob("*.md"))
for md_file in md_files:
process_file(md_file, lang_name, args.api_url, args.api_key, args.model)
if __name__ == "__main__":
main()