OmniRoute/scripts/validate_translation.py

#!/usr/bin/env python3
"""
OmniRoute i18n Translation Validator
Script for comparing source (en.json) with any translation
Detects missing translations and source changes needing updates

Usage:
    python validate_translation.py              # Uses TRANSLATION_LANG env or --lang argument
    python validate_translation.py --lang cs    # Validate Czech (cs.json)
    python validate_translation.py -l de       # Validate German (de.json)
    TRANSLATION_LANG=fr python validate_translation.py  # Validate French

Environment variables:
    TRANSLATION_LANG    Target language code (e.g., cs, de, fr)
"""

import json
import sys
import os
from pathlib import Path
from typing import Dict, List, Set, Tuple, Any
import argparse

# Colors (ANSI)
RED = '\033[0;31m'
GREEN = '\033[0;32m'
YELLOW = '\033[1;33m'
BLUE = '\033[0;34m'
NC = '\033[0m'

# Configuration - find repo root relative to this script
_script_dir = Path(__file__).parent.resolve()
# If script is in scripts/ subfolder, go up one level to repo root
if _script_dir.name == "scripts":
    SCRIPT_DIR = _script_dir.parent
else:
    SCRIPT_DIR = _script_dir

MESSAGES_DIR = SCRIPT_DIR / "src" / "i18n" / "messages"
SOURCE_FILE = MESSAGES_DIR / "en.json"

# Get target language from env or argument
def get_target_lang() -> str:
    """Get target language from ENV or CLI argument."""
    # First check environment variable
    env_lang = os.environ.get('TRANSLATION_LANG')
    if env_lang:
        return env_lang

    # Then check command line argument (will be set in main)
    if hasattr(get_target_lang, 'cli_lang'):
        return get_target_lang.cli_lang

    # Default to cs for backwards compatibility
    return "cs"

# Keys that should NOT be translated (technical terms, proper names, etc.)
UNTRANSLATABLE_KEYS = {
    # ICU/Plural formats
    "apiManager.modelsCount",
    # Technical/Protocol names
    "a2aDashboard.metadata",
    "a2aDashboard.ok",
    "a2aDashboard.url",
    "cliTools.baseUrlPlaceholder",
    "cliTools.platforms",
    "cliTools.toolDescriptions.claude",
    "cliTools.toolDescriptions.codex",
    "cliTools.toolDescriptions.cursor",
    "combos.roundRobin",
    "common.model",
    "docs.clientCherryStudioTitle",
    "docs.clientClaudeTitle",
    "docs.clientCursorTitle",
    "docs.github",
    "docs.protocolA2aTitle",
    "docs.protocolMcpTitle",
    "endpoint.chat",
    "endpoint.chatCompletions",
    "endpoint.cloudProxy",
    "endpoint.mcpCardTitle",
    "endpoint.rerank",
    "header.a2a",
    "header.mcp",
    "health.cpu",
    "health.latencyP50",
    "health.latencyP95",
    "health.latencyP99",
    "health.millisecondsShort",
    "health.notAvailable",
    "health.ok",
    "home.aliasLabel",
    "home.oauthLabel",
    "landing.brandName",
    "landing.flowProviderAnthropic",
    "landing.flowProviderGemini",
    "landing.flowProviderGithubCopilot",
    "landing.flowProviderOpenAI",
    "landing.flowToolClaudeCode",
    "landing.flowToolCline",
    "landing.flowToolCursor",
    "landing.flowToolOpenAICodex",
    "landing.github",
    "legal.terms",
    "legal.privacy",
    "logs.endpoint",
    "logs.proxy",
    "logs.console",
    "logs.request",
    "logs.audit",
    "media.interpolation",
    "media.upscale",
    "media.samples",
    "search.search",
    "search.searchTools",
    "search.webSearch",
    "search.fileSearch",
    "settings.theme",
    "settings.language",
    "settings.currency",
    "settings.timezone",
    "stats.requests",
    "stats.tokens",
    "stats.latency",
    "stats.errors",
    "themesPage.dark",
    "themesPage.light",
    "themesPage.system",
    "translator.translate",
    "translator.translateFrom",
    "translator.translateTo",
    "translator.detect",
    "translator.detectedLanguage",
    "usage.totalRequests",
    "usage.totalTokens",
    "usage.inputTokens",
    "usage.outputTokens",
    "usage.promptTokens",
    "usage.completionTokens",
    "usage.cacheReadTokens",
    "usage.cacheWriteTokens",
}


def print_header(msg: str) -> None:
    print(f"\n{BLUE}{'='*50}{NC}")
    print(f"{BLUE}{msg}{NC}")
    print(f"{BLUE}{'='*50}{NC}")


def print_success(msg: str) -> None:
    print(f"{GREEN}✓ {msg}{NC}")


def print_warning(msg: str) -> None:
    print(f"{YELLOW}⚠ {msg}{NC}")


def print_error(msg: str) -> None:
    print(f"{RED}✗ {msg}{NC}")


def load_json(path: Path) -> Dict:
    """Load JSON file"""
    try:
        with open(path, 'r', encoding='utf-8') as f:
            return json.load(f)
    except json.JSONDecodeError as e:
        print_error(f"Invalid JSON in {path}: {e}")
        sys.exit(1)


def get_all_keys(obj: Any, prefix: str = "") -> Set[str]:
    """Recursively get all leaf keys from JSON object"""
    keys = set()
    if isinstance(obj, dict):
        for key, value in obj.items():
            new_prefix = f"{prefix}.{key}" if prefix else key
            if isinstance(value, dict):
                keys.update(get_all_keys(value, new_prefix))
            elif isinstance(value, list):
                # Handle arrays - check first element for structure
                if value and isinstance(value[0], dict):
                    for i, item in enumerate(value):
                        keys.update(get_all_keys(item, f"{new_prefix}[{i}]"))
                else:
                    keys.add(new_prefix)
            else:
                keys.add(new_prefix)
    return keys


def find_missing_keys(source: Dict, trans: Dict) -> Set[str]:
    """Keys in source but not in translation"""
    source_keys = get_all_keys(source)
    trans_keys = get_all_keys(trans)
    return source_keys - trans_keys


def find_extra_keys(source: Dict, trans: Dict) -> Set[str]:
    """Keys in translation but not in source"""
    source_keys = get_all_keys(source)
    trans_keys = get_all_keys(trans)
    return trans_keys - source_keys


def get_value_by_path(obj: Dict, path: str) -> Any:
    """Get value from nested dict using dot notation"""
    keys = path.replace('[', '.').replace(']', '').split('.')
    current = obj
    for key in keys:
        if key.isdigit():
            idx = int(key)
            if isinstance(current, list) and idx < len(current):
                current = current[idx]
            else:
                return None
        else:
            if isinstance(current, dict) and key in current:
                current = current[key]
            else:
                return None
    return current


def find_untranslated(source: Dict, trans: Dict) -> Set[str]:
    """Keys where source value equals translation (not translated), excluding untranslatable keys"""
    source_keys = get_all_keys(source)
    untranslated = set()

    for key in source_keys:
        # Skip keys that are in the untranslatable list
        if key in UNTRANSLATABLE_KEYS:
            continue

        source_val = get_value_by_path(source, key)
        trans_val = get_value_by_path(trans, key)

        if source_val is not None and source_val == trans_val:
            untranslated.add(key)

    return untranslated


def find_placeholder_issues(source: Dict, trans: Dict) -> List[Tuple[str, str, str]]:
    """
    Find placeholder mismatches between source and translation.
    Only checks top-level placeholders like {count}, {day}, NOT ICU inner content.
    Returns list of (key, source_placeholder, trans_placeholder)
    """
    source_keys = get_all_keys(source)
    issues = []

    for key in source_keys:
        source_val = get_value_by_path(source, key)
        trans_val = get_value_by_path(trans, key)

        if source_val is None or trans_val is None:
            continue

        if not isinstance(source_val, str) or not isinstance(trans_val, str):
            continue

        # Only extract top-level placeholders: {name}, {count}, {day}, NOT {# X} inside ICU
        import re
        # Extract variable names from placeholders (e.g., 'name' from '{name}' or 'count' from '{count, plural, ...}')
        # This avoids false positives on ICU strings where the internal text is translated.
        placeholder_regex = r'\{\s*([a-zA-Z][a-zA-Z0-9_]*)'
        source_placeholders = set(re.findall(placeholder_regex, source_val))
        trans_placeholders = set(re.findall(placeholder_regex, trans_val))

        # Check for missing placeholders
        missing = source_placeholders - trans_placeholders
        if missing:
            issues.append((key, str(source_placeholders), str(trans_placeholders)))

    return issues


def compare_category(source: Dict, trans: Dict, category: str) -> Tuple[bool, List[str]]:
    """Compare a specific category, return (complete, missing_keys)"""
    if category not in source:
        return False, [f"Category '{category}' not in source"]

    if category not in trans:
        return False, [f"Category '{category}' missing in translation"]

    source_keys = get_all_keys(source[category])
    trans_keys = get_all_keys(trans[category])
    missing = source_keys - trans_keys

    return len(missing) == 0, list(missing)


def get_translation_file() -> Path:
    """Get the translation file path based on target language."""
    lang = get_target_lang()
    return MESSAGES_DIR / f"{lang}.json"


def generate_report():
    """Generate full translation report"""
    translation_file = get_translation_file()
    print_header("OmniRoute Translation Report")
    print(f"Source: {SOURCE_FILE}")
    print(f"Translation: {translation_file}\n")

    source = load_json(SOURCE_FILE)
    trans = load_json(translation_file)

    # Count keys
    source_count = len(get_all_keys(source))
    trans_count = len(get_all_keys(trans))

    print(f"{BLUE}Key Statistics:{NC}")
    print(f"  Source keys: {source_count}")
    print(f"  Translation keys: {trans_count}\n")

    # Missing keys
    print_header("Missing Translations")
    missing = find_missing_keys(source, trans)
    if missing:
        print(f"{RED}Found {len(missing)} missing keys:{NC}")
        for key in sorted(missing)[:50]:  # Limit output
            print(f"  - {key}")
        if len(missing) > 50:
            print(f"  ... and {len(missing) - 50} more")
    else:
        print_success("No missing translations!")

    # Extra keys
    print_header("Extra Keys")
    extra = find_extra_keys(source, trans)
    if extra:
        print(f"{YELLOW}Found {len(extra)} extra keys:{NC}")
        for key in sorted(extra)[:50]:
            print(f"  - {key}")
    else:
        print_success("No extra keys!")

    # Untranslated
    print_header("Untranslated Keys (same as source)")
    untranslated = find_untranslated(source, trans)
    if untranslated:
        print(f"{YELLOW}Found {len(untranslated)} untranslated keys:{NC}")
        for key in sorted(untranslated)[:50]:
            print(f"  - {key}")
        if len(untranslated) > 50:
            print(f"  ... and {len(untranslated) - 50} more")
    else:
        print_success("All keys appear to be translated!")

    # Placeholder issues
    print_header("Placeholder Mismatches")
    placeholder_issues = find_placeholder_issues(source, trans)
    if placeholder_issues:
        print(f"{YELLOW}Found {len(placeholder_issues)} placeholder mismatches:{NC}")
        for key, src_ph, trans_ph in placeholder_issues[:20]:
            print(f"  - {key}")
            print(f"    Source: {src_ph}")
            print(f"    Trans:  {trans_ph}")
        if len(placeholder_issues) > 20:
            print(f"  ... and {len(placeholder_issues) - 20} more")
    else:
        print_success("All placeholders match!")

    # Per-category status
    print_header("Per-Category Status")
    for category in sorted(source.keys()):
        complete, missing = compare_category(source, trans, category)
        if complete:
            print_success(f"{category} - complete")
        else:
            print_error(f"{category} - missing {len(missing)} keys")

    # Summary
    print_header("Summary")
    if not missing and not extra and not untranslated:
        print(f"{GREEN}🎉 Translation is fully synchronized!{NC}")
        return 0
    else:
        print(f"{YELLOW}Translation needs attention:{NC}")
        print(f"  - Missing: {len(missing)}")
        print(f"  - Extra: {len(extra)}")
        print(f"  - Untranslated: {len(untranslated)}")
        return 0


def quick_check() -> int:
    """Quick check - just show counts"""
    translation_file = get_translation_file()
    source = load_json(SOURCE_FILE)
    trans = load_json(translation_file)

    missing = find_missing_keys(source, trans)
    untranslated = find_untranslated(source, trans)

    print(f"Missing: {len(missing)}")
    print(f"Untranslated: {len(untranslated)}")

    # Exit codes:
    # 0 = OK
    # 1 = generic error
    # 2 = missing string in translation
    # 3 = untranslated (soft warning - not a failure)
    if missing:
        print_warning(f"{len(missing)} missing keys (non-critical)")
        return 0
    # untranslated is a soft warning, not a failure - translations exist, just not localized
    if untranslated:
        print_warning(f"{len(untranslated)} untranslated keys (non-critical)")
        return 0
    return 0


def show_diff(category: str) -> int:
    """Show detailed diff for a category"""
    translation_file = get_translation_file()
    source = load_json(SOURCE_FILE)
    trans = load_json(translation_file)

    if category not in source:
        print_error(f"Category '{category}' not found in source")
        print("Available categories:")
        for cat in sorted(source.keys()):
            print(f"  - {cat}")
        return 1

    print_header(f"Diff for category: {category}")

    print(f"{BLUE}{'Key':<30} | {'Source':<25} | {'Translation':<25}{NC}")
    print("-" * 85)

    source_keys = get_all_keys(source[category])

    for key in sorted(source_keys):
        source_val = get_value_by_path(source[category], key)
        trans_val = get_value_by_path(trans.get(category, {}), key)

        # Truncate long values
        source_str = str(source_val)[:25] if source_val else "(null)"
        trans_str = str(trans_val)[:25] if trans_val else "(missing)"

        if source_val == trans_val:
            status = f"{YELLOW}(same){NC}"
        elif trans_val is None:
            status = f"{RED}(missing){NC}"
        else:
            status = f"{GREEN}(ok){NC}"

        print(f"{key:<30} | {source_str:<25} | {trans_str:<25} {status}")

    return 0


def export_csv(output_file: str) -> int:
    """Export to CSV"""
    translation_file = get_translation_file()
    source = load_json(SOURCE_FILE)
    trans = load_json(translation_file)

    print_header(f"Exporting to CSV: {output_file}")

    source_keys = get_all_keys(source)

    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("key,source_value,translation_value,status\n")

        for key in sorted(source_keys):
            source_val = get_value_by_path(source, key)
            trans_val = get_value_by_path(trans, key)

            # Escape commas
            source_str = str(source_val).replace(',', ';')
            trans_str = str(trans_val).replace(',', ';') if trans_val else ""

            if trans_val is None:
                status = "MISSING"
            elif source_val == trans_val:
                status = "UNTRANSLATED"
            else:
                status = "OK"

            f.write(f'"{key}","{source_str}","{trans_str}",{status}\n')

    print_success(f"Exported to {output_file}")
    return 0


def export_markdown(output_file: str) -> int:
    """Export all keys to separate Markdown files - translated and untranslated"""
    translation_file = get_translation_file()
    source = load_json(SOURCE_FILE)
    trans = load_json(translation_file)

    print_header(f"Exporting to Markdown: {output_file}")

    source_keys = get_all_keys(source)
    missing = find_missing_keys(source, trans)
    untranslated = find_untranslated(source, trans)

    # Separate translated and untranslated
    translated_keys = []
    untranslated_sorted = sorted(untranslated)

    for key in sorted(source_keys):
        if key not in missing and key not in untranslated:
            translated_keys.append(key)

    translated_count = len(translated_keys)
    untranslated_count = len(untranslated_sorted)

    # Export untranslated (main output file)
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write("# Nepřeložené klíče (Untranslated Keys)\n\n")
        f.write(f"Zdroj: `{SOURCE_FILE.name}` | Překlad: `{TRANSLATION_FILE.name}`\n\n")

        f.write(f"**Celkem: {untranslated_count} nepreložených klíčů**\n\n")

        f.write("| # | Klíč (Key) | Originál | Nepřeloženo |\n")
        f.write("|---|------------|----------|------------|\n")

        for i, key in enumerate(untranslated_sorted, 1):
            source_val = get_value_by_path(source, key)
            trans_val = get_value_by_path(trans, key)

            source_str = str(source_val).replace('|', '\\|')[:60]
            trans_str = str(trans_val).replace('|', '\\|')[:60]

            f.write(f"| {i} | `{key}` | {source_str} | {trans_str} |\n")

        f.write("\n## Shrnutí (Summary)\n\n")
        f.write(f"- Celkem klíčů: {len(source_keys)}\n")
        f.write(f"- Chybějících: {len(missing)}\n")
        f.write(f"- Nepřeložených: {untranslated_count}\n")
        f.write(f"- Přeložených: {translated_count}\n")

    # Export translated to separate file
    translated_file = output_file.replace('.md', '_translated.md')
    translation_filename = translation_file.name
    with open(translated_file, 'w', encoding='utf-8') as f:
        f.write("# Přeložené klíče (Translated Keys)\n\n")
        f.write(f"Zdroj: `{SOURCE_FILE.name}` | Překlad: `{translation_filename}`\n\n")

        f.write(f"**Celkem: {translated_count} přeložených klíčů**\n\n")

        f.write("| # | Klíč (Key) | Originál | Překlad |\n")
        f.write("|---|------------|----------|---------|\n")

        for i, key in enumerate(translated_keys, 1):
            source_val = get_value_by_path(source, key)
            trans_val = get_value_by_path(trans, key)

            source_str = str(source_val).replace('|', '\\|')[:40]
            trans_str = str(trans_val).replace('|', '\\|')[:40]

            f.write(f"| {i} | `{key}` | {source_str} | {trans_str} |\n")

    print_success(f"Exported: {output_file} ({untranslated_count} keys)")
    print_success(f"Exported: {translated_file} ({translated_count} keys)")
    return 0


def usage():
    print("""
OmniRoute i18n Translation Validator

Usage: validate_translation.py [command] [options]

Options:
  -l, --lang <code>    Target language code (e.g., cs, de, fr)
                      Default: cs or TRANSLATION_LANG env variable

Commands:
  (default)        Generate full report
  quick            Quick check - just show counts
  diff <category>  Show detailed diff for a category
  csv [file]       Export to CSV (default: translation_report.csv)
  md [file]        Export to Markdown (default: translation_report.md)

Examples:
  python validate_translation.py                 # Full report (default: cs)
  python validate_translation.py --lang de        # Validate German
  python validate_translation.py -l fr            # Validate French
  TRANSLATION_LANG=es python validate_translation.py  # Validate Spanish
  python validate_translation.py quick            # Quick status check
  python validate_translation.py diff common      # Diff common category
  python validate_translation.py csv              # Export to CSV
  python validate_translation.py md               # Export to Markdown
""")


def main():
    # Parse global arguments first
    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument('-l', '--lang', dest='lang', default=None)
    parser.add_argument('command', nargs='?')
    parser.add_argument('arg', nargs='?')

    # Parse known args only to allow commands to handle their own args
    args, _ = parser.parse_known_args()

    # Set language from argument or use default
    if args.lang:
        get_target_lang.cli_lang = args.lang
    elif not os.environ.get('TRANSLATION_LANG'):
        # Default to cs for backwards compatibility
        get_target_lang.cli_lang = "cs"

    # Check if translation file exists
    translation_file = get_translation_file()
    if not translation_file.exists():
        print_error(f"Translation file not found: {translation_file}")
        print(f"Available languages:")
        for f in sorted(MESSAGES_DIR.glob("*.json")):
            if f.name != "en.json":
                print(f"  - {f.stem}")
        return 1

    # Execute command
    if not args.command or args.command in ('help', '--help', '-h'):
        return generate_report()

    if args.command == "quick":
        return quick_check()
    elif args.command == "diff":
        if not args.arg:
            print_error("Please specify category")
            usage()
            return 1
        return show_diff(args.arg)
    elif args.command == "csv":
        output = args.arg if args.arg else "translation_report.csv"
        return export_csv(output)
    elif args.command == "md":
        output = args.arg if args.arg else "translation_report.md"
        return export_markdown(output)
    else:
        print_error(f"Unknown command: {args.command}")
        usage()
        return 1


if __name__ == "__main__":
    sys.exit(main())