qwen-code/scripts/check-i18n.ts
MikeWang0316tw 02a65f90c4
fix(i18n): Correct zh-TW translations to match Traditional Chinese conventions (#4129)
* fix(i18n): Correct zh-TW translations to match Traditional Chinese conventions

Fix ~131 lines of Traditional Chinese (zh-TW) translations that used
Simplified Chinese character forms instead of standard Traditional
Chinese usage.

Changes:
- 文件 → 檔案 (47 occurrences)
- 爲 → 為 (45 occurrences)
- 啓 → 啟 (44 occurrences)
- 曆史 → 歷史 (6 occurrences)
- 鏈接 → 連結 (4 occurrences)
- 菜單 → 選單 (3 occurrences)

* fix(i18n): Replace 服務器 with 伺服器 (15 occurrences)

Align with Traditional Chinese convention where 伺服器 is the standard
term for 'server' in computing contexts.

* fix(i18n): Update zh-TW.js header comment to prevent accidental overwrite

Clarify that the file is the authoritative source and should not be
overwritten with auto-generated output, to prevent future maintainers
from regenerating with raw OpenCC and losing manual corrections.

* fix(i18n): Add zh-TW regression check and maintenance docs

Addresses reviewer feedback on PR #4129 (points 2 and 3):

- scripts/check-i18n.ts: Iterate over parsed zh-TW translation values
  (not raw file content) and report the offending key. Replace the
  earlier substring list with ZH_TW_FORBIDDEN_PATTERNS, which targets
  the three real regression categories: variant Traditional characters
  produced by OpenCC s2t (爲, 啓), Mainland-Chinese vocabulary (服務器,
  菜單, 鏈接), and pure Simplified characters. Excludes 禁用 / 配置 /
  文件 / 打開 to avoid false positives on Taiwan-valid usage.
- scripts/tests/check-i18n.test.ts: Cover the new check, including
  negative cases for Taiwan-valid vocabulary.
- docs/users/features/language.md: Document zh-TW maintenance — the
  vocabulary table, why raw OpenCC s2t output is not acceptable, and
  where the CI-enforced list lives.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

* fix(i18n): Address review feedback on zh-TW check (#4129)

- check-i18n.ts: Sort ZH_TW_FORBIDDEN_PATTERNS longest-first and break
  on first match so e.g. `历史` reports the specific bigram instead of
  also firing the bare `历` rule (no duplicate CI errors).
- check-i18n.ts: Add ZH_TW_ALLOWED_EXCEPTIONS escape hatch so a future
  legitimate translation (e.g. 區塊鏈 in a UI string) can opt out by key
  without weakening the global pattern list.
- docs/users/features/language.md: Add a "CI enforced?" column so
  contributors can tell which rows block CI vs. which are review-only
  style guidance. Replace bare `曆` in the table with the `曆史` bigram
  and note that `曆` is correct in calendar terms (日曆, 農曆, 西曆) —
  prevents a future maintainer from globally replacing 曆→歷.
- Tests: Cover the dedup behavior on overlapping patterns.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

* docs(i18n): Note word-boundary limitation of zh-TW substring check

Document the known limitation that `includes()`-based pattern matching
does not respect Chinese word boundaries — a bigram like `鏈接` will
false-positive on `區塊鏈接口` (區塊鏈 + 接口). Direct contributors to
`ZH_TW_ALLOWED_EXCEPTIONS` when this happens instead of weakening the
pattern list.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-15 15:26:12 +08:00

677 lines
19 KiB
JavaScript
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* @license
* Copyright 2025 Qwen
* SPDX-License-Identifier: Apache-2.0
*/
import * as fs from 'node:fs';
import * as path from 'node:path';
import { dirname } from 'node:path';
import { fileURLToPath, pathToFileURL } from 'node:url';
import { glob } from 'glob';
import {
MUST_TRANSLATE_KEYS,
SUPPORTED_LANGUAGES,
} from '../packages/cli/src/i18n/index.js';
import type { LanguageDefinition } from '../packages/cli/src/i18n/languages.js';
import {
getTranslationModuleExport,
isTranslationDict,
type TranslationDict,
} from '../packages/cli/src/i18n/translationDict.js';
export interface LocaleStats {
code: string;
id: string;
totalKeys: number;
translatedKeys: number;
missingKeys: string[];
extraKeys: string[];
untranslatedMustKeys: string[];
}
export interface CheckResult {
success: boolean;
errors: string[];
warnings: string[];
stats: {
totalKeys: number;
unusedKeys: string[];
unusedKeysOnlyInLocales?: string[];
locales: LocaleStats[];
};
}
export interface CheckI18nOptions {
localesDir?: string;
sourceDir?: string;
supportedLanguages?: readonly Pick<
LanguageDefinition,
'code' | 'id' | 'strictParity'
>[];
mustTranslateKeys?: readonly string[];
strictKeyParityLocales?: ReadonlySet<string>;
}
export interface PrintCheckI18nOptions {
writeUnusedKeysJson?: boolean;
unusedKeysOutputPath?: string;
}
const __dirname = dirname(fileURLToPath(import.meta.url));
const WRITE_UNUSED_KEYS_FLAG = '--write-unused-locale-keys';
const WRITE_UNUSED_KEYS_ENV = 'QWEN_CHECK_I18N_WRITE_UNUSED_KEYS';
export function shouldWriteUnusedKeysJson(): boolean {
return (
process.argv.includes(WRITE_UNUSED_KEYS_FLAG) ||
process.env[WRITE_UNUSED_KEYS_ENV] === '1'
);
}
/**
* Substrings that should not appear in zh-TW (Taiwan Traditional Chinese) values.
*
* Three categories of regressions we want to catch automatically:
* 1. Variant Traditional characters that OpenCC s2t produces by default but
* Taiwan does not use as primary forms (e.g. 爲, 啓).
* 2. Mainland-Chinese vocabulary whose characters are valid Traditional but
* the word itself is not used in Taiwan (e.g. 服務器, 菜單, 鏈接).
* 3. Pure Simplified Chinese characters that would only appear if OpenCC
* was not run at all (e.g. 为, 启, 链).
*
* Deliberately excluded to avoid false positives:
* - 禁用 / 配置 / 設置 — standard in Taiwan.
* - 文件 — contextual (can legitimately mean "document").
* - 打開 — colloquially common in Taiwan even if 開啟 is preferred for UI.
* - Bare 鏈 — valid in 區塊鏈 etc.; only the bigram 鏈接 is flagged.
*
* Known limitation: matching is plain substring (`includes()`) and does not
* respect Chinese word boundaries. Bigram patterns can therefore false-positive
* across compound-word boundaries — e.g. `區塊鏈接口` (= `區塊鏈` + `接口`)
* contains the substring `鏈接` even though neither word is wrong. When this
* happens, add the affected translation key to ZH_TW_ALLOWED_EXCEPTIONS below
* with a brief justification, rather than weakening the pattern list.
*/
const ZH_TW_FORBIDDEN_PATTERNS_RAW: ReadonlyArray<{
pattern: string;
preferred: string;
}> = [
// Variant Traditional characters from OpenCC s2t output
{ pattern: '爲', preferred: '為' },
{ pattern: '啓', preferred: '啟' },
// Mainland-Chinese vocabulary (valid Traditional chars, wrong word for Taiwan)
{ pattern: '曆史', preferred: '歷史' },
{ pattern: '鏈接', preferred: '連結' },
{ pattern: '菜單', preferred: '選單' },
{ pattern: '服務器', preferred: '伺服器' },
// Same Mainland vocabulary written in Simplified form
{ pattern: '菜单', preferred: '選單' },
{ pattern: '服务器', preferred: '伺服器' },
{ pattern: '链接', preferred: '連結' },
{ pattern: '历史', preferred: '歷史' },
// Pure Simplified characters (no ambiguity with valid Traditional usage)
{ pattern: '为', preferred: '為' },
{ pattern: '启', preferred: '啟' },
{ pattern: '历', preferred: '歷' },
{ pattern: '链', preferred: '鏈/連' },
{ pattern: '选', preferred: '選' },
{ pattern: '删', preferred: '刪' },
{ pattern: '扩', preferred: '擴' },
{ pattern: '设', preferred: '設' },
{ pattern: '详', preferred: '詳' },
{ pattern: '认', preferred: '認' },
];
// Sorted longest-first so that more specific patterns (e.g. `历史`) are matched
// before their constituent characters (`历`), avoiding duplicate findings on
// the same translation value.
const ZH_TW_FORBIDDEN_PATTERNS = [...ZH_TW_FORBIDDEN_PATTERNS_RAW].sort(
(a, b) => b.pattern.length - a.pattern.length,
);
/**
* Translation keys whose zh-TW value is allowed to contain an otherwise
* forbidden substring. Use this as an escape hatch when a legitimate
* translation needs a normally-banned character or word — add the key here
* with a comment explaining why, instead of weakening the global pattern list.
*
* Example:
* 'Open block explorer for {{address}}': '...區塊鏈瀏覽器...', // 區塊鏈 = blockchain
*/
const ZH_TW_ALLOWED_EXCEPTIONS: ReadonlySet<string> = new Set<string>([
// (empty — no legitimate exceptions today)
]);
/**
* Walk every translation value and report any value containing a forbidden
* substring. Iterating over the parsed dict (rather than the raw file)
* lets us report the offending key, and avoids matching characters inside
* file-level comments or JS syntax.
*
* Only the longest matching pattern per value is reported, to keep CI output
* focused on the most actionable fix.
*/
export function findForbiddenZhTwPatterns(
translations: TranslationDict,
): Array<{ key: string; pattern: string; preferred: string }> {
const findings: Array<{ key: string; pattern: string; preferred: string }> =
[];
for (const [key, value] of Object.entries(translations)) {
if (ZH_TW_ALLOWED_EXCEPTIONS.has(key)) continue;
const candidates = Array.isArray(value) ? value : [value];
for (const candidate of candidates) {
if (typeof candidate !== 'string') continue;
for (const { pattern, preferred } of ZH_TW_FORBIDDEN_PATTERNS) {
if (candidate.includes(pattern)) {
findings.push({ key, pattern, preferred });
break;
}
}
}
}
return findings;
}
async function loadTranslationsFile(
filePath: string,
): Promise<TranslationDict> {
const fileUrl = pathToFileURL(filePath).href;
const module = await import(fileUrl);
const result = getTranslationModuleExport(module);
if (!isTranslationDict(result)) {
throw new Error(`Invalid locale module: ${filePath}`);
}
return result as TranslationDict;
}
function extractStringLiteral(
content: string,
startPos: number,
quote: string,
): { value: string; endPos: number } | null {
let pos = startPos + 1;
let value = '';
let escaped = false;
while (pos < content.length) {
const char = content[pos];
if (escaped) {
if (char === '\\') {
value += '\\';
} else if (char === quote) {
value += quote;
} else if (char === 'n') {
value += '\n';
} else if (char === 't') {
value += '\t';
} else if (char === 'r') {
value += '\r';
} else {
value += char;
}
escaped = false;
} else if (char === '\\') {
escaped = true;
} else if (char === quote) {
return { value, endPos: pos };
} else {
value += char;
}
pos++;
}
return null;
}
async function extractUsedKeys(sourceDir: string): Promise<Set<string>> {
const usedKeys = new Set<string>();
const files = await glob('**/*.{ts,tsx}', {
cwd: sourceDir,
ignore: [
'**/node_modules/**',
'**/dist/**',
'**/*.test.ts',
'**/*.test.tsx',
],
});
for (const file of files) {
const filePath = path.join(sourceDir, file);
try {
const content = fs.readFileSync(filePath, 'utf-8');
const tCallRegex = /\bta?\s*\(/g;
let match: RegExpExecArray | null;
while ((match = tCallRegex.exec(content)) !== null) {
let pos = match.index + match[0].length;
while (pos < content.length && /\s/.test(content[pos])) {
pos++;
}
if (pos >= content.length) {
continue;
}
const char = content[pos];
if (char === "'" || char === '"') {
const result = extractStringLiteral(content, pos, char);
if (result) {
usedKeys.add(result.value);
}
}
}
} catch {
continue;
}
}
return usedKeys;
}
function checkKeyValueConsistency(enTranslations: TranslationDict): string[] {
const errors: string[] = [];
for (const [key, value] of Object.entries(enTranslations)) {
if (Array.isArray(value)) {
continue;
}
if (key !== value) {
errors.push(`Key-value mismatch in en.js: "${key}" !== "${value}"`);
}
}
return errors;
}
function translationValuesMatch(
left: TranslationValue | undefined,
right: TranslationValue | undefined,
): boolean {
return JSON.stringify(left) === JSON.stringify(right);
}
function countTranslatedKeys(
enTranslations: TranslationDict,
localeTranslations: TranslationDict,
): number {
let translatedKeys = 0;
for (const [key, enValue] of Object.entries(enTranslations)) {
if (
key in localeTranslations &&
!translationValuesMatch(localeTranslations[key], enValue)
) {
translatedKeys++;
}
}
return translatedKeys;
}
function findUnusedKeys(allKeys: Set<string>, usedKeys: Set<string>): string[] {
return Array.from(allKeys)
.filter((key) => !usedKeys.has(key))
.sort();
}
function saveKeysOnlyInLocalesToJson(
keysOnlyInLocales: string[],
outputPath: string,
): void {
try {
const data = {
keys: keysOnlyInLocales,
count: keysOnlyInLocales.length,
};
fs.writeFileSync(outputPath, `${JSON.stringify(data, null, 2)}\n`);
console.log(`Keys that exist only in locale files saved to: ${outputPath}`);
} catch (error) {
console.error(`Failed to save keys to JSON file: ${error}`);
}
}
async function findKeysOnlyInLocales(
unusedKeys: string[],
sourceDir: string,
localesDir: string,
): Promise<string[]> {
if (unusedKeys.length === 0) {
return [];
}
const keysOnlyInLocales: string[] = [];
const localesDirName = path.basename(localesDir);
const files = await glob('**/*.{ts,tsx}', {
cwd: sourceDir,
ignore: [
'**/node_modules/**',
'**/dist/**',
'**/*.test.ts',
'**/*.test.tsx',
`**/${localesDirName}/**`,
],
});
const foundKeys = new Set<string>();
for (const file of files) {
const filePath = path.join(sourceDir, file);
try {
const content = fs.readFileSync(filePath, 'utf-8');
for (const key of unusedKeys) {
if (!foundKeys.has(key) && content.includes(key)) {
foundKeys.add(key);
}
}
} catch {
continue;
}
}
for (const key of unusedKeys) {
if (!foundKeys.has(key)) {
keysOnlyInLocales.push(key);
}
}
return keysOnlyInLocales;
}
export async function checkI18n(
options: CheckI18nOptions = {},
): Promise<CheckResult> {
const errors: string[] = [];
const warnings: string[] = [];
const localesDir =
options.localesDir ??
path.join(__dirname, '../packages/cli/src/i18n/locales');
const sourceDir =
options.sourceDir ?? path.join(__dirname, '../packages/cli/src');
const supportedLanguages = options.supportedLanguages ?? SUPPORTED_LANGUAGES;
const mustTranslateKeys = options.mustTranslateKeys ?? MUST_TRANSLATE_KEYS;
const mustTranslateKeySet = new Set(mustTranslateKeys);
const strictKeyParityLocales =
options.strictKeyParityLocales ??
new Set(
supportedLanguages
.filter((language) => language.strictParity)
.map((language) => language.code),
);
const localeDefinitions = supportedLanguages.map((language) => ({
code: language.code,
id: language.id,
path: path.join(localesDir, `${language.code}.js`),
}));
const localeTranslations = new Map<string, TranslationDict>();
for (const locale of localeDefinitions) {
try {
localeTranslations.set(
locale.code,
await loadTranslationsFile(locale.path),
);
} catch (error) {
errors.push(
`Failed to load ${locale.code}.js: ${error instanceof Error ? error.message : String(error)}`,
);
}
}
const enTranslations = localeTranslations.get('en');
if (!enTranslations) {
return {
success: false,
errors,
warnings,
stats: {
totalKeys: 0,
unusedKeys: [],
locales: [],
},
};
}
errors.push(...checkKeyValueConsistency(enTranslations));
const enKeys = new Set(Object.keys(enTranslations));
const localeStats: LocaleStats[] = [];
for (const locale of localeDefinitions) {
if (locale.code === 'en') {
continue;
}
const translations = localeTranslations.get(locale.code);
if (!translations) {
continue;
}
const localeKeys = new Set(Object.keys(translations));
const missingKeys = Array.from(enKeys)
.filter((key) => !localeKeys.has(key))
.sort();
const extraKeys = Array.from(localeKeys)
.filter((key) => !enKeys.has(key))
.sort();
const untranslatedMustKeys = mustTranslateKeys.filter((key) => {
const value = translations[key];
return (
value === undefined ||
translationValuesMatch(value, enTranslations[key])
);
});
const translatedKeys = countTranslatedKeys(enTranslations, translations);
localeStats.push({
code: locale.code,
id: locale.id,
totalKeys: enKeys.size,
translatedKeys,
missingKeys,
extraKeys,
untranslatedMustKeys,
});
const requiresStrictKeyParity = strictKeyParityLocales.has(locale.code);
if (missingKeys.length > 0) {
if (requiresStrictKeyParity) {
for (const key of missingKeys) {
errors.push(`Missing translation in ${locale.code}.js: "${key}"`);
}
} else {
const missingRequiredKeys = missingKeys.filter((key) =>
mustTranslateKeySet.has(key),
);
const missingOptionalKeyCount =
missingKeys.length - missingRequiredKeys.length;
for (const key of missingRequiredKeys) {
errors.push(
`Missing required translation in ${locale.code}.js: "${key}"`,
);
}
if (missingOptionalKeyCount > 0) {
warnings.push(
`${locale.code}.js is missing ${missingOptionalKeyCount} non-required translation keys`,
);
}
}
}
if (extraKeys.length > 0) {
if (requiresStrictKeyParity) {
for (const key of extraKeys) {
errors.push(
`Extra key in ${locale.code}.js (not in en.js): "${key}"`,
);
}
} else {
warnings.push(
`${locale.code}.js has ${extraKeys.length} keys not present in en.js`,
);
}
}
for (const key of untranslatedMustKeys) {
errors.push(
`Required translation still falls back to English in ${locale.code}.js: "${key}"`,
);
}
}
// Check zh-TW.js for Taiwan-vocabulary regressions (raw OpenCC output,
// Mainland-Chinese vocabulary, or Simplified characters slipping in).
const zhTWTranslations = localeTranslations.get('zh-TW');
if (zhTWTranslations) {
for (const { key, pattern, preferred } of findForbiddenZhTwPatterns(
zhTWTranslations,
)) {
errors.push(
`Non-Taiwan vocabulary in zh-TW.js at "${key}": "${pattern}" should be "${preferred}"`,
);
}
}
const usedKeys = await extractUsedKeys(sourceDir);
const unusedKeys = findUnusedKeys(enKeys, usedKeys);
const unusedKeysOnlyInLocales =
unusedKeys.length > 0
? await findKeysOnlyInLocales(unusedKeys, sourceDir, localesDir)
: [];
if (unusedKeys.length > 0) {
warnings.push(`Found ${unusedKeys.length} unused translation keys`);
}
return {
success: errors.length === 0,
errors,
warnings,
stats: {
totalKeys: enKeys.size,
unusedKeys,
unusedKeysOnlyInLocales,
locales: localeStats,
},
};
}
export function printCheckI18nResult(
result: CheckResult,
options: PrintCheckI18nOptions = {},
): void {
console.log('\n=== i18n Check Results ===\n');
console.log(`Total keys: ${result.stats.totalKeys}\n`);
console.log('Locale coverage:');
for (const locale of result.stats.locales) {
const coverage =
locale.totalKeys > 0
? ((locale.translatedKeys / locale.totalKeys) * 100).toFixed(1)
: '0.0';
console.log(
` - ${locale.id} (${locale.code}): ${locale.translatedKeys}/${locale.totalKeys} translated (${coverage}%)`,
);
}
console.log();
if (result.warnings.length > 0) {
console.log('⚠️ Warnings:');
result.warnings.forEach((warning) => console.log(` - ${warning}`));
if (
result.stats.unusedKeys.length > 0 &&
result.stats.unusedKeys.length <= 10
) {
console.log('\nUnused keys:');
result.stats.unusedKeys.forEach((key) => console.log(` - "${key}"`));
} else if (result.stats.unusedKeys.length > 10) {
console.log(
`\nUnused keys (showing first 10 of ${result.stats.unusedKeys.length}):`,
);
result.stats.unusedKeys
.slice(0, 10)
.forEach((key) => console.log(` - "${key}"`));
}
if (
result.stats.unusedKeysOnlyInLocales &&
result.stats.unusedKeysOnlyInLocales.length > 0
) {
console.log(
'\n⚠ The following keys exist ONLY in locale files and nowhere else in the codebase:',
);
console.log(
' Please review these keys - they might be safe to remove.',
);
result.stats.unusedKeysOnlyInLocales.forEach((key) =>
console.log(` - "${key}"`),
);
if (options.writeUnusedKeysJson) {
const outputPath =
options.unusedKeysOutputPath ??
path.join(__dirname, 'unused-keys-only-in-locales.json');
saveKeysOnlyInLocalesToJson(
result.stats.unusedKeysOnlyInLocales,
outputPath,
);
} else {
console.log(
`\nJSON report not written. Re-run with ${WRITE_UNUSED_KEYS_FLAG} or ${WRITE_UNUSED_KEYS_ENV}=1 to update it.`,
);
}
}
console.log();
}
}
async function main() {
const result = await checkI18n();
printCheckI18nResult(result, {
writeUnusedKeysJson: shouldWriteUnusedKeysJson(),
});
if (result.errors.length > 0) {
console.log('❌ Errors:');
result.errors.forEach((error) => console.log(` - ${error}`));
console.log();
process.exit(1);
}
console.log('✅ All checks passed!\n');
}
if (
process.argv[1] &&
path.resolve(process.argv[1]) === fileURLToPath(import.meta.url)
) {
main().catch((error) => {
console.error('❌ Fatal error:', error);
process.exit(1);
});
}