From 17939baa662a4caae7c6d8e341ab2b66cc997fc3 Mon Sep 17 00:00:00 2001 From: tanzhenxin Date: Mon, 16 Mar 2026 22:44:53 +0800 Subject: [PATCH] feat(core): auto-detect UTF-8 BOM for PowerShell scripts on Windows - Add needsUtf8Bom() to detect when UTF-8 BOM is needed based on file extension and system code page - PowerShell 5.1 on non-UTF-8 Windows systems (e.g. GBK) requires BOM to read scripts correctly - Remove default UTF8 encoding; undefined now triggers auto-detection - Add tests for needsUtf8Bom() covering Windows/non-Windows scenarios This ensures PowerShell scripts are written with UTF-8 BOM on systems that need it, fixing character encoding issues for non-ASCII content. Co-authored-by: Qwen-Coder --- packages/cli/src/config/config.ts | 4 +- packages/core/src/config/config.ts | 7 +- .../src/services/fileSystemService.test.ts | 75 ++++++++++++++++++- .../core/src/services/fileSystemService.ts | 43 +++++++++++ packages/core/src/tools/edit.ts | 12 ++- packages/core/src/tools/write-file.ts | 14 +++- 6 files changed, 142 insertions(+), 13 deletions(-) diff --git a/packages/cli/src/config/config.ts b/packages/cli/src/config/config.ts index eab0470c6..e0aaaa962 100755 --- a/packages/cli/src/config/config.ts +++ b/packages/cli/src/config/config.ts @@ -10,7 +10,6 @@ import { Config, DEFAULT_QWEN_EMBEDDING_MODEL, FileDiscoveryService, - FileEncoding, getAllGeminiMdFilenames, loadServerHierarchicalMemory, setGeminiMdFilename as setServerGeminiMdFilename, @@ -1041,8 +1040,7 @@ export async function loadCliConfig( // always be true and the settings file can never disable recording. chatRecording: argv.chatRecording ?? settings.general?.chatRecording ?? true, - defaultFileEncoding: - settings.general?.defaultFileEncoding ?? FileEncoding.UTF8, + defaultFileEncoding: settings.general?.defaultFileEncoding, lsp: { enabled: lspEnabled, }, diff --git a/packages/core/src/config/config.ts b/packages/core/src/config/config.ts index 3fcd3b9ca..2a941ea48 100644 --- a/packages/core/src/config/config.ts +++ b/packages/core/src/config/config.ts @@ -37,7 +37,6 @@ import { type FileSystemService, StandardFileSystemService, type FileEncodingType, - FileEncoding, } from '../services/fileSystemService.js'; import { GitService } from '../services/gitService.js'; @@ -523,7 +522,7 @@ export class Config { private readonly truncateToolOutputLines: number; private readonly eventEmitter?: EventEmitter; private readonly channel: string | undefined; - private readonly defaultFileEncoding: FileEncodingType; + private readonly defaultFileEncoding: FileEncodingType | undefined; private readonly enableHooks: boolean; private readonly hooks?: Record; private readonly hooksConfig?: Record; @@ -641,7 +640,7 @@ export class Config { this.truncateToolOutputLines = params.truncateToolOutputLines ?? DEFAULT_TRUNCATE_TOOL_OUTPUT_LINES; this.channel = params.channel; - this.defaultFileEncoding = params.defaultFileEncoding ?? FileEncoding.UTF8; + this.defaultFileEncoding = params.defaultFileEncoding; this.storage = new Storage(this.targetDir); this.inputFormat = params.inputFormat ?? InputFormat.TEXT; this.fileExclusions = new FileExclusions(this); @@ -1647,7 +1646,7 @@ export class Config { * Get the default file encoding for new files. * @returns FileEncodingType */ - getDefaultFileEncoding(): FileEncodingType { + getDefaultFileEncoding(): FileEncodingType | undefined { return this.defaultFileEncoding; } diff --git a/packages/core/src/services/fileSystemService.test.ts b/packages/core/src/services/fileSystemService.test.ts index 1b481e928..7811a96ed 100644 --- a/packages/core/src/services/fileSystemService.test.ts +++ b/packages/core/src/services/fileSystemService.test.ts @@ -6,9 +6,16 @@ import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest'; import fs from 'node:fs/promises'; -import { StandardFileSystemService } from './fileSystemService.js'; +import { + StandardFileSystemService, + needsUtf8Bom, + resetUtf8BomCache, +} from './fileSystemService.js'; const mockPlatform = vi.hoisted(() => vi.fn().mockReturnValue('linux')); +const mockGetSystemEncoding = vi.hoisted(() => + vi.fn().mockReturnValue('utf-8'), +); vi.mock('fs/promises'); vi.mock('os', () => ({ @@ -17,6 +24,9 @@ vi.mock('os', () => ({ }, platform: mockPlatform, })); +vi.mock('../utils/systemEncoding.js', () => ({ + getSystemEncoding: mockGetSystemEncoding, +})); vi.mock('../utils/fileUtils.js', async (importOriginal) => { const actual = await importOriginal(); @@ -33,6 +43,9 @@ describe('StandardFileSystemService', () => { beforeEach(() => { vi.resetAllMocks(); + resetUtf8BomCache(); + mockPlatform.mockReturnValue('linux'); + mockGetSystemEncoding.mockReturnValue('utf-8'); fileSystem = new StandardFileSystemService(); }); @@ -375,4 +388,64 @@ describe('StandardFileSystemService', () => { ); }); }); + + describe('needsUtf8Bom', () => { + beforeEach(() => { + resetUtf8BomCache(); + }); + + it('should return true for .ps1 files on Windows with non-UTF-8 code page', () => { + mockPlatform.mockReturnValue('win32'); + mockGetSystemEncoding.mockReturnValue('gbk'); + + expect(needsUtf8Bom('/test/script.ps1')).toBe(true); + }); + + it('should return true for .PS1 files (case-insensitive)', () => { + mockPlatform.mockReturnValue('win32'); + mockGetSystemEncoding.mockReturnValue('gbk'); + + expect(needsUtf8Bom('/test/SCRIPT.PS1')).toBe(true); + }); + + it('should return false for .ps1 files on Windows with UTF-8 code page', () => { + mockPlatform.mockReturnValue('win32'); + mockGetSystemEncoding.mockReturnValue('utf-8'); + + expect(needsUtf8Bom('/test/script.ps1')).toBe(false); + }); + + it('should return false for .ps1 files on non-Windows', () => { + mockPlatform.mockReturnValue('darwin'); + + expect(needsUtf8Bom('/test/script.ps1')).toBe(false); + }); + + it('should return false for non-.ps1 files on Windows with non-UTF-8 code page', () => { + mockPlatform.mockReturnValue('win32'); + mockGetSystemEncoding.mockReturnValue('gbk'); + + expect(needsUtf8Bom('/test/script.sh')).toBe(false); + expect(needsUtf8Bom('/test/file.txt')).toBe(false); + expect(needsUtf8Bom('/test/script.bat')).toBe(false); + }); + + it('should cache the platform/encoding check across calls', () => { + mockPlatform.mockReturnValue('win32'); + mockGetSystemEncoding.mockReturnValue('gbk'); + + needsUtf8Bom('/test/script.ps1'); + needsUtf8Bom('/test/other.ps1'); + + // getSystemEncoding should only be called once due to caching + expect(mockGetSystemEncoding).toHaveBeenCalledTimes(1); + }); + + it('should treat null system encoding as non-UTF-8', () => { + mockPlatform.mockReturnValue('win32'); + mockGetSystemEncoding.mockReturnValue(null); + + expect(needsUtf8Bom('/test/script.ps1')).toBe(true); + }); + }); }); diff --git a/packages/core/src/services/fileSystemService.ts b/packages/core/src/services/fileSystemService.ts index d8667043a..6d2022c75 100644 --- a/packages/core/src/services/fileSystemService.ts +++ b/packages/core/src/services/fileSystemService.ts @@ -14,6 +14,7 @@ import { iconvEncodingExists, isUtf8CompatibleEncoding, } from '../utils/iconvHelper.js'; +import { getSystemEncoding } from '../utils/systemEncoding.js'; import type { ReadTextFileRequest, WriteTextFileRequest, @@ -91,6 +92,48 @@ export interface WriteTextFileOptions { */ const CRLF_EXTENSIONS = new Set(['.bat', '.cmd']); +/** + * File extensions that need UTF-8 BOM on Windows with a non-UTF-8 code page. + * PowerShell 5.1 (the version that ships with Windows) reads BOM-less files + * using the system's ANSI code page. Without a BOM, any non-ASCII characters + * in the script will be misinterpreted (e.g. on a GBK system). PowerShell 7+ + * defaults to UTF-8 and handles BOM fine, so adding BOM is always safe. + */ +const UTF8_BOM_EXTENSIONS = new Set(['.ps1']); + +// Cache so we only call getSystemEncoding() once per process +let cachedIsNonUtf8Windows: boolean | undefined; + +/** + * Returns true if a newly created file at the given path should be written + * with a UTF-8 BOM. Conditions (all must be true): + * 1. Running on Windows + * 2. System code page is not UTF-8 + * 3. File extension is in UTF8_BOM_EXTENSIONS (e.g. .ps1) + */ +export function needsUtf8Bom(filePath: string): boolean { + const ext = path.extname(filePath).toLowerCase(); + if (!UTF8_BOM_EXTENSIONS.has(ext)) { + return false; + } + if (cachedIsNonUtf8Windows === undefined) { + if (os.platform() !== 'win32') { + cachedIsNonUtf8Windows = false; + } else { + const sysEnc = getSystemEncoding(); + cachedIsNonUtf8Windows = sysEnc !== 'utf-8'; + } + } + return cachedIsNonUtf8Windows; +} + +/** + * Reset the UTF-8 BOM cache — useful for testing. + */ +export function resetUtf8BomCache(): void { + cachedIsNonUtf8Windows = undefined; +} + /** * Returns true if the file at the given path requires CRLF line endings. * Only applies on Windows where cmd.exe actually parses these files. diff --git a/packages/core/src/tools/edit.ts b/packages/core/src/tools/edit.ts index 474a6aace..ae4c9480b 100644 --- a/packages/core/src/tools/edit.ts +++ b/packages/core/src/tools/edit.ts @@ -20,7 +20,7 @@ import { makeRelative, shortenPath } from '../utils/paths.js'; import { isNodeError } from '../utils/errors.js'; import type { Config } from '../config/config.js'; import { ApprovalMode } from '../config/config.js'; -import { FileEncoding } from '../services/fileSystemService.js'; +import { FileEncoding, needsUtf8Bom } from '../services/fileSystemService.js'; import { DEFAULT_DIFF_OPTIONS, getDiffStat } from './diffOptions.js'; import { ReadFileTool } from './read-file.js'; import { ToolNames, ToolDisplayNames } from './tool-names.js'; @@ -397,8 +397,14 @@ class EditToolInvocation implements ToolInvocation { // For new files, apply default file encoding setting // For existing files, preserve the original encoding (BOM and charset) if (editData.isNewFile) { - const useBOM = - this.config.getDefaultFileEncoding() === FileEncoding.UTF8_BOM; + const userEncoding = this.config.getDefaultFileEncoding(); + let useBOM = false; + if (userEncoding === FileEncoding.UTF8_BOM) { + useBOM = true; + } else if (userEncoding === undefined) { + // No explicit setting: auto-detect (e.g. .ps1 on non-UTF-8 Windows) + useBOM = needsUtf8Bom(this.params.file_path); + } await this.config.getFileSystemService().writeTextFile({ path: this.params.file_path, content: editData.newContent, diff --git a/packages/core/src/tools/write-file.ts b/packages/core/src/tools/write-file.ts index 9da02e4d4..2fb53a73f 100644 --- a/packages/core/src/tools/write-file.ts +++ b/packages/core/src/tools/write-file.ts @@ -24,7 +24,7 @@ import { ToolConfirmationOutcome, } from './tools.js'; import { ToolErrorType } from './tool-error.js'; -import { FileEncoding } from '../services/fileSystemService.js'; +import { FileEncoding, needsUtf8Bom } from '../services/fileSystemService.js'; import { makeRelative, shortenPath } from '../utils/paths.js'; import { getErrorMessage, isNodeError } from '../utils/errors.js'; import { DEFAULT_DIFF_OPTIONS, getDiffStat } from './diffOptions.js'; @@ -212,7 +212,17 @@ class WriteFileToolInvocation extends BaseToolInvocation< if (!fileExists) { fs.mkdirSync(dirName, { recursive: true }); - useBOM = this.config.getDefaultFileEncoding() === FileEncoding.UTF8_BOM; + const userEncoding = this.config.getDefaultFileEncoding(); + if (userEncoding === FileEncoding.UTF8_BOM) { + // User explicitly configured UTF-8 BOM for all new files + useBOM = true; + } else if (userEncoding === undefined) { + // No explicit setting: auto-detect based on platform/extension. + // e.g. .ps1 on Windows with a non-UTF-8 code page needs BOM so + // PowerShell 5.1 reads the file as UTF-8 instead of the system ANSI page + useBOM = needsUtf8Bom(file_path); + } + // else: user explicitly set 'utf-8' (no BOM) — respect it detectedEncoding = undefined; }