diff --git a/packages/cli/src/acp-integration/service/filesystem.test.ts b/packages/cli/src/acp-integration/service/filesystem.test.ts index 6eb3dfa1b..e8dc34968 100644 --- a/packages/cli/src/acp-integration/service/filesystem.test.ts +++ b/packages/cli/src/acp-integration/service/filesystem.test.ts @@ -11,6 +11,9 @@ import { ACP_ERROR_CODES } from '../errorCodes.js'; const createFallback = (): FileSystemService => ({ readTextFile: vi.fn(), + readTextFileWithInfo: vi + .fn() + .mockResolvedValue({ content: '', encoding: 'utf-8', bom: false }), writeTextFile: vi.fn(), detectFileBOM: vi.fn().mockResolvedValue(false), findFiles: vi.fn().mockReturnValue([]), diff --git a/packages/cli/src/acp-integration/service/filesystem.ts b/packages/cli/src/acp-integration/service/filesystem.ts index 0a03e0f0f..88512558d 100644 --- a/packages/cli/src/acp-integration/service/filesystem.ts +++ b/packages/cli/src/acp-integration/service/filesystem.ts @@ -4,7 +4,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -import type { FileSystemService } from '@qwen-code/qwen-code-core'; +import type { FileSystemService , FileReadResult } from '@qwen-code/qwen-code-core'; import type * as acp from '../acp.js'; import { ACP_ERROR_CODES } from '../errorCodes.js'; @@ -54,6 +54,12 @@ export class AcpFileSystemService implements FileSystemService { return response.content; } + async readTextFileWithInfo(filePath: string): Promise { + // ACP protocol does not expose encoding metadata; delegate to the local + // fallback which performs a single-pass read with encoding detection. + return this.fallback.readTextFileWithInfo(filePath); + } + async writeTextFile( filePath: string, content: string, diff --git a/packages/core/src/services/fileSystemService.test.ts b/packages/core/src/services/fileSystemService.test.ts index 3b2b472ad..fe72829e2 100644 --- a/packages/core/src/services/fileSystemService.test.ts +++ b/packages/core/src/services/fileSystemService.test.ts @@ -15,10 +15,14 @@ vi.mock('../utils/fileUtils.js', async (importOriginal) => { return { ...actual, readFileWithEncoding: vi.fn(), + readFileWithEncodingInfo: vi.fn(), }; }); -import { readFileWithEncoding } from '../utils/fileUtils.js'; +import { + readFileWithEncoding, + readFileWithEncodingInfo, +} from '../utils/fileUtils.js'; describe('StandardFileSystemService', () => { let fileSystem: StandardFileSystemService; @@ -53,6 +57,42 @@ describe('StandardFileSystemService', () => { }); }); + describe('readTextFileWithInfo', () => { + it('should return content, encoding, and bom via readFileWithEncodingInfo', async () => { + const mockResult = { content: 'Hello', encoding: 'utf-8', bom: false }; + vi.mocked(readFileWithEncodingInfo).mockResolvedValue(mockResult); + + const result = await fileSystem.readTextFileWithInfo('/test/file.txt'); + + expect(readFileWithEncodingInfo).toHaveBeenCalledWith('/test/file.txt'); + expect(result).toEqual(mockResult); + }); + + it('should return non-UTF-8 encoding info for GBK file', async () => { + const mockResult = { + content: '你好世界', + encoding: 'gb18030', + bom: false, + }; + vi.mocked(readFileWithEncodingInfo).mockResolvedValue(mockResult); + + const result = await fileSystem.readTextFileWithInfo('/test/gbk.txt'); + + expect(result.encoding).toBe('gb18030'); + expect(result.bom).toBe(false); + expect(result.content).toBe('你好世界'); + }); + + it('should propagate readFileWithEncodingInfo errors', async () => { + const error = new Error('ENOENT: File not found'); + vi.mocked(readFileWithEncodingInfo).mockRejectedValue(error); + + await expect( + fileSystem.readTextFileWithInfo('/test/file.txt'), + ).rejects.toThrow('ENOENT: File not found'); + }); + }); + describe('writeTextFile', () => { it('should write file content using fs', async () => { vi.mocked(fs.writeFile).mockResolvedValue(); @@ -156,6 +196,41 @@ describe('StandardFileSystemService', () => { 'utf-8', ); }); + + it('should preserve UTF-16LE BOM when writing back a UTF-16LE file', async () => { + vi.mocked(fs.writeFile).mockResolvedValue(); + + await fileSystem.writeTextFile('/test/file.txt', 'Hello', { + encoding: 'utf-16le', + bom: true, + }); + + // iconv-lite encodes as UTF-16LE; with bom:true the FF FE BOM is prepended + const writeCall = vi.mocked(fs.writeFile).mock.calls[0]; + expect(writeCall[0]).toBe('/test/file.txt'); + expect(writeCall[1]).toBeInstanceOf(Buffer); + const buf = writeCall[1] as Buffer; + // First two bytes must be the UTF-16LE BOM: FF FE + expect(buf[0]).toBe(0xff); + expect(buf[1]).toBe(0xfe); + }); + + it('should not add BOM when writing UTF-16LE file without bom flag', async () => { + vi.mocked(fs.writeFile).mockResolvedValue(); + + await fileSystem.writeTextFile('/test/file.txt', 'Hello', { + encoding: 'utf-16le', + bom: false, + }); + + // No BOM prepended — raw iconv-encoded buffer written directly + const writeCall = vi.mocked(fs.writeFile).mock.calls[0]; + expect(writeCall[0]).toBe('/test/file.txt'); + expect(writeCall[1]).toBeInstanceOf(Buffer); + const buf = writeCall[1] as Buffer; + // First two bytes should NOT be FF FE (the UTF-16LE BOM) + expect(!(buf[0] === 0xff && buf[1] === 0xfe)).toBe(true); + }); }); describe('detectFileBOM', () => { diff --git a/packages/core/src/services/fileSystemService.ts b/packages/core/src/services/fileSystemService.ts index 26d8a6626..787d68929 100644 --- a/packages/core/src/services/fileSystemService.ts +++ b/packages/core/src/services/fileSystemService.ts @@ -7,8 +7,16 @@ import fs from 'node:fs/promises'; import * as path from 'node:path'; import { globSync } from 'glob'; -import { readFileWithEncoding } from '../utils/fileUtils.js'; -import { iconvEncode, iconvEncodingExists } from '../utils/iconvHelper.js'; +import { + readFileWithEncoding, + readFileWithEncodingInfo, +} from '../utils/fileUtils.js'; +import type { FileReadResult } from '../utils/fileUtils.js'; +import { + iconvEncode, + iconvEncodingExists, + isUtf8CompatibleEncoding, +} from '../utils/iconvHelper.js'; /** * Supported file encodings for new files. @@ -35,6 +43,15 @@ export interface FileSystemService { */ readTextFile(filePath: string): Promise; + /** + * Read text content from a file, returning both the content and encoding metadata. + * Combines readTextFile + detectFileBOM + detectFileEncoding into a single I/O pass. + * + * @param filePath - The path to the file to read + * @returns The file content, encoding name, and whether a UTF-8 BOM was present + */ + readTextFileWithInfo(filePath: string): Promise; + /** * Write text content to a file * @@ -103,12 +120,28 @@ function hasUTF8BOM(buffer: Buffer): boolean { } /** - * Check whether an encoding name represents a UTF-8 compatible encoding - * that doesn't require iconv-lite for writing. + * Return the BOM byte sequence for a given encoding name, or null if the + * encoding does not use a standard BOM. Used when writing back a file that + * originally had a BOM so the BOM is preserved. */ -function isUtf8CompatibleEncoding(encoding: string): boolean { +function getBOMBytesForEncoding(encoding: string): Buffer | null { const lower = encoding.toLowerCase().replace(/[^a-z0-9]/g, ''); - return lower === 'utf8' || lower === 'ascii' || lower === 'usascii'; + switch (lower) { + case 'utf8': + return Buffer.from([0xef, 0xbb, 0xbf]); + case 'utf16le': + case 'utf16': + return Buffer.from([0xff, 0xfe]); + case 'utf16be': + return Buffer.from([0xfe, 0xff]); + case 'utf32le': + case 'utf32': + return Buffer.from([0xff, 0xfe, 0x00, 0x00]); + case 'utf32be': + return Buffer.from([0x00, 0x00, 0xfe, 0xff]); + default: + return null; + } } /** @@ -120,6 +153,12 @@ export class StandardFileSystemService implements FileSystemService { return readFileWithEncoding(filePath); } + async readTextFileWithInfo(filePath: string): Promise { + // Single I/O pass: returns content, encoding, and BOM flag together, + // eliminating the need for separate detectFileEncoding / detectFileBOM calls. + return readFileWithEncodingInfo(filePath); + } + async writeTextFile( filePath: string, content: string, @@ -128,19 +167,30 @@ export class StandardFileSystemService implements FileSystemService { const bom = options?.bom ?? false; const encoding = options?.encoding; - // Check if a non-UTF-8 encoding is specified and supported + // Check if a non-UTF-8 encoding is specified and supported by iconv-lite const isNonUtf8Encoding = encoding && !isUtf8CompatibleEncoding(encoding) && iconvEncodingExists(encoding); if (isNonUtf8Encoding) { - // Non-UTF-8 encoding (e.g. GBK, Big5, Shift_JIS) — use iconv-lite to encode + // Non-UTF-8 encoding (e.g. GBK, Big5, Shift_JIS, UTF-16LE, UTF-32BE…) + // Use iconv-lite to encode the content. When the file originally had a BOM + // (bom: true), prepend the correct BOM bytes for this encoding so the + // byte-order mark is preserved on write-back. const encoded = iconvEncode(content, encoding); - await fs.writeFile(filePath, encoded); + if (bom) { + const bomBytes = getBOMBytesForEncoding(encoding); + await fs.writeFile( + filePath, + bomBytes ? Buffer.concat([bomBytes, encoded]) : encoded, + ); + } else { + await fs.writeFile(filePath, encoded); + } } else if (bom) { - // Prepend UTF-8 BOM (EF BB BF) - // If content already starts with BOM character, strip it first to avoid double BOM + // UTF-8 BOM: prepend EF BB BF + // If content already starts with the BOM character, strip it first to avoid double BOM. const normalizedContent = content.charCodeAt(0) === 0xfeff ? content.slice(1) : content; const bomBuffer = Buffer.from([0xef, 0xbb, 0xbf]); diff --git a/packages/core/src/tools/edit.ts b/packages/core/src/tools/edit.ts index a4da3d3f3..61a318190 100644 --- a/packages/core/src/tools/edit.ts +++ b/packages/core/src/tools/edit.ts @@ -27,7 +27,7 @@ import { ToolNames, ToolDisplayNames } from './tool-names.js'; import { logFileOperation } from '../telemetry/loggers.js'; import { FileOperationEvent } from '../telemetry/types.js'; import { FileOperation } from '../telemetry/metrics.js'; -import { getSpecificMimeType, detectFileEncoding } from '../utils/fileUtils.js'; +import { getSpecificMimeType } from '../utils/fileUtils.js'; import { getLanguageFromFilePath } from '../utils/language-detection.js'; import type { ModifiableDeclarativeTool, @@ -145,17 +145,15 @@ class EditToolInvocation implements ToolInvocation { | undefined = undefined; try { - currentContent = await this.config + const fileInfo = await this.config .getFileSystemService() - .readTextFile(params.file_path); + .readTextFileWithInfo(params.file_path); // Normalize line endings to LF for consistent processing. - currentContent = currentContent.replace(/\r\n/g, '\n'); + currentContent = fileInfo.content.replace(/\r\n/g, '\n'); fileExists = true; - // Detect encoding and BOM to preserve original file characteristics on write-back - encoding = await detectFileEncoding(params.file_path); - bom = await this.config - .getFileSystemService() - .detectFileBOM(params.file_path); + // Encoding and BOM are returned from the same I/O pass, avoiding redundant reads. + encoding = fileInfo.encoding; + bom = fileInfo.bom; } catch (err: unknown) { if (!isNodeError(err) || err.code !== 'ENOENT') { // Rethrow unexpected FS errors (permissions, etc.) diff --git a/packages/core/src/tools/write-file.ts b/packages/core/src/tools/write-file.ts index 04254541b..4085e3b69 100644 --- a/packages/core/src/tools/write-file.ts +++ b/packages/core/src/tools/write-file.ts @@ -37,7 +37,7 @@ import { IdeClient } from '../ide/ide-client.js'; import { logFileOperation } from '../telemetry/loggers.js'; import { FileOperationEvent } from '../telemetry/types.js'; import { FileOperation } from '../telemetry/metrics.js'; -import { getSpecificMimeType, detectFileEncoding } from '../utils/fileUtils.js'; +import { getSpecificMimeType } from '../utils/fileUtils.js'; import { getLanguageFromFilePath } from '../utils/language-detection.js'; import { createDebugLogger } from '../utils/debugLogger.js'; @@ -245,11 +245,13 @@ class WriteFileToolInvocation extends BaseToolInvocation< let useBOM = false; let detectedEncoding: string | undefined; if (!isNewFile) { - useBOM = await this.config + // Use readTextFileWithInfo for a single I/O pass that returns encoding + // and BOM metadata together, avoiding separate detectFileBOM / detectFileEncoding calls. + const fileInfo = await this.config .getFileSystemService() - .detectFileBOM(file_path); - // Detect encoding to preserve non-UTF-8 encodings (e.g. GBK, Big5) - detectedEncoding = await detectFileEncoding(file_path); + .readTextFileWithInfo(file_path); + useBOM = fileInfo.bom; + detectedEncoding = fileInfo.encoding; } else { useBOM = this.config.getDefaultFileEncoding() === FileEncoding.UTF8_BOM; } diff --git a/packages/core/src/utils/fileUtils.test.ts b/packages/core/src/utils/fileUtils.test.ts index 76141ac23..6dc38e4d7 100644 --- a/packages/core/src/utils/fileUtils.test.ts +++ b/packages/core/src/utils/fileUtils.test.ts @@ -28,6 +28,7 @@ import { processSingleFileContent, detectBOM, readFileWithEncoding, + readFileWithEncodingInfo, detectFileEncoding, fileExists, } from './fileUtils.js'; @@ -447,6 +448,64 @@ describe('fileUtils', () => { }); }); + describe('readFileWithEncodingInfo', () => { + it('should return bom: false and encoding utf-8 for plain UTF-8 file', async () => { + const filePath = path.join(testDir, 'info-utf8.txt'); + await fsPromises.writeFile(filePath, 'Hello', 'utf8'); + + const result = await readFileWithEncodingInfo(filePath); + expect(result.content).toBe('Hello'); + expect(result.encoding).toBe('utf-8'); + expect(result.bom).toBe(false); + }); + + it('should return bom: true and encoding utf-8 for UTF-8 BOM file', async () => { + const utf8Bom = Buffer.from([0xef, 0xbb, 0xbf]); + const filePath = path.join(testDir, 'info-utf8-bom.txt'); + await fsPromises.writeFile( + filePath, + Buffer.concat([utf8Bom, Buffer.from('Hello', 'utf8')]), + ); + + const result = await readFileWithEncodingInfo(filePath); + expect(result.content).toBe('Hello'); + expect(result.encoding).toBe('utf-8'); + expect(result.bom).toBe(true); + }); + + it('should return bom: true and encoding utf-16le for UTF-16LE BOM file', async () => { + const utf16leBom = Buffer.from([0xff, 0xfe]); + const utf16leContent = Buffer.from('Hi', 'utf16le'); + const filePath = path.join(testDir, 'info-utf16le.txt'); + await fsPromises.writeFile( + filePath, + Buffer.concat([utf16leBom, utf16leContent]), + ); + + const result = await readFileWithEncodingInfo(filePath); + expect(result.content).toBe('Hi'); + expect(result.encoding).toBe('utf-16le'); + // Non-UTF-8 BOM should also be flagged so it is preserved on write-back + expect(result.bom).toBe(true); + }); + + it('should return bom: false for GBK file (no BOM)', async () => { + const gbkBuffer = Buffer.from([ + 0xc4, 0xe3, 0xba, 0xc3, 0xca, 0xc0, 0xbd, 0xe7, 0xd5, 0xe2, 0xca, + 0xc7, 0xd6, 0xd0, 0xce, 0xc4, 0xc4, 0xda, 0xc8, 0xdd, 0xd3, 0xc3, + 0xd3, 0xda, 0xb2, 0xe2, 0xca, 0xd4, 0xb1, 0xe0, 0xc2, 0xeb, 0xbc, + 0xec, 0xb2, 0xe2, + ]); + const filePath = path.join(testDir, 'info-gbk.txt'); + await fsPromises.writeFile(filePath, gbkBuffer); + + const result = await readFileWithEncodingInfo(filePath); + expect(result.bom).toBe(false); + expect(result.encoding).toBe('gb18030'); + expect(result.content).toBe('你好世界这是中文内容用于测试编码检测'); + }); + }); + describe('detectFileEncoding', () => { it('should detect UTF-8 for plain ASCII file', async () => { const filePath = path.join(testDir, 'ascii.txt'); diff --git a/packages/core/src/utils/fileUtils.ts b/packages/core/src/utils/fileUtils.ts index a678c4113..05de408ef 100644 --- a/packages/core/src/utils/fileUtils.ts +++ b/packages/core/src/utils/fileUtils.ts @@ -9,7 +9,11 @@ import fsPromises from 'node:fs/promises'; import path from 'node:path'; import type { PartUnion } from '@google/genai'; import mime from 'mime/lite'; -import { iconvDecode, iconvEncodingExists } from './iconvHelper.js'; +import { + iconvDecode, + iconvEncodingExists, + isUtf8CompatibleEncoding, +} from './iconvHelper.js'; import { ToolErrorType } from '../tools/tool-error.js'; import { BINARY_EXTENSIONS } from './ignorePatterns.js'; import type { Config } from '../config/config.js'; @@ -133,12 +137,62 @@ function isValidUtf8(buffer: Buffer): boolean { } /** - * Check whether an encoding name represents a UTF-8 compatible encoding - * that Node's Buffer can handle natively. + * Result of reading a file with encoding detection. */ -function isUtf8Compatible(encoding: string): boolean { - const lower = encoding.toLowerCase().replace(/[^a-z0-9]/g, ''); - return lower === 'utf8' || lower === 'ascii' || lower === 'usascii'; +export interface FileReadResult { + /** Decoded text content of the file (BOM stripped if present). */ + content: string; + /** Detected encoding name (e.g. 'utf-8', 'gb18030', 'utf-16le'). */ + encoding: string; + /** + * Whether the file had a Unicode BOM (UTF-8, UTF-16 LE/BE, or UTF-32 LE/BE). + * When true, the same BOM should be re-written on save to preserve the file's + * original byte-order mark. + */ + bom: boolean; +} + +/** + * Internal helper: decode a buffer given a BOMInfo. + * Returns the decoded string for each supported BOM encoding. + */ +function decodeBOMBuffer(buf: Buffer, bomInfo: BOMInfo): string { + const content = buf.subarray(bomInfo.bomLength); + switch (bomInfo.encoding) { + case 'utf8': + return content.toString('utf8'); + case 'utf16le': + return content.toString('utf16le'); + case 'utf16be': + return decodeUTF16BE(content); + case 'utf32le': + return decodeUTF32(content, true); + case 'utf32be': + return decodeUTF32(content, false); + default: + // Defensive fallback; should be unreachable + return content.toString('utf8'); + } +} + +/** + * Map a BOMInfo encoding to a canonical encoding name string. + */ +function bomEncodingToName(bomEncoding: UnicodeEncoding): string { + switch (bomEncoding) { + case 'utf8': + return 'utf-8'; + case 'utf16le': + return 'utf-16le'; + case 'utf16be': + return 'utf-16be'; + case 'utf32le': + return 'utf-32le'; + case 'utf32be': + return 'utf-32be'; + default: + return 'utf-8'; + } } /** @@ -146,44 +200,43 @@ function isUtf8Compatible(encoding: string): boolean { * For files without BOM, validates UTF-8 first. If invalid UTF-8, uses chardet * to detect encoding (e.g. GBK, Big5, Shift_JIS) and iconv-lite to decode. * Falls back to utf8 when detection fails. + * + * Returns both the decoded content and the detected encoding/BOM information + * in a single I/O pass, avoiding redundant file reads. */ -export async function readFileWithEncoding(filePath: string): Promise { +export async function readFileWithEncodingInfo( + filePath: string, +): Promise { // Read the file once; detect BOM and decode from the single buffer. const full = await fs.promises.readFile(filePath); - if (full.length === 0) return ''; + if (full.length === 0) return { content: '', encoding: 'utf-8', bom: false }; - const bom = detectBOM(full); - if (bom) { - // Strip BOM and decode per encoding - const content = full.subarray(bom.bomLength); - switch (bom.encoding) { - case 'utf8': - return content.toString('utf8'); - case 'utf16le': - return content.toString('utf16le'); - case 'utf16be': - return decodeUTF16BE(content); - case 'utf32le': - return decodeUTF32(content, true); - case 'utf32be': - return decodeUTF32(content, false); - default: - // Defensive fallback; should be unreachable - return content.toString('utf8'); - } + const bomInfo = detectBOM(full); + if (bomInfo) { + return { + content: decodeBOMBuffer(full, bomInfo), + encoding: bomEncodingToName(bomInfo.encoding), + // Mark bom: true for all Unicode BOM variants (UTF-8/16/32) so that + // the BOM is re-written on save and the file's original format is preserved. + bom: true, + }; } // No BOM — check if it's valid UTF-8 first (fast path for the common case) if (isValidUtf8(full)) { - return full.toString('utf8'); + return { content: full.toString('utf8'), encoding: 'utf-8', bom: false }; } // Not valid UTF-8 — try chardet-based encoding detection const detected = detectEncodingFromBuffer(full); - if (detected && !isUtf8Compatible(detected)) { + if (detected && !isUtf8CompatibleEncoding(detected)) { try { if (iconvEncodingExists(detected)) { - return iconvDecode(full, detected); + return { + content: iconvDecode(full, detected), + encoding: detected, + bom: false, + }; } } catch (e) { debugLogger.warn( @@ -193,7 +246,18 @@ export async function readFileWithEncoding(filePath: string): Promise { } // Final fallback: UTF-8 with replacement characters - return full.toString('utf8'); + return { content: full.toString('utf8'), encoding: 'utf-8', bom: false }; +} + +/** + * Read a file as text, honoring BOM encodings (UTF‑8/16/32) and stripping the BOM. + * For files without BOM, validates UTF-8 first. If invalid UTF-8, uses chardet + * to detect encoding (e.g. GBK, Big5, Shift_JIS) and iconv-lite to decode. + * Falls back to utf8 when detection fails. + */ +export async function readFileWithEncoding(filePath: string): Promise { + const result = await readFileWithEncodingInfo(filePath); + return result.content; } /** @@ -239,7 +303,7 @@ export async function detectFileEncoding(filePath: string): Promise { // 3. Use chardet for detection const detected = detectEncodingFromBuffer(sample); - if (detected && !isUtf8Compatible(detected)) { + if (detected && !isUtf8CompatibleEncoding(detected)) { return detected; } diff --git a/packages/core/src/utils/iconvHelper.ts b/packages/core/src/utils/iconvHelper.ts index d9118abab..12c1a56c8 100644 --- a/packages/core/src/utils/iconvHelper.ts +++ b/packages/core/src/utils/iconvHelper.ts @@ -51,3 +51,15 @@ export function iconvEncode(content: string, encoding: string): Buffer { export function iconvEncodingExists(encoding: string): boolean { return iconvLite.encodingExists(encoding); } + +/** + * Check whether an encoding name represents a UTF-8 compatible encoding + * that Node's Buffer can handle natively without iconv-lite. + * Normalizes encoding names (e.g. 'utf-8', 'UTF8', 'us-ascii' all match). + * @param encoding The encoding name to check + * @returns True if the encoding is UTF-8 or ASCII compatible + */ +export function isUtf8CompatibleEncoding(encoding: string): boolean { + const lower = encoding.toLowerCase().replace(/[^a-z0-9]/g, ''); + return lower === 'utf8' || lower === 'ascii' || lower === 'usascii'; +}