diff --git a/package-lock.json b/package-lock.json index 44590ebd4..5df32acc0 100644 --- a/package-lock.json +++ b/package-lock.json @@ -19482,6 +19482,7 @@ "google-auth-library": "^10.5.0", "html-to-text": "^9.0.5", "https-proxy-agent": "^7.0.6", + "iconv-lite": "^0.6.3", "ignore": "^7.0.0", "jsonrepair": "^3.13.0", "marked": "^15.0.12", diff --git a/packages/cli/src/acp-integration/service/filesystem.test.ts b/packages/cli/src/acp-integration/service/filesystem.test.ts index 6eb3dfa1b..e8dc34968 100644 --- a/packages/cli/src/acp-integration/service/filesystem.test.ts +++ b/packages/cli/src/acp-integration/service/filesystem.test.ts @@ -11,6 +11,9 @@ import { ACP_ERROR_CODES } from '../errorCodes.js'; const createFallback = (): FileSystemService => ({ readTextFile: vi.fn(), + readTextFileWithInfo: vi + .fn() + .mockResolvedValue({ content: '', encoding: 'utf-8', bom: false }), writeTextFile: vi.fn(), detectFileBOM: vi.fn().mockResolvedValue(false), findFiles: vi.fn().mockReturnValue([]), diff --git a/packages/cli/src/acp-integration/service/filesystem.ts b/packages/cli/src/acp-integration/service/filesystem.ts index 9dfbf35b3..88512558d 100644 --- a/packages/cli/src/acp-integration/service/filesystem.ts +++ b/packages/cli/src/acp-integration/service/filesystem.ts @@ -4,7 +4,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -import type { FileSystemService } from '@qwen-code/qwen-code-core'; +import type { FileSystemService , FileReadResult } from '@qwen-code/qwen-code-core'; import type * as acp from '../acp.js'; import { ACP_ERROR_CODES } from '../errorCodes.js'; @@ -54,10 +54,16 @@ export class AcpFileSystemService implements FileSystemService { return response.content; } + async readTextFileWithInfo(filePath: string): Promise { + // ACP protocol does not expose encoding metadata; delegate to the local + // fallback which performs a single-pass read with encoding detection. + return this.fallback.readTextFileWithInfo(filePath); + } + async writeTextFile( filePath: string, content: string, - options?: { bom?: boolean }, + options?: { bom?: boolean; encoding?: string }, ): Promise { if (!this.capabilities.writeTextFile) { return this.fallback.writeTextFile(filePath, content, options); diff --git a/packages/core/package.json b/packages/core/package.json index c80f40474..43219cbcc 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -42,6 +42,7 @@ "ajv-formats": "^3.0.0", "async-mutex": "^0.5.0", "chardet": "^2.1.0", + "iconv-lite": "^0.6.3", "chokidar": "^4.0.3", "diff": "^7.0.0", "dotenv": "^17.1.0", diff --git a/packages/core/src/services/fileSystemService.test.ts b/packages/core/src/services/fileSystemService.test.ts index 69898f72d..fe72829e2 100644 --- a/packages/core/src/services/fileSystemService.test.ts +++ b/packages/core/src/services/fileSystemService.test.ts @@ -10,6 +10,20 @@ import { StandardFileSystemService } from './fileSystemService.js'; vi.mock('fs/promises'); +vi.mock('../utils/fileUtils.js', async (importOriginal) => { + const actual = await importOriginal(); + return { + ...actual, + readFileWithEncoding: vi.fn(), + readFileWithEncodingInfo: vi.fn(), + }; +}); + +import { + readFileWithEncoding, + readFileWithEncodingInfo, +} from '../utils/fileUtils.js'; + describe('StandardFileSystemService', () => { let fileSystem: StandardFileSystemService; @@ -23,19 +37,19 @@ describe('StandardFileSystemService', () => { }); describe('readTextFile', () => { - it('should read file content using fs', async () => { + it('should read file content using readFileWithEncoding', async () => { const testContent = 'Hello, World!'; - vi.mocked(fs.readFile).mockResolvedValue(testContent); + vi.mocked(readFileWithEncoding).mockResolvedValue(testContent); const result = await fileSystem.readTextFile('/test/file.txt'); - expect(fs.readFile).toHaveBeenCalledWith('/test/file.txt', 'utf-8'); + expect(readFileWithEncoding).toHaveBeenCalledWith('/test/file.txt'); expect(result).toBe(testContent); }); - it('should propagate fs.readFile errors', async () => { + it('should propagate readFileWithEncoding errors', async () => { const error = new Error('ENOENT: File not found'); - vi.mocked(fs.readFile).mockRejectedValue(error); + vi.mocked(readFileWithEncoding).mockRejectedValue(error); await expect(fileSystem.readTextFile('/test/file.txt')).rejects.toThrow( 'ENOENT: File not found', @@ -43,6 +57,42 @@ describe('StandardFileSystemService', () => { }); }); + describe('readTextFileWithInfo', () => { + it('should return content, encoding, and bom via readFileWithEncodingInfo', async () => { + const mockResult = { content: 'Hello', encoding: 'utf-8', bom: false }; + vi.mocked(readFileWithEncodingInfo).mockResolvedValue(mockResult); + + const result = await fileSystem.readTextFileWithInfo('/test/file.txt'); + + expect(readFileWithEncodingInfo).toHaveBeenCalledWith('/test/file.txt'); + expect(result).toEqual(mockResult); + }); + + it('should return non-UTF-8 encoding info for GBK file', async () => { + const mockResult = { + content: '你好世界', + encoding: 'gb18030', + bom: false, + }; + vi.mocked(readFileWithEncodingInfo).mockResolvedValue(mockResult); + + const result = await fileSystem.readTextFileWithInfo('/test/gbk.txt'); + + expect(result.encoding).toBe('gb18030'); + expect(result.bom).toBe(false); + expect(result.content).toBe('你好世界'); + }); + + it('should propagate readFileWithEncodingInfo errors', async () => { + const error = new Error('ENOENT: File not found'); + vi.mocked(readFileWithEncodingInfo).mockRejectedValue(error); + + await expect( + fileSystem.readTextFileWithInfo('/test/file.txt'), + ).rejects.toThrow('ENOENT: File not found'); + }); + }); + describe('writeTextFile', () => { it('should write file content using fs', async () => { vi.mocked(fs.writeFile).mockResolvedValue(); @@ -120,6 +170,67 @@ describe('StandardFileSystemService', () => { } expect(bomCount).toBe(1); }); + it('should write file with non-UTF-8 encoding using iconv-lite', async () => { + vi.mocked(fs.writeFile).mockResolvedValue(); + + await fileSystem.writeTextFile('/test/file.txt', '你好世界', { + encoding: 'gbk', + }); + + // Verify that fs.writeFile was called with a Buffer (iconv-encoded) + const writeCall = vi.mocked(fs.writeFile).mock.calls[0]; + expect(writeCall[0]).toBe('/test/file.txt'); + expect(writeCall[1]).toBeInstanceOf(Buffer); + }); + + it('should write file as UTF-8 when encoding is utf-8', async () => { + vi.mocked(fs.writeFile).mockResolvedValue(); + + await fileSystem.writeTextFile('/test/file.txt', 'Hello', { + encoding: 'utf-8', + }); + + expect(fs.writeFile).toHaveBeenCalledWith( + '/test/file.txt', + 'Hello', + 'utf-8', + ); + }); + + it('should preserve UTF-16LE BOM when writing back a UTF-16LE file', async () => { + vi.mocked(fs.writeFile).mockResolvedValue(); + + await fileSystem.writeTextFile('/test/file.txt', 'Hello', { + encoding: 'utf-16le', + bom: true, + }); + + // iconv-lite encodes as UTF-16LE; with bom:true the FF FE BOM is prepended + const writeCall = vi.mocked(fs.writeFile).mock.calls[0]; + expect(writeCall[0]).toBe('/test/file.txt'); + expect(writeCall[1]).toBeInstanceOf(Buffer); + const buf = writeCall[1] as Buffer; + // First two bytes must be the UTF-16LE BOM: FF FE + expect(buf[0]).toBe(0xff); + expect(buf[1]).toBe(0xfe); + }); + + it('should not add BOM when writing UTF-16LE file without bom flag', async () => { + vi.mocked(fs.writeFile).mockResolvedValue(); + + await fileSystem.writeTextFile('/test/file.txt', 'Hello', { + encoding: 'utf-16le', + bom: false, + }); + + // No BOM prepended — raw iconv-encoded buffer written directly + const writeCall = vi.mocked(fs.writeFile).mock.calls[0]; + expect(writeCall[0]).toBe('/test/file.txt'); + expect(writeCall[1]).toBeInstanceOf(Buffer); + const buf = writeCall[1] as Buffer; + // First two bytes should NOT be FF FE (the UTF-16LE BOM) + expect(!(buf[0] === 0xff && buf[1] === 0xfe)).toBe(true); + }); }); describe('detectFileBOM', () => { diff --git a/packages/core/src/services/fileSystemService.ts b/packages/core/src/services/fileSystemService.ts index 91f36161c..787d68929 100644 --- a/packages/core/src/services/fileSystemService.ts +++ b/packages/core/src/services/fileSystemService.ts @@ -7,6 +7,16 @@ import fs from 'node:fs/promises'; import * as path from 'node:path'; import { globSync } from 'glob'; +import { + readFileWithEncoding, + readFileWithEncodingInfo, +} from '../utils/fileUtils.js'; +import type { FileReadResult } from '../utils/fileUtils.js'; +import { + iconvEncode, + iconvEncodingExists, + isUtf8CompatibleEncoding, +} from '../utils/iconvHelper.js'; /** * Supported file encodings for new files. @@ -33,6 +43,15 @@ export interface FileSystemService { */ readTextFile(filePath: string): Promise; + /** + * Read text content from a file, returning both the content and encoding metadata. + * Combines readTextFile + detectFileBOM + detectFileEncoding into a single I/O pass. + * + * @param filePath - The path to the file to read + * @returns The file content, encoding name, and whether a UTF-8 BOM was present + */ + readTextFileWithInfo(filePath: string): Promise; + /** * Write text content to a file * @@ -74,6 +93,14 @@ export interface WriteTextFileOptions { * @default false */ bom?: boolean; + + /** + * The encoding to use when writing the file. + * If specified and not UTF-8 compatible, iconv-lite will be used to encode. + * This is used to preserve the original encoding of non-UTF-8 files (e.g. GBK, Big5). + * @default undefined (writes as UTF-8) + */ + encoding?: string; } /** @@ -92,12 +119,44 @@ function hasUTF8BOM(buffer: Buffer): boolean { ); } +/** + * Return the BOM byte sequence for a given encoding name, or null if the + * encoding does not use a standard BOM. Used when writing back a file that + * originally had a BOM so the BOM is preserved. + */ +function getBOMBytesForEncoding(encoding: string): Buffer | null { + const lower = encoding.toLowerCase().replace(/[^a-z0-9]/g, ''); + switch (lower) { + case 'utf8': + return Buffer.from([0xef, 0xbb, 0xbf]); + case 'utf16le': + case 'utf16': + return Buffer.from([0xff, 0xfe]); + case 'utf16be': + return Buffer.from([0xfe, 0xff]); + case 'utf32le': + case 'utf32': + return Buffer.from([0xff, 0xfe, 0x00, 0x00]); + case 'utf32be': + return Buffer.from([0x00, 0x00, 0xfe, 0xff]); + default: + return null; + } +} + /** * Standard file system implementation */ export class StandardFileSystemService implements FileSystemService { async readTextFile(filePath: string): Promise { - return fs.readFile(filePath, FileEncoding.UTF8); + // Use encoding-aware reader that handles BOM and non-UTF-8 encodings (e.g. GBK) + return readFileWithEncoding(filePath); + } + + async readTextFileWithInfo(filePath: string): Promise { + // Single I/O pass: returns content, encoding, and BOM flag together, + // eliminating the need for separate detectFileEncoding / detectFileBOM calls. + return readFileWithEncodingInfo(filePath); } async writeTextFile( @@ -106,10 +165,32 @@ export class StandardFileSystemService implements FileSystemService { options?: WriteTextFileOptions, ): Promise { const bom = options?.bom ?? false; + const encoding = options?.encoding; - if (bom) { - // Prepend UTF-8 BOM (EF BB BF) - // If content already starts with BOM character, strip it first to avoid double BOM + // Check if a non-UTF-8 encoding is specified and supported by iconv-lite + const isNonUtf8Encoding = + encoding && + !isUtf8CompatibleEncoding(encoding) && + iconvEncodingExists(encoding); + + if (isNonUtf8Encoding) { + // Non-UTF-8 encoding (e.g. GBK, Big5, Shift_JIS, UTF-16LE, UTF-32BE…) + // Use iconv-lite to encode the content. When the file originally had a BOM + // (bom: true), prepend the correct BOM bytes for this encoding so the + // byte-order mark is preserved on write-back. + const encoded = iconvEncode(content, encoding); + if (bom) { + const bomBytes = getBOMBytesForEncoding(encoding); + await fs.writeFile( + filePath, + bomBytes ? Buffer.concat([bomBytes, encoded]) : encoded, + ); + } else { + await fs.writeFile(filePath, encoded); + } + } else if (bom) { + // UTF-8 BOM: prepend EF BB BF + // If content already starts with the BOM character, strip it first to avoid double BOM. const normalizedContent = content.charCodeAt(0) === 0xfeff ? content.slice(1) : content; const bomBuffer = Buffer.from([0xef, 0xbb, 0xbf]); diff --git a/packages/core/src/tools/edit.ts b/packages/core/src/tools/edit.ts index 016eb2854..61a318190 100644 --- a/packages/core/src/tools/edit.ts +++ b/packages/core/src/tools/edit.ts @@ -108,6 +108,10 @@ interface CalculatedEdit { occurrences: number; error?: { display: string; raw: string; type: ToolErrorType }; isNewFile: boolean; + /** Detected encoding of the existing file (e.g. 'utf-8', 'gbk') */ + encoding: string; + /** Whether the existing file has a UTF-8 BOM */ + bom: boolean; } class EditToolInvocation implements ToolInvocation { @@ -134,17 +138,22 @@ class EditToolInvocation implements ToolInvocation { let finalNewString = params.new_string; let finalOldString = params.old_string; let occurrences = 0; + let encoding = 'utf-8'; + let bom = false; let error: | { display: string; raw: string; type: ToolErrorType } | undefined = undefined; try { - currentContent = await this.config + const fileInfo = await this.config .getFileSystemService() - .readTextFile(params.file_path); + .readTextFileWithInfo(params.file_path); // Normalize line endings to LF for consistent processing. - currentContent = currentContent.replace(/\r\n/g, '\n'); + currentContent = fileInfo.content.replace(/\r\n/g, '\n'); fileExists = true; + // Encoding and BOM are returned from the same I/O pass, avoiding redundant reads. + encoding = fileInfo.encoding; + bom = fileInfo.bom; } catch (err: unknown) { if (!isNodeError(err) || err.code !== 'ENOENT') { // Rethrow unexpected FS errors (permissions, etc.) @@ -238,6 +247,8 @@ class EditToolInvocation implements ToolInvocation { occurrences, error, isNewFile, + encoding, + bom, }; } @@ -373,7 +384,7 @@ class EditToolInvocation implements ToolInvocation { this.ensureParentDirectoriesExist(this.params.file_path); // For new files, apply default file encoding setting - // For existing files, keep original content as-is (including any BOM character) + // For existing files, preserve the original encoding (BOM and charset) if (editData.isNewFile) { const useBOM = this.config.getDefaultFileEncoding() === FileEncoding.UTF8_BOM; @@ -385,7 +396,10 @@ class EditToolInvocation implements ToolInvocation { } else { await this.config .getFileSystemService() - .writeTextFile(this.params.file_path, editData.newContent); + .writeTextFile(this.params.file_path, editData.newContent, { + bom: editData.bom, + encoding: editData.encoding, + }); } const fileName = path.basename(this.params.file_path); diff --git a/packages/core/src/tools/write-file.test.ts b/packages/core/src/tools/write-file.test.ts index b0d7a2b0d..e096b0a72 100644 --- a/packages/core/src/tools/write-file.test.ts +++ b/packages/core/src/tools/write-file.test.ts @@ -759,6 +759,7 @@ describe('WriteFileTool', () => { // Verify writeTextFile was called with bom: true expect(writeSpy).toHaveBeenCalledWith(filePath, newContent, { bom: true, + encoding: 'utf-8', }); // Cleanup @@ -785,6 +786,7 @@ describe('WriteFileTool', () => { // Verify writeTextFile was called with bom: false expect(writeSpy).toHaveBeenCalledWith(filePath, newContent, { bom: false, + encoding: 'utf-8', }); // Cleanup diff --git a/packages/core/src/tools/write-file.ts b/packages/core/src/tools/write-file.ts index 1ccb7bf0b..4085e3b69 100644 --- a/packages/core/src/tools/write-file.ts +++ b/packages/core/src/tools/write-file.ts @@ -243,17 +243,25 @@ class WriteFileToolInvocation extends BaseToolInvocation< // Check if file exists and has BOM to preserve encoding // For new files, use the configured default encoding let useBOM = false; + let detectedEncoding: string | undefined; if (!isNewFile) { - useBOM = await this.config + // Use readTextFileWithInfo for a single I/O pass that returns encoding + // and BOM metadata together, avoiding separate detectFileBOM / detectFileEncoding calls. + const fileInfo = await this.config .getFileSystemService() - .detectFileBOM(file_path); + .readTextFileWithInfo(file_path); + useBOM = fileInfo.bom; + detectedEncoding = fileInfo.encoding; } else { useBOM = this.config.getDefaultFileEncoding() === FileEncoding.UTF8_BOM; } await this.config .getFileSystemService() - .writeTextFile(file_path, fileContent, { bom: useBOM }); + .writeTextFile(file_path, fileContent, { + bom: useBOM, + encoding: detectedEncoding, + }); // Generate diff for display result const fileName = path.basename(file_path); diff --git a/packages/core/src/utils/fileUtils.test.ts b/packages/core/src/utils/fileUtils.test.ts index b21ee79e2..6dc38e4d7 100644 --- a/packages/core/src/utils/fileUtils.test.ts +++ b/packages/core/src/utils/fileUtils.test.ts @@ -28,6 +28,8 @@ import { processSingleFileContent, detectBOM, readFileWithEncoding, + readFileWithEncodingInfo, + detectFileEncoding, fileExists, } from './fileUtils.js'; import type { Config } from '../config/config.js'; @@ -407,6 +409,153 @@ describe('fileUtils', () => { const result = await readFileWithEncoding(filePath); expect(result).toBe(''); }); + + it('should read GBK-encoded file with Chinese characters correctly', async () => { + // GBK encoding of "你好世界这是中文内容用于测试编码检测" + // Needs enough content for chardet to reliably detect the encoding + const gbkBuffer = Buffer.from([ + 0xc4, 0xe3, 0xba, 0xc3, 0xca, 0xc0, 0xbd, 0xe7, 0xd5, 0xe2, 0xca, + 0xc7, 0xd6, 0xd0, 0xce, 0xc4, 0xc4, 0xda, 0xc8, 0xdd, 0xd3, 0xc3, + 0xd3, 0xda, 0xb2, 0xe2, 0xca, 0xd4, 0xb1, 0xe0, 0xc2, 0xeb, 0xbc, + 0xec, 0xb2, 0xe2, + ]); + const filePath = path.join(testDir, 'gbk-chinese.txt'); + await fsPromises.writeFile(filePath, gbkBuffer); + + const result = await readFileWithEncoding(filePath); + expect(result).toBe('你好世界这是中文内容用于测试编码检测'); + }); + + it('should read GBK-encoded file with mixed ASCII and Chinese correctly', async () => { + // GBK encoding of "// 这是注释内容用于测试\nhello你好世界测试中文编码检测\n函数返回值正确" + // Needs enough Chinese content for chardet to reliably detect as GB18030/GBK + const gbkBuffer = Buffer.from([ + 0x2f, 0x2f, 0x20, 0xd5, 0xe2, 0xca, 0xc7, 0xd7, 0xa2, 0xca, 0xcd, + 0xc4, 0xda, 0xc8, 0xdd, 0xd3, 0xc3, 0xd3, 0xda, 0xb2, 0xe2, 0xca, + 0xd4, 0x0a, 0x68, 0x65, 0x6c, 0x6c, 0x6f, 0xc4, 0xe3, 0xba, 0xc3, + 0xca, 0xc0, 0xbd, 0xe7, 0xb2, 0xe2, 0xca, 0xd4, 0xd6, 0xd0, 0xce, + 0xc4, 0xb1, 0xe0, 0xc2, 0xeb, 0xbc, 0xec, 0xb2, 0xe2, 0x0a, 0xba, + 0xaf, 0xca, 0xfd, 0xb7, 0xb5, 0xbb, 0xd8, 0xd6, 0xb5, 0xd5, 0xfd, + 0xc8, 0xb7, + ]); + const filePath = path.join(testDir, 'gbk-mixed.txt'); + await fsPromises.writeFile(filePath, gbkBuffer); + + const result = await readFileWithEncoding(filePath); + expect(result).toContain('hello'); + expect(result).toContain('你好世界'); + expect(result).toContain('函数返回值正确'); + }); + }); + + describe('readFileWithEncodingInfo', () => { + it('should return bom: false and encoding utf-8 for plain UTF-8 file', async () => { + const filePath = path.join(testDir, 'info-utf8.txt'); + await fsPromises.writeFile(filePath, 'Hello', 'utf8'); + + const result = await readFileWithEncodingInfo(filePath); + expect(result.content).toBe('Hello'); + expect(result.encoding).toBe('utf-8'); + expect(result.bom).toBe(false); + }); + + it('should return bom: true and encoding utf-8 for UTF-8 BOM file', async () => { + const utf8Bom = Buffer.from([0xef, 0xbb, 0xbf]); + const filePath = path.join(testDir, 'info-utf8-bom.txt'); + await fsPromises.writeFile( + filePath, + Buffer.concat([utf8Bom, Buffer.from('Hello', 'utf8')]), + ); + + const result = await readFileWithEncodingInfo(filePath); + expect(result.content).toBe('Hello'); + expect(result.encoding).toBe('utf-8'); + expect(result.bom).toBe(true); + }); + + it('should return bom: true and encoding utf-16le for UTF-16LE BOM file', async () => { + const utf16leBom = Buffer.from([0xff, 0xfe]); + const utf16leContent = Buffer.from('Hi', 'utf16le'); + const filePath = path.join(testDir, 'info-utf16le.txt'); + await fsPromises.writeFile( + filePath, + Buffer.concat([utf16leBom, utf16leContent]), + ); + + const result = await readFileWithEncodingInfo(filePath); + expect(result.content).toBe('Hi'); + expect(result.encoding).toBe('utf-16le'); + // Non-UTF-8 BOM should also be flagged so it is preserved on write-back + expect(result.bom).toBe(true); + }); + + it('should return bom: false for GBK file (no BOM)', async () => { + const gbkBuffer = Buffer.from([ + 0xc4, 0xe3, 0xba, 0xc3, 0xca, 0xc0, 0xbd, 0xe7, 0xd5, 0xe2, 0xca, + 0xc7, 0xd6, 0xd0, 0xce, 0xc4, 0xc4, 0xda, 0xc8, 0xdd, 0xd3, 0xc3, + 0xd3, 0xda, 0xb2, 0xe2, 0xca, 0xd4, 0xb1, 0xe0, 0xc2, 0xeb, 0xbc, + 0xec, 0xb2, 0xe2, + ]); + const filePath = path.join(testDir, 'info-gbk.txt'); + await fsPromises.writeFile(filePath, gbkBuffer); + + const result = await readFileWithEncodingInfo(filePath); + expect(result.bom).toBe(false); + expect(result.encoding).toBe('gb18030'); + expect(result.content).toBe('你好世界这是中文内容用于测试编码检测'); + }); + }); + + describe('detectFileEncoding', () => { + it('should detect UTF-8 for plain ASCII file', async () => { + const filePath = path.join(testDir, 'ascii.txt'); + await fsPromises.writeFile(filePath, 'Hello World', 'utf8'); + + const encoding = await detectFileEncoding(filePath); + expect(encoding).toBe('utf-8'); + }); + + it('should detect UTF-8 for file with UTF-8 BOM', async () => { + const utf8Bom = Buffer.from([0xef, 0xbb, 0xbf]); + const content = Buffer.from('Hello', 'utf8'); + const filePath = path.join(testDir, 'utf8-bom-detect.txt'); + await fsPromises.writeFile(filePath, Buffer.concat([utf8Bom, content])); + + const encoding = await detectFileEncoding(filePath); + expect(encoding).toBe('utf-8'); + }); + + it('should detect GBK encoding for Chinese text in GBK', async () => { + // GBK encoding of "你好世界这是中文内容用于测试编码检测" + // Needs enough content for chardet to reliably detect + const gbkBuffer = Buffer.from([ + 0xc4, 0xe3, 0xba, 0xc3, 0xca, 0xc0, 0xbd, 0xe7, 0xd5, 0xe2, 0xca, + 0xc7, 0xd6, 0xd0, 0xce, 0xc4, 0xc4, 0xda, 0xc8, 0xdd, 0xd3, 0xc3, + 0xd3, 0xda, 0xb2, 0xe2, 0xca, 0xd4, 0xb1, 0xe0, 0xc2, 0xeb, 0xbc, + 0xec, 0xb2, 0xe2, + ]); + const filePath = path.join(testDir, 'gbk-detect.txt'); + await fsPromises.writeFile(filePath, gbkBuffer); + + const encoding = await detectFileEncoding(filePath); + // chardet detects GBK as 'gb18030' (its superset) + expect(encoding).toBe('gb18030'); + }); + + it('should return utf-8 for empty file', async () => { + const filePath = path.join(testDir, 'empty-detect.txt'); + await fsPromises.writeFile(filePath, ''); + + const encoding = await detectFileEncoding(filePath); + expect(encoding).toBe('utf-8'); + }); + + it('should return utf-8 for non-existent file', async () => { + const filePath = path.join(testDir, 'nonexistent-detect.txt'); + + const encoding = await detectFileEncoding(filePath); + expect(encoding).toBe('utf-8'); + }); }); describe('isBinaryFile with BOM awareness', () => { diff --git a/packages/core/src/utils/fileUtils.ts b/packages/core/src/utils/fileUtils.ts index aab6935cb..05de408ef 100644 --- a/packages/core/src/utils/fileUtils.ts +++ b/packages/core/src/utils/fileUtils.ts @@ -9,10 +9,16 @@ import fsPromises from 'node:fs/promises'; import path from 'node:path'; import type { PartUnion } from '@google/genai'; import mime from 'mime/lite'; +import { + iconvDecode, + iconvEncodingExists, + isUtf8CompatibleEncoding, +} from './iconvHelper.js'; import { ToolErrorType } from '../tools/tool-error.js'; import { BINARY_EXTENSIONS } from './ignorePatterns.js'; import type { Config } from '../config/config.js'; import { createDebugLogger } from './debugLogger.js'; +import { detectEncodingFromBuffer } from './systemEncoding.js'; const debugLogger = createDebugLogger('FILE_UTILS'); @@ -118,23 +124,41 @@ function decodeUTF32(buf: Buffer, littleEndian: boolean): string { } /** - * Read a file as text, honoring BOM encodings (UTF‑8/16/32) and stripping the BOM. - * Falls back to utf8 when no BOM is present. + * Check whether a buffer is valid UTF-8 by attempting a strict decode. + * If any invalid byte sequence is encountered, TextDecoder with `fatal: true` throws. */ -export async function readFileWithEncoding(filePath: string): Promise { - // Read the file once; detect BOM and decode from the single buffer. - const full = await fs.promises.readFile(filePath); - if (full.length === 0) return ''; - - const bom = detectBOM(full); - if (!bom) { - // No BOM → treat as UTF‑8 - return full.toString('utf8'); +function isValidUtf8(buffer: Buffer): boolean { + try { + new TextDecoder('utf-8', { fatal: true }).decode(buffer); + return true; + } catch { + return false; } +} - // Strip BOM and decode per encoding - const content = full.subarray(bom.bomLength); - switch (bom.encoding) { +/** + * Result of reading a file with encoding detection. + */ +export interface FileReadResult { + /** Decoded text content of the file (BOM stripped if present). */ + content: string; + /** Detected encoding name (e.g. 'utf-8', 'gb18030', 'utf-16le'). */ + encoding: string; + /** + * Whether the file had a Unicode BOM (UTF-8, UTF-16 LE/BE, or UTF-32 LE/BE). + * When true, the same BOM should be re-written on save to preserve the file's + * original byte-order mark. + */ + bom: boolean; +} + +/** + * Internal helper: decode a buffer given a BOMInfo. + * Returns the decoded string for each supported BOM encoding. + */ +function decodeBOMBuffer(buf: Buffer, bomInfo: BOMInfo): string { + const content = buf.subarray(bomInfo.bomLength); + switch (bomInfo.encoding) { case 'utf8': return content.toString('utf8'); case 'utf16le': @@ -151,6 +175,153 @@ export async function readFileWithEncoding(filePath: string): Promise { } } +/** + * Map a BOMInfo encoding to a canonical encoding name string. + */ +function bomEncodingToName(bomEncoding: UnicodeEncoding): string { + switch (bomEncoding) { + case 'utf8': + return 'utf-8'; + case 'utf16le': + return 'utf-16le'; + case 'utf16be': + return 'utf-16be'; + case 'utf32le': + return 'utf-32le'; + case 'utf32be': + return 'utf-32be'; + default: + return 'utf-8'; + } +} + +/** + * Read a file as text, honoring BOM encodings (UTF‑8/16/32) and stripping the BOM. + * For files without BOM, validates UTF-8 first. If invalid UTF-8, uses chardet + * to detect encoding (e.g. GBK, Big5, Shift_JIS) and iconv-lite to decode. + * Falls back to utf8 when detection fails. + * + * Returns both the decoded content and the detected encoding/BOM information + * in a single I/O pass, avoiding redundant file reads. + */ +export async function readFileWithEncodingInfo( + filePath: string, +): Promise { + // Read the file once; detect BOM and decode from the single buffer. + const full = await fs.promises.readFile(filePath); + if (full.length === 0) return { content: '', encoding: 'utf-8', bom: false }; + + const bomInfo = detectBOM(full); + if (bomInfo) { + return { + content: decodeBOMBuffer(full, bomInfo), + encoding: bomEncodingToName(bomInfo.encoding), + // Mark bom: true for all Unicode BOM variants (UTF-8/16/32) so that + // the BOM is re-written on save and the file's original format is preserved. + bom: true, + }; + } + + // No BOM — check if it's valid UTF-8 first (fast path for the common case) + if (isValidUtf8(full)) { + return { content: full.toString('utf8'), encoding: 'utf-8', bom: false }; + } + + // Not valid UTF-8 — try chardet-based encoding detection + const detected = detectEncodingFromBuffer(full); + if (detected && !isUtf8CompatibleEncoding(detected)) { + try { + if (iconvEncodingExists(detected)) { + return { + content: iconvDecode(full, detected), + encoding: detected, + bom: false, + }; + } + } catch (e) { + debugLogger.warn( + `Failed to decode file ${filePath} as ${detected}: ${e instanceof Error ? e.message : String(e)}`, + ); + } + } + + // Final fallback: UTF-8 with replacement characters + return { content: full.toString('utf8'), encoding: 'utf-8', bom: false }; +} + +/** + * Read a file as text, honoring BOM encodings (UTF‑8/16/32) and stripping the BOM. + * For files without BOM, validates UTF-8 first. If invalid UTF-8, uses chardet + * to detect encoding (e.g. GBK, Big5, Shift_JIS) and iconv-lite to decode. + * Falls back to utf8 when detection fails. + */ +export async function readFileWithEncoding(filePath: string): Promise { + const result = await readFileWithEncodingInfo(filePath); + return result.content; +} + +/** + * Detect the encoding of a file by reading a sample from its beginning. + * Returns the encoding name (e.g. 'utf-8', 'gbk', 'shift_jis'). + * Uses BOM detection first, then UTF-8 validation, then chardet as fallback. + */ +export async function detectFileEncoding(filePath: string): Promise { + let fh: fs.promises.FileHandle | null = null; + try { + fh = await fs.promises.open(filePath, 'r'); + const stats = await fh.stat(); + if (stats.size === 0) return 'utf-8'; + + // Read a sample (up to 8KB) for detection + const sampleSize = Math.min(8192, stats.size); + const buf = Buffer.alloc(sampleSize); + const { bytesRead } = await fh.read(buf, 0, sampleSize, 0); + if (bytesRead === 0) return 'utf-8'; + const sample = buf.subarray(0, bytesRead); + + // 1. Check for BOM + const bom = detectBOM(sample); + if (bom) { + switch (bom.encoding) { + case 'utf8': + return 'utf-8'; + case 'utf16le': + return 'utf-16le'; + case 'utf16be': + return 'utf-16be'; + case 'utf32le': + return 'utf-32le'; + case 'utf32be': + return 'utf-32be'; + default: + return 'utf-8'; + } + } + + // 2. Validate UTF-8 + if (isValidUtf8(sample)) return 'utf-8'; + + // 3. Use chardet for detection + const detected = detectEncodingFromBuffer(sample); + if (detected && !isUtf8CompatibleEncoding(detected)) { + return detected; + } + + return 'utf-8'; + } catch { + // If file can't be read, default to UTF-8 + return 'utf-8'; + } finally { + if (fh) { + try { + await fh.close(); + } catch { + // Ignore close errors + } + } + } +} + /** * Looks up the specific MIME type for a file path. * @param filePath Path to the file. diff --git a/packages/core/src/utils/iconvHelper.ts b/packages/core/src/utils/iconvHelper.ts new file mode 100644 index 000000000..12c1a56c8 --- /dev/null +++ b/packages/core/src/utils/iconvHelper.ts @@ -0,0 +1,65 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * Helper module to bridge iconv-lite CJS module with our ESM codebase. + * iconv-lite v0.6.x uses ambient `declare module` type declarations + * that are incompatible with NodeNext module resolution. + * This module provides properly-typed wrappers. + */ + +interface IconvLite { + decode(buffer: Buffer, encoding: string): string; + encode(content: string, encoding: string): Buffer; + encodingExists(encoding: string): boolean; +} + +// iconv-lite is a CJS module. Under NodeNext resolution, its ambient type +// declarations don't map correctly. We import the default export (which is +// the CJS module.exports object) and cast it to a proper interface. +import iconvModule from 'iconv-lite'; +const iconvLite: IconvLite = iconvModule as unknown as IconvLite; + +/** + * Decode a buffer using the specified encoding. + * @param buffer The buffer to decode + * @param encoding The encoding to use (e.g. 'gbk', 'big5', 'shift_jis') + * @returns The decoded string + */ +export function iconvDecode(buffer: Buffer, encoding: string): string { + return iconvLite.decode(buffer, encoding); +} + +/** + * Encode a string to a buffer using the specified encoding. + * @param content The string to encode + * @param encoding The encoding to use (e.g. 'gbk', 'big5', 'shift_jis') + * @returns The encoded buffer + */ +export function iconvEncode(content: string, encoding: string): Buffer { + return iconvLite.encode(content, encoding); +} + +/** + * Check if an encoding is supported by iconv-lite. + * @param encoding The encoding name to check + * @returns True if the encoding is supported + */ +export function iconvEncodingExists(encoding: string): boolean { + return iconvLite.encodingExists(encoding); +} + +/** + * Check whether an encoding name represents a UTF-8 compatible encoding + * that Node's Buffer can handle natively without iconv-lite. + * Normalizes encoding names (e.g. 'utf-8', 'UTF8', 'us-ascii' all match). + * @param encoding The encoding name to check + * @returns True if the encoding is UTF-8 or ASCII compatible + */ +export function isUtf8CompatibleEncoding(encoding: string): boolean { + const lower = encoding.toLowerCase().replace(/[^a-z0-9]/g, ''); + return lower === 'utf8' || lower === 'ascii' || lower === 'usascii'; +}