mirror of
https://github.com/QwenLM/qwen-code.git
synced 2026-04-28 11:41:04 +00:00
fix: preserve original encoding when reading/writing non-UTF-8 files
Fixes #2069 - Add iconv-lite dependency for non-UTF-8 encoding support - Add iconvHelper.ts as a CJS/ESM compatibility wrapper - Update readFileWithEncoding() to detect and handle GBK/Big5/Shift_JIS using BOM detection -> UTF-8 validation -> chardet -> iconv-lite fallback - Add detectFileEncoding() to identify file encoding before writes - Update writeTextFile() to accept encoding option and encode with iconv-lite - Update WriteFileTool and EditTool to detect and preserve original encoding - Add tests for GBK read/write, detectFileEncoding, and encoding option
This commit is contained in:
parent
407a66c959
commit
a5eb1733fa
11 changed files with 375 additions and 32 deletions
1
package-lock.json
generated
1
package-lock.json
generated
|
|
@ -19471,6 +19471,7 @@
|
||||||
"google-auth-library": "^10.5.0",
|
"google-auth-library": "^10.5.0",
|
||||||
"html-to-text": "^9.0.5",
|
"html-to-text": "^9.0.5",
|
||||||
"https-proxy-agent": "^7.0.6",
|
"https-proxy-agent": "^7.0.6",
|
||||||
|
"iconv-lite": "^0.6.3",
|
||||||
"ignore": "^7.0.0",
|
"ignore": "^7.0.0",
|
||||||
"jsonrepair": "^3.13.0",
|
"jsonrepair": "^3.13.0",
|
||||||
"marked": "^15.0.12",
|
"marked": "^15.0.12",
|
||||||
|
|
|
||||||
|
|
@ -57,7 +57,7 @@ export class AcpFileSystemService implements FileSystemService {
|
||||||
async writeTextFile(
|
async writeTextFile(
|
||||||
filePath: string,
|
filePath: string,
|
||||||
content: string,
|
content: string,
|
||||||
options?: { bom?: boolean },
|
options?: { bom?: boolean; encoding?: string },
|
||||||
): Promise<void> {
|
): Promise<void> {
|
||||||
if (!this.capabilities.writeTextFile) {
|
if (!this.capabilities.writeTextFile) {
|
||||||
return this.fallback.writeTextFile(filePath, content, options);
|
return this.fallback.writeTextFile(filePath, content, options);
|
||||||
|
|
|
||||||
|
|
@ -42,6 +42,7 @@
|
||||||
"ajv-formats": "^3.0.0",
|
"ajv-formats": "^3.0.0",
|
||||||
"async-mutex": "^0.5.0",
|
"async-mutex": "^0.5.0",
|
||||||
"chardet": "^2.1.0",
|
"chardet": "^2.1.0",
|
||||||
|
"iconv-lite": "^0.6.3",
|
||||||
"chokidar": "^4.0.3",
|
"chokidar": "^4.0.3",
|
||||||
"diff": "^7.0.0",
|
"diff": "^7.0.0",
|
||||||
"dotenv": "^17.1.0",
|
"dotenv": "^17.1.0",
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,16 @@ import { StandardFileSystemService } from './fileSystemService.js';
|
||||||
|
|
||||||
vi.mock('fs/promises');
|
vi.mock('fs/promises');
|
||||||
|
|
||||||
|
vi.mock('../utils/fileUtils.js', async (importOriginal) => {
|
||||||
|
const actual = await importOriginal<typeof import('../utils/fileUtils.js')>();
|
||||||
|
return {
|
||||||
|
...actual,
|
||||||
|
readFileWithEncoding: vi.fn(),
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
import { readFileWithEncoding } from '../utils/fileUtils.js';
|
||||||
|
|
||||||
describe('StandardFileSystemService', () => {
|
describe('StandardFileSystemService', () => {
|
||||||
let fileSystem: StandardFileSystemService;
|
let fileSystem: StandardFileSystemService;
|
||||||
|
|
||||||
|
|
@ -23,19 +33,19 @@ describe('StandardFileSystemService', () => {
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('readTextFile', () => {
|
describe('readTextFile', () => {
|
||||||
it('should read file content using fs', async () => {
|
it('should read file content using readFileWithEncoding', async () => {
|
||||||
const testContent = 'Hello, World!';
|
const testContent = 'Hello, World!';
|
||||||
vi.mocked(fs.readFile).mockResolvedValue(testContent);
|
vi.mocked(readFileWithEncoding).mockResolvedValue(testContent);
|
||||||
|
|
||||||
const result = await fileSystem.readTextFile('/test/file.txt');
|
const result = await fileSystem.readTextFile('/test/file.txt');
|
||||||
|
|
||||||
expect(fs.readFile).toHaveBeenCalledWith('/test/file.txt', 'utf-8');
|
expect(readFileWithEncoding).toHaveBeenCalledWith('/test/file.txt');
|
||||||
expect(result).toBe(testContent);
|
expect(result).toBe(testContent);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('should propagate fs.readFile errors', async () => {
|
it('should propagate readFileWithEncoding errors', async () => {
|
||||||
const error = new Error('ENOENT: File not found');
|
const error = new Error('ENOENT: File not found');
|
||||||
vi.mocked(fs.readFile).mockRejectedValue(error);
|
vi.mocked(readFileWithEncoding).mockRejectedValue(error);
|
||||||
|
|
||||||
await expect(fileSystem.readTextFile('/test/file.txt')).rejects.toThrow(
|
await expect(fileSystem.readTextFile('/test/file.txt')).rejects.toThrow(
|
||||||
'ENOENT: File not found',
|
'ENOENT: File not found',
|
||||||
|
|
@ -120,6 +130,32 @@ describe('StandardFileSystemService', () => {
|
||||||
}
|
}
|
||||||
expect(bomCount).toBe(1);
|
expect(bomCount).toBe(1);
|
||||||
});
|
});
|
||||||
|
it('should write file with non-UTF-8 encoding using iconv-lite', async () => {
|
||||||
|
vi.mocked(fs.writeFile).mockResolvedValue();
|
||||||
|
|
||||||
|
await fileSystem.writeTextFile('/test/file.txt', '你好世界', {
|
||||||
|
encoding: 'gbk',
|
||||||
|
});
|
||||||
|
|
||||||
|
// Verify that fs.writeFile was called with a Buffer (iconv-encoded)
|
||||||
|
const writeCall = vi.mocked(fs.writeFile).mock.calls[0];
|
||||||
|
expect(writeCall[0]).toBe('/test/file.txt');
|
||||||
|
expect(writeCall[1]).toBeInstanceOf(Buffer);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should write file as UTF-8 when encoding is utf-8', async () => {
|
||||||
|
vi.mocked(fs.writeFile).mockResolvedValue();
|
||||||
|
|
||||||
|
await fileSystem.writeTextFile('/test/file.txt', 'Hello', {
|
||||||
|
encoding: 'utf-8',
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(fs.writeFile).toHaveBeenCalledWith(
|
||||||
|
'/test/file.txt',
|
||||||
|
'Hello',
|
||||||
|
'utf-8',
|
||||||
|
);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('detectFileBOM', () => {
|
describe('detectFileBOM', () => {
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,8 @@
|
||||||
import fs from 'node:fs/promises';
|
import fs from 'node:fs/promises';
|
||||||
import * as path from 'node:path';
|
import * as path from 'node:path';
|
||||||
import { globSync } from 'glob';
|
import { globSync } from 'glob';
|
||||||
|
import { readFileWithEncoding } from '../utils/fileUtils.js';
|
||||||
|
import { iconvEncode, iconvEncodingExists } from '../utils/iconvHelper.js';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Supported file encodings for new files.
|
* Supported file encodings for new files.
|
||||||
|
|
@ -74,6 +76,14 @@ export interface WriteTextFileOptions {
|
||||||
* @default false
|
* @default false
|
||||||
*/
|
*/
|
||||||
bom?: boolean;
|
bom?: boolean;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The encoding to use when writing the file.
|
||||||
|
* If specified and not UTF-8 compatible, iconv-lite will be used to encode.
|
||||||
|
* This is used to preserve the original encoding of non-UTF-8 files (e.g. GBK, Big5).
|
||||||
|
* @default undefined (writes as UTF-8)
|
||||||
|
*/
|
||||||
|
encoding?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -92,12 +102,22 @@ function hasUTF8BOM(buffer: Buffer): boolean {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check whether an encoding name represents a UTF-8 compatible encoding
|
||||||
|
* that doesn't require iconv-lite for writing.
|
||||||
|
*/
|
||||||
|
function isUtf8CompatibleEncoding(encoding: string): boolean {
|
||||||
|
const lower = encoding.toLowerCase().replace(/[^a-z0-9]/g, '');
|
||||||
|
return lower === 'utf8' || lower === 'ascii' || lower === 'usascii';
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Standard file system implementation
|
* Standard file system implementation
|
||||||
*/
|
*/
|
||||||
export class StandardFileSystemService implements FileSystemService {
|
export class StandardFileSystemService implements FileSystemService {
|
||||||
async readTextFile(filePath: string): Promise<string> {
|
async readTextFile(filePath: string): Promise<string> {
|
||||||
return fs.readFile(filePath, FileEncoding.UTF8);
|
// Use encoding-aware reader that handles BOM and non-UTF-8 encodings (e.g. GBK)
|
||||||
|
return readFileWithEncoding(filePath);
|
||||||
}
|
}
|
||||||
|
|
||||||
async writeTextFile(
|
async writeTextFile(
|
||||||
|
|
@ -106,8 +126,19 @@ export class StandardFileSystemService implements FileSystemService {
|
||||||
options?: WriteTextFileOptions,
|
options?: WriteTextFileOptions,
|
||||||
): Promise<void> {
|
): Promise<void> {
|
||||||
const bom = options?.bom ?? false;
|
const bom = options?.bom ?? false;
|
||||||
|
const encoding = options?.encoding;
|
||||||
|
|
||||||
if (bom) {
|
// Check if a non-UTF-8 encoding is specified and supported
|
||||||
|
const isNonUtf8Encoding =
|
||||||
|
encoding &&
|
||||||
|
!isUtf8CompatibleEncoding(encoding) &&
|
||||||
|
iconvEncodingExists(encoding);
|
||||||
|
|
||||||
|
if (isNonUtf8Encoding) {
|
||||||
|
// Non-UTF-8 encoding (e.g. GBK, Big5, Shift_JIS) — use iconv-lite to encode
|
||||||
|
const encoded = iconvEncode(content, encoding);
|
||||||
|
await fs.writeFile(filePath, encoded);
|
||||||
|
} else if (bom) {
|
||||||
// Prepend UTF-8 BOM (EF BB BF)
|
// Prepend UTF-8 BOM (EF BB BF)
|
||||||
// If content already starts with BOM character, strip it first to avoid double BOM
|
// If content already starts with BOM character, strip it first to avoid double BOM
|
||||||
const normalizedContent =
|
const normalizedContent =
|
||||||
|
|
|
||||||
|
|
@ -27,7 +27,7 @@ import { ToolNames, ToolDisplayNames } from './tool-names.js';
|
||||||
import { logFileOperation } from '../telemetry/loggers.js';
|
import { logFileOperation } from '../telemetry/loggers.js';
|
||||||
import { FileOperationEvent } from '../telemetry/types.js';
|
import { FileOperationEvent } from '../telemetry/types.js';
|
||||||
import { FileOperation } from '../telemetry/metrics.js';
|
import { FileOperation } from '../telemetry/metrics.js';
|
||||||
import { getSpecificMimeType } from '../utils/fileUtils.js';
|
import { getSpecificMimeType, detectFileEncoding } from '../utils/fileUtils.js';
|
||||||
import { getLanguageFromFilePath } from '../utils/language-detection.js';
|
import { getLanguageFromFilePath } from '../utils/language-detection.js';
|
||||||
import type {
|
import type {
|
||||||
ModifiableDeclarativeTool,
|
ModifiableDeclarativeTool,
|
||||||
|
|
@ -108,6 +108,10 @@ interface CalculatedEdit {
|
||||||
occurrences: number;
|
occurrences: number;
|
||||||
error?: { display: string; raw: string; type: ToolErrorType };
|
error?: { display: string; raw: string; type: ToolErrorType };
|
||||||
isNewFile: boolean;
|
isNewFile: boolean;
|
||||||
|
/** Detected encoding of the existing file (e.g. 'utf-8', 'gbk') */
|
||||||
|
encoding: string;
|
||||||
|
/** Whether the existing file has a UTF-8 BOM */
|
||||||
|
bom: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
class EditToolInvocation implements ToolInvocation<EditToolParams, ToolResult> {
|
class EditToolInvocation implements ToolInvocation<EditToolParams, ToolResult> {
|
||||||
|
|
@ -134,6 +138,8 @@ class EditToolInvocation implements ToolInvocation<EditToolParams, ToolResult> {
|
||||||
let finalNewString = params.new_string;
|
let finalNewString = params.new_string;
|
||||||
let finalOldString = params.old_string;
|
let finalOldString = params.old_string;
|
||||||
let occurrences = 0;
|
let occurrences = 0;
|
||||||
|
let encoding = 'utf-8';
|
||||||
|
let bom = false;
|
||||||
let error:
|
let error:
|
||||||
| { display: string; raw: string; type: ToolErrorType }
|
| { display: string; raw: string; type: ToolErrorType }
|
||||||
| undefined = undefined;
|
| undefined = undefined;
|
||||||
|
|
@ -145,6 +151,11 @@ class EditToolInvocation implements ToolInvocation<EditToolParams, ToolResult> {
|
||||||
// Normalize line endings to LF for consistent processing.
|
// Normalize line endings to LF for consistent processing.
|
||||||
currentContent = currentContent.replace(/\r\n/g, '\n');
|
currentContent = currentContent.replace(/\r\n/g, '\n');
|
||||||
fileExists = true;
|
fileExists = true;
|
||||||
|
// Detect encoding and BOM to preserve original file characteristics on write-back
|
||||||
|
encoding = await detectFileEncoding(params.file_path);
|
||||||
|
bom = await this.config
|
||||||
|
.getFileSystemService()
|
||||||
|
.detectFileBOM(params.file_path);
|
||||||
} catch (err: unknown) {
|
} catch (err: unknown) {
|
||||||
if (!isNodeError(err) || err.code !== 'ENOENT') {
|
if (!isNodeError(err) || err.code !== 'ENOENT') {
|
||||||
// Rethrow unexpected FS errors (permissions, etc.)
|
// Rethrow unexpected FS errors (permissions, etc.)
|
||||||
|
|
@ -238,6 +249,8 @@ class EditToolInvocation implements ToolInvocation<EditToolParams, ToolResult> {
|
||||||
occurrences,
|
occurrences,
|
||||||
error,
|
error,
|
||||||
isNewFile,
|
isNewFile,
|
||||||
|
encoding,
|
||||||
|
bom,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -373,7 +386,7 @@ class EditToolInvocation implements ToolInvocation<EditToolParams, ToolResult> {
|
||||||
this.ensureParentDirectoriesExist(this.params.file_path);
|
this.ensureParentDirectoriesExist(this.params.file_path);
|
||||||
|
|
||||||
// For new files, apply default file encoding setting
|
// For new files, apply default file encoding setting
|
||||||
// For existing files, keep original content as-is (including any BOM character)
|
// For existing files, preserve the original encoding (BOM and charset)
|
||||||
if (editData.isNewFile) {
|
if (editData.isNewFile) {
|
||||||
const useBOM =
|
const useBOM =
|
||||||
this.config.getDefaultFileEncoding() === FileEncoding.UTF8_BOM;
|
this.config.getDefaultFileEncoding() === FileEncoding.UTF8_BOM;
|
||||||
|
|
@ -385,7 +398,10 @@ class EditToolInvocation implements ToolInvocation<EditToolParams, ToolResult> {
|
||||||
} else {
|
} else {
|
||||||
await this.config
|
await this.config
|
||||||
.getFileSystemService()
|
.getFileSystemService()
|
||||||
.writeTextFile(this.params.file_path, editData.newContent);
|
.writeTextFile(this.params.file_path, editData.newContent, {
|
||||||
|
bom: editData.bom,
|
||||||
|
encoding: editData.encoding,
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
const fileName = path.basename(this.params.file_path);
|
const fileName = path.basename(this.params.file_path);
|
||||||
|
|
|
||||||
|
|
@ -759,6 +759,7 @@ describe('WriteFileTool', () => {
|
||||||
// Verify writeTextFile was called with bom: true
|
// Verify writeTextFile was called with bom: true
|
||||||
expect(writeSpy).toHaveBeenCalledWith(filePath, newContent, {
|
expect(writeSpy).toHaveBeenCalledWith(filePath, newContent, {
|
||||||
bom: true,
|
bom: true,
|
||||||
|
encoding: 'utf-8',
|
||||||
});
|
});
|
||||||
|
|
||||||
// Cleanup
|
// Cleanup
|
||||||
|
|
@ -785,6 +786,7 @@ describe('WriteFileTool', () => {
|
||||||
// Verify writeTextFile was called with bom: false
|
// Verify writeTextFile was called with bom: false
|
||||||
expect(writeSpy).toHaveBeenCalledWith(filePath, newContent, {
|
expect(writeSpy).toHaveBeenCalledWith(filePath, newContent, {
|
||||||
bom: false,
|
bom: false,
|
||||||
|
encoding: 'utf-8',
|
||||||
});
|
});
|
||||||
|
|
||||||
// Cleanup
|
// Cleanup
|
||||||
|
|
|
||||||
|
|
@ -37,7 +37,7 @@ import { IdeClient } from '../ide/ide-client.js';
|
||||||
import { logFileOperation } from '../telemetry/loggers.js';
|
import { logFileOperation } from '../telemetry/loggers.js';
|
||||||
import { FileOperationEvent } from '../telemetry/types.js';
|
import { FileOperationEvent } from '../telemetry/types.js';
|
||||||
import { FileOperation } from '../telemetry/metrics.js';
|
import { FileOperation } from '../telemetry/metrics.js';
|
||||||
import { getSpecificMimeType } from '../utils/fileUtils.js';
|
import { getSpecificMimeType, detectFileEncoding } from '../utils/fileUtils.js';
|
||||||
import { getLanguageFromFilePath } from '../utils/language-detection.js';
|
import { getLanguageFromFilePath } from '../utils/language-detection.js';
|
||||||
import { createDebugLogger } from '../utils/debugLogger.js';
|
import { createDebugLogger } from '../utils/debugLogger.js';
|
||||||
|
|
||||||
|
|
@ -243,17 +243,23 @@ class WriteFileToolInvocation extends BaseToolInvocation<
|
||||||
// Check if file exists and has BOM to preserve encoding
|
// Check if file exists and has BOM to preserve encoding
|
||||||
// For new files, use the configured default encoding
|
// For new files, use the configured default encoding
|
||||||
let useBOM = false;
|
let useBOM = false;
|
||||||
|
let detectedEncoding: string | undefined;
|
||||||
if (!isNewFile) {
|
if (!isNewFile) {
|
||||||
useBOM = await this.config
|
useBOM = await this.config
|
||||||
.getFileSystemService()
|
.getFileSystemService()
|
||||||
.detectFileBOM(file_path);
|
.detectFileBOM(file_path);
|
||||||
|
// Detect encoding to preserve non-UTF-8 encodings (e.g. GBK, Big5)
|
||||||
|
detectedEncoding = await detectFileEncoding(file_path);
|
||||||
} else {
|
} else {
|
||||||
useBOM = this.config.getDefaultFileEncoding() === FileEncoding.UTF8_BOM;
|
useBOM = this.config.getDefaultFileEncoding() === FileEncoding.UTF8_BOM;
|
||||||
}
|
}
|
||||||
|
|
||||||
await this.config
|
await this.config
|
||||||
.getFileSystemService()
|
.getFileSystemService()
|
||||||
.writeTextFile(file_path, fileContent, { bom: useBOM });
|
.writeTextFile(file_path, fileContent, {
|
||||||
|
bom: useBOM,
|
||||||
|
encoding: detectedEncoding,
|
||||||
|
});
|
||||||
|
|
||||||
// Generate diff for display result
|
// Generate diff for display result
|
||||||
const fileName = path.basename(file_path);
|
const fileName = path.basename(file_path);
|
||||||
|
|
|
||||||
|
|
@ -28,6 +28,7 @@ import {
|
||||||
processSingleFileContent,
|
processSingleFileContent,
|
||||||
detectBOM,
|
detectBOM,
|
||||||
readFileWithEncoding,
|
readFileWithEncoding,
|
||||||
|
detectFileEncoding,
|
||||||
fileExists,
|
fileExists,
|
||||||
} from './fileUtils.js';
|
} from './fileUtils.js';
|
||||||
import type { Config } from '../config/config.js';
|
import type { Config } from '../config/config.js';
|
||||||
|
|
@ -407,6 +408,95 @@ describe('fileUtils', () => {
|
||||||
const result = await readFileWithEncoding(filePath);
|
const result = await readFileWithEncoding(filePath);
|
||||||
expect(result).toBe('');
|
expect(result).toBe('');
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('should read GBK-encoded file with Chinese characters correctly', async () => {
|
||||||
|
// GBK encoding of "你好世界这是中文内容用于测试编码检测"
|
||||||
|
// Needs enough content for chardet to reliably detect the encoding
|
||||||
|
const gbkBuffer = Buffer.from([
|
||||||
|
0xc4, 0xe3, 0xba, 0xc3, 0xca, 0xc0, 0xbd, 0xe7, 0xd5, 0xe2, 0xca,
|
||||||
|
0xc7, 0xd6, 0xd0, 0xce, 0xc4, 0xc4, 0xda, 0xc8, 0xdd, 0xd3, 0xc3,
|
||||||
|
0xd3, 0xda, 0xb2, 0xe2, 0xca, 0xd4, 0xb1, 0xe0, 0xc2, 0xeb, 0xbc,
|
||||||
|
0xec, 0xb2, 0xe2,
|
||||||
|
]);
|
||||||
|
const filePath = path.join(testDir, 'gbk-chinese.txt');
|
||||||
|
await fsPromises.writeFile(filePath, gbkBuffer);
|
||||||
|
|
||||||
|
const result = await readFileWithEncoding(filePath);
|
||||||
|
expect(result).toBe('你好世界这是中文内容用于测试编码检测');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should read GBK-encoded file with mixed ASCII and Chinese correctly', async () => {
|
||||||
|
// GBK encoding of "// 这是注释内容用于测试\nhello你好世界测试中文编码检测\n函数返回值正确"
|
||||||
|
// Needs enough Chinese content for chardet to reliably detect as GB18030/GBK
|
||||||
|
const gbkBuffer = Buffer.from([
|
||||||
|
0x2f, 0x2f, 0x20, 0xd5, 0xe2, 0xca, 0xc7, 0xd7, 0xa2, 0xca, 0xcd,
|
||||||
|
0xc4, 0xda, 0xc8, 0xdd, 0xd3, 0xc3, 0xd3, 0xda, 0xb2, 0xe2, 0xca,
|
||||||
|
0xd4, 0x0a, 0x68, 0x65, 0x6c, 0x6c, 0x6f, 0xc4, 0xe3, 0xba, 0xc3,
|
||||||
|
0xca, 0xc0, 0xbd, 0xe7, 0xb2, 0xe2, 0xca, 0xd4, 0xd6, 0xd0, 0xce,
|
||||||
|
0xc4, 0xb1, 0xe0, 0xc2, 0xeb, 0xbc, 0xec, 0xb2, 0xe2, 0x0a, 0xba,
|
||||||
|
0xaf, 0xca, 0xfd, 0xb7, 0xb5, 0xbb, 0xd8, 0xd6, 0xb5, 0xd5, 0xfd,
|
||||||
|
0xc8, 0xb7,
|
||||||
|
]);
|
||||||
|
const filePath = path.join(testDir, 'gbk-mixed.txt');
|
||||||
|
await fsPromises.writeFile(filePath, gbkBuffer);
|
||||||
|
|
||||||
|
const result = await readFileWithEncoding(filePath);
|
||||||
|
expect(result).toContain('hello');
|
||||||
|
expect(result).toContain('你好世界');
|
||||||
|
expect(result).toContain('函数返回值正确');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('detectFileEncoding', () => {
|
||||||
|
it('should detect UTF-8 for plain ASCII file', async () => {
|
||||||
|
const filePath = path.join(testDir, 'ascii.txt');
|
||||||
|
await fsPromises.writeFile(filePath, 'Hello World', 'utf8');
|
||||||
|
|
||||||
|
const encoding = await detectFileEncoding(filePath);
|
||||||
|
expect(encoding).toBe('utf-8');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should detect UTF-8 for file with UTF-8 BOM', async () => {
|
||||||
|
const utf8Bom = Buffer.from([0xef, 0xbb, 0xbf]);
|
||||||
|
const content = Buffer.from('Hello', 'utf8');
|
||||||
|
const filePath = path.join(testDir, 'utf8-bom-detect.txt');
|
||||||
|
await fsPromises.writeFile(filePath, Buffer.concat([utf8Bom, content]));
|
||||||
|
|
||||||
|
const encoding = await detectFileEncoding(filePath);
|
||||||
|
expect(encoding).toBe('utf-8');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should detect GBK encoding for Chinese text in GBK', async () => {
|
||||||
|
// GBK encoding of "你好世界这是中文内容用于测试编码检测"
|
||||||
|
// Needs enough content for chardet to reliably detect
|
||||||
|
const gbkBuffer = Buffer.from([
|
||||||
|
0xc4, 0xe3, 0xba, 0xc3, 0xca, 0xc0, 0xbd, 0xe7, 0xd5, 0xe2, 0xca,
|
||||||
|
0xc7, 0xd6, 0xd0, 0xce, 0xc4, 0xc4, 0xda, 0xc8, 0xdd, 0xd3, 0xc3,
|
||||||
|
0xd3, 0xda, 0xb2, 0xe2, 0xca, 0xd4, 0xb1, 0xe0, 0xc2, 0xeb, 0xbc,
|
||||||
|
0xec, 0xb2, 0xe2,
|
||||||
|
]);
|
||||||
|
const filePath = path.join(testDir, 'gbk-detect.txt');
|
||||||
|
await fsPromises.writeFile(filePath, gbkBuffer);
|
||||||
|
|
||||||
|
const encoding = await detectFileEncoding(filePath);
|
||||||
|
// chardet detects GBK as 'gb18030' (its superset)
|
||||||
|
expect(encoding).toBe('gb18030');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should return utf-8 for empty file', async () => {
|
||||||
|
const filePath = path.join(testDir, 'empty-detect.txt');
|
||||||
|
await fsPromises.writeFile(filePath, '');
|
||||||
|
|
||||||
|
const encoding = await detectFileEncoding(filePath);
|
||||||
|
expect(encoding).toBe('utf-8');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should return utf-8 for non-existent file', async () => {
|
||||||
|
const filePath = path.join(testDir, 'nonexistent-detect.txt');
|
||||||
|
|
||||||
|
const encoding = await detectFileEncoding(filePath);
|
||||||
|
expect(encoding).toBe('utf-8');
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('isBinaryFile with BOM awareness', () => {
|
describe('isBinaryFile with BOM awareness', () => {
|
||||||
|
|
|
||||||
|
|
@ -9,10 +9,12 @@ import fsPromises from 'node:fs/promises';
|
||||||
import path from 'node:path';
|
import path from 'node:path';
|
||||||
import type { PartUnion } from '@google/genai';
|
import type { PartUnion } from '@google/genai';
|
||||||
import mime from 'mime/lite';
|
import mime from 'mime/lite';
|
||||||
|
import { iconvDecode, iconvEncodingExists } from './iconvHelper.js';
|
||||||
import { ToolErrorType } from '../tools/tool-error.js';
|
import { ToolErrorType } from '../tools/tool-error.js';
|
||||||
import { BINARY_EXTENSIONS } from './ignorePatterns.js';
|
import { BINARY_EXTENSIONS } from './ignorePatterns.js';
|
||||||
import type { Config } from '../config/config.js';
|
import type { Config } from '../config/config.js';
|
||||||
import { createDebugLogger } from './debugLogger.js';
|
import { createDebugLogger } from './debugLogger.js';
|
||||||
|
import { detectEncodingFromBuffer } from './systemEncoding.js';
|
||||||
|
|
||||||
const debugLogger = createDebugLogger('FILE_UTILS');
|
const debugLogger = createDebugLogger('FILE_UTILS');
|
||||||
|
|
||||||
|
|
@ -117,9 +119,33 @@ function decodeUTF32(buf: Buffer, littleEndian: boolean): string {
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check whether a buffer is valid UTF-8 by attempting a strict decode.
|
||||||
|
* If any invalid byte sequence is encountered, TextDecoder with `fatal: true` throws.
|
||||||
|
*/
|
||||||
|
function isValidUtf8(buffer: Buffer): boolean {
|
||||||
|
try {
|
||||||
|
new TextDecoder('utf-8', { fatal: true }).decode(buffer);
|
||||||
|
return true;
|
||||||
|
} catch {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check whether an encoding name represents a UTF-8 compatible encoding
|
||||||
|
* that Node's Buffer can handle natively.
|
||||||
|
*/
|
||||||
|
function isUtf8Compatible(encoding: string): boolean {
|
||||||
|
const lower = encoding.toLowerCase().replace(/[^a-z0-9]/g, '');
|
||||||
|
return lower === 'utf8' || lower === 'ascii' || lower === 'usascii';
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Read a file as text, honoring BOM encodings (UTF‑8/16/32) and stripping the BOM.
|
* Read a file as text, honoring BOM encodings (UTF‑8/16/32) and stripping the BOM.
|
||||||
* Falls back to utf8 when no BOM is present.
|
* For files without BOM, validates UTF-8 first. If invalid UTF-8, uses chardet
|
||||||
|
* to detect encoding (e.g. GBK, Big5, Shift_JIS) and iconv-lite to decode.
|
||||||
|
* Falls back to utf8 when detection fails.
|
||||||
*/
|
*/
|
||||||
export async function readFileWithEncoding(filePath: string): Promise<string> {
|
export async function readFileWithEncoding(filePath: string): Promise<string> {
|
||||||
// Read the file once; detect BOM and decode from the single buffer.
|
// Read the file once; detect BOM and decode from the single buffer.
|
||||||
|
|
@ -127,27 +153,108 @@ export async function readFileWithEncoding(filePath: string): Promise<string> {
|
||||||
if (full.length === 0) return '';
|
if (full.length === 0) return '';
|
||||||
|
|
||||||
const bom = detectBOM(full);
|
const bom = detectBOM(full);
|
||||||
if (!bom) {
|
if (bom) {
|
||||||
// No BOM → treat as UTF‑8
|
// Strip BOM and decode per encoding
|
||||||
|
const content = full.subarray(bom.bomLength);
|
||||||
|
switch (bom.encoding) {
|
||||||
|
case 'utf8':
|
||||||
|
return content.toString('utf8');
|
||||||
|
case 'utf16le':
|
||||||
|
return content.toString('utf16le');
|
||||||
|
case 'utf16be':
|
||||||
|
return decodeUTF16BE(content);
|
||||||
|
case 'utf32le':
|
||||||
|
return decodeUTF32(content, true);
|
||||||
|
case 'utf32be':
|
||||||
|
return decodeUTF32(content, false);
|
||||||
|
default:
|
||||||
|
// Defensive fallback; should be unreachable
|
||||||
|
return content.toString('utf8');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// No BOM — check if it's valid UTF-8 first (fast path for the common case)
|
||||||
|
if (isValidUtf8(full)) {
|
||||||
return full.toString('utf8');
|
return full.toString('utf8');
|
||||||
}
|
}
|
||||||
|
|
||||||
// Strip BOM and decode per encoding
|
// Not valid UTF-8 — try chardet-based encoding detection
|
||||||
const content = full.subarray(bom.bomLength);
|
const detected = detectEncodingFromBuffer(full);
|
||||||
switch (bom.encoding) {
|
if (detected && !isUtf8Compatible(detected)) {
|
||||||
case 'utf8':
|
try {
|
||||||
return content.toString('utf8');
|
if (iconvEncodingExists(detected)) {
|
||||||
case 'utf16le':
|
return iconvDecode(full, detected);
|
||||||
return content.toString('utf16le');
|
}
|
||||||
case 'utf16be':
|
} catch (e) {
|
||||||
return decodeUTF16BE(content);
|
debugLogger.warn(
|
||||||
case 'utf32le':
|
`Failed to decode file ${filePath} as ${detected}: ${e instanceof Error ? e.message : String(e)}`,
|
||||||
return decodeUTF32(content, true);
|
);
|
||||||
case 'utf32be':
|
}
|
||||||
return decodeUTF32(content, false);
|
}
|
||||||
default:
|
|
||||||
// Defensive fallback; should be unreachable
|
// Final fallback: UTF-8 with replacement characters
|
||||||
return content.toString('utf8');
|
return full.toString('utf8');
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Detect the encoding of a file by reading a sample from its beginning.
|
||||||
|
* Returns the encoding name (e.g. 'utf-8', 'gbk', 'shift_jis').
|
||||||
|
* Uses BOM detection first, then UTF-8 validation, then chardet as fallback.
|
||||||
|
*/
|
||||||
|
export async function detectFileEncoding(filePath: string): Promise<string> {
|
||||||
|
let fh: fs.promises.FileHandle | null = null;
|
||||||
|
try {
|
||||||
|
fh = await fs.promises.open(filePath, 'r');
|
||||||
|
const stats = await fh.stat();
|
||||||
|
if (stats.size === 0) return 'utf-8';
|
||||||
|
|
||||||
|
// Read a sample (up to 8KB) for detection
|
||||||
|
const sampleSize = Math.min(8192, stats.size);
|
||||||
|
const buf = Buffer.alloc(sampleSize);
|
||||||
|
const { bytesRead } = await fh.read(buf, 0, sampleSize, 0);
|
||||||
|
if (bytesRead === 0) return 'utf-8';
|
||||||
|
const sample = buf.subarray(0, bytesRead);
|
||||||
|
|
||||||
|
// 1. Check for BOM
|
||||||
|
const bom = detectBOM(sample);
|
||||||
|
if (bom) {
|
||||||
|
switch (bom.encoding) {
|
||||||
|
case 'utf8':
|
||||||
|
return 'utf-8';
|
||||||
|
case 'utf16le':
|
||||||
|
return 'utf-16le';
|
||||||
|
case 'utf16be':
|
||||||
|
return 'utf-16be';
|
||||||
|
case 'utf32le':
|
||||||
|
return 'utf-32le';
|
||||||
|
case 'utf32be':
|
||||||
|
return 'utf-32be';
|
||||||
|
default:
|
||||||
|
return 'utf-8';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. Validate UTF-8
|
||||||
|
if (isValidUtf8(sample)) return 'utf-8';
|
||||||
|
|
||||||
|
// 3. Use chardet for detection
|
||||||
|
const detected = detectEncodingFromBuffer(sample);
|
||||||
|
if (detected && !isUtf8Compatible(detected)) {
|
||||||
|
return detected;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 'utf-8';
|
||||||
|
} catch {
|
||||||
|
// If file can't be read, default to UTF-8
|
||||||
|
return 'utf-8';
|
||||||
|
} finally {
|
||||||
|
if (fh) {
|
||||||
|
try {
|
||||||
|
await fh.close();
|
||||||
|
} catch {
|
||||||
|
// Ignore close errors
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
53
packages/core/src/utils/iconvHelper.ts
Normal file
53
packages/core/src/utils/iconvHelper.ts
Normal file
|
|
@ -0,0 +1,53 @@
|
||||||
|
/**
|
||||||
|
* @license
|
||||||
|
* Copyright 2025 Google LLC
|
||||||
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Helper module to bridge iconv-lite CJS module with our ESM codebase.
|
||||||
|
* iconv-lite v0.6.x uses ambient `declare module` type declarations
|
||||||
|
* that are incompatible with NodeNext module resolution.
|
||||||
|
* This module provides properly-typed wrappers.
|
||||||
|
*/
|
||||||
|
|
||||||
|
interface IconvLite {
|
||||||
|
decode(buffer: Buffer, encoding: string): string;
|
||||||
|
encode(content: string, encoding: string): Buffer;
|
||||||
|
encodingExists(encoding: string): boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
// iconv-lite is a CJS module. Under NodeNext resolution, its ambient type
|
||||||
|
// declarations don't map correctly. We import the default export (which is
|
||||||
|
// the CJS module.exports object) and cast it to a proper interface.
|
||||||
|
import iconvModule from 'iconv-lite';
|
||||||
|
const iconvLite: IconvLite = iconvModule as unknown as IconvLite;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Decode a buffer using the specified encoding.
|
||||||
|
* @param buffer The buffer to decode
|
||||||
|
* @param encoding The encoding to use (e.g. 'gbk', 'big5', 'shift_jis')
|
||||||
|
* @returns The decoded string
|
||||||
|
*/
|
||||||
|
export function iconvDecode(buffer: Buffer, encoding: string): string {
|
||||||
|
return iconvLite.decode(buffer, encoding);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Encode a string to a buffer using the specified encoding.
|
||||||
|
* @param content The string to encode
|
||||||
|
* @param encoding The encoding to use (e.g. 'gbk', 'big5', 'shift_jis')
|
||||||
|
* @returns The encoded buffer
|
||||||
|
*/
|
||||||
|
export function iconvEncode(content: string, encoding: string): Buffer {
|
||||||
|
return iconvLite.encode(content, encoding);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if an encoding is supported by iconv-lite.
|
||||||
|
* @param encoding The encoding name to check
|
||||||
|
* @returns True if the encoding is supported
|
||||||
|
*/
|
||||||
|
export function iconvEncodingExists(encoding: string): boolean {
|
||||||
|
return iconvLite.encodingExists(encoding);
|
||||||
|
}
|
||||||
Loading…
Add table
Add a link
Reference in a new issue