fix: preserve original encoding when reading/writing non-UTF-8 files

Fixes #2069

- Add iconv-lite dependency for non-UTF-8 encoding support
- Add iconvHelper.ts as a CJS/ESM compatibility wrapper
- Update readFileWithEncoding() to detect and handle GBK/Big5/Shift_JIS
  using BOM detection -> UTF-8 validation -> chardet -> iconv-lite fallback
- Add detectFileEncoding() to identify file encoding before writes
- Update writeTextFile() to accept encoding option and encode with iconv-lite
- Update WriteFileTool and EditTool to detect and preserve original encoding
- Add tests for GBK read/write, detectFileEncoding, and encoding option
This commit is contained in:
LaZzyMan 2026-03-04 14:52:46 +08:00
parent 407a66c959
commit a5eb1733fa
11 changed files with 375 additions and 32 deletions

1
package-lock.json generated
View file

@ -19471,6 +19471,7 @@
"google-auth-library": "^10.5.0", "google-auth-library": "^10.5.0",
"html-to-text": "^9.0.5", "html-to-text": "^9.0.5",
"https-proxy-agent": "^7.0.6", "https-proxy-agent": "^7.0.6",
"iconv-lite": "^0.6.3",
"ignore": "^7.0.0", "ignore": "^7.0.0",
"jsonrepair": "^3.13.0", "jsonrepair": "^3.13.0",
"marked": "^15.0.12", "marked": "^15.0.12",

View file

@ -57,7 +57,7 @@ export class AcpFileSystemService implements FileSystemService {
async writeTextFile( async writeTextFile(
filePath: string, filePath: string,
content: string, content: string,
options?: { bom?: boolean }, options?: { bom?: boolean; encoding?: string },
): Promise<void> { ): Promise<void> {
if (!this.capabilities.writeTextFile) { if (!this.capabilities.writeTextFile) {
return this.fallback.writeTextFile(filePath, content, options); return this.fallback.writeTextFile(filePath, content, options);

View file

@ -42,6 +42,7 @@
"ajv-formats": "^3.0.0", "ajv-formats": "^3.0.0",
"async-mutex": "^0.5.0", "async-mutex": "^0.5.0",
"chardet": "^2.1.0", "chardet": "^2.1.0",
"iconv-lite": "^0.6.3",
"chokidar": "^4.0.3", "chokidar": "^4.0.3",
"diff": "^7.0.0", "diff": "^7.0.0",
"dotenv": "^17.1.0", "dotenv": "^17.1.0",

View file

@ -10,6 +10,16 @@ import { StandardFileSystemService } from './fileSystemService.js';
vi.mock('fs/promises'); vi.mock('fs/promises');
vi.mock('../utils/fileUtils.js', async (importOriginal) => {
const actual = await importOriginal<typeof import('../utils/fileUtils.js')>();
return {
...actual,
readFileWithEncoding: vi.fn(),
};
});
import { readFileWithEncoding } from '../utils/fileUtils.js';
describe('StandardFileSystemService', () => { describe('StandardFileSystemService', () => {
let fileSystem: StandardFileSystemService; let fileSystem: StandardFileSystemService;
@ -23,19 +33,19 @@ describe('StandardFileSystemService', () => {
}); });
describe('readTextFile', () => { describe('readTextFile', () => {
it('should read file content using fs', async () => { it('should read file content using readFileWithEncoding', async () => {
const testContent = 'Hello, World!'; const testContent = 'Hello, World!';
vi.mocked(fs.readFile).mockResolvedValue(testContent); vi.mocked(readFileWithEncoding).mockResolvedValue(testContent);
const result = await fileSystem.readTextFile('/test/file.txt'); const result = await fileSystem.readTextFile('/test/file.txt');
expect(fs.readFile).toHaveBeenCalledWith('/test/file.txt', 'utf-8'); expect(readFileWithEncoding).toHaveBeenCalledWith('/test/file.txt');
expect(result).toBe(testContent); expect(result).toBe(testContent);
}); });
it('should propagate fs.readFile errors', async () => { it('should propagate readFileWithEncoding errors', async () => {
const error = new Error('ENOENT: File not found'); const error = new Error('ENOENT: File not found');
vi.mocked(fs.readFile).mockRejectedValue(error); vi.mocked(readFileWithEncoding).mockRejectedValue(error);
await expect(fileSystem.readTextFile('/test/file.txt')).rejects.toThrow( await expect(fileSystem.readTextFile('/test/file.txt')).rejects.toThrow(
'ENOENT: File not found', 'ENOENT: File not found',
@ -120,6 +130,32 @@ describe('StandardFileSystemService', () => {
} }
expect(bomCount).toBe(1); expect(bomCount).toBe(1);
}); });
it('should write file with non-UTF-8 encoding using iconv-lite', async () => {
vi.mocked(fs.writeFile).mockResolvedValue();
await fileSystem.writeTextFile('/test/file.txt', '你好世界', {
encoding: 'gbk',
});
// Verify that fs.writeFile was called with a Buffer (iconv-encoded)
const writeCall = vi.mocked(fs.writeFile).mock.calls[0];
expect(writeCall[0]).toBe('/test/file.txt');
expect(writeCall[1]).toBeInstanceOf(Buffer);
});
it('should write file as UTF-8 when encoding is utf-8', async () => {
vi.mocked(fs.writeFile).mockResolvedValue();
await fileSystem.writeTextFile('/test/file.txt', 'Hello', {
encoding: 'utf-8',
});
expect(fs.writeFile).toHaveBeenCalledWith(
'/test/file.txt',
'Hello',
'utf-8',
);
});
}); });
describe('detectFileBOM', () => { describe('detectFileBOM', () => {

View file

@ -7,6 +7,8 @@
import fs from 'node:fs/promises'; import fs from 'node:fs/promises';
import * as path from 'node:path'; import * as path from 'node:path';
import { globSync } from 'glob'; import { globSync } from 'glob';
import { readFileWithEncoding } from '../utils/fileUtils.js';
import { iconvEncode, iconvEncodingExists } from '../utils/iconvHelper.js';
/** /**
* Supported file encodings for new files. * Supported file encodings for new files.
@ -74,6 +76,14 @@ export interface WriteTextFileOptions {
* @default false * @default false
*/ */
bom?: boolean; bom?: boolean;
/**
* The encoding to use when writing the file.
* If specified and not UTF-8 compatible, iconv-lite will be used to encode.
* This is used to preserve the original encoding of non-UTF-8 files (e.g. GBK, Big5).
* @default undefined (writes as UTF-8)
*/
encoding?: string;
} }
/** /**
@ -92,12 +102,22 @@ function hasUTF8BOM(buffer: Buffer): boolean {
); );
} }
/**
* Check whether an encoding name represents a UTF-8 compatible encoding
* that doesn't require iconv-lite for writing.
*/
function isUtf8CompatibleEncoding(encoding: string): boolean {
const lower = encoding.toLowerCase().replace(/[^a-z0-9]/g, '');
return lower === 'utf8' || lower === 'ascii' || lower === 'usascii';
}
/** /**
* Standard file system implementation * Standard file system implementation
*/ */
export class StandardFileSystemService implements FileSystemService { export class StandardFileSystemService implements FileSystemService {
async readTextFile(filePath: string): Promise<string> { async readTextFile(filePath: string): Promise<string> {
return fs.readFile(filePath, FileEncoding.UTF8); // Use encoding-aware reader that handles BOM and non-UTF-8 encodings (e.g. GBK)
return readFileWithEncoding(filePath);
} }
async writeTextFile( async writeTextFile(
@ -106,8 +126,19 @@ export class StandardFileSystemService implements FileSystemService {
options?: WriteTextFileOptions, options?: WriteTextFileOptions,
): Promise<void> { ): Promise<void> {
const bom = options?.bom ?? false; const bom = options?.bom ?? false;
const encoding = options?.encoding;
if (bom) { // Check if a non-UTF-8 encoding is specified and supported
const isNonUtf8Encoding =
encoding &&
!isUtf8CompatibleEncoding(encoding) &&
iconvEncodingExists(encoding);
if (isNonUtf8Encoding) {
// Non-UTF-8 encoding (e.g. GBK, Big5, Shift_JIS) — use iconv-lite to encode
const encoded = iconvEncode(content, encoding);
await fs.writeFile(filePath, encoded);
} else if (bom) {
// Prepend UTF-8 BOM (EF BB BF) // Prepend UTF-8 BOM (EF BB BF)
// If content already starts with BOM character, strip it first to avoid double BOM // If content already starts with BOM character, strip it first to avoid double BOM
const normalizedContent = const normalizedContent =

View file

@ -27,7 +27,7 @@ import { ToolNames, ToolDisplayNames } from './tool-names.js';
import { logFileOperation } from '../telemetry/loggers.js'; import { logFileOperation } from '../telemetry/loggers.js';
import { FileOperationEvent } from '../telemetry/types.js'; import { FileOperationEvent } from '../telemetry/types.js';
import { FileOperation } from '../telemetry/metrics.js'; import { FileOperation } from '../telemetry/metrics.js';
import { getSpecificMimeType } from '../utils/fileUtils.js'; import { getSpecificMimeType, detectFileEncoding } from '../utils/fileUtils.js';
import { getLanguageFromFilePath } from '../utils/language-detection.js'; import { getLanguageFromFilePath } from '../utils/language-detection.js';
import type { import type {
ModifiableDeclarativeTool, ModifiableDeclarativeTool,
@ -108,6 +108,10 @@ interface CalculatedEdit {
occurrences: number; occurrences: number;
error?: { display: string; raw: string; type: ToolErrorType }; error?: { display: string; raw: string; type: ToolErrorType };
isNewFile: boolean; isNewFile: boolean;
/** Detected encoding of the existing file (e.g. 'utf-8', 'gbk') */
encoding: string;
/** Whether the existing file has a UTF-8 BOM */
bom: boolean;
} }
class EditToolInvocation implements ToolInvocation<EditToolParams, ToolResult> { class EditToolInvocation implements ToolInvocation<EditToolParams, ToolResult> {
@ -134,6 +138,8 @@ class EditToolInvocation implements ToolInvocation<EditToolParams, ToolResult> {
let finalNewString = params.new_string; let finalNewString = params.new_string;
let finalOldString = params.old_string; let finalOldString = params.old_string;
let occurrences = 0; let occurrences = 0;
let encoding = 'utf-8';
let bom = false;
let error: let error:
| { display: string; raw: string; type: ToolErrorType } | { display: string; raw: string; type: ToolErrorType }
| undefined = undefined; | undefined = undefined;
@ -145,6 +151,11 @@ class EditToolInvocation implements ToolInvocation<EditToolParams, ToolResult> {
// Normalize line endings to LF for consistent processing. // Normalize line endings to LF for consistent processing.
currentContent = currentContent.replace(/\r\n/g, '\n'); currentContent = currentContent.replace(/\r\n/g, '\n');
fileExists = true; fileExists = true;
// Detect encoding and BOM to preserve original file characteristics on write-back
encoding = await detectFileEncoding(params.file_path);
bom = await this.config
.getFileSystemService()
.detectFileBOM(params.file_path);
} catch (err: unknown) { } catch (err: unknown) {
if (!isNodeError(err) || err.code !== 'ENOENT') { if (!isNodeError(err) || err.code !== 'ENOENT') {
// Rethrow unexpected FS errors (permissions, etc.) // Rethrow unexpected FS errors (permissions, etc.)
@ -238,6 +249,8 @@ class EditToolInvocation implements ToolInvocation<EditToolParams, ToolResult> {
occurrences, occurrences,
error, error,
isNewFile, isNewFile,
encoding,
bom,
}; };
} }
@ -373,7 +386,7 @@ class EditToolInvocation implements ToolInvocation<EditToolParams, ToolResult> {
this.ensureParentDirectoriesExist(this.params.file_path); this.ensureParentDirectoriesExist(this.params.file_path);
// For new files, apply default file encoding setting // For new files, apply default file encoding setting
// For existing files, keep original content as-is (including any BOM character) // For existing files, preserve the original encoding (BOM and charset)
if (editData.isNewFile) { if (editData.isNewFile) {
const useBOM = const useBOM =
this.config.getDefaultFileEncoding() === FileEncoding.UTF8_BOM; this.config.getDefaultFileEncoding() === FileEncoding.UTF8_BOM;
@ -385,7 +398,10 @@ class EditToolInvocation implements ToolInvocation<EditToolParams, ToolResult> {
} else { } else {
await this.config await this.config
.getFileSystemService() .getFileSystemService()
.writeTextFile(this.params.file_path, editData.newContent); .writeTextFile(this.params.file_path, editData.newContent, {
bom: editData.bom,
encoding: editData.encoding,
});
} }
const fileName = path.basename(this.params.file_path); const fileName = path.basename(this.params.file_path);

View file

@ -759,6 +759,7 @@ describe('WriteFileTool', () => {
// Verify writeTextFile was called with bom: true // Verify writeTextFile was called with bom: true
expect(writeSpy).toHaveBeenCalledWith(filePath, newContent, { expect(writeSpy).toHaveBeenCalledWith(filePath, newContent, {
bom: true, bom: true,
encoding: 'utf-8',
}); });
// Cleanup // Cleanup
@ -785,6 +786,7 @@ describe('WriteFileTool', () => {
// Verify writeTextFile was called with bom: false // Verify writeTextFile was called with bom: false
expect(writeSpy).toHaveBeenCalledWith(filePath, newContent, { expect(writeSpy).toHaveBeenCalledWith(filePath, newContent, {
bom: false, bom: false,
encoding: 'utf-8',
}); });
// Cleanup // Cleanup

View file

@ -37,7 +37,7 @@ import { IdeClient } from '../ide/ide-client.js';
import { logFileOperation } from '../telemetry/loggers.js'; import { logFileOperation } from '../telemetry/loggers.js';
import { FileOperationEvent } from '../telemetry/types.js'; import { FileOperationEvent } from '../telemetry/types.js';
import { FileOperation } from '../telemetry/metrics.js'; import { FileOperation } from '../telemetry/metrics.js';
import { getSpecificMimeType } from '../utils/fileUtils.js'; import { getSpecificMimeType, detectFileEncoding } from '../utils/fileUtils.js';
import { getLanguageFromFilePath } from '../utils/language-detection.js'; import { getLanguageFromFilePath } from '../utils/language-detection.js';
import { createDebugLogger } from '../utils/debugLogger.js'; import { createDebugLogger } from '../utils/debugLogger.js';
@ -243,17 +243,23 @@ class WriteFileToolInvocation extends BaseToolInvocation<
// Check if file exists and has BOM to preserve encoding // Check if file exists and has BOM to preserve encoding
// For new files, use the configured default encoding // For new files, use the configured default encoding
let useBOM = false; let useBOM = false;
let detectedEncoding: string | undefined;
if (!isNewFile) { if (!isNewFile) {
useBOM = await this.config useBOM = await this.config
.getFileSystemService() .getFileSystemService()
.detectFileBOM(file_path); .detectFileBOM(file_path);
// Detect encoding to preserve non-UTF-8 encodings (e.g. GBK, Big5)
detectedEncoding = await detectFileEncoding(file_path);
} else { } else {
useBOM = this.config.getDefaultFileEncoding() === FileEncoding.UTF8_BOM; useBOM = this.config.getDefaultFileEncoding() === FileEncoding.UTF8_BOM;
} }
await this.config await this.config
.getFileSystemService() .getFileSystemService()
.writeTextFile(file_path, fileContent, { bom: useBOM }); .writeTextFile(file_path, fileContent, {
bom: useBOM,
encoding: detectedEncoding,
});
// Generate diff for display result // Generate diff for display result
const fileName = path.basename(file_path); const fileName = path.basename(file_path);

View file

@ -28,6 +28,7 @@ import {
processSingleFileContent, processSingleFileContent,
detectBOM, detectBOM,
readFileWithEncoding, readFileWithEncoding,
detectFileEncoding,
fileExists, fileExists,
} from './fileUtils.js'; } from './fileUtils.js';
import type { Config } from '../config/config.js'; import type { Config } from '../config/config.js';
@ -407,6 +408,95 @@ describe('fileUtils', () => {
const result = await readFileWithEncoding(filePath); const result = await readFileWithEncoding(filePath);
expect(result).toBe(''); expect(result).toBe('');
}); });
it('should read GBK-encoded file with Chinese characters correctly', async () => {
// GBK encoding of "你好世界这是中文内容用于测试编码检测"
// Needs enough content for chardet to reliably detect the encoding
const gbkBuffer = Buffer.from([
0xc4, 0xe3, 0xba, 0xc3, 0xca, 0xc0, 0xbd, 0xe7, 0xd5, 0xe2, 0xca,
0xc7, 0xd6, 0xd0, 0xce, 0xc4, 0xc4, 0xda, 0xc8, 0xdd, 0xd3, 0xc3,
0xd3, 0xda, 0xb2, 0xe2, 0xca, 0xd4, 0xb1, 0xe0, 0xc2, 0xeb, 0xbc,
0xec, 0xb2, 0xe2,
]);
const filePath = path.join(testDir, 'gbk-chinese.txt');
await fsPromises.writeFile(filePath, gbkBuffer);
const result = await readFileWithEncoding(filePath);
expect(result).toBe('你好世界这是中文内容用于测试编码检测');
});
it('should read GBK-encoded file with mixed ASCII and Chinese correctly', async () => {
// GBK encoding of "// 这是注释内容用于测试\nhello你好世界测试中文编码检测\n函数返回值正确"
// Needs enough Chinese content for chardet to reliably detect as GB18030/GBK
const gbkBuffer = Buffer.from([
0x2f, 0x2f, 0x20, 0xd5, 0xe2, 0xca, 0xc7, 0xd7, 0xa2, 0xca, 0xcd,
0xc4, 0xda, 0xc8, 0xdd, 0xd3, 0xc3, 0xd3, 0xda, 0xb2, 0xe2, 0xca,
0xd4, 0x0a, 0x68, 0x65, 0x6c, 0x6c, 0x6f, 0xc4, 0xe3, 0xba, 0xc3,
0xca, 0xc0, 0xbd, 0xe7, 0xb2, 0xe2, 0xca, 0xd4, 0xd6, 0xd0, 0xce,
0xc4, 0xb1, 0xe0, 0xc2, 0xeb, 0xbc, 0xec, 0xb2, 0xe2, 0x0a, 0xba,
0xaf, 0xca, 0xfd, 0xb7, 0xb5, 0xbb, 0xd8, 0xd6, 0xb5, 0xd5, 0xfd,
0xc8, 0xb7,
]);
const filePath = path.join(testDir, 'gbk-mixed.txt');
await fsPromises.writeFile(filePath, gbkBuffer);
const result = await readFileWithEncoding(filePath);
expect(result).toContain('hello');
expect(result).toContain('你好世界');
expect(result).toContain('函数返回值正确');
});
});
describe('detectFileEncoding', () => {
it('should detect UTF-8 for plain ASCII file', async () => {
const filePath = path.join(testDir, 'ascii.txt');
await fsPromises.writeFile(filePath, 'Hello World', 'utf8');
const encoding = await detectFileEncoding(filePath);
expect(encoding).toBe('utf-8');
});
it('should detect UTF-8 for file with UTF-8 BOM', async () => {
const utf8Bom = Buffer.from([0xef, 0xbb, 0xbf]);
const content = Buffer.from('Hello', 'utf8');
const filePath = path.join(testDir, 'utf8-bom-detect.txt');
await fsPromises.writeFile(filePath, Buffer.concat([utf8Bom, content]));
const encoding = await detectFileEncoding(filePath);
expect(encoding).toBe('utf-8');
});
it('should detect GBK encoding for Chinese text in GBK', async () => {
// GBK encoding of "你好世界这是中文内容用于测试编码检测"
// Needs enough content for chardet to reliably detect
const gbkBuffer = Buffer.from([
0xc4, 0xe3, 0xba, 0xc3, 0xca, 0xc0, 0xbd, 0xe7, 0xd5, 0xe2, 0xca,
0xc7, 0xd6, 0xd0, 0xce, 0xc4, 0xc4, 0xda, 0xc8, 0xdd, 0xd3, 0xc3,
0xd3, 0xda, 0xb2, 0xe2, 0xca, 0xd4, 0xb1, 0xe0, 0xc2, 0xeb, 0xbc,
0xec, 0xb2, 0xe2,
]);
const filePath = path.join(testDir, 'gbk-detect.txt');
await fsPromises.writeFile(filePath, gbkBuffer);
const encoding = await detectFileEncoding(filePath);
// chardet detects GBK as 'gb18030' (its superset)
expect(encoding).toBe('gb18030');
});
it('should return utf-8 for empty file', async () => {
const filePath = path.join(testDir, 'empty-detect.txt');
await fsPromises.writeFile(filePath, '');
const encoding = await detectFileEncoding(filePath);
expect(encoding).toBe('utf-8');
});
it('should return utf-8 for non-existent file', async () => {
const filePath = path.join(testDir, 'nonexistent-detect.txt');
const encoding = await detectFileEncoding(filePath);
expect(encoding).toBe('utf-8');
});
}); });
describe('isBinaryFile with BOM awareness', () => { describe('isBinaryFile with BOM awareness', () => {

View file

@ -9,10 +9,12 @@ import fsPromises from 'node:fs/promises';
import path from 'node:path'; import path from 'node:path';
import type { PartUnion } from '@google/genai'; import type { PartUnion } from '@google/genai';
import mime from 'mime/lite'; import mime from 'mime/lite';
import { iconvDecode, iconvEncodingExists } from './iconvHelper.js';
import { ToolErrorType } from '../tools/tool-error.js'; import { ToolErrorType } from '../tools/tool-error.js';
import { BINARY_EXTENSIONS } from './ignorePatterns.js'; import { BINARY_EXTENSIONS } from './ignorePatterns.js';
import type { Config } from '../config/config.js'; import type { Config } from '../config/config.js';
import { createDebugLogger } from './debugLogger.js'; import { createDebugLogger } from './debugLogger.js';
import { detectEncodingFromBuffer } from './systemEncoding.js';
const debugLogger = createDebugLogger('FILE_UTILS'); const debugLogger = createDebugLogger('FILE_UTILS');
@ -117,9 +119,33 @@ function decodeUTF32(buf: Buffer, littleEndian: boolean): string {
return out; return out;
} }
/**
* Check whether a buffer is valid UTF-8 by attempting a strict decode.
* If any invalid byte sequence is encountered, TextDecoder with `fatal: true` throws.
*/
function isValidUtf8(buffer: Buffer): boolean {
try {
new TextDecoder('utf-8', { fatal: true }).decode(buffer);
return true;
} catch {
return false;
}
}
/**
* Check whether an encoding name represents a UTF-8 compatible encoding
* that Node's Buffer can handle natively.
*/
function isUtf8Compatible(encoding: string): boolean {
const lower = encoding.toLowerCase().replace(/[^a-z0-9]/g, '');
return lower === 'utf8' || lower === 'ascii' || lower === 'usascii';
}
/** /**
* Read a file as text, honoring BOM encodings (UTF8/16/32) and stripping the BOM. * Read a file as text, honoring BOM encodings (UTF8/16/32) and stripping the BOM.
* Falls back to utf8 when no BOM is present. * For files without BOM, validates UTF-8 first. If invalid UTF-8, uses chardet
* to detect encoding (e.g. GBK, Big5, Shift_JIS) and iconv-lite to decode.
* Falls back to utf8 when detection fails.
*/ */
export async function readFileWithEncoding(filePath: string): Promise<string> { export async function readFileWithEncoding(filePath: string): Promise<string> {
// Read the file once; detect BOM and decode from the single buffer. // Read the file once; detect BOM and decode from the single buffer.
@ -127,27 +153,108 @@ export async function readFileWithEncoding(filePath: string): Promise<string> {
if (full.length === 0) return ''; if (full.length === 0) return '';
const bom = detectBOM(full); const bom = detectBOM(full);
if (!bom) { if (bom) {
// No BOM → treat as UTF8 // Strip BOM and decode per encoding
const content = full.subarray(bom.bomLength);
switch (bom.encoding) {
case 'utf8':
return content.toString('utf8');
case 'utf16le':
return content.toString('utf16le');
case 'utf16be':
return decodeUTF16BE(content);
case 'utf32le':
return decodeUTF32(content, true);
case 'utf32be':
return decodeUTF32(content, false);
default:
// Defensive fallback; should be unreachable
return content.toString('utf8');
}
}
// No BOM — check if it's valid UTF-8 first (fast path for the common case)
if (isValidUtf8(full)) {
return full.toString('utf8'); return full.toString('utf8');
} }
// Strip BOM and decode per encoding // Not valid UTF-8 — try chardet-based encoding detection
const content = full.subarray(bom.bomLength); const detected = detectEncodingFromBuffer(full);
switch (bom.encoding) { if (detected && !isUtf8Compatible(detected)) {
case 'utf8': try {
return content.toString('utf8'); if (iconvEncodingExists(detected)) {
case 'utf16le': return iconvDecode(full, detected);
return content.toString('utf16le'); }
case 'utf16be': } catch (e) {
return decodeUTF16BE(content); debugLogger.warn(
case 'utf32le': `Failed to decode file ${filePath} as ${detected}: ${e instanceof Error ? e.message : String(e)}`,
return decodeUTF32(content, true); );
case 'utf32be': }
return decodeUTF32(content, false); }
default:
// Defensive fallback; should be unreachable // Final fallback: UTF-8 with replacement characters
return content.toString('utf8'); return full.toString('utf8');
}
/**
* Detect the encoding of a file by reading a sample from its beginning.
* Returns the encoding name (e.g. 'utf-8', 'gbk', 'shift_jis').
* Uses BOM detection first, then UTF-8 validation, then chardet as fallback.
*/
export async function detectFileEncoding(filePath: string): Promise<string> {
let fh: fs.promises.FileHandle | null = null;
try {
fh = await fs.promises.open(filePath, 'r');
const stats = await fh.stat();
if (stats.size === 0) return 'utf-8';
// Read a sample (up to 8KB) for detection
const sampleSize = Math.min(8192, stats.size);
const buf = Buffer.alloc(sampleSize);
const { bytesRead } = await fh.read(buf, 0, sampleSize, 0);
if (bytesRead === 0) return 'utf-8';
const sample = buf.subarray(0, bytesRead);
// 1. Check for BOM
const bom = detectBOM(sample);
if (bom) {
switch (bom.encoding) {
case 'utf8':
return 'utf-8';
case 'utf16le':
return 'utf-16le';
case 'utf16be':
return 'utf-16be';
case 'utf32le':
return 'utf-32le';
case 'utf32be':
return 'utf-32be';
default:
return 'utf-8';
}
}
// 2. Validate UTF-8
if (isValidUtf8(sample)) return 'utf-8';
// 3. Use chardet for detection
const detected = detectEncodingFromBuffer(sample);
if (detected && !isUtf8Compatible(detected)) {
return detected;
}
return 'utf-8';
} catch {
// If file can't be read, default to UTF-8
return 'utf-8';
} finally {
if (fh) {
try {
await fh.close();
} catch {
// Ignore close errors
}
}
} }
} }

View file

@ -0,0 +1,53 @@
/**
* @license
* Copyright 2025 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
/**
* Helper module to bridge iconv-lite CJS module with our ESM codebase.
* iconv-lite v0.6.x uses ambient `declare module` type declarations
* that are incompatible with NodeNext module resolution.
* This module provides properly-typed wrappers.
*/
interface IconvLite {
decode(buffer: Buffer, encoding: string): string;
encode(content: string, encoding: string): Buffer;
encodingExists(encoding: string): boolean;
}
// iconv-lite is a CJS module. Under NodeNext resolution, its ambient type
// declarations don't map correctly. We import the default export (which is
// the CJS module.exports object) and cast it to a proper interface.
import iconvModule from 'iconv-lite';
const iconvLite: IconvLite = iconvModule as unknown as IconvLite;
/**
* Decode a buffer using the specified encoding.
* @param buffer The buffer to decode
* @param encoding The encoding to use (e.g. 'gbk', 'big5', 'shift_jis')
* @returns The decoded string
*/
export function iconvDecode(buffer: Buffer, encoding: string): string {
return iconvLite.decode(buffer, encoding);
}
/**
* Encode a string to a buffer using the specified encoding.
* @param content The string to encode
* @param encoding The encoding to use (e.g. 'gbk', 'big5', 'shift_jis')
* @returns The encoded buffer
*/
export function iconvEncode(content: string, encoding: string): Buffer {
return iconvLite.encode(content, encoding);
}
/**
* Check if an encoding is supported by iconv-lite.
* @param encoding The encoding name to check
* @returns True if the encoding is supported
*/
export function iconvEncodingExists(encoding: string): boolean {
return iconvLite.encodingExists(encoding);
}