fix(serve/fs): close 5 review-round-7 findings (#4250)

glm-5.1 round-7 review surfaced 6 items; 5 are real fixes, 1 was
already addressed in 7f4f30d0b (writeText pre-write symlink
guard — reviewer looked at ebd9e78a1 snapshot ~8h before the
round-6 fix). Stack on round-6.

1. edit() bypassed lowFs.readTextFile (workspaceFileSystem.ts) —
   Critical encoding round-trip corruption. The earlier
   `fsp.readFile(p, 'utf-8')` included the UTF-8 BOM verbatim in
   `current`, breaking `oldText` matching even when the user
   passed the exact source text from a previous read; lost
   iconv-supported codepage handling (GBK / Big5 / Shift_JIS),
   so non-UTF-8 files would mojibake into `current` and
   round-trip-corrupt on write-back; and the subsequent
   `lowFs.writeTextFile` passed no `_meta`, stripping the BOM
   and forcing UTF-8 on write-back even when the original was
   BOM'd or non-UTF-8. Fix: read via `lowFs.readTextFile` AND
   forward `readResult._meta` into the write-back. Test pins
   UTF-8 BOM round-trip end-to-end.

2. hasSuspiciousPathPattern multi-digit and POSIX false positive
   (paths.ts) — `/~\\d/` only matched single-digit; NTFS
   allocates `~10+` on >9 collisions. POSIX false-positives:
   editor swap files, backup tools used `~N` legitimately. Fixed
   to `/~\\d+/` and gated behind `process.platform === 'win32'`,
   matching the ADS-colon check.

3. canonicalizeWorkspace redundant per-request syscall
   (paths.ts) — Performance. Factory canonicalizes once at
   build; every `resolveWithinWorkspace` also ran realpathSync
   on the same path, blocking the event loop. Added
   CANONICAL_BOUND_CACHE Map keyed on the input string;
   steady-state size = 1 per `1 daemon = 1 workspace`.

4. readBytes opts.maxBytes API contract
   (workspaceFileSystem.ts) — Semantic mismatch. Parameter name
   promised window-read; impl only used it as a hard reject
   gate. Now truncates the buffer post-read so `readBytes(p,
   { maxBytes: 1024 })` on a 200 KB file returns 1 KB. Hard
   `MAX_READ_BYTES` cap still throws for files above it.

5. glob walks node_modules and .git unnecessarily
   (workspaceFileSystem.ts) — Performance. Without an `ignore`
   option, `globAsync` traversed every file under those dirs
   before our per-hit `shouldIgnore` filter. Now passes
   `ignore: ['**/node_modules/**', '**/.git/**']` to
   short-circuit traversal. Post-filter via `shouldIgnore`
   remains authoritative.

5 new tests:
- 8.3 short-name regex Windows / POSIX split
- readBytes truncates 2048-byte file to 1024 with maxBytes
- readBytes throws file_too_large only above hard cap
- edit() preserves UTF-8 BOM round-trip
- glob prunes node_modules and .git

442/442 serve tests pass; typecheck + eslint clean.

Stack: 7b0db4c3aa81ada43fefd7a4611b38e82157911cb8e5d546834267ebd9e78a17f4f30d0b → THIS

🤖 Generated with [Qwen Code](https://github.com/QwenLM/qwen-code)
This commit is contained in:
doudouOUC 2026-05-18 08:59:52 +08:00
parent 7f4f30d0b4
commit 1dc9d22902
4 changed files with 202 additions and 16 deletions

View file

@ -98,10 +98,24 @@ describe('canonicalizeWorkspace', () => {
});
describe('hasSuspiciousPathPattern', () => {
it('rejects 8.3 short names regardless of platform', () => {
expect(hasSuspiciousPathPattern('GIT~1')).toBe(true);
expect(hasSuspiciousPathPattern('CLAUDE~2')).toBe(true);
expect(hasSuspiciousPathPattern('SETTIN~1.JSON')).toBe(true);
it('rejects 8.3 short names on Windows including multi-digit suffixes; admits POSIX legit ~N filenames', () => {
if (process.platform === 'win32') {
// Windows: original short names + multi-digit (NTFS allocates
// ~1..~4 then hashes; ~10+ are real)
expect(hasSuspiciousPathPattern('GIT~1')).toBe(true);
expect(hasSuspiciousPathPattern('CLAUDE~2')).toBe(true);
expect(hasSuspiciousPathPattern('SETTIN~1.JSON')).toBe(true);
expect(hasSuspiciousPathPattern('LONGFI~10.TXT')).toBe(true);
expect(hasSuspiciousPathPattern('OTHER~99.dat')).toBe(true);
} else {
// POSIX: ~N is a legitimate filename char (editor swaps,
// backup files, version schemes). Daemon's FS isn't NTFS,
// so 8.3 interpretation doesn't apply.
expect(hasSuspiciousPathPattern('backup~1.txt')).toBe(false);
expect(hasSuspiciousPathPattern('notes~2.md')).toBe(false);
expect(hasSuspiciousPathPattern('file~3.swp')).toBe(false);
expect(hasSuspiciousPathPattern('LONGFI~10.TXT')).toBe(false);
}
});
it('rejects long-path / device prefixes regardless of platform', () => {
@ -253,12 +267,19 @@ describe('resolveWithinWorkspace', () => {
});
it('rejects suspicious patterns before any I/O', async () => {
await expect(
resolveWithinWorkspace('GIT~1', workspace, 'read'),
).rejects.toMatchObject({ kind: 'path_outside_workspace' });
// UNC prefixes are platform-agnostic: a daemon should never
// accept `//server/share` regardless of OS. The 8.3
// short-name `~\d` check is gated on win32 (legitimate POSIX
// filenames can have `~N` in them); we exercise the win32
// branch only when the runner is Windows.
await expect(
resolveWithinWorkspace('//server/share', workspace, 'read'),
).rejects.toMatchObject({ kind: 'path_outside_workspace' });
if (process.platform === 'win32') {
await expect(
resolveWithinWorkspace('GIT~1', workspace, 'read'),
).rejects.toMatchObject({ kind: 'path_outside_workspace' });
}
});
it('rejects empty/non-string input with parse_error', async () => {

View file

@ -170,7 +170,24 @@ export function hasSuspiciousPathPattern(p: string): boolean {
const colonIndex = p.indexOf(':', 2);
if (colonIndex !== -1) return true;
}
if (/~\d/.test(p)) return true;
// NTFS 8.3 short-name suffix: `LONGFILENAME~1.TXT`. Two fixes
// over the original `/~\d/`:
//
// 1. Multi-digit (`~10`, `~99`) — NTFS allocates `~1`..`~4` for
// the first 4 collisions, then switches to a hashed scheme
// where `~10` and above are real, common short names. The
// original regex missed those entirely.
// 2. Gate on Windows — on POSIX, `~\d` is a legitimate filename
// character used by editor swap files (`file~1.swp`),
// backup tools (`notes~2.md`, `backup~1.txt`), and version
// schemes. The daemon's actual filesystem on Linux/macOS
// isn't NTFS, so 8.3 interpretation doesn't apply and
// rejecting these is a false positive that breaks
// legitimate workflows. NTFS-on-Linux mounts (`ntfs-3g`) are
// rare enough that we accept the residual gap; operators
// mounting NTFS into a daemon workspace can disable
// suspicious-pattern rejection at the route layer.
if (process.platform === 'win32' && /~\d+/.test(p)) return true;
if (
p.startsWith('\\\\?\\') ||
p.startsWith('\\\\.\\') ||
@ -219,6 +236,33 @@ export function hasSuspiciousPathPattern(p: string): boolean {
/** Default ancestor-walk depth limit for ENOENT fallback. */
const MAX_ANCESTOR_HOPS = 40;
/**
* Module-level memo cache for `boundWorkspace → canonical`
* mapping. The factory already canonicalizes once at build time,
* so the inner call inside `resolveWithinWorkspace` is paying for
* a redundant `realpathSync.native` per request. The cache turns
* subsequent calls with the same `boundWorkspace` into an O(1)
* Map lookup; first call still pays the syscall (idempotent on
* already-canonical input).
*
* Cache size is bounded only by the number of *distinct*
* `boundWorkspace` values a daemon ever sees `1 daemon = 1
* workspace` per #4175, so the steady-state size is exactly 1.
* Tests that exercise multiple workspaces add an entry per
* scratch dir; entries never have to be evicted because realpath
* is functional (a path's canonical form doesn't change unless
* the path is moved on disk, in which case the bound workspace
* is wrong elsewhere too).
*/
const CANONICAL_BOUND_CACHE = new Map<string, string>();
function canonicalizeBoundWorkspaceCached(boundWorkspace: string): string {
const cached = CANONICAL_BOUND_CACHE.get(boundWorkspace);
if (cached !== undefined) return cached;
const canonical = canonicalizeWorkspace(boundWorkspace);
CANONICAL_BOUND_CACHE.set(boundWorkspace, canonical);
return canonical;
}
/**
* Walk up `absolute` until a component exists on disk. Returns the
* existing ancestor and the trailing components that don't exist
@ -337,7 +381,7 @@ export async function resolveWithinWorkspace(
);
}
const boundCanonical = canonicalizeWorkspace(boundWorkspace);
const boundCanonical = canonicalizeBoundWorkspaceCached(boundWorkspace);
const absolute = path.resolve(boundCanonical, input);
// Cheap pre-filter on the resolved-but-not-realpathed form. Catches

View file

@ -183,11 +183,26 @@ describe('WorkspaceFileSystem - readBytes', () => {
expect(Array.from(buf)).toEqual([1, 2, 3, 0, 4, 5]);
});
it('throws file_too_large above the cap', async () => {
it('truncates returned buffer to opts.maxBytes (window read, matches API name)', async () => {
// Earlier semantics threw `file_too_large` here, but `maxBytes`
// in the parameter name promises window-read behavior. Files
// above the HARD `MAX_READ_BYTES` cap still throw — see
// separate test below — but a caller-supplied tighter cap
// truncates rather than rejects.
const target = path.join(h.workspace, 'small.bin');
await fsp.writeFile(target, Buffer.alloc(2048, 0xab));
const r = await h.fs.resolve('small.bin', 'read');
const buf = await h.fs.readBytes(r, { maxBytes: 1024 });
expect(buf.length).toBe(1024);
expect(buf[0]).toBe(0xab);
});
it('throws file_too_large when file size exceeds the hard MAX_READ_BYTES cap regardless of maxBytes', async () => {
const policy = await import('./policy.js');
const target = path.join(h.workspace, 'huge.bin');
await fsp.writeFile(target, Buffer.alloc(2048));
await fsp.writeFile(target, Buffer.alloc(policy.MAX_READ_BYTES + 1));
const r = await h.fs.resolve('huge.bin', 'read');
const err = await h.fs.readBytes(r, { maxBytes: 1024 }).catch((e) => e);
const err = await h.fs.readBytes(r).catch((e) => e);
expect(isFsError(err)).toBe(true);
expect((err as { kind: string }).kind).toBe('file_too_large');
});
@ -274,6 +289,38 @@ describe('WorkspaceFileSystem - glob', () => {
expect(names).toContain('src.ts');
});
it('prunes node_modules and .git at glob walk time (no traversal cost)', async () => {
// The `ignore` option passed to `globAsync` short-circuits
// traversal at the directory level — files under
// node_modules/.git never reach our per-hit realpath +
// shouldIgnore filter. Simulate a workspace with deps and
// assert the deeply-nested `node_modules` file is absent
// even from a `**/*` glob.
await fsp.mkdir(
path.join(h.workspace, 'node_modules', 'left-pad', 'dist'),
{
recursive: true,
},
);
await fsp.writeFile(
path.join(h.workspace, 'node_modules', 'left-pad', 'dist', 'index.js'),
'module.exports = function leftPad() {};\n',
);
await fsp.mkdir(path.join(h.workspace, '.git', 'objects'), {
recursive: true,
});
await fsp.writeFile(
path.join(h.workspace, '.git', 'objects', 'pack.bin'),
'',
);
await fsp.writeFile(path.join(h.workspace, 'src.ts'), '');
const hits = await h.fs.glob('**/*');
const names = hits.map((p) => path.relative(h.workspace, p));
expect(names.some((n) => n.includes('node_modules'))).toBe(false);
expect(names.some((n) => n.includes('.git'))).toBe(false);
expect(names).toContain('src.ts');
});
it('respects maxResults', async () => {
const hits = await h.fs.glob('**/*', { maxResults: 1 });
expect(hits).toHaveLength(1);
@ -374,6 +421,33 @@ describe('WorkspaceFileSystem - write/edit', () => {
expect(err.hint).toMatch(/this string is not present/);
});
it('edit() preserves UTF-8 BOM round-trip via lowFs (no \\uFEFF leak into oldText match)', async () => {
// The earlier `fsp.readFile(p, 'utf-8')` would include the
// BOM (\\uFEFF codepoint) in the returned string, breaking
// `oldText` matching even when the user passed the exact
// source text; and the write-back without `_meta` would
// strip the BOM, silently changing the file's encoding
// profile. Using `lowFs.readTextFile` strips the BOM for
// matching and `_meta.bom: true` preserves it on write-back.
const target = path.join(h.workspace, 'bom.txt');
const bom = Buffer.from([0xef, 0xbb, 0xbf]);
await fsp.writeFile(
target,
Buffer.concat([bom, Buffer.from('foo=1\nbar=2\n', 'utf-8')]),
);
const r = await h.fs.resolve('bom.txt', 'edit');
// oldText is `'foo=1'` WITHOUT BOM — must still match.
const out = await h.fs.edit(r, 'foo=1', 'foo=42');
expect(out.writtenBytes).toBeGreaterThan(0);
const after = await fsp.readFile(target);
// BOM preserved at start
expect(after[0]).toBe(0xef);
expect(after[1]).toBe(0xbb);
expect(after[2]).toBe(0xbf);
// Edit applied
expect(after.subarray(3).toString('utf-8')).toBe('foo=42\nbar=2\n');
});
it('rejects edit on binary file', async () => {
const bin = path.join(h.workspace, 'bin.dat');
const buf = Buffer.alloc(64);

View file

@ -404,8 +404,23 @@ class WorkspaceFileSystemImpl implements WorkspaceFileSystem {
try {
assertTrustedForIntent(this.deps.trusted, 'read');
const st = await fsp.stat(p as string);
enforceReadBytesSize(st.size, opts.maxBytes);
const buf = await fsp.readFile(p as string);
// Hard cap (file size > MAX_READ_BYTES) always throws; this
// is the OOM defense that's independent of caller-supplied
// `maxBytes`.
enforceReadBytesSize(st.size);
let buf = await fsp.readFile(p as string);
// Soft cap: caller's `opts.maxBytes` is a window cap matching
// the parameter name's promise. Earlier semantics treated
// it as a "reject if file > maxBytes" gate, which violated
// the API contract — a caller asking for `maxBytes: 1024`
// on a 200 KB file expected to receive 1 KB, not an
// exception. We now truncate post-read so the returned
// buffer never exceeds `opts.maxBytes`. The hard
// `MAX_READ_BYTES` cap above ensures the underlying
// `fsp.readFile` allocation is bounded regardless.
if (opts.maxBytes !== undefined && opts.maxBytes < buf.length) {
buf = buf.subarray(0, opts.maxBytes);
}
// Post-read TOCTOU guard — same shape as `readText`.
await assertInodeStableAfterRead(p as string, st.ino);
this.deps.audit.recordAccess(this.deps.ctx, {
@ -527,11 +542,22 @@ class WorkspaceFileSystemImpl implements WorkspaceFileSystem {
{ hint: 'opts.cwd must be a path obtained from fs.resolve()' },
);
}
// Pass an `ignore` option so the glob library prunes
// common-and-huge directories at traversal time. Without
// this, `glob('**/*')` in a typical workspace walks every
// file under `node_modules/` and `.git/` (often hundreds
// of thousands of paths) before our per-hit `realpath` +
// `lstat` filter drops them. The post-filter via
// `shouldIgnore` is still authoritative — this is purely a
// walk-time optimization that aligns with the
// `loadIgnoreRules` defaults (which already include `.git`
// as a default ignore dir).
const matches = await globAsync(pattern, {
cwd: cwdReal,
nodir: false,
absolute: true,
dot: true,
ignore: ['**/node_modules/**', '**/.git/**'],
});
const out: ResolvedPath[] = [];
const max = opts.maxResults ?? Number.POSITIVE_INFINITY;
@ -741,10 +767,24 @@ class WorkspaceFileSystemImpl implements WorkspaceFileSystem {
},
);
}
const current = await fsp.readFile(p as string, 'utf-8');
// Use `lowFs.readTextFile` (not raw `fsp.readFile(p,
// 'utf-8')`) so BOM / encoding / CRLF handling matches what
// `readText` does and what `writeTextFile` will preserve on
// write-back. A direct utf-8 read on a UTF-8-BOM file would
// include the U+FEFF BOM codepoint in `current`,
// breaking `oldText` matching even when the user passed
// the exact string from a previous read; on iconv-supported
// codepages (GBK, Big5, Shift_JIS) it would mojibake the
// content and round-trip-corrupt the file on write-back.
const readResult = await this.deps.lowFs.readTextFile({
path: p as string,
limit: Number.POSITIVE_INFINITY,
line: 0,
});
const current = readResult.content;
// Post-read TOCTOU guard — catches the swap-during-read
// attack where `p` is replaced with a symlink between
// `fsp.stat` above and `fsp.readFile` here. The full
// `fsp.stat` above and the read here. The full
// read-modify-write race window (between this check and
// `lowFs.writeTextFile` below) is the deferred PR 20
// follow-up that adds atomic-via-temp + `expectedHash`.
@ -777,9 +817,16 @@ class WorkspaceFileSystemImpl implements WorkspaceFileSystem {
// atomic-via-temp follow-up; this guard is the
// defense-in-depth layer.
await assertNotSymlinkBeforeWrite(p as string);
// Forward the encoding/BOM/lineEnding metadata captured
// during the read so the write-back preserves the file's
// original encoding profile. Without this, a UTF-8-BOM
// file would be written without BOM, and a non-UTF-8 file
// (GBK/Shift_JIS) would be written as UTF-8 — silent
// round-trip corruption of any file the daemon edits.
await this.deps.lowFs.writeTextFile({
path: p as string,
content: next,
_meta: readResult._meta,
});
// Symmetric with `readText` / `writeText` — operators
// monitoring `fs.access` need to see when an edit landed on