mirror of
https://github.com/QwenLM/qwen-code.git
synced 2026-05-24 22:03:46 +00:00
fix(serve/fs): close 5 review-round-7 findings (#4250)
glm-5.1 round-7 review surfaced 6 items; 5 are real fixes, 1 was already addressed in7f4f30d0b(writeText pre-write symlink guard — reviewer looked atebd9e78a1snapshot ~8h before the round-6 fix). Stack on round-6. 1. edit() bypassed lowFs.readTextFile (workspaceFileSystem.ts) — Critical encoding round-trip corruption. The earlier `fsp.readFile(p, 'utf-8')` included the UTF-8 BOM verbatim in `current`, breaking `oldText` matching even when the user passed the exact source text from a previous read; lost iconv-supported codepage handling (GBK / Big5 / Shift_JIS), so non-UTF-8 files would mojibake into `current` and round-trip-corrupt on write-back; and the subsequent `lowFs.writeTextFile` passed no `_meta`, stripping the BOM and forcing UTF-8 on write-back even when the original was BOM'd or non-UTF-8. Fix: read via `lowFs.readTextFile` AND forward `readResult._meta` into the write-back. Test pins UTF-8 BOM round-trip end-to-end. 2. hasSuspiciousPathPattern multi-digit and POSIX false positive (paths.ts) — `/~\\d/` only matched single-digit; NTFS allocates `~10+` on >9 collisions. POSIX false-positives: editor swap files, backup tools used `~N` legitimately. Fixed to `/~\\d+/` and gated behind `process.platform === 'win32'`, matching the ADS-colon check. 3. canonicalizeWorkspace redundant per-request syscall (paths.ts) — Performance. Factory canonicalizes once at build; every `resolveWithinWorkspace` also ran realpathSync on the same path, blocking the event loop. Added CANONICAL_BOUND_CACHE Map keyed on the input string; steady-state size = 1 per `1 daemon = 1 workspace`. 4. readBytes opts.maxBytes API contract (workspaceFileSystem.ts) — Semantic mismatch. Parameter name promised window-read; impl only used it as a hard reject gate. Now truncates the buffer post-read so `readBytes(p, { maxBytes: 1024 })` on a 200 KB file returns 1 KB. Hard `MAX_READ_BYTES` cap still throws for files above it. 5. glob walks node_modules and .git unnecessarily (workspaceFileSystem.ts) — Performance. Without an `ignore` option, `globAsync` traversed every file under those dirs before our per-hit `shouldIgnore` filter. Now passes `ignore: ['**/node_modules/**', '**/.git/**']` to short-circuit traversal. Post-filter via `shouldIgnore` remains authoritative. 5 new tests: - 8.3 short-name regex Windows / POSIX split - readBytes truncates 2048-byte file to 1024 with maxBytes - readBytes throws file_too_large only above hard cap - edit() preserves UTF-8 BOM round-trip - glob prunes node_modules and .git 442/442 serve tests pass; typecheck + eslint clean. Stack:7b0db4c3a→a81ada43f→efd7a4611→b38e82157→911cb8e5d→546834267→ebd9e78a1→7f4f30d0b→ THIS 🤖 Generated with [Qwen Code](https://github.com/QwenLM/qwen-code)
This commit is contained in:
parent
7f4f30d0b4
commit
1dc9d22902
4 changed files with 202 additions and 16 deletions
|
|
@ -98,10 +98,24 @@ describe('canonicalizeWorkspace', () => {
|
|||
});
|
||||
|
||||
describe('hasSuspiciousPathPattern', () => {
|
||||
it('rejects 8.3 short names regardless of platform', () => {
|
||||
expect(hasSuspiciousPathPattern('GIT~1')).toBe(true);
|
||||
expect(hasSuspiciousPathPattern('CLAUDE~2')).toBe(true);
|
||||
expect(hasSuspiciousPathPattern('SETTIN~1.JSON')).toBe(true);
|
||||
it('rejects 8.3 short names on Windows including multi-digit suffixes; admits POSIX legit ~N filenames', () => {
|
||||
if (process.platform === 'win32') {
|
||||
// Windows: original short names + multi-digit (NTFS allocates
|
||||
// ~1..~4 then hashes; ~10+ are real)
|
||||
expect(hasSuspiciousPathPattern('GIT~1')).toBe(true);
|
||||
expect(hasSuspiciousPathPattern('CLAUDE~2')).toBe(true);
|
||||
expect(hasSuspiciousPathPattern('SETTIN~1.JSON')).toBe(true);
|
||||
expect(hasSuspiciousPathPattern('LONGFI~10.TXT')).toBe(true);
|
||||
expect(hasSuspiciousPathPattern('OTHER~99.dat')).toBe(true);
|
||||
} else {
|
||||
// POSIX: ~N is a legitimate filename char (editor swaps,
|
||||
// backup files, version schemes). Daemon's FS isn't NTFS,
|
||||
// so 8.3 interpretation doesn't apply.
|
||||
expect(hasSuspiciousPathPattern('backup~1.txt')).toBe(false);
|
||||
expect(hasSuspiciousPathPattern('notes~2.md')).toBe(false);
|
||||
expect(hasSuspiciousPathPattern('file~3.swp')).toBe(false);
|
||||
expect(hasSuspiciousPathPattern('LONGFI~10.TXT')).toBe(false);
|
||||
}
|
||||
});
|
||||
|
||||
it('rejects long-path / device prefixes regardless of platform', () => {
|
||||
|
|
@ -253,12 +267,19 @@ describe('resolveWithinWorkspace', () => {
|
|||
});
|
||||
|
||||
it('rejects suspicious patterns before any I/O', async () => {
|
||||
await expect(
|
||||
resolveWithinWorkspace('GIT~1', workspace, 'read'),
|
||||
).rejects.toMatchObject({ kind: 'path_outside_workspace' });
|
||||
// UNC prefixes are platform-agnostic: a daemon should never
|
||||
// accept `//server/share` regardless of OS. The 8.3
|
||||
// short-name `~\d` check is gated on win32 (legitimate POSIX
|
||||
// filenames can have `~N` in them); we exercise the win32
|
||||
// branch only when the runner is Windows.
|
||||
await expect(
|
||||
resolveWithinWorkspace('//server/share', workspace, 'read'),
|
||||
).rejects.toMatchObject({ kind: 'path_outside_workspace' });
|
||||
if (process.platform === 'win32') {
|
||||
await expect(
|
||||
resolveWithinWorkspace('GIT~1', workspace, 'read'),
|
||||
).rejects.toMatchObject({ kind: 'path_outside_workspace' });
|
||||
}
|
||||
});
|
||||
|
||||
it('rejects empty/non-string input with parse_error', async () => {
|
||||
|
|
|
|||
|
|
@ -170,7 +170,24 @@ export function hasSuspiciousPathPattern(p: string): boolean {
|
|||
const colonIndex = p.indexOf(':', 2);
|
||||
if (colonIndex !== -1) return true;
|
||||
}
|
||||
if (/~\d/.test(p)) return true;
|
||||
// NTFS 8.3 short-name suffix: `LONGFILENAME~1.TXT`. Two fixes
|
||||
// over the original `/~\d/`:
|
||||
//
|
||||
// 1. Multi-digit (`~10`, `~99`) — NTFS allocates `~1`..`~4` for
|
||||
// the first 4 collisions, then switches to a hashed scheme
|
||||
// where `~10` and above are real, common short names. The
|
||||
// original regex missed those entirely.
|
||||
// 2. Gate on Windows — on POSIX, `~\d` is a legitimate filename
|
||||
// character used by editor swap files (`file~1.swp`),
|
||||
// backup tools (`notes~2.md`, `backup~1.txt`), and version
|
||||
// schemes. The daemon's actual filesystem on Linux/macOS
|
||||
// isn't NTFS, so 8.3 interpretation doesn't apply and
|
||||
// rejecting these is a false positive that breaks
|
||||
// legitimate workflows. NTFS-on-Linux mounts (`ntfs-3g`) are
|
||||
// rare enough that we accept the residual gap; operators
|
||||
// mounting NTFS into a daemon workspace can disable
|
||||
// suspicious-pattern rejection at the route layer.
|
||||
if (process.platform === 'win32' && /~\d+/.test(p)) return true;
|
||||
if (
|
||||
p.startsWith('\\\\?\\') ||
|
||||
p.startsWith('\\\\.\\') ||
|
||||
|
|
@ -219,6 +236,33 @@ export function hasSuspiciousPathPattern(p: string): boolean {
|
|||
/** Default ancestor-walk depth limit for ENOENT fallback. */
|
||||
const MAX_ANCESTOR_HOPS = 40;
|
||||
|
||||
/**
|
||||
* Module-level memo cache for `boundWorkspace → canonical`
|
||||
* mapping. The factory already canonicalizes once at build time,
|
||||
* so the inner call inside `resolveWithinWorkspace` is paying for
|
||||
* a redundant `realpathSync.native` per request. The cache turns
|
||||
* subsequent calls with the same `boundWorkspace` into an O(1)
|
||||
* Map lookup; first call still pays the syscall (idempotent on
|
||||
* already-canonical input).
|
||||
*
|
||||
* Cache size is bounded only by the number of *distinct*
|
||||
* `boundWorkspace` values a daemon ever sees — `1 daemon = 1
|
||||
* workspace` per #4175, so the steady-state size is exactly 1.
|
||||
* Tests that exercise multiple workspaces add an entry per
|
||||
* scratch dir; entries never have to be evicted because realpath
|
||||
* is functional (a path's canonical form doesn't change unless
|
||||
* the path is moved on disk, in which case the bound workspace
|
||||
* is wrong elsewhere too).
|
||||
*/
|
||||
const CANONICAL_BOUND_CACHE = new Map<string, string>();
|
||||
function canonicalizeBoundWorkspaceCached(boundWorkspace: string): string {
|
||||
const cached = CANONICAL_BOUND_CACHE.get(boundWorkspace);
|
||||
if (cached !== undefined) return cached;
|
||||
const canonical = canonicalizeWorkspace(boundWorkspace);
|
||||
CANONICAL_BOUND_CACHE.set(boundWorkspace, canonical);
|
||||
return canonical;
|
||||
}
|
||||
|
||||
/**
|
||||
* Walk up `absolute` until a component exists on disk. Returns the
|
||||
* existing ancestor and the trailing components that don't exist
|
||||
|
|
@ -337,7 +381,7 @@ export async function resolveWithinWorkspace(
|
|||
);
|
||||
}
|
||||
|
||||
const boundCanonical = canonicalizeWorkspace(boundWorkspace);
|
||||
const boundCanonical = canonicalizeBoundWorkspaceCached(boundWorkspace);
|
||||
const absolute = path.resolve(boundCanonical, input);
|
||||
|
||||
// Cheap pre-filter on the resolved-but-not-realpathed form. Catches
|
||||
|
|
|
|||
|
|
@ -183,11 +183,26 @@ describe('WorkspaceFileSystem - readBytes', () => {
|
|||
expect(Array.from(buf)).toEqual([1, 2, 3, 0, 4, 5]);
|
||||
});
|
||||
|
||||
it('throws file_too_large above the cap', async () => {
|
||||
it('truncates returned buffer to opts.maxBytes (window read, matches API name)', async () => {
|
||||
// Earlier semantics threw `file_too_large` here, but `maxBytes`
|
||||
// in the parameter name promises window-read behavior. Files
|
||||
// above the HARD `MAX_READ_BYTES` cap still throw — see
|
||||
// separate test below — but a caller-supplied tighter cap
|
||||
// truncates rather than rejects.
|
||||
const target = path.join(h.workspace, 'small.bin');
|
||||
await fsp.writeFile(target, Buffer.alloc(2048, 0xab));
|
||||
const r = await h.fs.resolve('small.bin', 'read');
|
||||
const buf = await h.fs.readBytes(r, { maxBytes: 1024 });
|
||||
expect(buf.length).toBe(1024);
|
||||
expect(buf[0]).toBe(0xab);
|
||||
});
|
||||
|
||||
it('throws file_too_large when file size exceeds the hard MAX_READ_BYTES cap regardless of maxBytes', async () => {
|
||||
const policy = await import('./policy.js');
|
||||
const target = path.join(h.workspace, 'huge.bin');
|
||||
await fsp.writeFile(target, Buffer.alloc(2048));
|
||||
await fsp.writeFile(target, Buffer.alloc(policy.MAX_READ_BYTES + 1));
|
||||
const r = await h.fs.resolve('huge.bin', 'read');
|
||||
const err = await h.fs.readBytes(r, { maxBytes: 1024 }).catch((e) => e);
|
||||
const err = await h.fs.readBytes(r).catch((e) => e);
|
||||
expect(isFsError(err)).toBe(true);
|
||||
expect((err as { kind: string }).kind).toBe('file_too_large');
|
||||
});
|
||||
|
|
@ -274,6 +289,38 @@ describe('WorkspaceFileSystem - glob', () => {
|
|||
expect(names).toContain('src.ts');
|
||||
});
|
||||
|
||||
it('prunes node_modules and .git at glob walk time (no traversal cost)', async () => {
|
||||
// The `ignore` option passed to `globAsync` short-circuits
|
||||
// traversal at the directory level — files under
|
||||
// node_modules/.git never reach our per-hit realpath +
|
||||
// shouldIgnore filter. Simulate a workspace with deps and
|
||||
// assert the deeply-nested `node_modules` file is absent
|
||||
// even from a `**/*` glob.
|
||||
await fsp.mkdir(
|
||||
path.join(h.workspace, 'node_modules', 'left-pad', 'dist'),
|
||||
{
|
||||
recursive: true,
|
||||
},
|
||||
);
|
||||
await fsp.writeFile(
|
||||
path.join(h.workspace, 'node_modules', 'left-pad', 'dist', 'index.js'),
|
||||
'module.exports = function leftPad() {};\n',
|
||||
);
|
||||
await fsp.mkdir(path.join(h.workspace, '.git', 'objects'), {
|
||||
recursive: true,
|
||||
});
|
||||
await fsp.writeFile(
|
||||
path.join(h.workspace, '.git', 'objects', 'pack.bin'),
|
||||
'',
|
||||
);
|
||||
await fsp.writeFile(path.join(h.workspace, 'src.ts'), '');
|
||||
const hits = await h.fs.glob('**/*');
|
||||
const names = hits.map((p) => path.relative(h.workspace, p));
|
||||
expect(names.some((n) => n.includes('node_modules'))).toBe(false);
|
||||
expect(names.some((n) => n.includes('.git'))).toBe(false);
|
||||
expect(names).toContain('src.ts');
|
||||
});
|
||||
|
||||
it('respects maxResults', async () => {
|
||||
const hits = await h.fs.glob('**/*', { maxResults: 1 });
|
||||
expect(hits).toHaveLength(1);
|
||||
|
|
@ -374,6 +421,33 @@ describe('WorkspaceFileSystem - write/edit', () => {
|
|||
expect(err.hint).toMatch(/this string is not present/);
|
||||
});
|
||||
|
||||
it('edit() preserves UTF-8 BOM round-trip via lowFs (no \\uFEFF leak into oldText match)', async () => {
|
||||
// The earlier `fsp.readFile(p, 'utf-8')` would include the
|
||||
// BOM (\\uFEFF codepoint) in the returned string, breaking
|
||||
// `oldText` matching even when the user passed the exact
|
||||
// source text; and the write-back without `_meta` would
|
||||
// strip the BOM, silently changing the file's encoding
|
||||
// profile. Using `lowFs.readTextFile` strips the BOM for
|
||||
// matching and `_meta.bom: true` preserves it on write-back.
|
||||
const target = path.join(h.workspace, 'bom.txt');
|
||||
const bom = Buffer.from([0xef, 0xbb, 0xbf]);
|
||||
await fsp.writeFile(
|
||||
target,
|
||||
Buffer.concat([bom, Buffer.from('foo=1\nbar=2\n', 'utf-8')]),
|
||||
);
|
||||
const r = await h.fs.resolve('bom.txt', 'edit');
|
||||
// oldText is `'foo=1'` WITHOUT BOM — must still match.
|
||||
const out = await h.fs.edit(r, 'foo=1', 'foo=42');
|
||||
expect(out.writtenBytes).toBeGreaterThan(0);
|
||||
const after = await fsp.readFile(target);
|
||||
// BOM preserved at start
|
||||
expect(after[0]).toBe(0xef);
|
||||
expect(after[1]).toBe(0xbb);
|
||||
expect(after[2]).toBe(0xbf);
|
||||
// Edit applied
|
||||
expect(after.subarray(3).toString('utf-8')).toBe('foo=42\nbar=2\n');
|
||||
});
|
||||
|
||||
it('rejects edit on binary file', async () => {
|
||||
const bin = path.join(h.workspace, 'bin.dat');
|
||||
const buf = Buffer.alloc(64);
|
||||
|
|
|
|||
|
|
@ -404,8 +404,23 @@ class WorkspaceFileSystemImpl implements WorkspaceFileSystem {
|
|||
try {
|
||||
assertTrustedForIntent(this.deps.trusted, 'read');
|
||||
const st = await fsp.stat(p as string);
|
||||
enforceReadBytesSize(st.size, opts.maxBytes);
|
||||
const buf = await fsp.readFile(p as string);
|
||||
// Hard cap (file size > MAX_READ_BYTES) always throws; this
|
||||
// is the OOM defense that's independent of caller-supplied
|
||||
// `maxBytes`.
|
||||
enforceReadBytesSize(st.size);
|
||||
let buf = await fsp.readFile(p as string);
|
||||
// Soft cap: caller's `opts.maxBytes` is a window cap matching
|
||||
// the parameter name's promise. Earlier semantics treated
|
||||
// it as a "reject if file > maxBytes" gate, which violated
|
||||
// the API contract — a caller asking for `maxBytes: 1024`
|
||||
// on a 200 KB file expected to receive 1 KB, not an
|
||||
// exception. We now truncate post-read so the returned
|
||||
// buffer never exceeds `opts.maxBytes`. The hard
|
||||
// `MAX_READ_BYTES` cap above ensures the underlying
|
||||
// `fsp.readFile` allocation is bounded regardless.
|
||||
if (opts.maxBytes !== undefined && opts.maxBytes < buf.length) {
|
||||
buf = buf.subarray(0, opts.maxBytes);
|
||||
}
|
||||
// Post-read TOCTOU guard — same shape as `readText`.
|
||||
await assertInodeStableAfterRead(p as string, st.ino);
|
||||
this.deps.audit.recordAccess(this.deps.ctx, {
|
||||
|
|
@ -527,11 +542,22 @@ class WorkspaceFileSystemImpl implements WorkspaceFileSystem {
|
|||
{ hint: 'opts.cwd must be a path obtained from fs.resolve()' },
|
||||
);
|
||||
}
|
||||
// Pass an `ignore` option so the glob library prunes
|
||||
// common-and-huge directories at traversal time. Without
|
||||
// this, `glob('**/*')` in a typical workspace walks every
|
||||
// file under `node_modules/` and `.git/` (often hundreds
|
||||
// of thousands of paths) before our per-hit `realpath` +
|
||||
// `lstat` filter drops them. The post-filter via
|
||||
// `shouldIgnore` is still authoritative — this is purely a
|
||||
// walk-time optimization that aligns with the
|
||||
// `loadIgnoreRules` defaults (which already include `.git`
|
||||
// as a default ignore dir).
|
||||
const matches = await globAsync(pattern, {
|
||||
cwd: cwdReal,
|
||||
nodir: false,
|
||||
absolute: true,
|
||||
dot: true,
|
||||
ignore: ['**/node_modules/**', '**/.git/**'],
|
||||
});
|
||||
const out: ResolvedPath[] = [];
|
||||
const max = opts.maxResults ?? Number.POSITIVE_INFINITY;
|
||||
|
|
@ -741,10 +767,24 @@ class WorkspaceFileSystemImpl implements WorkspaceFileSystem {
|
|||
},
|
||||
);
|
||||
}
|
||||
const current = await fsp.readFile(p as string, 'utf-8');
|
||||
// Use `lowFs.readTextFile` (not raw `fsp.readFile(p,
|
||||
// 'utf-8')`) so BOM / encoding / CRLF handling matches what
|
||||
// `readText` does and what `writeTextFile` will preserve on
|
||||
// write-back. A direct utf-8 read on a UTF-8-BOM file would
|
||||
// include the U+FEFF BOM codepoint in `current`,
|
||||
// breaking `oldText` matching even when the user passed
|
||||
// the exact string from a previous read; on iconv-supported
|
||||
// codepages (GBK, Big5, Shift_JIS) it would mojibake the
|
||||
// content and round-trip-corrupt the file on write-back.
|
||||
const readResult = await this.deps.lowFs.readTextFile({
|
||||
path: p as string,
|
||||
limit: Number.POSITIVE_INFINITY,
|
||||
line: 0,
|
||||
});
|
||||
const current = readResult.content;
|
||||
// Post-read TOCTOU guard — catches the swap-during-read
|
||||
// attack where `p` is replaced with a symlink between
|
||||
// `fsp.stat` above and `fsp.readFile` here. The full
|
||||
// `fsp.stat` above and the read here. The full
|
||||
// read-modify-write race window (between this check and
|
||||
// `lowFs.writeTextFile` below) is the deferred PR 20
|
||||
// follow-up that adds atomic-via-temp + `expectedHash`.
|
||||
|
|
@ -777,9 +817,16 @@ class WorkspaceFileSystemImpl implements WorkspaceFileSystem {
|
|||
// atomic-via-temp follow-up; this guard is the
|
||||
// defense-in-depth layer.
|
||||
await assertNotSymlinkBeforeWrite(p as string);
|
||||
// Forward the encoding/BOM/lineEnding metadata captured
|
||||
// during the read so the write-back preserves the file's
|
||||
// original encoding profile. Without this, a UTF-8-BOM
|
||||
// file would be written without BOM, and a non-UTF-8 file
|
||||
// (GBK/Shift_JIS) would be written as UTF-8 — silent
|
||||
// round-trip corruption of any file the daemon edits.
|
||||
await this.deps.lowFs.writeTextFile({
|
||||
path: p as string,
|
||||
content: next,
|
||||
_meta: readResult._meta,
|
||||
});
|
||||
// Symmetric with `readText` / `writeText` — operators
|
||||
// monitoring `fs.access` need to see when an edit landed on
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue