diff --git a/packages/cli/src/serve/httpAcpBridge.test.ts b/packages/cli/src/serve/httpAcpBridge.test.ts index 48b72a0d9..e08fe4bba 100644 --- a/packages/cli/src/serve/httpAcpBridge.test.ts +++ b/packages/cli/src/serve/httpAcpBridge.test.ts @@ -291,13 +291,11 @@ describe('createHttpAcpBridge', () => { expect(a.sessionId).toBe(SESS_A); // Cross-workspace POST throws before touching the channel. - await expect( - bridge.spawnOrAttach({ workspaceCwd: WS_B }), - ).rejects.toBeInstanceOf(WorkspaceMismatchError); - // The error carries both paths for the route's 400 body. + // Single `.catch` capture — assert instance + carried fields off + // the same caught value rather than firing the rejection twice. const err = await bridge .spawnOrAttach({ workspaceCwd: WS_B }) - .catch((e: WorkspaceMismatchError) => e); + .catch((e: unknown) => e); expect(err).toBeInstanceOf(WorkspaceMismatchError); expect((err as WorkspaceMismatchError).bound).toBe(WS_A); expect((err as WorkspaceMismatchError).requested).toBe(WS_B); diff --git a/packages/cli/src/serve/httpAcpBridge.ts b/packages/cli/src/serve/httpAcpBridge.ts index 265182edd..c35e0802a 100644 --- a/packages/cli/src/serve/httpAcpBridge.ts +++ b/packages/cli/src/serve/httpAcpBridge.ts @@ -1151,21 +1151,17 @@ export function createHttpAcpBridge(opts: BridgeOptions): HttpAcpBridge { ); const connection = new ClientSideConnection(() => client, channel.stream); - // tanzhenxin cold-spawn-window finding: the agent child exists - // from the moment `channelFactory(boundWorkspace)` returns, but - // pre-fix `aliveChannels.add(info)` ran only AFTER the - // `initialize` handshake completed (up to `initTimeoutMs`, default - // 10s). A double-Ctrl+C in that handshake window played out as: - // first SIGINT entered `shutdown()` and awaited the in-flight - // spawn; second SIGINT called `killAllSync()` against an empty - // `aliveChannels` (the channel hadn't been added yet) and - // `process.exit(1)` orphaned the child. Add to `aliveChannels` - // BEFORE the handshake await, and register the `channel.exited` - // handler immediately afterward so init-failure / child-crash / - // late-shutdown all clean up through the standard exit path - // instead of needing bespoke catches. `channelInfo` (the attach - // target) stays set only AFTER initialize succeeds — see further - // down — so callers don't attach to a still-handshaking channel. + // Add to `aliveChannels` + register the `channel.exited` handler + // BEFORE the `initialize` handshake (tanzhenxin cold-spawn-window + // finding): the agent child exists from the moment + // `channelFactory(boundWorkspace)` returns, so a `killAllSync()` + // during the handshake window (up to `initTimeoutMs`, default + // 10s) must find it to avoid orphaning on `process.exit(1)`. + // Init-failure / child-crash / late-shutdown all converge on + // the same cleanup path via the handler below. + // `channelInfo` (the attach target) is assigned only AFTER + // initialize succeeds so callers don't attach to a still- + // handshaking channel. const info: ChannelInfo = { channel, connection, @@ -1174,6 +1170,24 @@ export function createHttpAcpBridge(opts: BridgeOptions): HttpAcpBridge { isDying: false, }; aliveChannels.add(info); + // Belt-and-suspenders leak detection. The set is intentionally + // multi-entry to cover the `killSession`-then-`spawnOrAttach` + // overlap window (size 2 is legitimate: one dying + one fresh + // attach-target). Anything higher implies a `channel.exited` + // handler never fired for some prior channel — a real leak we'd + // otherwise notice only as gradually-growing RSS over hours. + // The warning surfaces it the moment it happens. Threshold is + // 2 because that's the design ceiling; bumping it requires + // updating both this guard and the comments around + // `aliveChannels` declaration. + if (aliveChannels.size > 2) { + writeStderrLine( + `qwen serve: WARNING aliveChannels.size=${aliveChannels.size} ` + + `(expected 1, max 2 during killSession-then-spawnOrAttach ` + + `overlap) — possible channel leak; check that prior channels' ` + + `channel.exited fired and the handler ran cleanup.`, + ); + } // One-time channel.exited cleanup. The child dying takes ALL // multiplexed sessions with it — iterate `sessionIds` (snapshot @@ -1203,19 +1217,27 @@ export function createHttpAcpBridge(opts: BridgeOptions): HttpAcpBridge { if (channelInfo === info) channelInfo = undefined; const sessions = Array.from(info.sessionIds); info.sessionIds.clear(); - // Operator breadcrumb on the daemon side. Without it, an - // agent crash (OOM / segfault) is invisible from the daemon - // log: each affected SSE subscriber sees a `session_died` - // frame and disconnects, the daemon's child-stderr forwarder - // emits whatever the child wrote before dying (often - // nothing on a SIGKILL / segfault), and operators can't tell - // from `qwen serve`'s own output that the agent process is - // gone. Log the OS-level signal/exitCode + how many sessions - // were affected so the line is the canonical "agent - // disappeared" indicator. - writeStderrLine( - `qwen serve: channel exited (code=${exitInfo?.exitCode ?? 'none'}, signal=${exitInfo?.signalCode ?? 'none'}, ${sessions.length} session(s) torn down)`, - ); + // Operator breadcrumb for UNEXPECTED channel exits. Without + // this an agent crash (OOM / segfault) is invisible from the + // daemon log: each affected SSE subscriber sees a + // `session_died` frame and disconnects, the daemon's + // child-stderr forwarder emits whatever the child wrote before + // dying (often nothing on a SIGKILL / segfault), and operators + // can't tell from `qwen serve`'s own output that the agent + // process is gone. + // + // Suppressed during `shuttingDown` because the operator + // already saw "received SIGINT, draining..." from + // `runQwenServe`'s signal handler. The standalone + // killSession case (last session leaves, channel torn down + // but daemon stays up) still logs — there's no upstream + // context line in that flow, and the message confirms the + // cleanup actually ran. + if (!shuttingDown) { + writeStderrLine( + `qwen serve: channel exited (code=${exitInfo?.exitCode ?? 'none'}, signal=${exitInfo?.signalCode ?? 'none'}, ${sessions.length} session(s) torn down)`, + ); + } for (const sid of sessions) { const sessEntry = byId.get(sid); if (!sessEntry) continue; @@ -1258,11 +1280,17 @@ export function createHttpAcpBridge(opts: BridgeOptions): HttpAcpBridge { 'initialize', ); } catch (err) { - // Mark dying so concurrent `ensureChannel` callers spawn - // fresh instead of awaiting this in-flight promise's - // `channelInfo` (which we never assigned). The kill below - // triggers `channel.exited` → handler runs `aliveChannels - // .delete(info)` → entry leaves the alive set after OS reap. + // Mark the half-initialized channel as dying/unavailable, then + // kill it. Coalesced callers (`inFlightChannelSpawn` branch in + // `ensureChannel`) observe the same rejection on this promise + // and propagate it to their callers; the `inFlightSpawns` + // tracker is cleared in `spawnOrAttach`'s finally so a follow- + // up call retries cleanly. The `channel.exited` handler + // registered earlier removes `info` from `aliveChannels` once + // the OS reaps the child. `isDying` here is the cross-path + // invariant marker (matches `killSession` / `doSpawn`- + // newSession-failure / `shutdown`): "any channel in + // `aliveChannels` with `isDying === true` is mid-teardown." info.isDying = true; await channel.kill().catch(() => {}); throw err; @@ -2091,11 +2119,10 @@ export function createHttpAcpBridge(opts: BridgeOptions): HttpAcpBridge { // tanzhenxin BkUyD: iterate `aliveChannels` (the OS-level "still // alive" source of truth) — `channelInfo` only points at the // CURRENT attach target, missing any dying channel whose - // `channel.exited` hasn't fired yet. Pre-fix this iterated - // `channelInfo` alone; if a fresh spawn had already overwritten - // it during the prior channel's SIGTERM grace, the dying child - // received SIGTERM but no SIGKILL escalation when `process.exit(1)` - // fires and was orphaned. + // `channel.exited` hasn't fired yet. Without this, a fresh + // spawn overwriting `channelInfo` during the prior channel's + // SIGTERM grace would leave the dying child without SIGKILL + // escalation when `process.exit(1)` fires. shuttingDown = true; const channels = Array.from(aliveChannels); defaultEntry = undefined; diff --git a/packages/cli/src/serve/runQwenServe.ts b/packages/cli/src/serve/runQwenServe.ts index f09ec5e84..250fb28d9 100644 --- a/packages/cli/src/serve/runQwenServe.ts +++ b/packages/cli/src/serve/runQwenServe.ts @@ -172,18 +172,16 @@ export async function runQwenServe( }); let actualPort = opts.port; // Pass the already-canonical `boundWorkspace` into `createServeApp` - // via `opts.workspace`. `canonicalizeWorkspace` is idempotent so - // server.ts's own canonicalize is a no-op, but if some future - // refactor ever makes it non-idempotent (caching, async I/O, - // etc.) the value the route advertises on `/capabilities` and the - // value the bridge enforces would diverge — landing clients in a - // "/capabilities says X, POST /session/X 400s with workspace_mismatch" - // contradiction. Pre-canonicalizing here removes that drift risk. - const app = createServeApp( - { ...opts, workspace: boundWorkspace }, - () => actualPort, - { bridge }, - ); + // via `deps.boundWorkspace`. That field is the pre-canonicalized + // fast-path: createServeApp skips its own `canonicalizeWorkspace` + // call (which would issue a redundant `realpathSync.native` + // syscall — idempotent but unnecessary I/O at boot). Direct + // callers of createServeApp (tests / embeds) omit it and the + // server canonicalizes itself. + const app = createServeApp(opts, () => actualPort, { + bridge, + boundWorkspace, + }); // Node's `app.listen()` wants the unbracketed IPv6 literal (`::1`) but // operators conventionally type `[::1]` (or copy/paste from URLs that diff --git a/packages/cli/src/serve/server.ts b/packages/cli/src/serve/server.ts index 1cc926bc0..9777c8fd4 100644 --- a/packages/cli/src/serve/server.ts +++ b/packages/cli/src/serve/server.ts @@ -30,6 +30,19 @@ import { export interface ServeAppDeps { /** Bridge instance; tests inject a fake. Defaults to a fresh real one. */ bridge?: HttpAcpBridge; + /** + * Pre-canonicalized workspace path. When supplied, `createServeApp` + * skips its own `canonicalizeWorkspace` call (which would issue a + * redundant `realpathSync.native` syscall — idempotent, but a hot + * boot-time stat we can avoid). `runQwenServe` passes this after + * its own boot-time canonicalize so the value used by + * `/capabilities`, the `POST /session` cwd fallback, and the + * bridge are all the SAME canonical form. Callers that haven't + * canonicalized yet (tests, direct embeds) omit this and + * `createServeApp` falls back to canonicalizing `opts.workspace ?? + * process.cwd()` itself. + */ + boundWorkspace?: string; } /** @@ -85,17 +98,22 @@ export function createServeApp( // `process.cwd()`). `POST /session` with a mismatched cwd is // rejected with 400 `workspace_mismatch`. // - // Canonicalize here so the value advertised on `/capabilities`, - // used for the `POST /session` cwd fallback, AND passed into the - // bridge are all the SAME canonical form. Without this step the - // bridge re-canonicalizes via `realpathSync.native` and on symlinks - // / case-insensitive filesystems the bridge's stored form diverges - // from the value `/capabilities` advertises — clients echoing the - // advertised path back end up with a response whose `workspaceCwd` - // differs from what they sent. `canonicalizeWorkspace` is - // idempotent so `runQwenServe`'s pre-canonicalization here is a - // no-op when the caller already did the work. - const boundWorkspace = canonicalizeWorkspace(opts.workspace ?? process.cwd()); + // The value advertised on `/capabilities`, used for the `POST + // /session` cwd fallback, AND passed into the bridge must be the + // SAME canonical form — otherwise the bridge's + // `realpathSync.native` would diverge from what `/capabilities` + // shows on symlinks / case-insensitive filesystems, and clients + // echoing the advertised path back would see a response whose + // `workspaceCwd` differs from what they sent. + // + // `deps.boundWorkspace` is the pre-canonicalized fast-path — + // `runQwenServe` passes it after its own boot-time + // `canonicalizeWorkspace`, so we skip the redundant + // `realpathSync.native` here. When omitted (tests, direct embeds) + // we canonicalize ourselves. + const boundWorkspace = + deps.boundWorkspace ?? + canonicalizeWorkspace(opts.workspace ?? process.cwd()); const bridge = deps.bridge ?? createHttpAcpBridge({ @@ -903,9 +921,23 @@ function sendBridgeError( // every client request silently 400s. Limited to authenticated // requests by the upstream bearer-token gate, so probing-DoS // log noise stays bounded. + // SECURITY: `err.requested` is derived from the request body + // (`req.workspaceCwd` → `canonicalizeWorkspace` → here). `path.resolve` + // + `realpathSync.native` both preserve control characters inside + // path segments — they only normalize separators / `..` / `.` and + // walk symlinks. A body like `{"cwd": "/legit/path\nqwen serve: + // FAKE LOG LINE"}` would otherwise emit two valid-looking daemon + // log lines, weaponizing line-based log shippers (Splunk / Loki / + // journald → SIEM). `JSON.stringify` escapes control chars and + // wraps in quotes so any injection attempt surfaces as + // visible-as-quoted-noise rather than forged-line. `err.bound` is + // safe (canonicalized at boot from operator-controlled + // `--workspace` / `process.cwd()`) but quoted symmetrically for + // readability. writeStderrLine( `qwen serve: workspace_mismatch (POST /session): ` + - `daemon bound to "${err.bound}", rejected "${err.requested}"`, + `daemon bound to ${JSON.stringify(err.bound)}, ` + + `rejected ${JSON.stringify(err.requested)}`, ); res.status(400).json({ error: err.message,