From a3138cf5d5062ce58640c893d9e8e27d06c0790d Mon Sep 17 00:00:00 2001 From: LaZzyMan Date: Mon, 18 May 2026 11:23:40 +0800 Subject: [PATCH] fix(cli,core,docs): address 4 non-blocker findings from PR #4151 review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Top-level review on c5cf60ee8 declared "可以合并" (good to merge) but flagged 5 non-blocker items. Four are mechanical / low-cost; the fifth (thresholds → config) is intentionally deferred — see review reply. 1. docs/users/features/auto-mode.md:223 The "agent classifier sees first 200 chars of prompt" line was a stale leftover from before the truncation was removed (the AgentTool.toAutoClassifierInput regression guard now asserts full- prompt passthrough). Updated to describe the actual behavior plus the safety rationale (same shape as run_shell_command forwarding the full command). Also expanded the projection table with a note that MCP tools default to argument-stripped projection — pairing with the Limitations addendum below. 2. coreToolScheduler.ts:1425 + Session.ts:1945 The unavailable error message was overwriting `failClosed`'s classified reason ('Conversation transcript exceeds classifier context window' / 'Classifier prompt construction failed' / etc.) with a generic "blocked for safety" line. Operators lose the diagnostic distinction. Both sites now append the original reason in parentheses when present: 'Auto mode classifier unavailable; action blocked for safety (Classifier stage 1 unavailable - …)'. 3. permission-manager.ts:771 The session branch of the dangerous-rule stash didn't dedupe by raw string, while the persistent branch did. A user repeatedly clicking "Always allow" on the same fallback prompt would have piled duplicate stash entries that all activate on AUTO exit. Mirror the persistent-branch dedup. 4. docs/users/features/auto-mode.md (Limitations) Added a bullet making MCP-tool conservative-blocking explicit: third-party tools that haven't overridden toAutoClassifierInput show only their name to the classifier, so most calls will be blocked unless the user has written an explicit allow rule. This was a deliberate fail-closed choice from the previous round, but users wouldn't predict it without documentation. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/users/features/auto-mode.md | 23 ++++++++++++++++++- .../src/acp-integration/session/Session.ts | 3 ++- packages/core/src/core/coreToolScheduler.ts | 3 ++- .../src/permissions/permission-manager.ts | 10 +++++++- 4 files changed, 35 insertions(+), 4 deletions(-) diff --git a/docs/users/features/auto-mode.md b/docs/users/features/auto-mode.md index 5b5709a40..3f9f3299d 100644 --- a/docs/users/features/auto-mode.md +++ b/docs/users/features/auto-mode.md @@ -199,6 +199,15 @@ tightened over time. - **Not a substitute for `deny` rules.** The classifier is best-effort. For commands you're sure should never run, put them in `permissions.deny`. +- **MCP tools default to conservative blocking.** Third-party MCP tools + (`mcp__*`) opt-in to argument forwarding via the + `toAutoClassifierInput` override. Tools that have not opted in expose + only their name to the classifier — most such calls are + conservatively blocked unless you've written an explicit `allow` + rule. This is fail-closed by design (credentials and voluminous + content do not leak into the classifier LLM). If you trust a + specific MCP tool, add `permissions.allow: ["mcp__server__tool"]` so + it bypasses the classifier entirely. ## FAQ @@ -220,11 +229,23 @@ projection exposes: - `run_shell_command`: the full command (it has to — that's what the classifier judges). - `web_fetch`: the URL only. The prompt field is not forwarded. -- `agent`: subagent type plus the first 200 characters of the prompt. +- `agent`: subagent type plus the full prompt. The prompt is the + instruction the sub-agent will follow, so the classifier needs it + in full to detect attacks that would steer the sub-agent toward + destructive actions — same reason `run_shell_command` forwards the + full command. Tool results (the actual content returned by tools) are stripped from the classifier transcript entirely. +MCP tools (`mcp__*`) follow a stricter default: their parameters are +not forwarded unless the MCP tool author explicitly opted in via the +`toAutoClassifierInput` override. The classifier sees the tool name +but no arguments, so most MCP calls will be conservatively blocked +unless the user has written an explicit allow rule. This is fail- +closed by design — third-party tools should not leak credentials or +voluminous file content into the classifier LLM without intent. + **Can I disable the first-time information message?** It only shows once per user-settings file. After dismissal, diff --git a/packages/cli/src/acp-integration/session/Session.ts b/packages/cli/src/acp-integration/session/Session.ts index 46267e53c..0b405fcf2 100644 --- a/packages/cli/src/acp-integration/session/Session.ts +++ b/packages/cli/src/acp-integration/session/Session.ts @@ -1942,7 +1942,8 @@ export class Session implements SessionContext { return earlyErrorResponse( new Error( decision.unavailable - ? `Auto mode classifier unavailable; action blocked for safety` + ? `Auto mode classifier unavailable; action blocked for safety` + + (decision.reason ? ` (${decision.reason})` : '') : `Blocked by auto mode policy: ${decision.reason}`, ), fc.name, diff --git a/packages/core/src/core/coreToolScheduler.ts b/packages/core/src/core/coreToolScheduler.ts index feae45809..14176b0bb 100644 --- a/packages/core/src/core/coreToolScheduler.ts +++ b/packages/core/src/core/coreToolScheduler.ts @@ -1422,7 +1422,8 @@ export class CoreToolScheduler { reqInfo, new Error( decision.unavailable - ? `Auto mode classifier unavailable; action blocked for safety` + ? `Auto mode classifier unavailable; action blocked for safety` + + (decision.reason ? ` (${decision.reason})` : '') : `Blocked by auto mode policy: ${decision.reason}`, ), ToolErrorType.EXECUTION_DENIED, diff --git a/packages/core/src/permissions/permission-manager.ts b/packages/core/src/permissions/permission-manager.ts index 1c7fc7678..24f7b09b8 100644 --- a/packages/core/src/permissions/permission-manager.ts +++ b/packages/core/src/permissions/permission-manager.ts @@ -768,7 +768,15 @@ export class PermissionManager { // every subsequent AUTO call would bypass the classifier. See // dangerousRules.ts for the classifier-bypass criteria. if (this.strippedAllowRules && isDangerousAllowRule(rule)) { - this.strippedAllowRules.session.push(rule); + // Deduplicate on raw string — matches the persistent-stash branch + // in addPersistentRule. A repeated "Always allow" choice for the + // same rule must not pile copies into the session stash. + const exists = this.strippedAllowRules.session.some( + (r) => r.raw === rule.raw, + ); + if (!exists) { + this.strippedAllowRules.session.push(rule); + } debugLogger.info( `Stashed newly added dangerous allow rule while in AUTO mode: ${rule.raw}`, );