From a3138cf5d5062ce58640c893d9e8e27d06c0790d Mon Sep 17 00:00:00 2001
From: LaZzyMan <zeusdream7@gmail.com>
Date: Mon, 18 May 2026 11:23:40 +0800
Subject: [PATCH] fix(cli,core,docs): address 4 non-blocker findings from PR
 #4151 review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Top-level review on c5cf60ee8 declared "可以合并" (good to merge) but
flagged 5 non-blocker items. Four are mechanical / low-cost; the fifth
(thresholds → config) is intentionally deferred — see review reply.

1. docs/users/features/auto-mode.md:223
   The "agent classifier sees first 200 chars of prompt" line was a
   stale leftover from before the truncation was removed (the
   AgentTool.toAutoClassifierInput regression guard now asserts full-
   prompt passthrough). Updated to describe the actual behavior plus
   the safety rationale (same shape as run_shell_command forwarding
   the full command). Also expanded the projection table with a note
   that MCP tools default to argument-stripped projection — pairing
   with the Limitations addendum below.

2. coreToolScheduler.ts:1425 + Session.ts:1945
   The unavailable error message was overwriting `failClosed`'s
   classified reason ('Conversation transcript exceeds classifier
   context window' / 'Classifier prompt construction failed' / etc.)
   with a generic "blocked for safety" line. Operators lose the
   diagnostic distinction. Both sites now append the original reason
   in parentheses when present: 'Auto mode classifier unavailable;
   action blocked for safety (Classifier stage 1 unavailable - …)'.

3. permission-manager.ts:771
   The session branch of the dangerous-rule stash didn't dedupe by
   raw string, while the persistent branch did. A user repeatedly
   clicking "Always allow" on the same fallback prompt would have
   piled duplicate stash entries that all activate on AUTO exit.
   Mirror the persistent-branch dedup.

4. docs/users/features/auto-mode.md (Limitations)
   Added a bullet making MCP-tool conservative-blocking explicit:
   third-party tools that haven't overridden toAutoClassifierInput
   show only their name to the classifier, so most calls will be
   blocked unless the user has written an explicit allow rule. This
   was a deliberate fail-closed choice from the previous round, but
   users wouldn't predict it without documentation.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 docs/users/features/auto-mode.md              | 23 ++++++++++++++++++-
 .../src/acp-integration/session/Session.ts    |  3 ++-
 packages/core/src/core/coreToolScheduler.ts   |  3 ++-
 .../src/permissions/permission-manager.ts     | 10 +++++++-
 4 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/docs/users/features/auto-mode.md b/docs/users/features/auto-mode.md
index 5b5709a40..3f9f3299d 100644
--- a/docs/users/features/auto-mode.md
+++ b/docs/users/features/auto-mode.md
@@ -199,6 +199,15 @@ tightened over time.
 - **Not a substitute for `deny` rules.** The classifier is best-effort.
   For commands you're sure should never run, put them in
   `permissions.deny`.
+- **MCP tools default to conservative blocking.** Third-party MCP tools
+  (`mcp__*`) opt-in to argument forwarding via the
+  `toAutoClassifierInput` override. Tools that have not opted in expose
+  only their name to the classifier — most such calls are
+  conservatively blocked unless you've written an explicit `allow`
+  rule. This is fail-closed by design (credentials and voluminous
+  content do not leak into the classifier LLM). If you trust a
+  specific MCP tool, add `permissions.allow: ["mcp__server__tool"]` so
+  it bypasses the classifier entirely.
 
 ## FAQ
 
@@ -220,11 +229,23 @@ projection exposes:
 - `run_shell_command`: the full command (it has to — that's what the
   classifier judges).
 - `web_fetch`: the URL only. The prompt field is not forwarded.
-- `agent`: subagent type plus the first 200 characters of the prompt.
+- `agent`: subagent type plus the full prompt. The prompt is the
+  instruction the sub-agent will follow, so the classifier needs it
+  in full to detect attacks that would steer the sub-agent toward
+  destructive actions — same reason `run_shell_command` forwards the
+  full command.
 
 Tool results (the actual content returned by tools) are stripped from
 the classifier transcript entirely.
 
+MCP tools (`mcp__*`) follow a stricter default: their parameters are
+not forwarded unless the MCP tool author explicitly opted in via the
+`toAutoClassifierInput` override. The classifier sees the tool name
+but no arguments, so most MCP calls will be conservatively blocked
+unless the user has written an explicit allow rule. This is fail-
+closed by design — third-party tools should not leak credentials or
+voluminous file content into the classifier LLM without intent.
+
 **Can I disable the first-time information message?**
 
 It only shows once per user-settings file. After dismissal,
diff --git a/packages/cli/src/acp-integration/session/Session.ts b/packages/cli/src/acp-integration/session/Session.ts
index 46267e53c..0b405fcf2 100644
--- a/packages/cli/src/acp-integration/session/Session.ts
+++ b/packages/cli/src/acp-integration/session/Session.ts
@@ -1942,7 +1942,8 @@ export class Session implements SessionContext {
           return earlyErrorResponse(
             new Error(
               decision.unavailable
-                ? `Auto mode classifier unavailable; action blocked for safety`
+                ? `Auto mode classifier unavailable; action blocked for safety` +
+                  (decision.reason ? ` (${decision.reason})` : '')
                 : `Blocked by auto mode policy: ${decision.reason}`,
             ),
             fc.name,
diff --git a/packages/core/src/core/coreToolScheduler.ts b/packages/core/src/core/coreToolScheduler.ts
index feae45809..14176b0bb 100644
--- a/packages/core/src/core/coreToolScheduler.ts
+++ b/packages/core/src/core/coreToolScheduler.ts
@@ -1422,7 +1422,8 @@ export class CoreToolScheduler {
                     reqInfo,
                     new Error(
                       decision.unavailable
-                        ? `Auto mode classifier unavailable; action blocked for safety`
+                        ? `Auto mode classifier unavailable; action blocked for safety` +
+                          (decision.reason ? ` (${decision.reason})` : '')
                         : `Blocked by auto mode policy: ${decision.reason}`,
                     ),
                     ToolErrorType.EXECUTION_DENIED,
diff --git a/packages/core/src/permissions/permission-manager.ts b/packages/core/src/permissions/permission-manager.ts
index 1c7fc7678..24f7b09b8 100644
--- a/packages/core/src/permissions/permission-manager.ts
+++ b/packages/core/src/permissions/permission-manager.ts
@@ -768,7 +768,15 @@ export class PermissionManager {
       // every subsequent AUTO call would bypass the classifier. See
       // dangerousRules.ts for the classifier-bypass criteria.
       if (this.strippedAllowRules && isDangerousAllowRule(rule)) {
-        this.strippedAllowRules.session.push(rule);
+        // Deduplicate on raw string — matches the persistent-stash branch
+        // in addPersistentRule. A repeated "Always allow" choice for the
+        // same rule must not pile copies into the session stash.
+        const exists = this.strippedAllowRules.session.some(
+          (r) => r.raw === rule.raw,
+        );
+        if (!exists) {
+          this.strippedAllowRules.session.push(rule);
+        }
         debugLogger.info(
           `Stashed newly added dangerous allow rule while in AUTO mode: ${rule.raw}`,
         );