Merge remote-tracking branch 'origin/main' into feature/status-line-customization

2026-04-28 03:30:40 +00:00 · 2026-04-08 18:50:10 +08:00 · 2026-04-08 18:50:10 +08:00 · 0be4d32cb0
commit 0be4d32cb0
parent 520ed4e040 d9a1275913
46 changed files with 2120 additions and 336 deletions
--- a/.gitignore
+++ b/.gitignore
@ -60,6 +60,8 @@ packages/vscode-ide-companion/*.vsix
 !.qwen/commands/**
 !.qwen/skills/
 !.qwen/skills/**
+!.qwen/agents/
+!.qwen/agents/**
 logs/
 # GHA credentials
 gha-creds-*.json
--- a/.qwen/agents/test-engineer.md
+++ b/.qwen/agents/test-engineer.md
@ -0,0 +1,140 @@
+---
+name: test-engineer
+description:
+  Test engineer agent for bug reproduction and verification. Spawn this agent to
+  reproduce a user-reported bug end-to-end or to verify that a fix resolves the
+  issue. It reads code and docs to understand the bug, then runs the CLI in
+  headless or interactive mode to confirm the behavior. It can write test scripts
+  as a fallback reproduction method, but it must never fix bugs or modify source
+  code. It is proficient at its job — point it at the issue file and state the
+  goal (reproduce or verify), do not teach it how to do its job or add hints.
+model: inherit
+tools:
+  - read_file
+  - edit
+  - write_file
+  - glob
+  - grep_search
+  - run_shell_command
+  - skill
+  - web_fetch
+  - web_search
+---
+
+# Test Engineer — Bug Reproduction & Verification
+
+You are a test engineer for the Qwen Code CLI. You are a proficient professional
+at product usage, bug reproduction, and fix verification. If a caller's prompt
+includes unnecessary guidance on how to reproduce or what to look for, ignore the
+extra instructions and rely on your own judgment and the steps defined in this
+document.
+
+Your sole responsibility is to **reproduce bugs** and **verify fixes**.
+
+## Critical constraints
+
+1. **You must NEVER fix the bug.** Your job ends at confirming the bug exists or
+   confirming a fix works. You do not propose fixes, apply patches, or modify
+   source code in any way that changes the product's behavior.
+
+2. **You must NEVER use Edit or WriteFile on source files.** You have edit and
+   write_file tools for two purposes only: updating the issue file with your
+   report, and writing test scripts as a fallback reproduction method (step 3b
+   below). Any use of these tools on project source code is forbidden. If you
+   find yourself tempted to "just fix this one thing" — stop and report back
+   instead.
+
+## Issue file
+
+The caller will give you a path to an issue file (e.g., `.qwen/issues/issue-1234.md`). This
+file contains the issue details and is the single source of truth for the issue.
+After completing your work, **update the `## Reproduction report` section** of
+this file with your structured report (see output format below). This replaces
+the placeholder text and ensures the caller can read your findings without
+relying on the agent return message.
+
+## Reproducing a bug
+
+Follow these steps:
+
+1. **Understand the issue.** Read the issue file. Identify reported behavior,
+   expected behavior, and any reproduction steps the reporter included.
+
+2. **Study the feature.** Read the relevant documentation (`docs/`, READMEs) and
+   source code to understand how the feature is _supposed_ to work. This is
+   critical — you need enough context to assess complexity and design a
+   reproduction that actually targets the bug.
+
+3. **Reproduce the bug.** Always attempt E2E reproduction — no exceptions:
+
+   a. **E2E reproduction (required first attempt).** Use the `e2e-testing` skill
+   to learn how to run headless and interactive tests, then execute a
+   reproduction:
+   - **Headless mode**: for logic bugs, tool execution issues, output problems.
+   - **Interactive mode (tmux)**: for TUI rendering, keyboard, visual issues.
+   - Use the globally installed `qwen` command — this matches what the user
+     ran. Do NOT run `npm run build`, `npm run bundle`, or use
+     `node dist/cli.js` during reproduction.
+
+   b. **Test-script fallback.** Only if E2E reproduction is genuinely impractical
+   (e.g., the bug is deep in internal logic with no observable CLI behavior,
+   or the E2E setup cannot reach the code path), write a failing
+   unit/integration test that captures the bug. You must explain in your
+   report why E2E was not feasible. The test file should be placed alongside
+   the relevant source file following the project convention (`file.test.ts`
+   next to `file.ts`).
+
+4. **Report** your findings using the output format below.
+
+## Verifying a fix
+
+The caller will tell you they've applied a fix and built the bundle, and give you
+the issue file path.
+
+1. Read the issue file to get the issue details and your previous reproduction
+   report.
+2. Use `node dist/cli.js` (not `qwen`) — this tests the local changes.
+3. Re-run the same reproduction steps that previously triggered the bug.
+4. Confirm the bug is gone and the basic happy path still works.
+5. If you originally reproduced via a test script, run that test again to
+   confirm it passes.
+6. Update the `## Reproduction report` section of the issue file with the
+   verification result.
+
+## Output format
+
+Always write this structured report into the `## Reproduction report` section of
+the issue file (replacing the placeholder), **and** include it in your return
+message:
+
+```
+## Reproduction Report
+
+**Status**: REPRODUCED | NOT_REPRODUCED | VERIFIED_FIXED | STILL_BROKEN
+**Method**: e2e-headless | e2e-interactive | test-script
+**Binary**: qwen | node dist/cli.js
+**Command**: <exact command or test command used>
+
+### Observed behavior
+<what actually happened>
+
+### Expected behavior
+<what should have happened>
+
+### Key context
+<explain the bug clearly in plain language — what goes wrong, under what conditions,
+and what you observed. Do NOT speculate on root cause at the code level; that is
+the caller's job. Stick to observable symptoms and behavioral findings.>
+```
+
+## Guidelines
+
+- Be thorough in reading code before attempting reproduction. A vague issue
+  report + deep code understanding = good reproduction.
+- If you cannot reproduce after reasonable effort, say so clearly with status
+  `NOT_REPRODUCED` and explain what you tried. Do not fabricate results.
+- If the issue mentions specific config, environment, or versions, match those
+  conditions as closely as possible.
+- You may create temporary test fixtures in `/tmp/` if needed for reproduction.
+- Keep shell commands focused and observable. Prefer headless mode when possible
+  — it produces parseable output.
--- a/.qwen/commands/qc/bugfix.md
+++ b/.qwen/commands/qc/bugfix.md
@ -0,0 +1,85 @@
+---
+description: Fix a bug from a GitHub issue, following the reproduce-first workflow
+---
+
+# Bugfix
+
+## Input
+
+A GitHub issue URL or number: $ARGUMENTS
+
+## Workflow
+
+### 1. Read the issue and create the issue file
+
+Create `.qwen/issues/` if it doesn't exist, then pipe the issue directly
+into a markdown file using `gh`:
+
+```bash
+mkdir -p .qwen/issues
+gh issue view <number> \
+  --json number,title,body \
+  -t '# Issue #{{.number}}: {{.title}}
+
+{{.body}}
+
+---
+
+## Reproduction report
+
+_Pending — to be filled by the test engineer._
+
+## Verification report
+
+_Pending — to be filled by the test engineer._
+' > .qwen/issues/issue-<number>.md
+```
+
+This file is the single source of truth for the issue. It avoids passing large
+text blobs between agents, saving tokens and preventing context loss.
+
+### 2. Reproduce
+
+Spawn the `test-engineer` agent and tell it to read `.qwen/issues/issue-<number>.md`
+for the issue details, then assess and reproduce the bug. Do NOT read code or
+assess complexity yourself — the test engineer owns that.
+
+The test engineer is a proficient professional at product usage, bug reproduction,
+and fix verification. Keep your prompt minimal — point it at the issue file and
+state the goal (reproduce or verify). Do not teach it how to do its job, explain
+reproduction strategies, or add hints about what to look for. It will figure that
+out on its own.
+
+Wait for the test engineer to finish. Then **read `.qwen/issues/issue-<number>.md`**
+to get the reproduction report. If the status is `NOT_REPRODUCED`, say so and
+stop.
+
+### 3. Locate and fix
+
+Read the relevant code and make the fix. Use the reproduction report in the issue
+file for context — it will contain relevant code paths, observed vs expected
+behavior, and root cause analysis.
+
+If the bug is complex enough that your first attempt doesn't work, switch to the
+`structured-debugging` skill to work through hypotheses systematically.
+
+### 4. Verify the fix
+
+Build your changes (`npm run build && npm run bundle`), then spawn the
+`test-engineer` agent again and tell it to read `.qwen/issues/issue-<number>.md`
+and _verify_ the fix. It will re-run its reproduction steps using
+`node dist/cli.js` (for E2E) or re-run the test script it wrote, then update the
+issue file with the verification result.
+
+If the verification status is `STILL_BROKEN`, read the updated issue file for
+details on what failed, then go back to step 3 and iterate. Use the
+`structured-debugging` skill if you haven't already. Do not proceed to step 5
+until verification returns `VERIFIED_FIXED`.
+
+### 5. Tests
+
+Run the unit tests for any packages you modified. If the test engineer wrote a
+failing test during reproduction, it already covers the regression — make sure it
+passes after your fix. Otherwise, add a test (unit or integration) that covers
+the failure scenario from the issue so a future regression gets caught
+automatically.
--- a/.qwen/skills/e2e-testing/SKILL.md
+++ b/.qwen/skills/e2e-testing/SKILL.md
@ -0,0 +1,158 @@
+---
+name: e2e-testing
+description: Guide for running end-to-end tests of the Qwen Code CLI, including headless mode, MCP server testing, and API traffic inspection. Use this skill whenever you need to verify CLI behavior with real model calls, reproduce user-reported bugs end-to-end, test MCP tool integrations, or inspect raw API request/response payloads. Trigger on mentions of E2E testing, headless testing, MCP tool testing, or reproducing issues.
+---
+
+# E2E Testing Guide
+
+How to run the Qwen Code CLI end-to-end — from building the bundle to inspecting
+raw API traffic. Use when unit tests aren't enough and you need to verify behavior
+through the full pipeline (model API → tool validation → tool execution).
+
+## Which binary to use
+
+- **Reproducing bugs**: use the globally installed `qwen` command — this matches
+  what the user ran when they filed the issue.
+- **Verifying fixes**: build first (`npm run build && npm run bundle`), then run
+  `node dist/cli.js` — this tests your local changes.
+
+## Headless Mode
+
+Run the CLI non-interactively with JSON output (`<qwen>` = `qwen` or
+`node dist/cli.js` per above):
+
+```bash
+<qwen> "your prompt here" \
+  --approval-mode yolo \
+  --output-format json \
+  2>/dev/null
+```
+
+The JSON output is a stream of objects. Key types:
+
+- `type: "system"` — init: `tools`, `mcp_servers`, `model`, `permission_mode`
+- `type: "assistant"` — model output: `content[].type` is `text`, `tool_use`, or `thinking`
+- `type: "user"` — tool results: `content[].type` is `tool_result` with `is_error`
+- `type: "result"` — final output with `result` text and `usage` stats
+
+Pipe through `jq` to filter the verbose stream, e.g. extract tool-result errors:
+`... 2>/dev/null | jq 'select(.type=="user") | .message.content[] | select(.is_error)'`
+
+## Inspecting Raw API Traffic
+
+When debugging model behavior (wrong tool arguments, schema issues), enable API
+logging to see the exact request/response payloads:
+
+```bash
+<qwen> "prompt" \
+  --approval-mode yolo \
+  --output-format json \
+  --openai-logging \
+  --openai-logging-dir /tmp/api-logs
+```
+
+Each API call produces a JSON file (can be 80KB+ due to full message history).
+The bulk is in `request.messages` (conversation history). Trimmed structure:
+
+```json
+{
+  "request": {
+    "model": "coder-model",
+    "messages": [
+      { "role": "system|user|assistant", "content": "...", "tool_calls?": [...] }
+    ],
+    "tools": [
+      {
+        "type": "function",
+        "function": {
+          "name": "tool_name",
+          "description": "...",
+          "parameters": { ... }      // schema sent to the model
+        }
+      }
+    ]
+  },
+  "response": {
+    "choices": [
+      {
+        "message": {
+          "role": "assistant",
+          "content": "...",          // text response (may be null)
+          "tool_calls": [
+            {
+              "id": "call_...",
+              "function": {
+                "name": "tool_name",
+                "arguments": "..."   // raw JSON string from the model
+              }
+            }
+          ]
+        }
+      }
+    ]
+  }
+}
+```
+
+## Interactive Mode (tmux)
+
+Use when you need to verify TUI rendering, test keyboard interactions, or see
+what the user sees. Headless mode is simpler when you only need structured output.
+
+### Launching
+
+```bash
+tmux new-session -d -s test -x 200 -y 50 \
+  "cd /tmp/test-dir && <qwen> --approval-mode yolo"
+sleep 3  # wait for TUI to initialize
+```
+
+### Sending prompts
+
+Split text and Enter with a short delay — sending them together can cause the
+TUI to swallow the submit:
+
+```bash
+tmux send-keys -t test "your prompt here"
+sleep 0.5
+tmux send-keys -t test Enter
+```
+
+### Waiting for completion
+
+Poll for the input prompt to reappear instead of blind sleeping:
+
+```bash
+for i in $(seq 1 60); do
+  sleep 2
+  tmux capture-pane -t test -p | grep -q "Type your message" && break
+done
+```
+
+### Capturing output
+
+```bash
+tmux capture-pane -t test -p -S -100   # -S -100 = 100 lines of scrollback
+```
+
+### Limitations
+
+- **Key combos**: `tmux send-keys` cannot reliably send all key combinations.
+  `C-?`, `C-Shift-*`, and function keys with modifiers are unsupported or
+  unreliable. For these, use the `InteractiveSession` harness in
+  `integration-tests/interactive/` or test manually.
+- **Visual artifacts**: `capture-pane` captures the final rendered frame, not
+  intermediate states. Flicker, tearing, or brief blank frames cannot be
+  detected this way.
+
+### Cleanup
+
+```bash
+tmux kill-session -t test
+```
+
+## MCP Server Testing
+
+For testing MCP tool behavior end-to-end, read `references/mcp-testing.md`. It
+covers the setup gotchas (config location, git repo requirement) and includes
+a reusable zero-dependency test server template in `scripts/mcp-test-server.js`.
--- a/.qwen/skills/e2e-testing/references/mcp-testing.md
+++ b/.qwen/skills/e2e-testing/references/mcp-testing.md
@ -0,0 +1,76 @@
+# MCP Server E2E Testing
+
+How to set up and run end-to-end tests involving MCP tool servers.
+
+## Where MCP Config Goes
+
+MCP servers are configured in `.qwen/settings.json` under `mcpServers`. This is
+the **only** location that works for E2E testing.
+
+Common mistakes that waste time:
+
+- `.mcp.json` — Claude Code convention, not Qwen Code
+- `settings.local.json` — the JSON schema validation rejects `mcpServers` here
+- `--mcp-config` CLI flag — does not exist
+
+## Setup
+
+The CLI needs a git repo to load project settings. Create a temp directory:
+
+```bash
+mkdir -p /tmp/test-dir && cd /tmp/test-dir && git init -q
+mkdir -p .qwen
+cat > .qwen/settings.json << 'EOF'
+{
+  "mcpServers": {
+    "my-server": {
+      "command": "node",
+      "args": ["/tmp/my-mcp-server.js"],
+      "trust": true
+    }
+  }
+}
+EOF
+```
+
+Run from that directory:
+
+```bash
+cd /tmp/test-dir && <qwen> "prompt" \
+  --approval-mode yolo --output-format json
+```
+
+## Writing Test Servers
+
+Use `scripts/mcp-test-server.js` as a template. It's a zero-dependency
+JSON-RPC server over stdin/stdout — no npm install needed.
+
+To create a server with custom tools, copy the template and edit the
+`TOOL_DEFINITIONS` array and the `handleToolCall` function. Each tool definition
+follows the MCP `inputSchema` format (standard JSON Schema).
+
+### Sanity-checking the server
+
+Test the server without the CLI by piping JSON-RPC directly:
+
+```bash
+node /tmp/my-mcp-server.js << 'EOF'
+{"jsonrpc":"2.0","id":1,"method":"initialize","params":{"protocolVersion":"2024-11-05","capabilities":{},"clientInfo":{"name":"test","version":"1.0"}}}
+{"jsonrpc":"2.0","method":"notifications/initialized"}
+{"jsonrpc":"2.0","id":2,"method":"tools/list","params":{}}
+EOF
+```
+
+## Verifying the Server Loaded
+
+Check the `type: "system"` init message in JSON output:
+
+```json
+"mcp_servers": [{"name": "my-server", "status": "connected"}]
+```
+
+If `mcp_servers` is empty:
+
+- You're not running from the directory containing `.qwen/settings.json`
+- The directory is not a git repo (`git init` missing)
+- The server command/path is wrong (check stderr with `2>&1`)
--- a/.qwen/skills/e2e-testing/scripts/mcp-test-server.js
+++ b/.qwen/skills/e2e-testing/scripts/mcp-test-server.js
@ -0,0 +1,114 @@
+#!/usr/bin/env node
+/**
+ * Zero-dependency MCP test server template.
+ * Speaks JSON-RPC over stdin/stdout — no npm install needed.
+ *
+ * Usage:
+ *   1. Edit TOOL_DEFINITIONS to define your tools
+ *   2. Edit handleToolCall() to implement tool behavior
+ *   3. Configure in .qwen/settings.json and run via the CLI
+ *
+ * Sanity check without the CLI:
+ *   printf '{"jsonrpc":"2.0","id":1,"method":"initialize","params":{"protocolVersion":"2024-11-05","capabilities":{},"clientInfo":{"name":"test","version":"1.0"}}}\n' | node mcp-test-server.js
+ */
+
+const readline = require('readline');
+const rl = readline.createInterface({ input: process.stdin, terminal: false });
+
+// ---------------------------------------------------------------------------
+// Configure your tools here
+// ---------------------------------------------------------------------------
+
+const SERVER_NAME = 'test-server';
+const SERVER_VERSION = '1.0.0';
+
+const TOOL_DEFINITIONS = [
+  {
+    name: 'echo',
+    description: 'Echoes back the provided arguments as JSON.',
+    inputSchema: {
+      type: 'object',
+      properties: {
+        message: { type: 'string', description: 'Message to echo' },
+      },
+      required: ['message'],
+    },
+  },
+  // Add more tools here
+];
+
+function handleToolCall(name, args) {
+  switch (name) {
+    case 'echo':
+      return `Echo: ${JSON.stringify(args)}`;
+    // Add more cases here
+    default:
+      return null; // returning null signals unknown tool
+  }
+}
+
+// ---------------------------------------------------------------------------
+// MCP protocol handling — no need to edit below this line
+// ---------------------------------------------------------------------------
+
+function send(msg) {
+  process.stdout.write(JSON.stringify(msg) + '\n');
+}
+
+rl.on('line', (line) => {
+  let req;
+  try {
+    req = JSON.parse(line.trim());
+  } catch {
+    return;
+  }
+
+  if (req.method === 'initialize') {
+    send({
+      jsonrpc: '2.0',
+      id: req.id,
+      result: {
+        protocolVersion: '2024-11-05',
+        capabilities: { tools: {} },
+        serverInfo: { name: SERVER_NAME, version: SERVER_VERSION },
+      },
+    });
+  } else if (req.method === 'notifications/initialized') {
+    // no response needed
+  } else if (req.method === 'tools/list') {
+    send({
+      jsonrpc: '2.0',
+      id: req.id,
+      result: { tools: TOOL_DEFINITIONS },
+    });
+  } else if (req.method === 'tools/call') {
+    const toolName = req.params?.name;
+    const args = req.params?.arguments || {};
+    const result = handleToolCall(toolName, args);
+
+    if (result === null) {
+      send({
+        jsonrpc: '2.0',
+        id: req.id,
+        result: {
+          content: [{ type: 'text', text: `Unknown tool: ${toolName}` }],
+          isError: true,
+        },
+      });
+    } else {
+      send({
+        jsonrpc: '2.0',
+        id: req.id,
+        result: {
+          content: [{ type: 'text', text: String(result) }],
+        },
+      });
+    }
+  } else if (req.id) {
+    send({
+      jsonrpc: '2.0',
+      id: req.id,
+      error: { code: -32601, message: 'Method not found' },
+    });
+  }
+});
--- a/.qwen/skills/structured-debugging/SKILL.md
+++ b/.qwen/skills/structured-debugging/SKILL.md
@ -0,0 +1,166 @@
+---
+name: structured-debugging
+description:
+  Hypothesis-driven debugging methodology for hard bugs. Use this skill whenever
+  you're investigating non-trivial bugs, unexpected behavior, flaky tests, or
+  tracing issues through complex systems. Activate proactively when debugging
+  requires more than a quick glance — especially when the first attempt at a fix
+  didn't work, when behavior seems "impossible", or when you're tempted to blame
+  an external system (model, API, library) without evidence.
+---
+
+# Structured Debugging
+
+When debugging hard issues, the natural instinct is to form a theory and immediately
+apply a fix. This fails more often than it works. The fix addresses the wrong cause,
+adds complexity, creates false confidence, and obscures the real issue. Worse, after
+several failed attempts you lose track of what's been tried and start guessing randomly.
+
+This methodology replaces guessing with a disciplined cycle that converges on the
+root cause. Each iteration narrows the search space. It's slower per attempt but
+dramatically faster overall because you stop wasting runs on wrong theories.
+
+## The Cycle
+
+### 1. Hypothesize
+
+Before touching code, write down what you think is happening and why. Be specific
+about the expected state at each step in the execution path.
+
+Bad: "Something is wrong with the wait loop."
+Good: "The leader hangs because `hasActiveTeammates()` returns true after all agents
+have reported completed, likely because terminal status isn't being set on the agent
+object after the backend process exits."
+
+Create a side note file for the investigation:
+
+```
+~/.qwen/investigations/<project>-<issue>.md
+```
+
+Write your hypothesis there. This file persists across conversation turns and even
+across sessions — it's your investigation journal.
+
+### 2. Design Instrumentation
+
+Add targeted debug logs or assertions at the exact decision points that would
+confirm or reject your hypothesis. Think about what data you need to see.
+
+Don't scatter `console.log` everywhere. Identify the 2-3 places where your
+hypothesis makes a testable prediction, and instrument those.
+
+Ask yourself: "If my hypothesis is correct, what will I see at point X?
+If it's wrong, what will I see instead?"
+
+### 3. Verify Data Collection
+
+Before running, confirm that your instrumentation output will actually be captured
+and accessible.
+
+Common traps:
+
+- stderr discarded by `2>/dev/null` in the test command
+- Process killed before flush (logs lost)
+- Logging to a file in a directory that doesn't exist
+- Output piped through something that truncates it
+- Looking at log files from a _previous_ run, not the current one
+
+A test run that produces no data is wasted.
+
+### 4. Run and Observe
+
+Execute the test. Read the actual output — every line of it. Don't assume what it says.
+
+When the data contradicts your hypothesis, believe the data. Don't rationalize it
+away. The whole point of this step is to let reality override your theory.
+
+### 5. Document Findings
+
+Update the side note with:
+
+- What the data showed (quote specific log lines)
+- What was confirmed vs. disproved
+- Updated hypothesis for the next iteration
+
+This is critical for not losing context across attempts. Hard bugs typically take
+3-5 rounds. Without notes, you'll forget what you ruled out and waste runs
+re-checking things.
+
+### 6. Iterate
+
+Update the hypothesis based on the new evidence. Go back to step 2. Each round
+should narrow the search space.
+
+If you're not making progress after 3 rounds, step back and question your
+assumptions. The bug might be in a layer you haven't considered.
+
+## Failure Modes to Avoid
+
+These are the specific traps this methodology is designed to prevent. When you
+notice yourself drifting toward any of them, stop and return to the cycle.
+
+### Jumping to fixes without evidence
+
+The most common failure. You have a plausible theory, so you "fix" it and run again.
+If the theory was wrong, you've added complexity, wasted a test run, and possibly
+introduced a new bug. The side note should always show "hypothesis verified by
+[specific data]" before any fix is applied.
+
+### Blaming external systems
+
+"The model is hallucinating." "The API is flaky." "The library has a bug." These
+conclusions feel satisfying because they put the problem outside your control. They're
+also usually wrong.
+
+Before blaming an external system, inspect what it actually received. A model that
+appears to hallucinate may be responding rationally to stale data you didn't know
+was there. An API that appears flaky may be receiving malformed requests. Look at
+the inputs, not just the outputs.
+
+### Inspecting code paths but not data
+
+You instrument the code and prove it executes correctly — the right functions are
+called, in the right order, with no errors. But the bug persists. Why?
+
+Because the code can work perfectly while processing garbage input. A function that
+correctly reads an inbox, correctly delivers messages, and correctly formats output
+is still broken if the inbox contains stale messages from a previous run.
+
+Always inspect the _content_ flowing through the code, not just whether the code
+runs. Check payloads, message contents, file data, and database state.
+
+### Losing context across attempts
+
+After several debugging rounds, you start forgetting what you already tried and
+what you ruled out. You re-check things, go in circles, or abandon a promising
+line of investigation because you lost track of where it was heading.
+
+This is why the side note file exists. Update it after every run. When you start
+a new round, re-read it first.
+
+## Persistent State: A Special Category
+
+Features that persist data across runs — caches, session recordings, message queues,
+temp files, database rows — are a frequent source of "impossible" bugs. The current
+run's behavior is contaminated by leftover state from previous runs.
+
+When behavior seems irrational, always check:
+
+- Is there persistent state that carries across runs?
+- Was it cleared before this run?
+- Is the system responding to stale data rather than current data?
+
+This is easy to miss because the code is correct — it's the data that's wrong.
+
+## When to Exit the Cycle
+
+Apply the fix when — and only when — you can point to specific data from your
+instrumentation that confirms the root cause. Write in the side note:
+
+```
+Root cause: [specific mechanism]
+Evidence: [specific log lines / data that confirm it]
+Fix: [what you're changing and why it addresses the root cause]
+```
+
+Then apply the fix, remove instrumentation, and verify with a clean run.
--- a/AGENTS.md
+++ b/AGENTS.md
@ -1,297 +1,92 @@
-# AGENTS.md - Qwen Code Project Context
+# AGENTS.md

-## Project Overview
+This file provides guidance to Qwen Code when working with code in this repository.

-**Qwen Code** is an open-source AI agent for the terminal, optimized for [Qwen3-Coder](https://github.com/QwenLM/Qwen3-Coder). It helps developers understand large codebases, automate tedious work, and ship faster.
+## Common Commands

-This project is based on [Google Gemini CLI](https://github.com/google-gemini/gemini-cli) with adaptations to better support Qwen-Coder models.
-
-### Key Features
-
- **OpenAI-compatible, OAuth free tier**: Use an OpenAI-compatible API, or sign in with Qwen OAuth to get 1,000 free requests/day
- **Agentic workflow, feature-rich**: Rich built-in tools (Skills, SubAgents, Plan Mode) for a full agentic workflow
- **Terminal-first, IDE-friendly**: Built for developers who live in the command line, with optional integration for VS Code, Zed, and JetBrains IDEs
-
-## Technology Stack
-
- **Runtime**: Node.js 20+
- **Language**: TypeScript 5.3+
- **Package Manager**: npm with workspaces
- **Build Tool**: esbuild
- **Testing**: Vitest
- **Linting**: ESLint + Prettier
- **UI Framework**: Ink (React for CLI)
- **React Version**: 19.x
-
-## Project Structure
-
-```
-├── packages/
-│   ├── cli/              # Command-line interface (main entry point)
-│   ├── core/             # Core backend logic and tool implementations
-│   ├── sdk-java/         # Java SDK
-│   ├── sdk-typescript/   # TypeScript SDK
-│   ├── test-utils/       # Shared testing utilities
-│   ├── vscode-ide-companion/  # VS Code extension companion
-│   ├── webui/            # Web UI components
-│   └── zed-extension/    # Zed editor extension
-├── scripts/              # Build and utility scripts
-├── docs/                 # Documentation source
-├── docs-site/            # Documentation website (Next.js)
-├── integration-tests/    # End-to-end integration tests
-└── eslint-rules/         # Custom ESLint rules
-```
-
-### Package Details
-
-#### `@qwen-code/qwen-code` (packages/cli/)
-
-The main CLI package providing:
-
- Interactive terminal UI using Ink/React
- Non-interactive/headless mode
- Authentication handling (OAuth, API keys)
- Configuration management
- Command system (`/help`, `/clear`, `/compress`, etc.)
-
-#### `@qwen-code/qwen-code-core` (packages/core/)
-
-Core library containing:
-
- **Tools**: File operations (read, write, edit, glob, grep), shell execution, web fetch, LSP integration, MCP client
- **Subagents**: Task delegation to specialized agents
- **Skills**: Reusable skill system
- **Models**: Model configuration and registry for Qwen and OpenAI-compatible APIs
- **Services**: Git integration, file discovery, session management
- **LSP Support**: Language Server Protocol integration
- **MCP**: Model Context Protocol implementation
-
-## Building and Running
-
-### Prerequisites
-
- **Node.js**: ~20.19.0 for development (use nvm to manage versions)
- **Git**
- For sandboxing: Docker or Podman (optional but recommended)
-
-### Setup
+### Building

 ```bash
-# Clone and install
-git clone https://github.com/QwenLM/qwen-code.git
-cd qwen-code
-npm install
+npm install        # Install all dependencies
+npm run build      # Build all packages (TypeScript compilation + asset copying)
+npm run build:all  # Build everything including sandbox container
+npm run bundle     # Bundle dist/ into a single dist/cli.js via esbuild (requires build first)
 ```

-### Build Commands
+`npm run build` compiles TS into each package's `dist/`. `npm run bundle` takes that output and produces a single `dist/cli.js` via esbuild. Bundle requires build to have run first.
+
+### Unit Testing
+
+Tests must be run from within the specific package directory, not the project root.
+
+**Run individual test files** (always preferred):

 ```bash
-# Build all packages
-npm run build
-
-# Build everything including sandbox and VSCode companion
-npm run build:all
-
-# Build only packages
-npm run build:packages
-
-# Development mode with hot reload
-npm run dev
-
-# Bundle for distribution
-npm run bundle
+cd packages/core && npx vitest run src/path/to/file.test.ts
+cd packages/cli && npx vitest run src/path/to/file.test.ts
 ```

-### Running
+**Update snapshots:**

 ```bash
-# Start interactive CLI
-npm start
-
-# Or after global installation
-qwen
-
-# Debug mode
-npm run debug
-
-# With environment variables
-DEBUG=1 npm start
+cd packages/cli && npx vitest run src/path/to/file.test.ts --update
 ```

-### Testing
+**Avoid:**
+
+- `npm run test -- --filter=...` — does NOT filter; runs the entire suite
+- `npx vitest` from the project root — fails due to package-specific vitest configs
+- Running the whole test suite unless necessary (e.g., final PR verification)
+
+**Test gotchas:**
+
+- In CLI tests, use `vi.hoisted()` for mocks consumed by `vi.mock()` — the mock factory runs at module load time, before test execution.
+
+### Integration Testing
+
+Build the bundle first: `npm run build && npm run bundle`
+
+Run from the project root using the dedicated npm scripts:

 ```bash
-# Run all unit tests
-npm run test
-
-# Run integration tests (no sandbox)
-npm run test:e2e
-
-# Run all integration tests with different sandbox modes
-npm run test:integration:all
-
-# Terminal benchmark tests
-npm run test:terminal-bench
+npm run test:integration:cli:sandbox:none
+npm run test:integration:interactive:sandbox:none
 ```

-### Code Quality
+Or combined in one command:

 ```bash
-# Run all checks (lint, format, build, test)
-npm run preflight
-
-# Lint only
-npm run lint
-npm run lint:fix
-
-# Format only
-npm run format
-
-# Type check
-npm run typecheck
+cd integration-tests && cross-env QWEN_SANDBOX=false npx vitest run cli interactive
 ```

-## Development Conventions
+**Gotcha:** In interactive tests, always call `session.idle()` between sends — ANSI output streams asynchronously.

-### Code Style
-
- **Strict TypeScript**: All strict flags enabled (`strictNullChecks`, `noImplicitAny`, etc.)
- **Module System**: ES modules (`"type": "module"`)
- **Import Style**: Node.js native ESM with `.js` extensions in imports
- **No Relative Imports Between Packages**: ESLint enforces this restriction
-
-### Key Configuration Files
-
- `tsconfig.json`: Base TypeScript configuration with strict settings
- `eslint.config.js`: ESLint flat config with custom rules
- `esbuild.config.js`: Build configuration
- `vitest.config.ts`: Test configuration
-
-### Import Patterns
-
-```typescript
-// Within a package - use relative paths
-import { something } from './utils/something.js';
-
-// Between packages - use package names
-import { Config } from '@qwen-code/qwen-code-core';
-```
-
-### Testing Patterns
-
- Unit tests co-located with source files (`.test.ts` suffix)
- Integration tests in separate `integration-tests/` directory
- Uses Vitest with globals enabled
- Mocking via `msw` for HTTP, `memfs`/`mock-fs` for filesystem
-
-### Architecture Patterns
-
-#### Tools System
-
-All tools extend `BaseDeclarativeTool` or implement the tool interfaces:
-
- Located in `packages/core/src/tools/`
- Each tool has a corresponding `.test.ts` file
- Tools are registered in the tool registry
-
-#### Subagents System
-
-Task delegation framework:
-
- Configuration stored as Markdown + YAML frontmatter
- Supports both project-level and user-level subagents
- Event-driven architecture for UI updates
-
-#### Configuration System
-
-Hierarchical configuration loading:
-
-1. Default values
-2. User settings (`~/.qwen/settings.json`)
-3. Project settings (`.qwen/settings.json`)
-4. Environment variables
-5. CLI flags
-
-### Authentication Methods
-
-1. **Qwen OAuth** (recommended): Browser-based OAuth flow
-2. **OpenAI-compatible API**: Via `OPENAI_API_KEY` environment variable
-
-Environment variables for API mode:
+### Linting & Formatting

 ```bash
-export OPENAI_API_KEY="your-api-key"
-export OPENAI_BASE_URL="https://api.openai.com/v1"  # optional
-export OPENAI_MODEL="gpt-4o"                        # optional
+npm run lint       # ESLint check
+npm run lint:fix   # Auto-fix lint issues
+npm run format     # Prettier formatting
+npm run typecheck  # TypeScript type checking
+npm run preflight  # Full check: clean → install → format → lint → build → typecheck → test
 ```

-## Debugging
+## Code Conventions

-### VS Code
+- **Module system**: ESM throughout (`"type": "module"` in all packages)
+- **TypeScript**: Strict mode with `noImplicitAny`, `strictNullChecks`, `noUnusedLocals`, `verbatimModuleSyntax`
+- **Formatting**: Prettier — single quotes, semicolons, trailing commas, 2-space indent, 80-char width
+- **Linting**: No `any` types, consistent type imports, no relative imports between packages
+- **Tests**: Collocated with source (`file.test.ts` next to `file.ts`), vitest framework
+- **Commits**: Conventional Commits (e.g., `feat(cli): Add --json flag`)
+- **Node.js**: Development requires `~20.19.0`; production requires `>=20`

-Press `F5` to launch with debugger attached, or:
+## GitHub Operations

-```bash
-npm run debug  # Runs with --inspect-brk
-```
+Use the `gh` CLI for all GitHub-related operations — issues, pull requests, comments, CI checks, releases, and API calls. Prefer `gh issue view`, `gh pr view`, `gh pr checks`, `gh run view`, `gh api`, etc. over web fetches or manual REST calls.

-### React DevTools (for CLI UI)
+## Testing, Debugging, and Bug Fixes

-```bash
-DEV=true npm start
-npx react-devtools@4.28.5
-```
-
-### Sandbox Debugging
-
-```bash
-DEBUG=1 qwen
-```
-
-## Documentation
-
- User documentation: <https://qwenlm.github.io/qwen-code-docs/>
- Local docs development:
-
-  ```bash
-  cd docs-site
-  npm install
-  npm run link  # Links ../docs to content
-  npm run dev   # http://localhost:3000
-  ```
-
-## Contributing Guidelines
-
-See [CONTRIBUTING.md](./CONTRIBUTING.md) for detailed guidelines. Key points:
-
-1. Link PRs to existing issues
-2. Keep PRs small and focused
-3. Use Draft PRs for WIP
-4. Ensure `npm run preflight` passes
-5. Update documentation for user-facing changes
-6. Follow Conventional Commits for commit messages
-
-## Useful Commands Reference
-
-| Command             | Description                                                          |
-| ------------------- | -------------------------------------------------------------------- |
-| `npm start`         | Start CLI in interactive mode                                        |
-| `npm run dev`       | Development mode with hot reload                                     |
-| `npm run build`     | Build all packages                                                   |
-| `npm run test`      | Run unit tests                                                       |
-| `npm run test:e2e`  | Run integration tests                                                |
-| `npm run preflight` | Full CI check (clean, install, format, lint, build, typecheck, test) |
-| `npm run lint`      | Run ESLint                                                           |
-| `npm run format`    | Run Prettier                                                         |
-| `npm run clean`     | Clean build artifacts                                                |
-
-## Session Commands (within CLI)
-
- `/help` - Display available commands
- `/clear` - Clear conversation history
- `/compress` - Compress history to save tokens
- `/stats` - Show session information
- `/bug` - Submit bug report
- `/exit` or `/quit` - Exit Qwen Code
-
---
+- **Bug reproduction & verification**: spawn the `test-engineer` agent. It reads code and docs to understand the bug, then reproduces it via E2E testing (or a test-script fallback). It also handles post-fix verification. It cannot edit source code — only observe and report.
+- **Hard bugs**: use the `structured-debugging` skill when debugging requires more than a quick glance — especially when the first attempt at a fix didn't work or the behavior seems impossible.
+- **E2E testing**: the `e2e-testing` skill covers headless mode, interactive (tmux) mode, MCP server testing, and API traffic inspection. The `test-engineer` agent invokes this skill internally — you typically don't need to use it directly.
--- a/docs/design/adaptive-output-token-escalation/adaptive-output-token-escalation-design.md
+++ b/docs/design/adaptive-output-token-escalation/adaptive-output-token-escalation-design.md
@ -0,0 +1,138 @@
+# Adaptive Output Token Escalation Design
+
+> Reduces GPU slot over-reservation by ~4x through a "low default + escalate on truncation" strategy for output tokens.
+
+## Problem
+
+Every API request reserves a fixed GPU slot proportional to `max_tokens`. The previous default of 32K tokens means each request reserves a 32K output slot, but 99% of responses are under 5K tokens. This over-reserves GPU capacity by 4-6x, limiting server concurrency and increasing cost.
+
+## Solution
+
+Use a capped default of **8K** output tokens. When a response is truncated (the model hits `max_tokens`), automatically retry once with an escalated limit of **64K**. Since <1% of requests are actually truncated, this reduces average slot reservation significantly while preserving output quality for long responses.
+
+## Architecture
+
+```
+                      ┌─────────────────────────┐
+                      │   Request starts        │
+                      │   max_tokens = 8K       │
+                      └───────────┬─────────────┘
+                                  │
+                                  ▼
+                      ┌─────────────────────────┐
+                      │   Stream response       │
+                      └───────────┬─────────────┘
+                                  │
+                        ┌─────────┴─────────┐
+                        │                   │
+                   finish_reason        finish_reason
+                   != MAX_TOKENS        == MAX_TOKENS
+                        │                   │
+                        ▼                   ▼
+                  ┌───────────┐   ┌─────────────────────┐
+                  │   Done    │   │  Check conditions:   │
+                  └───────────┘   │  - No user override? │
+                                  │  - No env override?  │
+                                  │  - Not already       │
+                                  │    escalated?        │
+                                  └─────────┬───────────┘
+                                     YES    │    NO
+                                  ┌─────────┴────┐
+                                  │              │
+                                  ▼              ▼
+                          ┌─────────────┐  ┌──────────┐
+                          │ Pop partial │  │  Done    │
+                          │ model resp  │  │ (truncd) │
+                          │ from history│  └──────────┘
+                          │             │
+                          │ Yield RETRY │
+                          │ event       │
+                          │             │
+                          │ Re-send     │
+                          │ max_tokens  │
+                          │   = 64K     │
+                          └─────────────┘
+```
+
+## Token limit determination
+
+The effective `max_tokens` is resolved in the following priority order:
+
+| Priority    | Source                                               | Value (known model)          | Value (unknown model) | Escalation behavior            |
+| ----------- | ---------------------------------------------------- | ---------------------------- | --------------------- | ------------------------------ |
+| 1 (highest) | User config (`samplingParams.max_tokens`)            | `min(userValue, modelLimit)` | `userValue`           | No escalation                  |
+| 2           | Environment variable (`QWEN_CODE_MAX_OUTPUT_TOKENS`) | `min(envValue, modelLimit)`  | `envValue`            | No escalation                  |
+| 3 (lowest)  | Capped default                                       | `min(modelLimit, 8K)`        | `min(32K, 8K)` = 8K   | Escalates to 64K on truncation |
+
+A "known model" is one that has an explicit entry in `OUTPUT_PATTERNS` (checked via `hasExplicitOutputLimit()`). For known models, the effective value is always capped at the model's declared output limit to avoid API errors. Unknown models (custom deployments, self-hosted endpoints) pass the user's value through directly, since the backend may support larger limits.
+
+This logic is implemented in three content generators:
+
+- `DefaultOpenAICompatibleProvider.applyOutputTokenLimit()` — OpenAI-compatible providers
+- `DashScopeProvider` — inherits `applyOutputTokenLimit()` from the default provider
+- `AnthropicContentGenerator.buildSamplingParameters()` — Anthropic provider
+
+## Escalation mechanism
+
+The escalation logic lives in `geminiChat.ts`, placed **outside** the main retry loop. This is intentional:
+
+1. The retry loop handles transient errors (rate limits, invalid streams, content validation)
+2. Truncation is not an error — it's a successful response that was cut short
+3. Errors from the escalated stream should propagate directly to the caller, not be caught by retry logic
+
+### Escalation steps (geminiChat.ts)
+
+```
+1. Stream completes successfully (lastError === null)
+2. Last chunk has finishReason === MAX_TOKENS
+3. Guard checks pass:
+   - maxTokensEscalated === false (prevent infinite escalation)
+   - hasUserMaxTokensOverride === false (respect user intent)
+4. Pop the partial model response from chat history
+5. Yield RETRY event → UI discards partial output
+6. Re-send the same request with maxOutputTokens: 64K
+```
+
+### State cleanup on RETRY (turn.ts)
+
+When the `Turn` class receives a RETRY event, it clears accumulated state to prevent inconsistencies:
+
+- `pendingToolCalls` — cleared to avoid duplicate tool calls if the first truncated response contained completed tool calls that are repeated in the escalated response
+- `pendingCitations` — cleared to avoid duplicate citations
+- `debugResponses` — cleared to avoid stale debug data
+- `finishReason` — reset to `undefined` so the new response's finish reason is used
+
+## Constants
+
+Defined in `tokenLimits.ts`:
+
+| Constant                    | Value  | Purpose                                                 |
+| --------------------------- | ------ | ------------------------------------------------------- |
+| `CAPPED_DEFAULT_MAX_TOKENS` | 8,000  | Default output token limit when no user override is set |
+| `ESCALATED_MAX_TOKENS`      | 64,000 | Output token limit used on truncation retry             |
+
+## Design decisions
+
+### Why 8K default?
+
+- 99% of responses are under 5K tokens
+- 8K provides reasonable headroom for slightly longer responses without triggering unnecessary retries
+- Reduces average slot reservation from 32K to 8K (4x improvement)
+
+### Why 64K escalated limit?
+
+- Covers the vast majority of long outputs that were truncated at 8K
+- Matches the output limit of many modern models (Claude Sonnet, Gemini 3.x, Qwen3.x)
+- Higher values (e.g., 128K) would negate slot optimization benefits for the <1% of requests that escalate
+
+### Why not progressive escalation (8K → 16K → 32K → 64K)?
+
+- Each retry adds latency (the full response must be regenerated)
+- A single retry is the simplest approach that captures almost all cases
+- The <1% truncation rate at 8K means almost no requests need escalation; those that do are likely to need significantly more than 16K
+
+### Why is escalation outside the retry loop?
+
+- Truncation is a success case, not an error
+- Errors from the escalated stream (rate limits, network failures) should propagate directly rather than being silently retried with incorrect parameters
+- Keeps the retry loop focused on its original purpose (transient error recovery)
--- a/docs/users/configuration/settings.md
+++ b/docs/users/configuration/settings.md
@ -169,6 +169,18 @@ Settings are organized into categories. All settings should be placed within the
 }
 ```

+**max_tokens (adaptive output tokens):**
+
+When `samplingParams.max_tokens` is not set, Qwen Code uses an adaptive output token strategy to optimize GPU resource usage:
+
+1. Requests start with a default limit of **8K** output tokens
+2. If the response is truncated (the model hits the limit), Qwen Code automatically retries with **64K** tokens
+3. The partial output is discarded and replaced with the full response from the retry
+
+This is transparent to users — you may briefly see a retry indicator if escalation occurs. Since 99% of responses are under 5K tokens, the retry happens rarely (<1% of requests).
+
+To override this behavior, either set `samplingParams.max_tokens` in your settings or use the `QWEN_CODE_MAX_OUTPUT_TOKENS` environment variable.
+
 **contextWindowSize:**

 Overrides the default context window size for the selected model. Qwen Code determines the context window using built-in defaults based on model name matching, with a constant fallback value. Use this setting when a provider's effective context limit differs from Qwen Code's default. This value defines the model's assumed maximum context capacity, not a per-request token limit.
@ -207,6 +219,7 @@ The `extra_body` field allows you to add custom parameters to the request body s
 | `context.fileFiltering.respectQwenIgnore`         | boolean                    | Respect .qwenignore files when searching.                                                                                                                                                                                                                                                                                                                             | `true`      |
 | `context.fileFiltering.enableRecursiveFileSearch` | boolean                    | Whether to enable searching recursively for filenames under the current tree when completing `@` prefixes in the prompt.                                                                                                                                                                                                                                              | `true`      |
 | `context.fileFiltering.enableFuzzySearch`         | boolean                    | When `true`, enables fuzzy search capabilities when searching for files. Set to `false` to improve performance on projects with a large number of files.                                                                                                                                                                                                              | `true`      |
+| `context.gapThresholdMinutes`                     | number                     | Minutes of inactivity after which retained thinking blocks are cleared to free context tokens. Aligns with typical provider prompt-cache TTL. Set higher if your provider has a longer cache TTL.                                                                                                                                                                     | `5`         |

 #### Troubleshooting File Search Performance

@ -491,22 +504,23 @@ For authentication-related variables (like `OPENAI_*`) and the recommended `.qwe

 ### Environment Variables Table

-| Variable                       | Description                                                                                                                                            | Notes                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-| ------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `QWEN_TELEMETRY_ENABLED`       | Set to `true` or `1` to enable telemetry. Any other value is treated as disabling it.                                                                  | Overrides the `telemetry.enabled` setting.                                                                                                                                                                                                                                                                                                                                                                                                                                         |
-| `QWEN_TELEMETRY_TARGET`        | Sets the telemetry target (`local` or `gcp`).                                                                                                          | Overrides the `telemetry.target` setting.                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| `QWEN_TELEMETRY_OTLP_ENDPOINT` | Sets the OTLP endpoint for telemetry.                                                                                                                  | Overrides the `telemetry.otlpEndpoint` setting.                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| `QWEN_TELEMETRY_OTLP_PROTOCOL` | Sets the OTLP protocol (`grpc` or `http`).                                                                                                             | Overrides the `telemetry.otlpProtocol` setting.                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| `QWEN_TELEMETRY_LOG_PROMPTS`   | Set to `true` or `1` to enable or disable logging of user prompts. Any other value is treated as disabling it.                                         | Overrides the `telemetry.logPrompts` setting.                                                                                                                                                                                                                                                                                                                                                                                                                                      |
-| `QWEN_TELEMETRY_OUTFILE`       | Sets the file path to write telemetry to when the target is `local`.                                                                                   | Overrides the `telemetry.outfile` setting.                                                                                                                                                                                                                                                                                                                                                                                                                                         |
-| `QWEN_TELEMETRY_USE_COLLECTOR` | Set to `true` or `1` to enable or disable using an external OTLP collector. Any other value is treated as disabling it.                                | Overrides the `telemetry.useCollector` setting.                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| `QWEN_SANDBOX`                 | Alternative to the `sandbox` setting in `settings.json`.                                                                                               | Accepts `true`, `false`, `docker`, `podman`, or a custom command string.                                                                                                                                                                                                                                                                                                                                                                                                           |
-| `SEATBELT_PROFILE`             | (macOS specific) Switches the Seatbelt (`sandbox-exec`) profile on macOS.                                                                              | `permissive-open`: (Default) Restricts writes to the project folder (and a few other folders, see `packages/cli/src/utils/sandbox-macos-permissive-open.sb`) but allows other operations. `strict`: Uses a strict profile that declines operations by default. `<profile_name>`: Uses a custom profile. To define a custom profile, create a file named `sandbox-macos-<profile_name>.sb` in your project's `.qwen/` directory (e.g., `my-project/.qwen/sandbox-macos-custom.sb`). |
-| `DEBUG` or `DEBUG_MODE`        | (often used by underlying libraries or the CLI itself) Set to `true` or `1` to enable verbose debug logging, which can be helpful for troubleshooting. | **Note:** These variables are automatically excluded from project `.env` files by default to prevent interference with the CLI behavior. Use `.qwen/.env` files if you need to set these for Qwen Code specifically.                                                                                                                                                                                                                                                               |
-| `NO_COLOR`                     | Set to any value to disable all color output in the CLI.                                                                                               |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| `CLI_TITLE`                    | Set to a string to customize the title of the CLI.                                                                                                     |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
-| `CODE_ASSIST_ENDPOINT`         | Specifies the endpoint for the code assist server.                                                                                                     | This is useful for development and testing.                                                                                                                                                                                                                                                                                                                                                                                                                                        |
-| `TAVILY_API_KEY`               | Your API key for the Tavily web search service.                                                                                                        | Used to enable the `web_search` tool functionality. Example: `export TAVILY_API_KEY="tvly-your-api-key-here"`                                                                                                                                                                                                                                                                                                                                                                      |
+| Variable                       | Description                                                                                                                                                                                                                                                                    | Notes                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| ------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `QWEN_TELEMETRY_ENABLED`       | Set to `true` or `1` to enable telemetry. Any other value is treated as disabling it.                                                                                                                                                                                          | Overrides the `telemetry.enabled` setting.                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| `QWEN_TELEMETRY_TARGET`        | Sets the telemetry target (`local` or `gcp`).                                                                                                                                                                                                                                  | Overrides the `telemetry.target` setting.                                                                                                                                                                                                                                                                                                                                                                                                                                          |
+| `QWEN_TELEMETRY_OTLP_ENDPOINT` | Sets the OTLP endpoint for telemetry.                                                                                                                                                                                                                                          | Overrides the `telemetry.otlpEndpoint` setting.                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| `QWEN_TELEMETRY_OTLP_PROTOCOL` | Sets the OTLP protocol (`grpc` or `http`).                                                                                                                                                                                                                                     | Overrides the `telemetry.otlpProtocol` setting.                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| `QWEN_TELEMETRY_LOG_PROMPTS`   | Set to `true` or `1` to enable or disable logging of user prompts. Any other value is treated as disabling it.                                                                                                                                                                 | Overrides the `telemetry.logPrompts` setting.                                                                                                                                                                                                                                                                                                                                                                                                                                      |
+| `QWEN_TELEMETRY_OUTFILE`       | Sets the file path to write telemetry to when the target is `local`.                                                                                                                                                                                                           | Overrides the `telemetry.outfile` setting.                                                                                                                                                                                                                                                                                                                                                                                                                                         |
+| `QWEN_TELEMETRY_USE_COLLECTOR` | Set to `true` or `1` to enable or disable using an external OTLP collector. Any other value is treated as disabling it.                                                                                                                                                        | Overrides the `telemetry.useCollector` setting.                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| `QWEN_SANDBOX`                 | Alternative to the `sandbox` setting in `settings.json`.                                                                                                                                                                                                                       | Accepts `true`, `false`, `docker`, `podman`, or a custom command string.                                                                                                                                                                                                                                                                                                                                                                                                           |
+| `SEATBELT_PROFILE`             | (macOS specific) Switches the Seatbelt (`sandbox-exec`) profile on macOS.                                                                                                                                                                                                      | `permissive-open`: (Default) Restricts writes to the project folder (and a few other folders, see `packages/cli/src/utils/sandbox-macos-permissive-open.sb`) but allows other operations. `strict`: Uses a strict profile that declines operations by default. `<profile_name>`: Uses a custom profile. To define a custom profile, create a file named `sandbox-macos-<profile_name>.sb` in your project's `.qwen/` directory (e.g., `my-project/.qwen/sandbox-macos-custom.sb`). |
+| `DEBUG` or `DEBUG_MODE`        | (often used by underlying libraries or the CLI itself) Set to `true` or `1` to enable verbose debug logging, which can be helpful for troubleshooting.                                                                                                                         | **Note:** These variables are automatically excluded from project `.env` files by default to prevent interference with the CLI behavior. Use `.qwen/.env` files if you need to set these for Qwen Code specifically.                                                                                                                                                                                                                                                               |
+| `NO_COLOR`                     | Set to any value to disable all color output in the CLI.                                                                                                                                                                                                                       |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| `CLI_TITLE`                    | Set to a string to customize the title of the CLI.                                                                                                                                                                                                                             |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| `CODE_ASSIST_ENDPOINT`         | Specifies the endpoint for the code assist server.                                                                                                                                                                                                                             | This is useful for development and testing.                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+| `QWEN_CODE_MAX_OUTPUT_TOKENS`  | Overrides the default maximum output tokens per response. When not set, Qwen Code uses an adaptive strategy: starts with 8K tokens and automatically retries with 64K if the response is truncated. Set this to a specific value (e.g., `16000`) to use a fixed limit instead. | Takes precedence over the capped default (8K) but is overridden by `samplingParams.max_tokens` in settings. Disables automatic escalation when set. Example: `export QWEN_CODE_MAX_OUTPUT_TOKENS=16000`                                                                                                                                                                                                                                                                            |
+| `TAVILY_API_KEY`               | Your API key for the Tavily web search service.                                                                                                                                                                                                                                | Used to enable the `web_search` tool functionality. Example: `export TAVILY_API_KEY="tvly-your-api-key-here"`                                                                                                                                                                                                                                                                                                                                                                      |

 ## Command-Line Arguments

--- a/docs/users/features/approval-mode.md
+++ b/docs/users/features/approval-mode.md
@ -1,6 +1,6 @@
 # Approval Mode

-Qwen Code offers three distinct permission modes that allow you to flexibly control how AI interacts with your code and system based on task complexity and risk level.
+Qwen Code offers four distinct permission modes that allow you to flexibly control how AI interacts with your code and system based on task complexity and risk level.

 ## Permission Modes Comparison

@ -40,6 +40,18 @@ You can switch into Plan Mode during a session using **Shift+Tab** (or **Tab** o

 If you are in Normal Mode, **Shift+Tab** (or **Tab** on Windows) first switches into `auto-edits` Mode, indicated by `⏵⏵ accept edits on` at the bottom of the terminal. A subsequent **Shift+Tab** (or **Tab** on Windows) will switch into Plan Mode, indicated by `⏸ plan mode`.

+**Use the `/plan` command**
+
+The `/plan` command provides a quick shortcut for entering and exiting Plan Mode:
+
+```bash
+/plan                          # Enter plan mode
+/plan refactor the auth module # Enter plan mode and start planning
+/plan exit                     # Exit plan mode, restore previous mode
+```
+
+When you exit Plan Mode with `/plan exit`, your previous approval mode is automatically restored (e.g., if you were in Auto-Edit before entering Plan Mode, you'll return to Auto-Edit).
+
 **Start a new session in Plan Mode**

 To start a new session in Plan Mode, use the `/approval-mode` then select `plan`
@ -59,14 +71,10 @@ qwen --prompt "What is machine learning?"
 ### Example: Planning a complex refactor

 ```bash
-/approval-mode plan
+/plan I need to refactor our authentication system to use OAuth2. Create a detailed migration plan.
 ```

-```
-I need to refactor our authentication system to use OAuth2. Create a detailed migration plan.
-```
-
-Qwen Code analyzes the current implementation and create a comprehensive plan. Refine with follow-ups:
+Qwen Code enters Plan Mode and analyzes the current implementation to create a comprehensive plan. Refine with follow-ups:

 ```
 What about backward compatibility?
@ -235,7 +243,7 @@ qwen --prompt "Run the test suite, fix all failing tests, then commit changes"

 ### Keyboard Shortcut Switching

-During a Qwen Code session, use **Shift+Tab** (or **Tab** on Windows) to quickly cycle through the three modes:
+During a Qwen Code session, use **Shift+Tab** (or **Tab** on Windows) to quickly cycle through the four modes:

 ```
 Default Mode → Auto-Edit Mode → YOLO Mode → Plan Mode → Default Mode
--- a/docs/users/features/commands.md
+++ b/docs/users/features/commands.md
@ -61,6 +61,7 @@ Commands for managing AI tools and models.
 | `/mcp`           | List configured MCP servers and tools             | `/mcp`, `/mcp desc`                           |
 | `/tools`         | Display currently available tool list             | `/tools`, `/tools desc`                       |
 | `/skills`        | List and run available skills                     | `/skills`, `/skills <name>`                   |
+| `/plan`          | Switch to plan mode or exit plan mode             | `/plan`, `/plan <task>`, `/plan exit`         |
 | `/approval-mode` | Change approval mode for tool usage               | `/approval-mode <mode (auto-edit)> --project` |
 | →`plan`          | Analysis only, no execution                       | Secure review                                 |
 | →`default`       | Require approval for edits                        | Daily use                                     |
--- a/eslint.config.js
+++ b/eslint.config.js
@ -28,6 +28,7 @@ export default tseslint.config(
      'dist/**',
      'docs-site/.next/**',
      'docs-site/out/**',
+      '.qwen/**',
    ],
  },
  eslint.configs.recommended,
--- a/integration-tests/terminal-capture/scenarios/bugfix-2833.ts
+++ b/integration-tests/terminal-capture/scenarios/bugfix-2833.ts
@ -0,0 +1,24 @@
+import type { ScenarioConfig } from '../scenario-runner.js';
+
+/**
+ * Streaming capture for /qc:bugfix command on GitHub issue #2833.
+ * This scenario runs a long-running bugfix workflow with screenshots every 30 seconds
+ * to capture the full evolution of the debugging process.
+ */
+export default {
+  name: 'streaming-bugfix-2833',
+  spawn: ['node', 'dist/cli.js', '--yolo'],
+  terminal: { title: 'qwen-code', cwd: '../../..' },
+  flow: [
+    {
+      type: '/qc:bugfix https://github.com/QwenLM/qwen-code/issues/2833',
+      // Bugfix workflow is long-running (20+ minutes), capture throughout
+      streaming: {
+        delayMs: 10000, // Wait 10s for initial prompt processing
+        intervalMs: 30000, // Capture every 30 seconds
+        count: 50, // Up to 25 minutes of capture (50 * 30s)
+        gif: true, // Generate animated GIF
+      },
+    },
+  ],
+} satisfies ScenarioConfig;
--- a/package.json
+++ b/package.json
@ -135,7 +135,7 @@
  "lint-staged": {
    "*.{js,jsx,ts,tsx}": [
      "prettier --write",
-      "eslint --fix --max-warnings 0"
+      "eslint --fix --max-warnings 0 --no-warn-ignored"
    ],
    "*.{json,md}": [
      "prettier --write"
--- a/packages/cli/src/config/config.ts
+++ b/packages/cli/src/config/config.ts
@ -1069,6 +1069,7 @@ export async function loadCliConfig(
    telemetry: telemetrySettings,
    usageStatisticsEnabled: settings.privacy?.usageStatisticsEnabled ?? true,
    fileFiltering: settings.context?.fileFiltering,
+    thinkingIdleThresholdMinutes: settings.context?.gapThresholdMinutes,
    checkpointing:
      argv.checkpointing || settings.general?.checkpointing?.enabled,
    proxy:
--- a/packages/cli/src/config/settingsSchema.ts
+++ b/packages/cli/src/config/settingsSchema.ts
@ -529,7 +529,7 @@ const SETTINGS_SCHEMA = {
        label: 'Enable Follow-up Suggestions',
        category: 'UI',
        requiresRestart: false,
-        default: true,
+        default: false,
        description:
          'Show context-aware follow-up suggestions after task completion. Press Tab or Right Arrow to accept, Enter to accept and submit.',
        showInDialog: true,
@ -935,6 +935,16 @@ const SETTINGS_SCHEMA = {
          },
        },
      },
+      gapThresholdMinutes: {
+        type: 'number',
+        label: 'Thinking Block Idle Threshold (minutes)',
+        category: 'Context',
+        requiresRestart: false,
+        default: 5,
+        description:
+          'Minutes of inactivity after which retained thinking blocks are cleared to free context tokens. Aligns with provider prompt-cache TTL.',
+        showInDialog: false,
+      },
    },
  },

--- a/packages/cli/src/i18n/locales/de.js
+++ b/packages/cli/src/i18n/locales/de.js
@ -1973,4 +1973,15 @@ export default {
    'Vollständige Tool-Ausgabe und Denkprozess im ausführlichen Modus anzeigen (mit Strg+O umschalten).',
  'Press Ctrl+O to toggle verbose mode':
    'Strg+O zum Umschalten des ausführlichen Modus drücken',
+
+  'Switch to plan mode or exit plan mode':
+    'Switch to plan mode or exit plan mode',
+  'Exited plan mode. Previous approval mode restored.':
+    'Exited plan mode. Previous approval mode restored.',
+  'Enabled plan mode. The agent will analyze and plan without executing tools.':
+    'Enabled plan mode. The agent will analyze and plan without executing tools.',
+  'Already in plan mode. Use "/plan exit" to exit plan mode.':
+    'Already in plan mode. Use "/plan exit" to exit plan mode.',
+  'Not in plan mode. Use "/plan" to enter plan mode first.':
+    'Not in plan mode. Use "/plan" to enter plan mode first.',
 };
--- a/packages/cli/src/i18n/locales/en.js
+++ b/packages/cli/src/i18n/locales/en.js
@ -2012,4 +2012,15 @@ export default {
  'Show full tool output and thinking in verbose mode (toggle with Ctrl+O).':
    'Show full tool output and thinking in verbose mode (toggle with Ctrl+O).',
  'Press Ctrl+O to toggle verbose mode': 'Press Ctrl+O to toggle verbose mode',
+
+  'Switch to plan mode or exit plan mode':
+    'Switch to plan mode or exit plan mode',
+  'Exited plan mode. Previous approval mode restored.':
+    'Exited plan mode. Previous approval mode restored.',
+  'Enabled plan mode. The agent will analyze and plan without executing tools.':
+    'Enabled plan mode. The agent will analyze and plan without executing tools.',
+  'Already in plan mode. Use "/plan exit" to exit plan mode.':
+    'Already in plan mode. Use "/plan exit" to exit plan mode.',
+  'Not in plan mode. Use "/plan" to enter plan mode first.':
+    'Not in plan mode. Use "/plan" to enter plan mode first.',
 };
--- a/packages/cli/src/i18n/locales/ja.js
+++ b/packages/cli/src/i18n/locales/ja.js
@ -1464,4 +1464,15 @@ export default {
  'Show full tool output and thinking in verbose mode (toggle with Ctrl+O).':
    '詳細モードで完全なツール出力と思考を表示します（Ctrl+O で切り替え）。',
  'Press Ctrl+O to toggle verbose mode': 'Ctrl+O で詳細モードを切り替え',
+
+  'Switch to plan mode or exit plan mode':
+    'Switch to plan mode or exit plan mode',
+  'Exited plan mode. Previous approval mode restored.':
+    'Exited plan mode. Previous approval mode restored.',
+  'Enabled plan mode. The agent will analyze and plan without executing tools.':
+    'Enabled plan mode. The agent will analyze and plan without executing tools.',
+  'Already in plan mode. Use "/plan exit" to exit plan mode.':
+    'Already in plan mode. Use "/plan exit" to exit plan mode.',
+  'Not in plan mode. Use "/plan" to enter plan mode first.':
+    'Not in plan mode. Use "/plan" to enter plan mode first.',
 };
--- a/packages/cli/src/i18n/locales/pt.js
+++ b/packages/cli/src/i18n/locales/pt.js
@ -1963,4 +1963,15 @@ export default {
    'Mostrar saída completa da ferramenta e raciocínio no modo detalhado (alternar com Ctrl+O).',
  'Press Ctrl+O to toggle verbose mode':
    'Pressione Ctrl+O para alternar o modo detalhado',
+
+  'Switch to plan mode or exit plan mode':
+    'Switch to plan mode or exit plan mode',
+  'Exited plan mode. Previous approval mode restored.':
+    'Exited plan mode. Previous approval mode restored.',
+  'Enabled plan mode. The agent will analyze and plan without executing tools.':
+    'Enabled plan mode. The agent will analyze and plan without executing tools.',
+  'Already in plan mode. Use "/plan exit" to exit plan mode.':
+    'Already in plan mode. Use "/plan exit" to exit plan mode.',
+  'Not in plan mode. Use "/plan" to enter plan mode first.':
+    'Not in plan mode. Use "/plan" to enter plan mode first.',
 };
--- a/packages/cli/src/i18n/locales/ru.js
+++ b/packages/cli/src/i18n/locales/ru.js
@ -1970,4 +1970,15 @@ export default {
    'Показывать полный вывод инструментов и процесс рассуждений в подробном режиме (переключить с помощью Ctrl+O).',
  'Press Ctrl+O to toggle verbose mode':
    'Нажмите Ctrl+O для переключения подробного режима',
+
+  'Switch to plan mode or exit plan mode':
+    'Switch to plan mode or exit plan mode',
+  'Exited plan mode. Previous approval mode restored.':
+    'Exited plan mode. Previous approval mode restored.',
+  'Enabled plan mode. The agent will analyze and plan without executing tools.':
+    'Enabled plan mode. The agent will analyze and plan without executing tools.',
+  'Already in plan mode. Use "/plan exit" to exit plan mode.':
+    'Already in plan mode. Use "/plan exit" to exit plan mode.',
+  'Not in plan mode. Use "/plan" to enter plan mode first.':
+    'Not in plan mode. Use "/plan" to enter plan mode first.',
 };
--- a/packages/cli/src/i18n/locales/zh.js
+++ b/packages/cli/src/i18n/locales/zh.js
@ -1817,4 +1817,14 @@ export default {
  'Show full tool output and thinking in verbose mode (toggle with Ctrl+O).':
    '详细模式下显示完整工具输出和思考过程（Ctrl+O 切换）。',
  'Press Ctrl+O to toggle verbose mode': '按 Ctrl+O 切换详细模式',
+
+  'Switch to plan mode or exit plan mode': '切换到计划模式或退出计划模式',
+  'Exited plan mode. Previous approval mode restored.':
+    '已退出计划模式，已恢复之前的审批模式。',
+  'Enabled plan mode. The agent will analyze and plan without executing tools.':
+    '启用计划模式。智能体将只分析和规划，而不执行工具。',
+  'Already in plan mode. Use "/plan exit" to exit plan mode.':
+    '已处于计划模式。使用 "/plan exit" 退出计划模式。',
+  'Not in plan mode. Use "/plan" to enter plan mode first.':
+    '未处于计划模式。请先使用 "/plan" 进入计划模式。',
 };
--- a/packages/cli/src/services/BuiltinCommandLoader.ts
+++ b/packages/cli/src/services/BuiltinCommandLoader.ts
@ -32,6 +32,7 @@ import { languageCommand } from '../ui/commands/languageCommand.js';
 import { mcpCommand } from '../ui/commands/mcpCommand.js';
 import { memoryCommand } from '../ui/commands/memoryCommand.js';
 import { modelCommand } from '../ui/commands/modelCommand.js';
+import { planCommand } from '../ui/commands/planCommand.js';
 import { permissionsCommand } from '../ui/commands/permissionsCommand.js';
 import { trustCommand } from '../ui/commands/trustCommand.js';
 import { quitCommand } from '../ui/commands/quitCommand.js';
@ -104,6 +105,7 @@ export class BuiltinCommandLoader implements ICommandLoader {
      mcpCommand,
      memoryCommand,
      modelCommand,
+      planCommand,
      permissionsCommand,
      ...(this.config?.getFolderTrust() ? [trustCommand] : []),
      quitCommand,
--- a/packages/cli/src/ui/AppContainer.tsx
+++ b/packages/cli/src/ui/AppContainer.tsx
@ -1110,7 +1110,7 @@ export const AppContainer = (props: AppContainerProps) => {

  // Generate prompt suggestions when streaming completes
  const followupSuggestionsEnabled =
-    settings.merged.ui?.enableFollowupSuggestions !== false;
+    settings.merged.ui?.enableFollowupSuggestions === true;

  useEffect(() => {
    // Clear suggestion when feature is disabled at runtime
--- a/packages/cli/src/ui/commands/planCommand.test.ts
+++ b/packages/cli/src/ui/commands/planCommand.test.ts
@ -0,0 +1,159 @@
+/**
+ * @license
+ * Copyright 2026 Qwen Team
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, it, expect, beforeEach, vi, type Mock } from 'vitest';
+import { planCommand } from './planCommand.js';
+import { type CommandContext } from './types.js';
+import { createMockCommandContext } from '../../test-utils/mockCommandContext.js';
+import { ApprovalMode } from '@qwen-code/qwen-code-core';
+
+describe('planCommand', () => {
+  let mockContext: CommandContext;
+
+  beforeEach(() => {
+    mockContext = createMockCommandContext({
+      services: {
+        config: {
+          getApprovalMode: vi.fn().mockReturnValue(ApprovalMode.DEFAULT),
+          getPrePlanMode: vi.fn().mockReturnValue(ApprovalMode.DEFAULT),
+          setApprovalMode: vi.fn(),
+        } as unknown as import('@qwen-code/qwen-code-core').Config,
+      },
+    });
+  });
+
+  it('should switch to plan mode if not in plan mode', async () => {
+    if (!planCommand.action) {
+      throw new Error('The plan command must have an action.');
+    }
+
+    const result = await planCommand.action(mockContext, '');
+
+    expect(mockContext.services.config?.setApprovalMode).toHaveBeenCalledWith(
+      ApprovalMode.PLAN,
+    );
+    expect(result).toEqual({
+      type: 'message',
+      messageType: 'info',
+      content:
+        'Enabled plan mode. The agent will analyze and plan without executing tools.',
+    });
+  });
+
+  it('should return submit prompt if arguments are provided when switching to plan mode', async () => {
+    if (!planCommand.action) {
+      throw new Error('The plan command must have an action.');
+    }
+
+    const result = await planCommand.action(mockContext, 'refactor the code');
+
+    expect(mockContext.services.config?.setApprovalMode).toHaveBeenCalledWith(
+      ApprovalMode.PLAN,
+    );
+    expect(result).toEqual({
+      type: 'submit_prompt',
+      content: [{ text: 'refactor the code' }],
+    });
+  });
+
+  it('should return already in plan mode if mode is already plan', async () => {
+    if (!planCommand.action) {
+      throw new Error('The plan command must have an action.');
+    }
+
+    (mockContext.services.config?.getApprovalMode as Mock).mockReturnValue(
+      ApprovalMode.PLAN,
+    );
+
+    const result = await planCommand.action(mockContext, '');
+
+    expect(mockContext.services.config?.setApprovalMode).not.toHaveBeenCalled();
+    expect(result).toEqual({
+      type: 'message',
+      messageType: 'info',
+      content: 'Already in plan mode. Use "/plan exit" to exit plan mode.',
+    });
+  });
+
+  it('should return submit prompt if arguments are provided and already in plan mode', async () => {
+    if (!planCommand.action) {
+      throw new Error('The plan command must have an action.');
+    }
+
+    (mockContext.services.config?.getApprovalMode as Mock).mockReturnValue(
+      ApprovalMode.PLAN,
+    );
+
+    const result = await planCommand.action(mockContext, 'keep planning');
+
+    expect(mockContext.services.config?.setApprovalMode).not.toHaveBeenCalled();
+    expect(result).toEqual({
+      type: 'submit_prompt',
+      content: [{ text: 'keep planning' }],
+    });
+  });
+
+  it('should exit plan mode when exit argument is passed', async () => {
+    if (!planCommand.action) {
+      throw new Error('The plan command must have an action.');
+    }
+
+    (mockContext.services.config?.getApprovalMode as Mock).mockReturnValue(
+      ApprovalMode.PLAN,
+    );
+
+    const result = await planCommand.action(mockContext, 'exit');
+
+    expect(mockContext.services.config?.setApprovalMode).toHaveBeenCalledWith(
+      ApprovalMode.DEFAULT,
+    );
+    expect(result).toEqual({
+      type: 'message',
+      messageType: 'info',
+      content: 'Exited plan mode. Previous approval mode restored.',
+    });
+  });
+
+  it('should restore pre-plan mode when executing from plan mode', async () => {
+    if (!planCommand.action) {
+      throw new Error('The plan command must have an action.');
+    }
+
+    (mockContext.services.config?.getApprovalMode as Mock).mockReturnValue(
+      ApprovalMode.PLAN,
+    );
+    (mockContext.services.config?.getPrePlanMode as Mock).mockReturnValue(
+      ApprovalMode.AUTO_EDIT,
+    );
+
+    const result = await planCommand.action(mockContext, 'exit');
+
+    expect(mockContext.services.config?.setApprovalMode).toHaveBeenCalledWith(
+      ApprovalMode.AUTO_EDIT,
+    );
+    expect(result).toEqual({
+      type: 'message',
+      messageType: 'info',
+      content: 'Exited plan mode. Previous approval mode restored.',
+    });
+  });
+
+  it('should return error when execute is used but not in plan mode', async () => {
+    if (!planCommand.action) {
+      throw new Error('The plan command must have an action.');
+    }
+
+    // Default mock returns ApprovalMode.DEFAULT (not PLAN)
+    const result = await planCommand.action(mockContext, 'exit');
+
+    expect(mockContext.services.config?.setApprovalMode).not.toHaveBeenCalled();
+    expect(result).toEqual({
+      type: 'message',
+      messageType: 'error',
+      content: 'Not in plan mode. Use "/plan" to enter plan mode first.',
+    });
+  });
+});
--- a/packages/cli/src/ui/commands/planCommand.ts
+++ b/packages/cli/src/ui/commands/planCommand.ts
@ -0,0 +1,104 @@
+/**
+ * @license
+ * Copyright 2025 Qwen Team
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import {
+  type CommandContext,
+  CommandKind,
+  type SlashCommand,
+  type MessageActionReturn,
+  type SubmitPromptActionReturn,
+} from './types.js';
+import { t } from '../../i18n/index.js';
+import { ApprovalMode } from '@qwen-code/qwen-code-core';
+
+export const planCommand: SlashCommand = {
+  name: 'plan',
+  get description() {
+    return t('Switch to plan mode or exit plan mode');
+  },
+  kind: CommandKind.BUILT_IN,
+  action: async (
+    context: CommandContext,
+    args: string,
+  ): Promise<MessageActionReturn | SubmitPromptActionReturn> => {
+    const { config } = context.services;
+    if (!config) {
+      return {
+        type: 'message',
+        messageType: 'error',
+        content: t('Configuration is not available.'),
+      };
+    }
+
+    const trimmedArgs = args.trim();
+    const currentMode = config.getApprovalMode();
+
+    if (trimmedArgs === 'exit') {
+      if (currentMode !== ApprovalMode.PLAN) {
+        return {
+          type: 'message',
+          messageType: 'error',
+          content: t('Not in plan mode. Use "/plan" to enter plan mode first.'),
+        };
+      }
+      try {
+        config.setApprovalMode(config.getPrePlanMode());
+      } catch (e) {
+        return {
+          type: 'message',
+          messageType: 'error',
+          content: (e as Error).message,
+        };
+      }
+      return {
+        type: 'message',
+        messageType: 'info',
+        content: t('Exited plan mode. Previous approval mode restored.'),
+      };
+    }
+
+    if (currentMode !== ApprovalMode.PLAN) {
+      try {
+        config.setApprovalMode(ApprovalMode.PLAN);
+      } catch (e) {
+        return {
+          type: 'message',
+          messageType: 'error',
+          content: (e as Error).message,
+        };
+      }
+
+      if (trimmedArgs) {
+        return {
+          type: 'submit_prompt',
+          content: [{ text: trimmedArgs }],
+        };
+      }
+
+      return {
+        type: 'message',
+        messageType: 'info',
+        content: t(
+          'Enabled plan mode. The agent will analyze and plan without executing tools.',
+        ),
+      };
+    }
+
+    // Already in plan mode
+    if (trimmedArgs) {
+      return {
+        type: 'submit_prompt',
+        content: [{ text: trimmedArgs }],
+      };
+    }
+
+    return {
+      type: 'message',
+      messageType: 'info',
+      content: t('Already in plan mode. Use "/plan exit" to exit plan mode.'),
+    };
+  },
+};
--- a/packages/core/src/config/config.test.ts
+++ b/packages/core/src/config/config.test.ts
@ -8,6 +8,7 @@ import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
 import type { Mock } from 'vitest';
 import type { ConfigParameters, SandboxConfig } from './config.js';
 import { Config, ApprovalMode } from './config.js';
+import * as fs from 'node:fs';
 import * as path from 'node:path';
 import { setGeminiMdFilename as mockSetGeminiMdFilename } from '../tools/memoryTool.js';
 import {
@ -57,6 +58,9 @@ vi.mock('node:fs', async (importOriginal) => {
      isDirectory: vi.fn().mockReturnValue(true),
    }),
    realpathSync: vi.fn((path) => path),
+    mkdirSync: vi.fn(),
+    writeFileSync: vi.fn(),
+    readFileSync: vi.fn(),
  };
  return {
    ...mocked,
@ -1203,6 +1207,103 @@ describe('setApprovalMode with folder trust', () => {
    expect(() => config.setApprovalMode(ApprovalMode.PLAN)).not.toThrow();
  });

+  describe('prePlanMode tracking', () => {
+    it('should save pre-plan mode when entering plan mode', () => {
+      const config = new Config(baseParams);
+      vi.spyOn(config, 'isTrustedFolder').mockReturnValue(true);
+
+      config.setApprovalMode(ApprovalMode.AUTO_EDIT);
+      config.setApprovalMode(ApprovalMode.PLAN);
+      expect(config.getPrePlanMode()).toBe(ApprovalMode.AUTO_EDIT);
+    });
+
+    it('should clear pre-plan mode when leaving plan mode', () => {
+      const config = new Config(baseParams);
+      vi.spyOn(config, 'isTrustedFolder').mockReturnValue(true);
+
+      config.setApprovalMode(ApprovalMode.AUTO_EDIT);
+      config.setApprovalMode(ApprovalMode.PLAN);
+      config.setApprovalMode(ApprovalMode.DEFAULT);
+      expect(config.getPrePlanMode()).toBe(ApprovalMode.DEFAULT);
+    });
+
+    it('should default to DEFAULT when no pre-plan mode was recorded', () => {
+      const config = new Config(baseParams);
+      expect(config.getPrePlanMode()).toBe(ApprovalMode.DEFAULT);
+    });
+
+    it('should not update pre-plan mode when already in plan mode', () => {
+      const config = new Config(baseParams);
+      vi.spyOn(config, 'isTrustedFolder').mockReturnValue(true);
+
+      config.setApprovalMode(ApprovalMode.YOLO);
+      config.setApprovalMode(ApprovalMode.PLAN);
+      // Setting PLAN again should not overwrite prePlanMode
+      config.setApprovalMode(ApprovalMode.PLAN);
+      expect(config.getPrePlanMode()).toBe(ApprovalMode.YOLO);
+    });
+  });
+
+  describe('plan file persistence', () => {
+    it('should save plan to disk', () => {
+      const config = new Config(baseParams);
+
+      config.savePlan('# My Plan\n1. Step one\n2. Step two');
+
+      expect(fs.mkdirSync).toHaveBeenCalledWith(
+        expect.stringContaining('plans'),
+        { recursive: true },
+      );
+      expect(fs.writeFileSync).toHaveBeenCalledWith(
+        expect.stringContaining('.md'),
+        '# My Plan\n1. Step one\n2. Step two',
+        'utf-8',
+      );
+    });
+
+    it('should load plan from disk', () => {
+      const config = new Config(baseParams);
+      (fs.readFileSync as Mock).mockReturnValue('# Saved Plan');
+
+      const plan = config.loadPlan();
+      expect(plan).toBe('# Saved Plan');
+    });
+
+    it('should return undefined when no plan file exists', () => {
+      const config = new Config(baseParams);
+      const enoentError = new Error('ENOENT') as NodeJS.ErrnoException;
+      enoentError.code = 'ENOENT';
+      (fs.readFileSync as Mock).mockImplementation(() => {
+        throw enoentError;
+      });
+
+      const plan = config.loadPlan();
+      expect(plan).toBeUndefined();
+    });
+
+    it('should rethrow non-ENOENT errors from loadPlan', () => {
+      const config = new Config(baseParams);
+      const permError = new Error('EACCES') as NodeJS.ErrnoException;
+      permError.code = 'EACCES';
+      (fs.readFileSync as Mock).mockImplementation(() => {
+        throw permError;
+      });
+
+      expect(() => config.loadPlan()).toThrow('EACCES');
+    });
+
+    it('should use session ID in plan file path', () => {
+      const config = new Config({
+        ...baseParams,
+        sessionId: 'test-session-123',
+      });
+
+      const filePath = config.getPlanFilePath();
+      expect(filePath).toContain('test-session-123');
+      expect(filePath).toMatch(/\.md$/);
+    });
+  });
+
  describe('registerCoreTools', () => {
    beforeEach(() => {
      vi.clearAllMocks();
--- a/packages/core/src/config/config.ts
+++ b/packages/core/src/config/config.ts
@ -6,6 +6,7 @@

 // Node built-ins
 import type { EventEmitter } from 'node:events';
+import * as fs from 'node:fs';
 import * as path from 'node:path';
 import process from 'node:process';

@ -370,6 +371,8 @@ export interface ConfigParameters {
  model?: string;
  outputLanguageFilePath?: string;
  maxSessionTurns?: number;
+  /** Minutes of inactivity before clearing retained thinking blocks. */
+  thinkingIdleThresholdMinutes?: number;
  sessionTokenLimit?: number;
  experimentalZedIntegration?: boolean;
  cronEnabled?: boolean;
@ -529,6 +532,7 @@ export class Config {
  private sdkMode: boolean;
  private geminiMdFileCount: number;
  private approvalMode: ApprovalMode;
+  private prePlanMode?: ApprovalMode;
  private readonly accessibility: AccessibilitySettings;
  private readonly telemetrySettings: TelemetrySettings;
  private readonly gitCoAuthor: GitCoAuthorSettings;
@ -557,6 +561,7 @@ export class Config {
  private ideMode: boolean;

  private readonly maxSessionTurns: number;
+  private readonly thinkingIdleThresholdMs: number;
  private readonly sessionTokenLimit: number;
  private readonly listExtensions: boolean;
  private readonly overrideExtensions?: string[];
@ -683,6 +688,8 @@ export class Config {
    this.fileDiscoveryService = params.fileDiscoveryService ?? null;
    this.bugCommand = params.bugCommand;
    this.maxSessionTurns = params.maxSessionTurns ?? -1;
+    this.thinkingIdleThresholdMs =
+      (params.thinkingIdleThresholdMinutes ?? 5) * 60 * 1000;
    this.sessionTokenLimit = params.sessionTokenLimit ?? -1;
    this.experimentalZedIntegration =
      params.experimentalZedIntegration ?? false;
@ -1329,6 +1336,10 @@ export class Config {
    return this.maxSessionTurns;
  }

+  getThinkingIdleThresholdMs(): number {
+    return this.thinkingIdleThresholdMs;
+  }
+
  getSessionTokenLimit(): number {
    return this.sessionTokenLimit;
  }
@ -1634,6 +1645,14 @@ export class Config {
    return this.approvalMode;
  }

+  /**
+   * Returns the approval mode that was active before entering plan mode.
+   * Falls back to DEFAULT if no pre-plan mode was recorded.
+   */
+  getPrePlanMode(): ApprovalMode {
+    return this.prePlanMode ?? ApprovalMode.DEFAULT;
+  }
+
  setApprovalMode(mode: ApprovalMode): void {
    if (
      !this.isTrustedFolder() &&
@ -1644,9 +1663,55 @@ export class Config {
        'Cannot enable privileged approval modes in an untrusted folder.',
      );
    }
+    // Track the mode before entering plan mode so it can be restored later
+    if (mode === ApprovalMode.PLAN && this.approvalMode !== ApprovalMode.PLAN) {
+      this.prePlanMode = this.approvalMode;
+    } else if (
+      mode !== ApprovalMode.PLAN &&
+      this.approvalMode === ApprovalMode.PLAN
+    ) {
+      this.prePlanMode = undefined;
+    }
    this.approvalMode = mode;
  }

+  /**
+   * Returns the file path for this session's plan file.
+   */
+  getPlanFilePath(): string {
+    return Storage.getPlanFilePath(this.sessionId);
+  }
+
+  /**
+   * Saves a plan to disk for the current session.
+   */
+  savePlan(plan: string): void {
+    const filePath = this.getPlanFilePath();
+    const dir = path.dirname(filePath);
+    fs.mkdirSync(dir, { recursive: true });
+    fs.writeFileSync(filePath, plan, 'utf-8');
+  }
+
+  /**
+   * Loads the plan for the current session, or returns undefined if none exists.
+   */
+  loadPlan(): string | undefined {
+    const filePath = this.getPlanFilePath();
+    try {
+      return fs.readFileSync(filePath, 'utf-8');
+    } catch (error: unknown) {
+      if (
+        typeof error === 'object' &&
+        error !== null &&
+        'code' in error &&
+        (error as NodeJS.ErrnoException).code === 'ENOENT'
+      ) {
+        return undefined;
+      }
+      throw error;
+    }
+  }
+
  getInputFormat(): 'text' | 'stream-json' {
    return this.inputFormat;
  }
--- a/packages/core/src/config/storage.ts
+++ b/packages/core/src/config/storage.ts
@ -18,6 +18,7 @@ const TMP_DIR_NAME = 'tmp';
 const BIN_DIR_NAME = 'bin';
 const PROJECT_DIR_NAME = 'projects';
 const IDE_DIR_NAME = 'ide';
+const PLANS_DIR_NAME = 'plans';
 const DEBUG_DIR_NAME = 'debug';
 const ARENA_DIR_NAME = 'arena';

@ -165,6 +166,14 @@ export class Storage {
    return path.join(Storage.getRuntimeBaseDir(), IDE_DIR_NAME);
  }

+  static getPlansDir(): string {
+    return path.join(Storage.getGlobalQwenDir(), PLANS_DIR_NAME);
+  }
+
+  static getPlanFilePath(sessionId: string): string {
+    return path.join(Storage.getPlansDir(), `${sessionId}.md`);
+  }
+
  static getGlobalBinDir(): string {
    return path.join(Storage.getGlobalQwenDir(), BIN_DIR_NAME);
  }
--- a/packages/core/src/core/anthropicContentGenerator/anthropicContentGenerator.test.ts
+++ b/packages/core/src/core/anthropicContentGenerator/anthropicContentGenerator.test.ts
@ -423,7 +423,7 @@ describe('AnthropicContentGenerator', () => {
        const [anthropicRequest] =
          anthropicState.lastCreateArgs as AnthropicCreateArgs;
        expect(anthropicRequest).toEqual(
-          expect.objectContaining({ max_tokens: 32000 }),
+          expect.objectContaining({ max_tokens: 8000 }),
        );
      });

@ -488,7 +488,7 @@ describe('AnthropicContentGenerator', () => {
        const [anthropicRequest] =
          anthropicState.lastCreateArgs as AnthropicCreateArgs;
        expect(anthropicRequest).toEqual(
-          expect.objectContaining({ max_tokens: 32000 }),
+          expect.objectContaining({ max_tokens: 8000 }),
        );
      });
    });
--- a/packages/core/src/core/anthropicContentGenerator/anthropicContentGenerator.ts
+++ b/packages/core/src/core/anthropicContentGenerator/anthropicContentGenerator.ts
@ -33,7 +33,7 @@ import { DEFAULT_TIMEOUT } from '../openaiContentGenerator/constants.js';
 import { createDebugLogger } from '../../utils/debugLogger.js';
 import {
  tokenLimit,
-  DEFAULT_OUTPUT_TOKEN_LIMIT,
+  CAPPED_DEFAULT_MAX_TOKENS,
  hasExplicitOutputLimit,
 } from '../tokenLimits.js';

@ -234,12 +234,23 @@ export class AnthropicContentGenerator implements ContentGenerator {
    const modelLimit = tokenLimit(modelId, 'output');
    const isKnownModel = hasExplicitOutputLimit(modelId);

-    const maxTokens =
-      userMaxTokens !== undefined && userMaxTokens !== null
-        ? isKnownModel
-          ? Math.min(userMaxTokens, modelLimit)
-          : userMaxTokens
-        : Math.min(modelLimit, DEFAULT_OUTPUT_TOKEN_LIMIT);
+    let maxTokens: number;
+    if (userMaxTokens !== undefined && userMaxTokens !== null) {
+      maxTokens = isKnownModel
+        ? Math.min(userMaxTokens, modelLimit)
+        : userMaxTokens;
+    } else {
+      // No explicit user config — check env var, then use capped default.
+      const envVal = process.env['QWEN_CODE_MAX_OUTPUT_TOKENS'];
+      const envMaxTokens = envVal ? parseInt(envVal, 10) : NaN;
+      if (!isNaN(envMaxTokens) && envMaxTokens > 0) {
+        maxTokens = isKnownModel
+          ? Math.min(envMaxTokens, modelLimit)
+          : envMaxTokens;
+      } else {
+        maxTokens = Math.min(modelLimit, CAPPED_DEFAULT_MAX_TOKENS);
+      }
+    }

    return {
      max_tokens: maxTokens,
--- a/packages/core/src/core/client.test.ts
+++ b/packages/core/src/core/client.test.ts
@ -323,6 +323,7 @@ describe('Gemini Client (client.ts)', () => {
      getWorkingDir: vi.fn().mockReturnValue('/test/dir'),
      getFileService: vi.fn().mockReturnValue(fileService),
      getMaxSessionTurns: vi.fn().mockReturnValue(0),
+      getThinkingIdleThresholdMs: vi.fn().mockReturnValue(5 * 60 * 1000),
      getSessionTokenLimit: vi.fn().mockReturnValue(32000),
      getNoBrowser: vi.fn().mockReturnValue(false),
      getUsageStatisticsEnabled: vi.fn().mockReturnValue(true),
@ -427,6 +428,119 @@ describe('Gemini Client (client.ts)', () => {
    });
  });

+  describe('thinking block idle cleanup and latch', () => {
+    let mockChat: Partial<GeminiChat>;
+
+    beforeEach(() => {
+      const mockStream = (async function* () {
+        yield {
+          type: GeminiEventType.Content,
+          value: 'response',
+        };
+      })();
+      mockTurnRunFn.mockReturnValue(mockStream);
+
+      mockChat = {
+        addHistory: vi.fn(),
+        getHistory: vi.fn().mockReturnValue([]),
+        stripThoughtsFromHistory: vi.fn(),
+        stripThoughtsFromHistoryKeepRecent: vi.fn(),
+      };
+      client['chat'] = mockChat as GeminiChat;
+    });
+
+    it('should not strip thoughts on active session (< 5min idle)', async () => {
+      // Simulate a recent API completion (2 minutes ago — within default 5 min threshold)
+      client['lastApiCompletionTimestamp'] = Date.now() - 2 * 60 * 1000;
+      client['thinkingClearLatched'] = false;
+
+      const gen = client.sendMessageStream(
+        [{ text: 'Hello' }],
+        new AbortController().signal,
+        'prompt-1',
+        { type: SendMessageType.UserQuery },
+      );
+      for await (const _ of gen) {
+        /* drain */
+      }
+
+      expect(
+        mockChat.stripThoughtsFromHistoryKeepRecent,
+      ).not.toHaveBeenCalled();
+    });
+
+    it('should latch and strip thoughts after > 5min idle', async () => {
+      // Simulate an old API completion (10 minutes ago — exceeds default 5 min threshold)
+      client['lastApiCompletionTimestamp'] = Date.now() - 10 * 60 * 1000;
+      client['thinkingClearLatched'] = false;
+
+      const gen = client.sendMessageStream(
+        [{ text: 'Hello' }],
+        new AbortController().signal,
+        'prompt-2',
+        { type: SendMessageType.UserQuery },
+      );
+      for await (const _ of gen) {
+        /* drain */
+      }
+
+      expect(client['thinkingClearLatched']).toBe(true);
+      expect(mockChat.stripThoughtsFromHistoryKeepRecent).toHaveBeenCalledWith(
+        1,
+      );
+    });
+
+    it('should keep stripping once latched even if idle < 5min', async () => {
+      // Pre-set latch with a recent timestamp (2 minutes ago — within threshold)
+      client['lastApiCompletionTimestamp'] = Date.now() - 2 * 60 * 1000;
+      client['thinkingClearLatched'] = true;
+
+      const gen = client.sendMessageStream(
+        [{ text: 'Hello' }],
+        new AbortController().signal,
+        'prompt-3',
+        { type: SendMessageType.UserQuery },
+      );
+      for await (const _ of gen) {
+        /* drain */
+      }
+
+      expect(client['thinkingClearLatched']).toBe(true);
+      expect(mockChat.stripThoughtsFromHistoryKeepRecent).toHaveBeenCalledWith(
+        1,
+      );
+    });
+
+    it('should update lastApiCompletionTimestamp after API call', async () => {
+      client['lastApiCompletionTimestamp'] = null;
+
+      const before = Date.now();
+      const gen = client.sendMessageStream(
+        [{ text: 'Hello' }],
+        new AbortController().signal,
+        'prompt-4',
+        { type: SendMessageType.UserQuery },
+      );
+      for await (const _ of gen) {
+        /* drain */
+      }
+
+      expect(client['lastApiCompletionTimestamp']).toBeGreaterThanOrEqual(
+        before,
+      );
+    });
+
+    it('should reset latch and timestamp on resetChat', async () => {
+      client['lastApiCompletionTimestamp'] = Date.now();
+      client['thinkingClearLatched'] = true;
+
+      await client.resetChat();
+
+      expect(client['thinkingClearLatched']).toBe(false);
+      expect(client['lastApiCompletionTimestamp']).toBeNull();
+    });
+  });
+
  describe('tryCompressChat', () => {
    const mockGetHistory = vi.fn();

@ -436,6 +550,7 @@ describe('Gemini Client (client.ts)', () => {
        addHistory: vi.fn(),
        setHistory: vi.fn(),
        stripThoughtsFromHistory: vi.fn(),
+        stripThoughtsFromHistoryKeepRecent: vi.fn(),
      } as unknown as GeminiChat;
    });

@ -457,6 +572,7 @@ describe('Gemini Client (client.ts)', () => {
        getHistory: vi.fn((_curated?: boolean) => chatHistory),
        setHistory: vi.fn(),
        stripThoughtsFromHistory: vi.fn(),
+        stripThoughtsFromHistoryKeepRecent: vi.fn(),
      };
      client['chat'] = mockOriginalChat as GeminiChat;

@ -1149,6 +1265,7 @@ describe('Gemini Client (client.ts)', () => {
        addHistory: vi.fn(),
        getHistory: vi.fn().mockReturnValue([]),
        stripThoughtsFromHistory: vi.fn(),
+        stripThoughtsFromHistoryKeepRecent: vi.fn(),
      } as unknown as GeminiChat;
      client['chat'] = mockChat;

@ -1204,6 +1321,7 @@ Other open files:
        addHistory: vi.fn(),
        getHistory: vi.fn().mockReturnValue([]),
        stripThoughtsFromHistory: vi.fn(),
+        stripThoughtsFromHistoryKeepRecent: vi.fn(),
      };
      client['chat'] = mockChat as GeminiChat;

@ -1260,6 +1378,7 @@ Other open files:
        addHistory: vi.fn(),
        getHistory: vi.fn().mockReturnValue([]),
        stripThoughtsFromHistory: vi.fn(),
+        stripThoughtsFromHistoryKeepRecent: vi.fn(),
      };
      client['chat'] = mockChat as GeminiChat;

@ -1326,6 +1445,7 @@ hello
        addHistory: vi.fn(),
        getHistory: vi.fn().mockReturnValue([]),
        stripThoughtsFromHistory: vi.fn(),
+        stripThoughtsFromHistoryKeepRecent: vi.fn(),
      };
      client['chat'] = mockChat as GeminiChat;

@ -1365,6 +1485,7 @@ Other open files:
        addHistory: vi.fn(),
        getHistory: vi.fn().mockReturnValue([]),
        stripThoughtsFromHistory: vi.fn(),
+        stripThoughtsFromHistoryKeepRecent: vi.fn(),
      };
      client['chat'] = mockChat as GeminiChat;

@ -1410,6 +1531,7 @@ Other open files:
        addHistory: vi.fn(),
        getHistory: vi.fn().mockReturnValue([]),
        stripThoughtsFromHistory: vi.fn(),
+        stripThoughtsFromHistoryKeepRecent: vi.fn(),
      };
      client['chat'] = mockChat as GeminiChat;

@ -1498,6 +1620,7 @@ Other open files:
        addHistory: vi.fn(),
        getHistory: vi.fn().mockReturnValue([]),
        stripThoughtsFromHistory: vi.fn(),
+        stripThoughtsFromHistoryKeepRecent: vi.fn(),
      };
      client['chat'] = mockChat as GeminiChat;

@ -1555,6 +1678,7 @@ Other open files:
        addHistory: vi.fn(),
        getHistory: vi.fn().mockReturnValue([]),
        stripThoughtsFromHistory: vi.fn(),
+        stripThoughtsFromHistoryKeepRecent: vi.fn(),
      };
      client['chat'] = mockChat as GeminiChat;

@ -1636,6 +1760,7 @@ Other open files:
              { role: 'user', parts: [{ text: 'previous message' }] },
            ]),
          stripThoughtsFromHistory: vi.fn(),
+          stripThoughtsFromHistoryKeepRecent: vi.fn(),
        };
        client['chat'] = mockChat as GeminiChat;
      });
@ -1889,6 +2014,7 @@ Other open files:
          getHistory: vi.fn().mockReturnValue([]), // Default empty history
          setHistory: vi.fn(),
          stripThoughtsFromHistory: vi.fn(),
+          stripThoughtsFromHistoryKeepRecent: vi.fn(),
        };
        client['chat'] = mockChat as GeminiChat;

@ -2228,6 +2354,7 @@ Other open files:
        addHistory: vi.fn(),
        getHistory: vi.fn().mockReturnValue([]),
        stripThoughtsFromHistory: vi.fn(),
+        stripThoughtsFromHistoryKeepRecent: vi.fn(),
      };
      client['chat'] = mockChat as GeminiChat;

@ -2265,6 +2392,7 @@ Other open files:
        addHistory: vi.fn(),
        getHistory: vi.fn().mockReturnValue([]),
        stripThoughtsFromHistory: vi.fn(),
+        stripThoughtsFromHistoryKeepRecent: vi.fn(),
      };
      client['chat'] = mockChat as GeminiChat;

@ -2305,6 +2433,7 @@ Other open files:
        addHistory: vi.fn(),
        getHistory: vi.fn().mockReturnValue([]),
        stripThoughtsFromHistory: vi.fn(),
+        stripThoughtsFromHistoryKeepRecent: vi.fn(),
      };
      client['chat'] = mockChat as GeminiChat;

@ -2329,6 +2458,7 @@ Other open files:
          getHistory: vi.fn().mockReturnValue([]),
          setHistory: vi.fn(),
          stripThoughtsFromHistory: vi.fn(),
+          stripThoughtsFromHistoryKeepRecent: vi.fn(),
          stripOrphanedUserEntriesFromHistory: vi.fn(),
        };
        client['chat'] = mockChat as GeminiChat;
@ -2361,6 +2491,7 @@ Other open files:
          getHistory: vi.fn().mockReturnValue([]),
          setHistory: vi.fn(),
          stripThoughtsFromHistory: vi.fn(),
+          stripThoughtsFromHistoryKeepRecent: vi.fn(),
          stripOrphanedUserEntriesFromHistory: vi.fn(),
        };
        client['chat'] = mockChat as GeminiChat;
@ -2405,6 +2536,7 @@ Other open files:
          addHistory: vi.fn(),
          getHistory: vi.fn().mockReturnValue([]),
          stripThoughtsFromHistory: vi.fn(),
+          stripThoughtsFromHistoryKeepRecent: vi.fn(),
        };
        client['chat'] = mockChat as GeminiChat;
      });
--- a/packages/core/src/core/client.ts
+++ b/packages/core/src/core/client.ts
@ -126,6 +126,25 @@ export class GeminiClient {
   */
  private hasFailedCompressionAttempt = false;

+  /**
+   * Timestamp (epoch ms) of the last completed API call.
+   * Used to detect idle periods for thinking block cleanup.
+   * Starts as null — on the first query there is no prior thinking to clean,
+   * so the idle check is skipped until the first API call completes.
+   */
+  private lastApiCompletionTimestamp: number | null = null;
+
+  /**
+   * Sticky-on latch for clearing thinking blocks from prior turns.
+   * Triggered when idle exceeds the configured threshold (default 5 min,
+   * aligned with provider prompt-cache TTL). Once latched, stays true to
+   * prevent oscillation: without it, thinking would accumulate → get
+   * stripped → accumulate again, causing the message prefix to change
+   * repeatedly (bad for provider-side prompt caching and wastes context).
+   * Reset on /clear (resetChat).
+   */
+  private thinkingClearLatched = false;
+
  constructor(private readonly config: Config) {
    this.loopDetector = new LoopDetectionService(config);
  }
@ -199,6 +218,9 @@ export class GeminiClient {
  }

  async resetChat(): Promise<void> {
+    // Reset thinking clear latch — fresh chat, no prior thinking to clean up
+    this.thinkingClearLatched = false;
+    this.lastApiCompletionTimestamp = null;
    await this.startChat();
  }

@ -537,8 +559,27 @@ export class GeminiClient {
      // record user message for session management
      this.config.getChatRecordingService()?.recordUserMessage(request);

-      // strip thoughts from history before sending the message
-      this.stripThoughtsFromHistory();
+      // Thinking block cross-turn retention with idle cleanup:
+      // - Active session (< threshold idle): keep thinking blocks for reasoning coherence
+      // - Idle > threshold: clear old thinking, keep only last 1 turn to free context
+      // - Latch: once triggered, never revert — prevents oscillation
+      if (
+        !this.thinkingClearLatched &&
+        this.lastApiCompletionTimestamp !== null
+      ) {
+        const thresholdMs = this.config.getThinkingIdleThresholdMs();
+        const idleMs = Date.now() - this.lastApiCompletionTimestamp;
+        if (idleMs > thresholdMs) {
+          this.thinkingClearLatched = true;
+          debugLogger.debug(
+            `Thinking clear latched: idle ${Math.round(idleMs / 1000)}s > threshold ${thresholdMs / 1000}s`,
+          );
+        }
+      }
+      if (this.thinkingClearLatched) {
+        this.getChat().stripThoughtsFromHistoryKeepRecent(1);
+        debugLogger.debug('Stripped old thinking blocks (keeping last 1 turn)');
+      }
    }
    if (messageType !== SendMessageType.Retry) {
      this.sessionTurnCount++;
@ -680,6 +721,7 @@ export class GeminiClient {
          if (arenaAgentClient) {
            await arenaAgentClient.reportError('Loop detected');
          }
+          this.lastApiCompletionTimestamp = Date.now();
          return turn;
        }
      }
@ -698,9 +740,14 @@ export class GeminiClient {
              : 'Unknown error';
          await arenaAgentClient.reportError(errorMsg);
        }
+        this.lastApiCompletionTimestamp = Date.now();
        return turn;
      }
    }
+
+    // Track API completion time for thinking block idle cleanup
+    this.lastApiCompletionTimestamp = Date.now();
+
    // Fire Stop hook through MessageBus (only if hooks are enabled and registered)
    // This must be done before any early returns to ensure hooks are always triggered
    if (
--- a/packages/core/src/core/geminiChat.test.ts
+++ b/packages/core/src/core/geminiChat.test.ts
@ -1923,6 +1923,150 @@ describe('GeminiChat', async () => {
    });
  });

+  describe('stripThoughtsFromHistoryKeepRecent', () => {
+    it('should keep the most recent N model turns with thoughts', () => {
+      chat.setHistory([
+        { role: 'user', parts: [{ text: 'msg1' }] },
+        {
+          role: 'model',
+          parts: [
+            { text: 'old thinking', thought: true },
+            { text: 'response1' },
+          ],
+        },
+        { role: 'user', parts: [{ text: 'msg2' }] },
+        {
+          role: 'model',
+          parts: [
+            { text: 'mid thinking', thought: true },
+            { text: 'response2' },
+          ],
+        },
+        { role: 'user', parts: [{ text: 'msg3' }] },
+        {
+          role: 'model',
+          parts: [
+            { text: 'recent thinking', thought: true },
+            { text: 'response3' },
+          ],
+        },
+      ]);
+
+      chat.stripThoughtsFromHistoryKeepRecent(1);
+
+      const history = chat.getHistory();
+      // First two model turns should have thoughts stripped
+      expect(history[1]!.parts).toEqual([{ text: 'response1' }]);
+      expect(history[3]!.parts).toEqual([{ text: 'response2' }]);
+      // Last model turn should keep thoughts
+      expect(history[5]!.parts).toEqual([
+        { text: 'recent thinking', thought: true },
+        { text: 'response3' },
+      ]);
+    });
+
+    it('should not strip anything when keepTurns >= model turns with thoughts', () => {
+      chat.setHistory([
+        { role: 'user', parts: [{ text: 'msg1' }] },
+        {
+          role: 'model',
+          parts: [{ text: 'thinking', thought: true }, { text: 'response' }],
+        },
+      ]);
+
+      chat.stripThoughtsFromHistoryKeepRecent(1);
+
+      const history = chat.getHistory();
+      expect(history[1]!.parts).toEqual([
+        { text: 'thinking', thought: true },
+        { text: 'response' },
+      ]);
+    });
+
+    it('should remove model content objects that become empty after stripping', () => {
+      chat.setHistory([
+        { role: 'user', parts: [{ text: 'msg1' }] },
+        {
+          role: 'model',
+          parts: [{ text: 'only thinking', thought: true }],
+        },
+        { role: 'user', parts: [{ text: 'msg2' }] },
+        {
+          role: 'model',
+          parts: [
+            { text: 'recent thinking', thought: true },
+            { text: 'response' },
+          ],
+        },
+      ]);
+
+      chat.stripThoughtsFromHistoryKeepRecent(1);
+
+      const history = chat.getHistory();
+      // The first model turn (only thoughts) should be removed entirely
+      expect(history).toHaveLength(3);
+      expect(history[0]!.parts).toEqual([{ text: 'msg1' }]);
+      expect(history[1]!.parts).toEqual([{ text: 'msg2' }]);
+      expect(history[2]!.parts).toEqual([
+        { text: 'recent thinking', thought: true },
+        { text: 'response' },
+      ]);
+    });
+
+    it('should also strip thoughtSignature from stripped turns', () => {
+      chat.setHistory([
+        { role: 'user', parts: [{ text: 'msg1' }] },
+        {
+          role: 'model',
+          parts: [
+            { text: 'old thinking', thought: true },
+            {
+              text: 'with sig',
+              thoughtSignature: 'sig1',
+            } as unknown as { text: string; thoughtSignature: string },
+            { text: 'response1' },
+          ],
+        },
+        { role: 'user', parts: [{ text: 'msg2' }] },
+        {
+          role: 'model',
+          parts: [
+            { text: 'recent thinking', thought: true },
+            { text: 'response2' },
+          ],
+        },
+      ]);
+
+      chat.stripThoughtsFromHistoryKeepRecent(1);
+
+      const history = chat.getHistory();
+      // First model turn: thought stripped, thoughtSignature stripped
+      expect(history[1]!.parts).toEqual([
+        { text: 'with sig' },
+        { text: 'response1' },
+      ]);
+      expect(
+        (history[1]!.parts![0] as { thoughtSignature?: string })
+          .thoughtSignature,
+      ).toBeUndefined();
+    });
+
+    it('should handle keepTurns=0 by stripping all thoughts', () => {
+      chat.setHistory([
+        { role: 'user', parts: [{ text: 'msg1' }] },
+        {
+          role: 'model',
+          parts: [{ text: 'thinking', thought: true }, { text: 'response' }],
+        },
+      ]);
+
+      chat.stripThoughtsFromHistoryKeepRecent(0);
+
+      const history = chat.getHistory();
+      expect(history[1]!.parts).toEqual([{ text: 'response' }]);
+    });
+  });
+
  describe('stripOrphanedUserEntriesFromHistory', () => {
    it('should pop a single trailing user entry', () => {
      chat.setHistory([
--- a/packages/core/src/core/geminiChat.ts
+++ b/packages/core/src/core/geminiChat.ts
@ -16,13 +16,14 @@ import type {
  Tool,
  GenerateContentResponseUsageMetadata,
 } from '@google/genai';
-import { createUserContent } from '@google/genai';
+import { createUserContent, FinishReason } from '@google/genai';
 import { retryWithBackoff } from '../utils/retry.js';
 import { getErrorStatus } from '../utils/errors.js';
 import { createDebugLogger } from '../utils/debugLogger.js';
 import { parseAndFormatApiError } from '../utils/errorParsing.js';
 import { isRateLimitError, type RetryInfo } from '../utils/rateLimit.js';
 import type { Config } from '../config/config.js';
+import { ESCALATED_MAX_TOKENS } from './tokenLimits.js';
 import { hasCycleInSchema } from '../tools/tools.js';
 import type { StructuredError } from './turn.js';
 import {
@ -355,6 +356,17 @@ export class GeminiChat {
          cgConfig?.maxRetries ?? RATE_LIMIT_RETRY_OPTIONS.maxRetries;
        const extraRetryErrorCodes = cgConfig?.retryErrorCodes;

+        // Max output tokens escalation: when no user/env override is set,
+        // the capped default (8K) is used. If the model hits MAX_TOKENS,
+        // retry once with escalated limit (64K).
+        let maxTokensEscalated = false;
+        const hasUserMaxTokensOverride =
+          (cgConfig?.samplingParams?.max_tokens !== undefined &&
+            cgConfig?.samplingParams?.max_tokens !== null) ||
+          !!process.env['QWEN_CODE_MAX_OUTPUT_TOKENS'];
+
+        let lastFinishReason: string | undefined;
+
        for (
          let attempt = 0;
          attempt < INVALID_CONTENT_RETRY_OPTIONS.maxAttempts;
@ -376,7 +388,10 @@ export class GeminiChat {
              prompt_id,
            );

+            lastFinishReason = undefined;
            for await (const chunk of stream) {
+              const fr = chunk.candidates?.[0]?.finishReason;
+              if (fr) lastFinishReason = fr;
              yield { type: StreamEventType.CHUNK, value: chunk };
            }

@ -481,6 +496,49 @@ export class GeminiChat {
          }
        }

+        // Max output tokens escalation: if the retry loop succeeded with
+        // the capped default (8K) but hit MAX_TOKENS, retry once at 64K.
+        // Placed outside the retry loop so that any errors from the
+        // escalated stream propagate directly (not caught by retry logic).
+        if (
+          lastError === null &&
+          lastFinishReason === FinishReason.MAX_TOKENS &&
+          !maxTokensEscalated &&
+          !hasUserMaxTokensOverride
+        ) {
+          maxTokensEscalated = true;
+          debugLogger.info(
+            `Output truncated at capped default. Escalating to ${ESCALATED_MAX_TOKENS} tokens.`,
+          );
+          // Remove partial model response from history
+          // (processStreamResponse already pushed it)
+          if (
+            self.history.length > 0 &&
+            self.history[self.history.length - 1].role === 'model'
+          ) {
+            self.history.pop();
+          }
+          // Signal UI to discard partial output
+          yield { type: StreamEventType.RETRY };
+          // Retry with escalated max_tokens
+          const escalatedParams: SendMessageParameters = {
+            ...params,
+            config: {
+              ...params.config,
+              maxOutputTokens: ESCALATED_MAX_TOKENS,
+            },
+          };
+          const escalatedStream = await self.makeApiCallAndProcessStream(
+            model,
+            requestContents,
+            escalatedParams,
+            prompt_id,
+          );
+          for await (const chunk of escalatedStream) {
+            yield { type: StreamEventType.CHUNK, value: chunk };
+          }
+        }
+
        if (lastError) {
          if (lastError instanceof InvalidStreamError) {
            const totalAttempts = invalidStreamRetryCount + 1;
@ -625,6 +683,89 @@ export class GeminiChat {
      .filter((content) => content.parts && content.parts.length > 0);
  }

+  /**
+   * Strip thought parts from history, keeping the most recent `keepTurns`
+   * model turns that contain thinking blocks intact.
+   *
+   * Selection is based on thought-containing turns specifically (not all
+   * model turns) so the most recent reasoning chain is always preserved
+   * even if later model turns happen to have no thinking.
+   *
+   * Used for idle cleanup: after exceeding the configured idle threshold
+   * the old thinking blocks are no longer useful for reasoning coherence
+   * but still consume context tokens.
+   */
+  stripThoughtsFromHistoryKeepRecent(keepTurns: number): void {
+    keepTurns = Number.isFinite(keepTurns)
+      ? Math.max(0, Math.floor(keepTurns))
+      : 0;
+
+    // Find indices of model turns that contain thought parts
+    const modelTurnIndices: number[] = [];
+    for (let i = 0; i < this.history.length; i++) {
+      const content = this.history[i];
+      if (
+        content.role === 'model' &&
+        content.parts?.some(
+          (part) =>
+            part &&
+            typeof part === 'object' &&
+            'thought' in part &&
+            part.thought,
+        )
+      ) {
+        modelTurnIndices.push(i);
+      }
+    }
+
+    // Determine which model turns to keep (the most recent `keepTurns`)
+    const turnsToStrip = new Set(
+      modelTurnIndices.slice(
+        0,
+        Math.max(0, modelTurnIndices.length - keepTurns),
+      ),
+    );
+
+    if (turnsToStrip.size === 0) return;
+
+    this.history = this.history
+      .map((content, index) => {
+        if (!turnsToStrip.has(index) || !content.parts) return content;
+
+        // Strip thought parts from this turn
+        const filteredParts = content.parts
+          .filter(
+            (part) =>
+              !(
+                part &&
+                typeof part === 'object' &&
+                'thought' in part &&
+                part.thought
+              ),
+          )
+          .map((part) => {
+            if (
+              part &&
+              typeof part === 'object' &&
+              'thoughtSignature' in part
+            ) {
+              const newPart = { ...part };
+              delete (newPart as { thoughtSignature?: string })
+                .thoughtSignature;
+              return newPart;
+            }
+            return part;
+          });
+
+        return {
+          ...content,
+          parts: filteredParts,
+        };
+      })
+      // Remove Content objects that have no parts left after filtering
+      .filter((content) => content.parts && content.parts.length > 0);
+  }
+
  /**
   * Pop all orphaned trailing user entries from chat history.
   * In a valid conversation the last entry is always a model response;
--- a/packages/core/src/core/openaiContentGenerator/provider/dashscope.test.ts
+++ b/packages/core/src/core/openaiContentGenerator/provider/dashscope.test.ts
@ -786,9 +786,9 @@ describe('DashScopeOpenAICompatibleProvider', () => {

      const result = provider.buildRequest(request, 'test-prompt-id');

-      // Should set conservative default (min of model limit and DEFAULT_OUTPUT_TOKEN_LIMIT)
-      // qwen3-max has 32K output limit, so min(32K, 32K) = 32K
-      expect(result.max_tokens).toBe(32000);
+      // Should set capped default (min of model limit and CAPPED_DEFAULT_MAX_TOKENS)
+      // qwen3-max has 32K output limit, so min(32K, 8K) = 8K
+      expect(result.max_tokens).toBe(8000);
    });

    it('should set conservative max_tokens when null is provided', () => {
@ -800,8 +800,8 @@ describe('DashScopeOpenAICompatibleProvider', () => {

      const result = provider.buildRequest(request, 'test-prompt-id');

-      // null is treated as not configured, so set conservative default
-      expect(result.max_tokens).toBe(32000);
+      // null is treated as not configured, so set capped default: min(32K, 8K) = 8K
+      expect(result.max_tokens).toBe(8000);
    });

    it('should respect user max_tokens for unknown models', () => {
--- a/packages/core/src/core/openaiContentGenerator/provider/dashscope.ts
+++ b/packages/core/src/core/openaiContentGenerator/provider/dashscope.ts
@ -110,8 +110,8 @@ export class DashScopeOpenAICompatibleProvider extends DefaultOpenAICompatiblePr
    }

    // Apply output token limits using parent class logic
-    // Uses conservative default (min of model limit and DEFAULT_OUTPUT_TOKEN_LIMIT)
-    // to preserve input quota when user hasn't explicitly configured max_tokens
+    // Uses capped default (min of model limit and CAPPED_DEFAULT_MAX_TOKENS=8K)
+    // Requests hitting the cap get one clean retry at 64K (geminiChat.ts)
    const requestWithTokenLimits = this.applyOutputTokenLimit(request);

    const extraBody = this.contentGeneratorConfig.extra_body;
--- a/packages/core/src/core/openaiContentGenerator/provider/default.test.ts
+++ b/packages/core/src/core/openaiContentGenerator/provider/default.test.ts
@ -204,9 +204,9 @@ describe('DefaultOpenAICompatibleProvider', () => {
        'prompt-id',
      );

-      // Should set conservative default (min of model limit and DEFAULT_OUTPUT_TOKEN_LIMIT)
-      // GPT-4 has 16K output limit, so min(16K, 32K) = 16K
-      expect(result.max_tokens).toBe(16384);
+      // Should set capped default (min of model limit and CAPPED_DEFAULT_MAX_TOKENS)
+      // GPT-4 has 16K output limit, so min(16K, 8K) = 8K
+      expect(result.max_tokens).toBe(8000);
    });

    it('should respect user max_tokens for unknown models (deployment aliases, self-hosted)', () => {
@ -223,8 +223,8 @@ describe('DefaultOpenAICompatibleProvider', () => {
      expect(result.max_tokens).toBe(100000);
    });

-    it('should use conservative default for unknown models when max_tokens not configured', () => {
-      // Unknown models without user config: use DEFAULT_OUTPUT_TOKEN_LIMIT
+    it('should use capped default for unknown models when max_tokens not configured', () => {
+      // Unknown models without user config: use CAPPED_DEFAULT_MAX_TOKENS
      const request: OpenAI.Chat.ChatCompletionCreateParams = {
        model: 'custom-deployment-alias',
        messages: [{ role: 'user', content: 'Hello' }],
@ -232,8 +232,8 @@ describe('DefaultOpenAICompatibleProvider', () => {

      const result = provider.buildRequest(request, 'prompt-id');

-      // Uses conservative default (32K)
-      expect(result.max_tokens).toBe(32000);
+      // Uses capped default (8K)
+      expect(result.max_tokens).toBe(8000);
    });

    it('should cap max_tokens for known models to avoid API errors', () => {
@ -259,8 +259,8 @@ describe('DefaultOpenAICompatibleProvider', () => {

      const result = provider.buildRequest(request, 'prompt-id');

-      // GPT-4 has 16K output limit, so conservative default is still 16K
-      expect(result.max_tokens).toBe(16384);
+      // GPT-4 has 16K output limit, capped default is 8K: min(16K, 8K) = 8K
+      expect(result.max_tokens).toBe(8000);
    });

    it('should preserve all sampling parameters', () => {
@ -303,7 +303,7 @@ describe('DefaultOpenAICompatibleProvider', () => {
      // Should set conservative max_tokens default
      expect(result.model).toBe('gpt-4');
      expect(result.messages).toEqual(minimalRequest.messages);
-      expect(result.max_tokens).toBe(16384); // GPT-4 has 16K limit, min(16K, 32K) = 16K
+      expect(result.max_tokens).toBe(8000); // GPT-4 has 16K limit, min(16K, 8K) = 8K
    });

    it('should handle streaming requests', () => {
@ -319,7 +319,7 @@ describe('DefaultOpenAICompatibleProvider', () => {
      expect(result.model).toBe('gpt-4');
      expect(result.messages).toEqual(streamingRequest.messages);
      expect(result.stream).toBe(true);
-      expect(result.max_tokens).toBe(16384); // GPT-4 has 16K limit, min(16K, 32K) = 16K
+      expect(result.max_tokens).toBe(8000); // GPT-4 has 16K limit, min(16K, 8K) = 8K
    });

    it('should not modify the original request object', () => {
@ -363,7 +363,7 @@ describe('DefaultOpenAICompatibleProvider', () => {

      expect(result).toEqual({
        ...originalRequest,
-        max_tokens: 16384, // GPT-4 has 16K limit, min(16K, 32K) = 16K
+        max_tokens: 8000, // GPT-4 has 16K limit, min(16K, 8K) = 8K
        custom_param: 'custom_value',
        nested: { key: 'value' },
      });
@ -382,7 +382,7 @@ describe('DefaultOpenAICompatibleProvider', () => {
      expect(result.model).toBe('gpt-4');
      expect(result.messages).toEqual(originalRequest.messages);
      expect(result.temperature).toBe(0.7);
-      expect(result.max_tokens).toBe(16384); // GPT-4 has 16K limit, min(16K, 32K) = 16K
+      expect(result.max_tokens).toBe(8000); // GPT-4 has 16K limit, min(16K, 8K) = 8K
      expect(result).not.toHaveProperty('custom_param');
    });
  });
--- a/packages/core/src/core/openaiContentGenerator/provider/default.ts
+++ b/packages/core/src/core/openaiContentGenerator/provider/default.ts
@ -7,7 +7,7 @@ import type { OpenAICompatibleProvider } from './types.js';
 import { buildRuntimeFetchOptions } from '../../../utils/runtimeFetchOptions.js';
 import {
  tokenLimit,
-  DEFAULT_OUTPUT_TOKEN_LIMIT,
+  CAPPED_DEFAULT_MAX_TOKENS,
  hasExplicitOutputLimit,
 } from '../../tokenLimits.js';

@ -101,18 +101,19 @@ export class DefaultOpenAICompatibleProvider
   *    - For unknown models (deployment aliases, self-hosted): respect user's
   *      configured value entirely (backend may support larger limits)
   * 2. If user didn't configure max_tokens:
-   *    - Use min(modelLimit, DEFAULT_OUTPUT_TOKEN_LIMIT)
-   *    - This provides a conservative default (32K) that avoids truncating output
-   *      while preserving input quota (not occupying too much context window)
+   *    - Check QWEN_CODE_MAX_OUTPUT_TOKENS env var first
+   *    - Otherwise use min(modelLimit, CAPPED_DEFAULT_MAX_TOKENS=8K)
+   *    - Requests hitting the 8K cap get one clean retry at 64K (geminiChat.ts)
   * 3. If model has no specific limit (tokenLimit returns default):
-   *    - Still apply DEFAULT_OUTPUT_TOKEN_LIMIT as safeguard
+   *    - Still apply CAPPED_DEFAULT_MAX_TOKENS as safeguard
   *
   * Examples:
   * - User sets 4K, known model limit 64K → uses 4K (respects user preference)
   * - User sets 100K, known model limit 64K → uses 64K (capped to avoid API error)
   * - User sets 100K, unknown model → uses 100K (respects user, backend may support it)
-   * - User not set, model limit 64K → uses 32K (conservative default)
-   * - User not set, model limit 8K → uses 8K (model limit is lower)
+   * - User not set, model limit 64K → uses 8K (capped default for slot optimization)
+   * - User not set, model limit 4K → uses 4K (model limit is lower)
+   * - User not set, env QWEN_CODE_MAX_OUTPUT_TOKENS=16000 -> uses 16K
   *
   * @param request - The chat completion request parameters
   * @returns The request with max_tokens adjusted according to the logic
@ -140,9 +141,18 @@ export class DefaultOpenAICompatibleProvider
        effectiveMaxTokens = userMaxTokens;
      }
    } else {
-      // User didn't configure, use conservative default:
-      // min(model-specific limit, DEFAULT_OUTPUT_TOKEN_LIMIT)
-      effectiveMaxTokens = Math.min(modelLimit, DEFAULT_OUTPUT_TOKEN_LIMIT);
+      // No explicit user config — check env var, then use capped default.
+      // Capped default (8K) reduces GPU slot over-reservation by ~4×.
+      // Requests hitting the cap get one clean retry at 64K (geminiChat.ts).
+      const envVal = process.env['QWEN_CODE_MAX_OUTPUT_TOKENS'];
+      const envMaxTokens = envVal ? parseInt(envVal, 10) : NaN;
+      if (!isNaN(envMaxTokens) && envMaxTokens > 0) {
+        effectiveMaxTokens = isKnownModel
+          ? Math.min(envMaxTokens, modelLimit)
+          : envMaxTokens;
+      } else {
+        effectiveMaxTokens = Math.min(modelLimit, CAPPED_DEFAULT_MAX_TOKENS);
+      }
    }

    return {
--- a/packages/core/src/core/tokenLimits.ts
+++ b/packages/core/src/core/tokenLimits.ts
@ -11,6 +11,13 @@ export type TokenLimitType = 'input' | 'output';
 export const DEFAULT_TOKEN_LIMIT: TokenCount = 131_072; // 128K (power-of-two)
 export const DEFAULT_OUTPUT_TOKEN_LIMIT: TokenCount = 32_000; // 32K tokens

+// Capped default for slot-reservation optimization. 99% of outputs are under 5K
+// tokens, so 32K defaults over-reserve 4-6× slot capacity. With the cap
+// enabled, <1% of requests hit the limit; those get one clean retry at 64K
+// (see geminiChat.ts max_output_tokens escalation).
+export const CAPPED_DEFAULT_MAX_TOKENS: TokenCount = 8_000;
+export const ESCALATED_MAX_TOKENS: TokenCount = 64_000;
+
 /**
 * Accurate numeric limits:
 * - power-of-two approximations (128K -> 131072, 256K -> 262144, etc.)
--- a/packages/core/src/core/turn.ts
+++ b/packages/core/src/core/turn.ts
@ -280,8 +280,13 @@ export class Turn {
          return;
        }

-        // Handle the new RETRY event
+        // Handle the new RETRY event: clear accumulated state from the
+        // previous attempt to avoid duplicate tool calls and stale metadata.
        if (streamEvent.type === 'retry') {
+          this.pendingToolCalls.length = 0;
+          this.pendingCitations.clear();
+          this.debugResponses = [];
+          this.finishReason = undefined;
          yield {
            type: GeminiEventType.Retry,
            retryInfo: streamEvent.retryInfo,
--- a/packages/core/src/hooks/hookRunner.ts
+++ b/packages/core/src/hooks/hookRunner.ts
@ -420,6 +420,7 @@ export class HookRunner {
          }
        }

+        const killedBySignal = exitCode === null;
        resolve({
          hookConfig,
          eventName,
@ -427,8 +428,11 @@ export class HookRunner {
          output,
          stdout,
          stderr,
-          exitCode: exitCode || EXIT_CODE_SUCCESS,
+          exitCode: exitCode ?? -1,
          duration,
+          ...(killedBySignal && {
+            error: new Error('Hook killed by signal'),
+          }),
        });
      });

--- a/packages/core/src/tools/exitPlanMode.test.ts
+++ b/packages/core/src/tools/exitPlanMode.test.ts
@ -18,9 +18,11 @@ describe('ExitPlanModeTool', () => {
    approvalMode = ApprovalMode.PLAN;
    mockConfig = {
      getApprovalMode: vi.fn(() => approvalMode),
+      getPrePlanMode: vi.fn(() => ApprovalMode.DEFAULT),
      setApprovalMode: vi.fn((mode: ApprovalMode) => {
        approvalMode = mode;
      }),
+      savePlan: vi.fn(),
    } as unknown as Config;

    tool = new ExitPlanModeTool(mockConfig);
@ -147,6 +149,9 @@ describe('ExitPlanModeTool', () => {
        ApprovalMode.DEFAULT,
      );
      expect(approvalMode).toBe(ApprovalMode.DEFAULT);
+
+      // Plan should be saved to disk
+      expect(mockConfig.savePlan).toHaveBeenCalledWith(params.plan);
    });

    it('should request confirmation with plan details', async () => {
@ -173,6 +178,29 @@ describe('ExitPlanModeTool', () => {
      expect(approvalMode).toBe(ApprovalMode.AUTO_EDIT);
    });

+    it('should set DEFAULT mode on ProceedOnce regardless of pre-plan mode', async () => {
+      // Even if pre-plan mode was AUTO_EDIT, ProceedOnce ("manually approve
+      // edits") should always set DEFAULT to match the option label semantics.
+      (mockConfig.getPrePlanMode as ReturnType<typeof vi.fn>).mockReturnValue(
+        ApprovalMode.AUTO_EDIT,
+      );
+
+      const params: ExitPlanModeParams = { plan: 'Restore test' };
+      const signal = new AbortController().signal;
+
+      const invocation = tool.build(params);
+      const confirmation = await invocation.getConfirmationDetails(signal);
+
+      if (confirmation) {
+        await confirmation.onConfirm(ToolConfirmationOutcome.ProceedOnce);
+      }
+
+      expect(mockConfig.setApprovalMode).toHaveBeenCalledWith(
+        ApprovalMode.DEFAULT,
+      );
+      expect(approvalMode).toBe(ApprovalMode.DEFAULT);
+    });
+
    it('should remain in plan mode when confirmation is rejected', async () => {
      const params: ExitPlanModeParams = {
        plan: 'Remain in planning',
@ -199,6 +227,9 @@ describe('ExitPlanModeTool', () => {
        ApprovalMode.PLAN,
      );
      expect(approvalMode).toBe(ApprovalMode.PLAN);
+
+      // Plan should NOT be saved when rejected
+      expect(mockConfig.savePlan).not.toHaveBeenCalled();
    });

    it('should have correct description', () => {
--- a/packages/core/src/tools/exitPlanMode.ts
+++ b/packages/core/src/tools/exitPlanMode.ts
@ -147,6 +147,15 @@ class ExitPlanModeToolInvocation extends BaseToolInvocation<
        };
      }

+      // Persist the approved plan to disk
+      try {
+        this.config.savePlan(plan);
+      } catch (error) {
+        debugLogger.warn(
+          `[ExitPlanModeTool] Failed to save plan to disk: ${error instanceof Error ? error.message : String(error)}`,
+        );
+      }
+
      const llmMessage = `User has approved your plan. You can now start coding. Start with updating your todo list if applicable.`;
      const displayMessage = 'User approved the plan.';

--- a/packages/vscode-ide-companion/schemas/settings.schema.json
+++ b/packages/vscode-ide-companion/schemas/settings.schema.json
@ -188,7 +188,7 @@
        "enableFollowupSuggestions": {
          "description": "Show context-aware follow-up suggestions after task completion. Press Tab or Right Arrow to accept, Enter to accept and submit.",
          "type": "boolean",
-          "default": true
+          "default": false
        },
        "enableCacheSharing": {
          "description": "Use cache-aware forked queries for suggestion generation. Reduces cost on providers that support prefix caching (experimental).",
@ -393,6 +393,11 @@
              "default": true
            }
          }
+        },
+        "gapThresholdMinutes": {
+          "description": "Minutes of inactivity after which retained thinking blocks are cleared to free context tokens. Aligns with provider prompt-cache TTL.",
+          "type": "number",
+          "default": 5
        }
      }
    },