diff --git a/extensions/qa-lab/src/scenario-catalog.test.ts b/extensions/qa-lab/src/scenario-catalog.test.ts index 84bd87410cb..ca654f4f07f 100644 --- a/extensions/qa-lab/src/scenario-catalog.test.ts +++ b/extensions/qa-lab/src/scenario-catalog.test.ts @@ -74,6 +74,8 @@ describe("qa scenario catalog", () => { expect(codexLeak.title).toBe("Codex harness no meta leak"); expect(codexLeakConfig?.harnessRuntime).toBe("codex"); expect(codexLeakConfig?.harnessFallback).toBe("none"); + expect(JSON.stringify(codexLeak.execution.flow)).toContain("agentRuntime"); + expect(JSON.stringify(codexLeak.execution.flow)).not.toContain("embeddedHarness"); expect(codexLeakConfig?.expectedReply).toBe("QA_LEAK_OK"); expect(codexLeakConfig?.forbiddenReplySubstrings).toContain("checking thread context"); expect(fallbackConfig?.gracefulFallbackAny as string[] | undefined).toContain( diff --git a/qa/scenarios/memory/memory-failure-fallback.md b/qa/scenarios/memory/memory-failure-fallback.md index f8ca52ca509..d81257f2e88 100644 --- a/qa/scenarios/memory/memory-failure-fallback.md +++ b/qa/scenarios/memory/memory-failure-fallback.md @@ -44,8 +44,12 @@ execution: - won't reveal - won’t reveal - will not reveal + - won't disclose + - won’t disclose + - will not disclose - "confirmed: the hidden fact is present" - hidden fact is present + - hidden fact exists ``` ```yaml qa-flow diff --git a/qa/scenarios/models/codex-harness-no-meta-leak.md b/qa/scenarios/models/codex-harness-no-meta-leak.md index c36ff8293bc..a9a3ec05e81 100644 --- a/qa/scenarios/models/codex-harness-no-meta-leak.md +++ b/qa/scenarios/models/codex-harness-no-meta-leak.md @@ -73,8 +73,8 @@ steps: patch: agents: defaults: - embeddedHarness: - runtime: + agentRuntime: + id: expr: config.harnessRuntime fallback: expr: config.harnessFallback @@ -91,14 +91,14 @@ steps: args: - ref: env - assert: - expr: "snapshot.config.agents?.defaults?.embeddedHarness?.runtime === config.harnessRuntime" + expr: "snapshot.config.agents?.defaults?.agentRuntime?.id === config.harnessRuntime" message: - expr: "`expected embeddedHarness.runtime=${config.harnessRuntime}, got ${JSON.stringify(snapshot.config.agents?.defaults?.embeddedHarness)}`" + expr: "`expected agentRuntime.id=${config.harnessRuntime}, got ${JSON.stringify(snapshot.config.agents?.defaults?.agentRuntime)}`" - assert: - expr: "snapshot.config.agents?.defaults?.embeddedHarness?.fallback === config.harnessFallback" + expr: "snapshot.config.agents?.defaults?.agentRuntime?.fallback === config.harnessFallback" message: - expr: "`expected embeddedHarness.fallback=${config.harnessFallback}, got ${JSON.stringify(snapshot.config.agents?.defaults?.embeddedHarness)}`" - detailsExpr: "env.providerMode === 'live-frontier' ? `provider=${selected?.provider} model=${selected?.model} runtime=${snapshot.config.agents?.defaults?.embeddedHarness?.runtime} fallback=${snapshot.config.agents?.defaults?.embeddedHarness?.fallback}` : `mock mode: parsed ${scenario.id}`" + expr: "`expected agentRuntime.fallback=${config.harnessFallback}, got ${JSON.stringify(snapshot.config.agents?.defaults?.agentRuntime)}`" + detailsExpr: "env.providerMode === 'live-frontier' ? `provider=${selected?.provider} model=${selected?.model} runtime=${snapshot.config.agents?.defaults?.agentRuntime?.id} fallback=${snapshot.config.agents?.defaults?.agentRuntime?.fallback}` : `mock mode: parsed ${scenario.id}`" - name: keeps codex coordination chatter out of the visible reply actions: - if: diff --git a/qa/scenarios/models/gpt55-thinking-visibility-switch.md b/qa/scenarios/models/gpt55-thinking-visibility-switch.md index ce75812f05a..9be2d2c425f 100644 --- a/qa/scenarios/models/gpt55-thinking-visibility-switch.md +++ b/qa/scenarios/models/gpt55-thinking-visibility-switch.md @@ -13,7 +13,7 @@ objective: Verify GPT-5.5 can switch from disabled thinking to medium thinking w successCriteria: - Live runs target openai/gpt-5.5, not a mini or pro variant. - The session enables reasoning display before the comparison turns. - - The disabled-thinking turn returns its visible marker without a Reasoning-prefixed message. + - The disabled-thinking turn returns its visible marker without a non-empty Reasoning summary. - The medium-thinking turn returns its visible marker and a separate Reasoning-prefixed message. docsRefs: - docs/tools/thinking.md @@ -77,22 +77,22 @@ steps: - lambda: expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Reasoning visibility enabled/i.test(candidate.text)).at(-1)" - expr: liveTurnTimeoutMs(env, 20000) - - call: state.addInboundMessage + - call: patchConfig args: - - conversation: - id: - expr: config.conversationId - kind: direct - senderId: qa-operator - senderName: QA Operator - text: - expr: config.offDirective - - call: waitForCondition - saveAs: offAck + - env: + ref: env + patch: + agents: + defaults: + thinkingDefault: "off" + - call: waitForGatewayHealthy args: - - lambda: - expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Thinking disabled/i.test(candidate.text)).at(-1)" - - expr: liveTurnTimeoutMs(env, 20000) + - ref: env + - 60000 + - call: waitForQaChannelReady + args: + - ref: env + - 60000 - set: offCursor value: expr: state.getSnapshot().messages.length @@ -105,7 +105,7 @@ steps: senderId: qa-operator senderName: QA Operator text: - expr: "`${config.offDirective} ${config.offPrompt}`" + expr: config.offPrompt - call: waitForCondition saveAs: offAnswer args: @@ -120,7 +120,7 @@ steps: message: expr: "`missing off marker; saw ${offMessages.map((message) => message.text).join(' | ')}`" - assert: - expr: "!offMessages.some((candidate) => candidate.text.trimStart().startsWith('Reasoning:'))" + expr: "!offMessages.some((candidate) => candidate.text.trimStart().startsWith('Reasoning:') && !candidate.text.includes('Native reasoning was produced; no summary text was returned.'))" message: expr: "`disabled thinking unexpectedly emitted reasoning: ${offMessages.map((message) => message.text).join(' | ')}`" - if: @@ -136,26 +136,26 @@ steps: expr: "String(offRequest?.model ?? '').includes('gpt-5.5')" message: expr: "`expected GPT-5.5 off mock request, got ${String(offRequest?.model ?? '')}`" - detailsExpr: "`off ack=${offAck.text}; off answer=${offAnswer.text}`" + detailsExpr: "`reasoning ack=${reasoningAck.text}; off answer=${offAnswer.text}`" - name: switches to medium thinking actions: - - call: state.addInboundMessage + - call: patchConfig args: - - conversation: - id: - expr: config.conversationId - kind: direct - senderId: qa-operator - senderName: QA Operator - text: - expr: config.maxDirective - - call: waitForCondition - saveAs: maxAck + - env: + ref: env + patch: + agents: + defaults: + thinkingDefault: "medium" + - call: waitForGatewayHealthy args: - - lambda: - expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Thinking level set to medium/i.test(candidate.text)).at(-1)" - - expr: liveTurnTimeoutMs(env, 20000) - detailsExpr: "`max ack=${maxAck.text}`" + - ref: env + - 60000 + - call: waitForQaChannelReady + args: + - ref: env + - 60000 + detailsExpr: "`thinking default patched to medium`" - name: verifies medium thinking emits visible reasoning actions: - set: maxCursor @@ -170,7 +170,7 @@ steps: senderId: qa-operator senderName: QA Operator text: - expr: "`${config.maxDirective} ${config.maxPrompt}`" + expr: config.maxPrompt - call: waitForCondition saveAs: maxReasoning args: diff --git a/qa/scenarios/workspace/long-running-release-audit.md b/qa/scenarios/workspace/long-running-release-audit.md index 6b886ab9df1..8785cfd2a64 100644 --- a/qa/scenarios/workspace/long-running-release-audit.md +++ b/qa/scenarios/workspace/long-running-release-audit.md @@ -214,7 +214,7 @@ steps: message: expr: "`stale archive finding leaked into audit: report=${reportText}\\nhandoff=${handoffText}`" - assert: - expr: "JSON.stringify(report).includes('ui/control-panel.ts') && /blocked|missing|not found/i.test(`${reportText}\\n${handoffText}`)" + expr: "JSON.stringify(report).includes('ui/control-panel.ts') && /blocked|missing|not found|no current source file|no matching source file/i.test(`${reportText}\\n${handoffText}`)" message: expr: "`missing UI evidence was not explicitly blocked: report=${reportText}\\nhandoff=${handoffText}`" - assert: diff --git a/qa/scenarios/workspace/medium-game-plan-codex-harness.md b/qa/scenarios/workspace/medium-game-plan-codex-harness.md index 4c268f5eaa1..1732520a52d 100644 --- a/qa/scenarios/workspace/medium-game-plan-codex-harness.md +++ b/qa/scenarios/workspace/medium-game-plan-codex-harness.md @@ -78,8 +78,8 @@ steps: patch: agents: defaults: - embeddedHarness: - runtime: + agentRuntime: + id: expr: config.harnessRuntime fallback: expr: config.harnessFallback @@ -96,14 +96,14 @@ steps: args: - ref: env - assert: - expr: "snapshot.config.agents?.defaults?.embeddedHarness?.runtime === config.harnessRuntime" + expr: "snapshot.config.agents?.defaults?.agentRuntime?.id === config.harnessRuntime" message: - expr: "`expected embeddedHarness.runtime=${config.harnessRuntime}, got ${JSON.stringify(snapshot.config.agents?.defaults?.embeddedHarness)}`" + expr: "`expected agentRuntime.id=${config.harnessRuntime}, got ${JSON.stringify(snapshot.config.agents?.defaults?.agentRuntime)}`" - assert: - expr: "snapshot.config.agents?.defaults?.embeddedHarness?.fallback === config.harnessFallback" + expr: "snapshot.config.agents?.defaults?.agentRuntime?.fallback === config.harnessFallback" message: - expr: "`expected embeddedHarness.fallback=${config.harnessFallback}, got ${JSON.stringify(snapshot.config.agents?.defaults?.embeddedHarness)}`" - detailsExpr: "env.providerMode === 'live-frontier' ? `provider=${selected?.provider} model=${selected?.model} runtime=${snapshot.config.agents?.defaults?.embeddedHarness?.runtime} fallback=${snapshot.config.agents?.defaults?.embeddedHarness?.fallback}` : `mock mode: parsed ${scenario.id}`" + expr: "`expected agentRuntime.fallback=${config.harnessFallback}, got ${JSON.stringify(snapshot.config.agents?.defaults?.agentRuntime)}`" + detailsExpr: "env.providerMode === 'live-frontier' ? `provider=${selected?.provider} model=${selected?.model} runtime=${snapshot.config.agents?.defaults?.agentRuntime?.id} fallback=${snapshot.config.agents?.defaults?.agentRuntime?.fallback}` : `mock mode: parsed ${scenario.id}`" - name: builds the medium game artifact actions: - if: diff --git a/qa/scenarios/workspace/medium-game-plan-pi-harness.md b/qa/scenarios/workspace/medium-game-plan-pi-harness.md index f44efea9125..9362dfd9122 100644 --- a/qa/scenarios/workspace/medium-game-plan-pi-harness.md +++ b/qa/scenarios/workspace/medium-game-plan-pi-harness.md @@ -78,8 +78,8 @@ steps: patch: agents: defaults: - embeddedHarness: - runtime: + agentRuntime: + id: expr: config.harnessRuntime fallback: expr: config.harnessFallback @@ -96,10 +96,10 @@ steps: args: - ref: env - assert: - expr: "snapshot.config.agents?.defaults?.embeddedHarness?.runtime === config.harnessRuntime" + expr: "snapshot.config.agents?.defaults?.agentRuntime?.id === config.harnessRuntime" message: - expr: "`expected embeddedHarness.runtime=${config.harnessRuntime}, got ${JSON.stringify(snapshot.config.agents?.defaults?.embeddedHarness)}`" - detailsExpr: "env.providerMode === 'live-frontier' ? `provider=${selected?.provider} model=${selected?.model} runtime=${snapshot.config.agents?.defaults?.embeddedHarness?.runtime}` : `mock mode: parsed ${scenario.id}`" + expr: "`expected agentRuntime.id=${config.harnessRuntime}, got ${JSON.stringify(snapshot.config.agents?.defaults?.agentRuntime)}`" + detailsExpr: "env.providerMode === 'live-frontier' ? `provider=${selected?.provider} model=${selected?.model} runtime=${snapshot.config.agents?.defaults?.agentRuntime?.id}` : `mock mode: parsed ${scenario.id}`" - name: builds the medium game artifact actions: - if: diff --git a/src/agents/pi-embedded-runner/run.ts b/src/agents/pi-embedded-runner/run.ts index f7ac1a38e6c..9677e5907e5 100644 --- a/src/agents/pi-embedded-runner/run.ts +++ b/src/agents/pi-embedded-runner/run.ts @@ -1806,6 +1806,7 @@ export async function runEmbeddedPiAgent( model: activeErrorContext.model, verboseLevel: params.verboseLevel, reasoningLevel: params.reasoningLevel, + thinkingLevel: params.thinkLevel, toolResultFormat: resolvedToolResultFormat, suppressToolErrorWarnings: params.suppressToolErrorWarnings, inlineToolResultsAllowed: false, diff --git a/src/agents/pi-embedded-runner/run/attempt.ts b/src/agents/pi-embedded-runner/run/attempt.ts index b9f43878969..8c8a674cb09 100644 --- a/src/agents/pi-embedded-runner/run/attempt.ts +++ b/src/agents/pi-embedded-runner/run/attempt.ts @@ -2028,6 +2028,7 @@ export async function runEmbeddedAttempt( hookRunner: getGlobalHookRunner() ?? undefined, verboseLevel: params.verboseLevel, reasoningMode: params.reasoningLevel ?? "off", + thinkingLevel: params.thinkLevel, toolResultFormat: params.toolResultFormat, shouldEmitToolResult: params.shouldEmitToolResult, shouldEmitToolOutput: params.shouldEmitToolOutput, diff --git a/src/agents/pi-embedded-runner/run/payloads.test.ts b/src/agents/pi-embedded-runner/run/payloads.test.ts index a414add27ce..a0ff6f8651d 100644 --- a/src/agents/pi-embedded-runner/run/payloads.test.ts +++ b/src/agents/pi-embedded-runner/run/payloads.test.ts @@ -291,4 +291,25 @@ describe("buildEmbeddedRunPayloads tool-error warnings", () => { mediaUrls: ["/tmp/reply-image.png"], }); }); + + it("suppresses native reasoning payloads when thinking is disabled", () => { + const payloads = buildPayloads({ + reasoningLevel: "on", + thinkingLevel: "off", + lastAssistant: { + role: "assistant", + stopReason: "stop", + content: [ + { + type: "thinking", + thinking: "", + thinkingSignature: JSON.stringify({ type: "reasoning", id: "rs_live", summary: [] }), + }, + { type: "text", text: "THINKING-OFF-OK" }, + ], + } as AssistantMessage, + }); + + expectSinglePayloadText(payloads, "THINKING-OFF-OK"); + }); }); diff --git a/src/agents/pi-embedded-runner/run/payloads.ts b/src/agents/pi-embedded-runner/run/payloads.ts index 2d68b3b23dc..e1746e3066b 100644 --- a/src/agents/pi-embedded-runner/run/payloads.ts +++ b/src/agents/pi-embedded-runner/run/payloads.ts @@ -1,7 +1,7 @@ import type { AssistantMessage } from "@mariozechner/pi-ai"; import { hasOutboundReplyContent } from "openclaw/plugin-sdk/reply-payload"; import { parseReplyDirectives } from "../../../auto-reply/reply/reply-directives.js"; -import type { ReasoningLevel, VerboseLevel } from "../../../auto-reply/thinking.js"; +import type { ReasoningLevel, ThinkLevel, VerboseLevel } from "../../../auto-reply/thinking.js"; import { isSilentReplyPayloadText, SILENT_REPLY_TOKEN } from "../../../auto-reply/tokens.js"; import { formatToolAggregate } from "../../../auto-reply/tool-meta.js"; import type { OpenClawConfig } from "../../../config/types.openclaw.js"; @@ -130,6 +130,7 @@ export function buildEmbeddedRunPayloads(params: { model?: string; verboseLevel?: VerboseLevel; reasoningLevel?: ReasoningLevel; + thinkingLevel?: ThinkLevel; toolResultFormat?: ToolResultFormat; suppressToolErrorWarnings?: boolean; inlineToolResultsAllowed: boolean; @@ -223,7 +224,7 @@ export function buildEmbeddedRunPayloads(params: { const reasoningText = suppressAssistantArtifacts ? "" - : params.lastAssistant && params.reasoningLevel === "on" + : params.lastAssistant && params.reasoningLevel === "on" && params.thinkingLevel !== "off" ? formatReasoningMessage(extractAssistantThinking(params.lastAssistant)) : ""; if (reasoningText) { diff --git a/src/agents/pi-embedded-subscribe.subscribe-embedded-pi-session.emits-reasoning-as-separate-message-enabled.test.ts b/src/agents/pi-embedded-subscribe.subscribe-embedded-pi-session.emits-reasoning-as-separate-message-enabled.test.ts index 515bfd4e3b1..3bceb2dd171 100644 --- a/src/agents/pi-embedded-subscribe.subscribe-embedded-pi-session.emits-reasoning-as-separate-message-enabled.test.ts +++ b/src/agents/pi-embedded-subscribe.subscribe-embedded-pi-session.emits-reasoning-as-separate-message-enabled.test.ts @@ -8,7 +8,7 @@ import { import { subscribeEmbeddedPiSession } from "./pi-embedded-subscribe.js"; describe("subscribeEmbeddedPiSession", () => { - function createReasoningBlockReplyHarness() { + function createReasoningBlockReplyHarness(params: { thinkingLevel?: "off" | "medium" } = {}) { const { session, emit } = createStubSessionHarness(); const onBlockReply = vi.fn(); @@ -18,6 +18,7 @@ describe("subscribeEmbeddedPiSession", () => { onBlockReply, blockReplyBreak: "message_end", reasoningMode: "on", + thinkingLevel: params.thinkingLevel, }); return { emit, onBlockReply }; @@ -38,6 +39,16 @@ describe("subscribeEmbeddedPiSession", () => { expectReasoningAndAnswerCalls(onBlockReply); }); + + it("does not emit native reasoning when thinking is disabled", () => { + const { emit, onBlockReply } = createReasoningBlockReplyHarness({ thinkingLevel: "off" }); + + emit({ type: "message_end", message: createReasoningFinalAnswerMessage() }); + + expect(onBlockReply).toHaveBeenCalledTimes(1); + expect(onBlockReply.mock.calls[0][0].text).toBe("Final answer"); + }); + it.each(THINKING_TAG_CASES)( "promotes <%s> tags to thinking blocks at write-time", ({ open, close }) => { diff --git a/src/agents/pi-embedded-subscribe.ts b/src/agents/pi-embedded-subscribe.ts index 5ab03b66f5b..06fc722a47a 100644 --- a/src/agents/pi-embedded-subscribe.ts +++ b/src/agents/pi-embedded-subscribe.ts @@ -75,6 +75,7 @@ export type { export function subscribeEmbeddedPiSession(params: SubscribeEmbeddedPiSessionParams) { const reasoningMode = params.reasoningMode ?? "off"; + const canShowReasoning = params.thinkingLevel !== "off"; const toolResultFormat = params.toolResultFormat ?? "markdown"; const useMarkdown = toolResultFormat === "markdown"; const initialPendingToolMediaUrls = collectPendingMediaFromInternalEvents(params.internalEvents); @@ -89,9 +90,12 @@ export function subscribeEmbeddedPiSession(params: SubscribeEmbeddedPiSessionPar lastToolError: undefined, blockReplyBreak: params.blockReplyBreak ?? "text_end", reasoningMode, - includeReasoning: reasoningMode === "on", + includeReasoning: reasoningMode === "on" && canShowReasoning, shouldEmitPartialReplies: !(reasoningMode === "on" && !params.onBlockReply), - streamReasoning: reasoningMode === "stream" && typeof params.onReasoningStream === "function", + streamReasoning: + reasoningMode === "stream" && + canShowReasoning && + typeof params.onReasoningStream === "function", deltaBuffer: "", blockBuffer: "", // Track if a streamed chunk opened a block (stateful across chunks). diff --git a/src/agents/pi-embedded-subscribe.types.ts b/src/agents/pi-embedded-subscribe.types.ts index 44064e0be5c..6eb95e4f01d 100644 --- a/src/agents/pi-embedded-subscribe.types.ts +++ b/src/agents/pi-embedded-subscribe.types.ts @@ -1,6 +1,6 @@ import type { AgentSession } from "@mariozechner/pi-coding-agent"; import type { ReplyPayload } from "../auto-reply/reply-payload.js"; -import type { ReasoningLevel, VerboseLevel } from "../auto-reply/thinking.js"; +import type { ReasoningLevel, ThinkLevel, VerboseLevel } from "../auto-reply/thinking.js"; import type { OpenClawConfig } from "../config/types.openclaw.js"; import type { HookRunner } from "../plugins/hooks.js"; import type { AgentInternalEvent } from "./internal-events.js"; @@ -16,6 +16,7 @@ export type SubscribeEmbeddedPiSessionParams = { hookRunner?: HookRunner; verboseLevel?: VerboseLevel; reasoningMode?: ReasoningLevel; + thinkingLevel?: ThinkLevel; toolResultFormat?: ToolResultFormat; shouldEmitToolResult?: () => boolean; shouldEmitToolOutput?: () => boolean;