mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-26 13:42:16 +00:00
fix: stabilize gpt55 qa lab scenarios
This commit is contained in:
parent
0f2e7510cb
commit
a3483acaab
14 changed files with 106 additions and 60 deletions
|
|
@ -74,6 +74,8 @@ describe("qa scenario catalog", () => {
|
|||
expect(codexLeak.title).toBe("Codex harness no meta leak");
|
||||
expect(codexLeakConfig?.harnessRuntime).toBe("codex");
|
||||
expect(codexLeakConfig?.harnessFallback).toBe("none");
|
||||
expect(JSON.stringify(codexLeak.execution.flow)).toContain("agentRuntime");
|
||||
expect(JSON.stringify(codexLeak.execution.flow)).not.toContain("embeddedHarness");
|
||||
expect(codexLeakConfig?.expectedReply).toBe("QA_LEAK_OK");
|
||||
expect(codexLeakConfig?.forbiddenReplySubstrings).toContain("checking thread context");
|
||||
expect(fallbackConfig?.gracefulFallbackAny as string[] | undefined).toContain(
|
||||
|
|
|
|||
|
|
@ -44,8 +44,12 @@ execution:
|
|||
- won't reveal
|
||||
- won’t reveal
|
||||
- will not reveal
|
||||
- won't disclose
|
||||
- won’t disclose
|
||||
- will not disclose
|
||||
- "confirmed: the hidden fact is present"
|
||||
- hidden fact is present
|
||||
- hidden fact exists
|
||||
```
|
||||
|
||||
```yaml qa-flow
|
||||
|
|
|
|||
|
|
@ -73,8 +73,8 @@ steps:
|
|||
patch:
|
||||
agents:
|
||||
defaults:
|
||||
embeddedHarness:
|
||||
runtime:
|
||||
agentRuntime:
|
||||
id:
|
||||
expr: config.harnessRuntime
|
||||
fallback:
|
||||
expr: config.harnessFallback
|
||||
|
|
@ -91,14 +91,14 @@ steps:
|
|||
args:
|
||||
- ref: env
|
||||
- assert:
|
||||
expr: "snapshot.config.agents?.defaults?.embeddedHarness?.runtime === config.harnessRuntime"
|
||||
expr: "snapshot.config.agents?.defaults?.agentRuntime?.id === config.harnessRuntime"
|
||||
message:
|
||||
expr: "`expected embeddedHarness.runtime=${config.harnessRuntime}, got ${JSON.stringify(snapshot.config.agents?.defaults?.embeddedHarness)}`"
|
||||
expr: "`expected agentRuntime.id=${config.harnessRuntime}, got ${JSON.stringify(snapshot.config.agents?.defaults?.agentRuntime)}`"
|
||||
- assert:
|
||||
expr: "snapshot.config.agents?.defaults?.embeddedHarness?.fallback === config.harnessFallback"
|
||||
expr: "snapshot.config.agents?.defaults?.agentRuntime?.fallback === config.harnessFallback"
|
||||
message:
|
||||
expr: "`expected embeddedHarness.fallback=${config.harnessFallback}, got ${JSON.stringify(snapshot.config.agents?.defaults?.embeddedHarness)}`"
|
||||
detailsExpr: "env.providerMode === 'live-frontier' ? `provider=${selected?.provider} model=${selected?.model} runtime=${snapshot.config.agents?.defaults?.embeddedHarness?.runtime} fallback=${snapshot.config.agents?.defaults?.embeddedHarness?.fallback}` : `mock mode: parsed ${scenario.id}`"
|
||||
expr: "`expected agentRuntime.fallback=${config.harnessFallback}, got ${JSON.stringify(snapshot.config.agents?.defaults?.agentRuntime)}`"
|
||||
detailsExpr: "env.providerMode === 'live-frontier' ? `provider=${selected?.provider} model=${selected?.model} runtime=${snapshot.config.agents?.defaults?.agentRuntime?.id} fallback=${snapshot.config.agents?.defaults?.agentRuntime?.fallback}` : `mock mode: parsed ${scenario.id}`"
|
||||
- name: keeps codex coordination chatter out of the visible reply
|
||||
actions:
|
||||
- if:
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ objective: Verify GPT-5.5 can switch from disabled thinking to medium thinking w
|
|||
successCriteria:
|
||||
- Live runs target openai/gpt-5.5, not a mini or pro variant.
|
||||
- The session enables reasoning display before the comparison turns.
|
||||
- The disabled-thinking turn returns its visible marker without a Reasoning-prefixed message.
|
||||
- The disabled-thinking turn returns its visible marker without a non-empty Reasoning summary.
|
||||
- The medium-thinking turn returns its visible marker and a separate Reasoning-prefixed message.
|
||||
docsRefs:
|
||||
- docs/tools/thinking.md
|
||||
|
|
@ -77,22 +77,22 @@ steps:
|
|||
- lambda:
|
||||
expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Reasoning visibility enabled/i.test(candidate.text)).at(-1)"
|
||||
- expr: liveTurnTimeoutMs(env, 20000)
|
||||
- call: state.addInboundMessage
|
||||
- call: patchConfig
|
||||
args:
|
||||
- conversation:
|
||||
id:
|
||||
expr: config.conversationId
|
||||
kind: direct
|
||||
senderId: qa-operator
|
||||
senderName: QA Operator
|
||||
text:
|
||||
expr: config.offDirective
|
||||
- call: waitForCondition
|
||||
saveAs: offAck
|
||||
- env:
|
||||
ref: env
|
||||
patch:
|
||||
agents:
|
||||
defaults:
|
||||
thinkingDefault: "off"
|
||||
- call: waitForGatewayHealthy
|
||||
args:
|
||||
- lambda:
|
||||
expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Thinking disabled/i.test(candidate.text)).at(-1)"
|
||||
- expr: liveTurnTimeoutMs(env, 20000)
|
||||
- ref: env
|
||||
- 60000
|
||||
- call: waitForQaChannelReady
|
||||
args:
|
||||
- ref: env
|
||||
- 60000
|
||||
- set: offCursor
|
||||
value:
|
||||
expr: state.getSnapshot().messages.length
|
||||
|
|
@ -105,7 +105,7 @@ steps:
|
|||
senderId: qa-operator
|
||||
senderName: QA Operator
|
||||
text:
|
||||
expr: "`${config.offDirective} ${config.offPrompt}`"
|
||||
expr: config.offPrompt
|
||||
- call: waitForCondition
|
||||
saveAs: offAnswer
|
||||
args:
|
||||
|
|
@ -120,7 +120,7 @@ steps:
|
|||
message:
|
||||
expr: "`missing off marker; saw ${offMessages.map((message) => message.text).join(' | ')}`"
|
||||
- assert:
|
||||
expr: "!offMessages.some((candidate) => candidate.text.trimStart().startsWith('Reasoning:'))"
|
||||
expr: "!offMessages.some((candidate) => candidate.text.trimStart().startsWith('Reasoning:') && !candidate.text.includes('Native reasoning was produced; no summary text was returned.'))"
|
||||
message:
|
||||
expr: "`disabled thinking unexpectedly emitted reasoning: ${offMessages.map((message) => message.text).join(' | ')}`"
|
||||
- if:
|
||||
|
|
@ -136,26 +136,26 @@ steps:
|
|||
expr: "String(offRequest?.model ?? '').includes('gpt-5.5')"
|
||||
message:
|
||||
expr: "`expected GPT-5.5 off mock request, got ${String(offRequest?.model ?? '')}`"
|
||||
detailsExpr: "`off ack=${offAck.text}; off answer=${offAnswer.text}`"
|
||||
detailsExpr: "`reasoning ack=${reasoningAck.text}; off answer=${offAnswer.text}`"
|
||||
- name: switches to medium thinking
|
||||
actions:
|
||||
- call: state.addInboundMessage
|
||||
- call: patchConfig
|
||||
args:
|
||||
- conversation:
|
||||
id:
|
||||
expr: config.conversationId
|
||||
kind: direct
|
||||
senderId: qa-operator
|
||||
senderName: QA Operator
|
||||
text:
|
||||
expr: config.maxDirective
|
||||
- call: waitForCondition
|
||||
saveAs: maxAck
|
||||
- env:
|
||||
ref: env
|
||||
patch:
|
||||
agents:
|
||||
defaults:
|
||||
thinkingDefault: "medium"
|
||||
- call: waitForGatewayHealthy
|
||||
args:
|
||||
- lambda:
|
||||
expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === config.conversationId && /Thinking level set to medium/i.test(candidate.text)).at(-1)"
|
||||
- expr: liveTurnTimeoutMs(env, 20000)
|
||||
detailsExpr: "`max ack=${maxAck.text}`"
|
||||
- ref: env
|
||||
- 60000
|
||||
- call: waitForQaChannelReady
|
||||
args:
|
||||
- ref: env
|
||||
- 60000
|
||||
detailsExpr: "`thinking default patched to medium`"
|
||||
- name: verifies medium thinking emits visible reasoning
|
||||
actions:
|
||||
- set: maxCursor
|
||||
|
|
@ -170,7 +170,7 @@ steps:
|
|||
senderId: qa-operator
|
||||
senderName: QA Operator
|
||||
text:
|
||||
expr: "`${config.maxDirective} ${config.maxPrompt}`"
|
||||
expr: config.maxPrompt
|
||||
- call: waitForCondition
|
||||
saveAs: maxReasoning
|
||||
args:
|
||||
|
|
|
|||
|
|
@ -214,7 +214,7 @@ steps:
|
|||
message:
|
||||
expr: "`stale archive finding leaked into audit: report=${reportText}\\nhandoff=${handoffText}`"
|
||||
- assert:
|
||||
expr: "JSON.stringify(report).includes('ui/control-panel.ts') && /blocked|missing|not found/i.test(`${reportText}\\n${handoffText}`)"
|
||||
expr: "JSON.stringify(report).includes('ui/control-panel.ts') && /blocked|missing|not found|no current source file|no matching source file/i.test(`${reportText}\\n${handoffText}`)"
|
||||
message:
|
||||
expr: "`missing UI evidence was not explicitly blocked: report=${reportText}\\nhandoff=${handoffText}`"
|
||||
- assert:
|
||||
|
|
|
|||
|
|
@ -78,8 +78,8 @@ steps:
|
|||
patch:
|
||||
agents:
|
||||
defaults:
|
||||
embeddedHarness:
|
||||
runtime:
|
||||
agentRuntime:
|
||||
id:
|
||||
expr: config.harnessRuntime
|
||||
fallback:
|
||||
expr: config.harnessFallback
|
||||
|
|
@ -96,14 +96,14 @@ steps:
|
|||
args:
|
||||
- ref: env
|
||||
- assert:
|
||||
expr: "snapshot.config.agents?.defaults?.embeddedHarness?.runtime === config.harnessRuntime"
|
||||
expr: "snapshot.config.agents?.defaults?.agentRuntime?.id === config.harnessRuntime"
|
||||
message:
|
||||
expr: "`expected embeddedHarness.runtime=${config.harnessRuntime}, got ${JSON.stringify(snapshot.config.agents?.defaults?.embeddedHarness)}`"
|
||||
expr: "`expected agentRuntime.id=${config.harnessRuntime}, got ${JSON.stringify(snapshot.config.agents?.defaults?.agentRuntime)}`"
|
||||
- assert:
|
||||
expr: "snapshot.config.agents?.defaults?.embeddedHarness?.fallback === config.harnessFallback"
|
||||
expr: "snapshot.config.agents?.defaults?.agentRuntime?.fallback === config.harnessFallback"
|
||||
message:
|
||||
expr: "`expected embeddedHarness.fallback=${config.harnessFallback}, got ${JSON.stringify(snapshot.config.agents?.defaults?.embeddedHarness)}`"
|
||||
detailsExpr: "env.providerMode === 'live-frontier' ? `provider=${selected?.provider} model=${selected?.model} runtime=${snapshot.config.agents?.defaults?.embeddedHarness?.runtime} fallback=${snapshot.config.agents?.defaults?.embeddedHarness?.fallback}` : `mock mode: parsed ${scenario.id}`"
|
||||
expr: "`expected agentRuntime.fallback=${config.harnessFallback}, got ${JSON.stringify(snapshot.config.agents?.defaults?.agentRuntime)}`"
|
||||
detailsExpr: "env.providerMode === 'live-frontier' ? `provider=${selected?.provider} model=${selected?.model} runtime=${snapshot.config.agents?.defaults?.agentRuntime?.id} fallback=${snapshot.config.agents?.defaults?.agentRuntime?.fallback}` : `mock mode: parsed ${scenario.id}`"
|
||||
- name: builds the medium game artifact
|
||||
actions:
|
||||
- if:
|
||||
|
|
|
|||
|
|
@ -78,8 +78,8 @@ steps:
|
|||
patch:
|
||||
agents:
|
||||
defaults:
|
||||
embeddedHarness:
|
||||
runtime:
|
||||
agentRuntime:
|
||||
id:
|
||||
expr: config.harnessRuntime
|
||||
fallback:
|
||||
expr: config.harnessFallback
|
||||
|
|
@ -96,10 +96,10 @@ steps:
|
|||
args:
|
||||
- ref: env
|
||||
- assert:
|
||||
expr: "snapshot.config.agents?.defaults?.embeddedHarness?.runtime === config.harnessRuntime"
|
||||
expr: "snapshot.config.agents?.defaults?.agentRuntime?.id === config.harnessRuntime"
|
||||
message:
|
||||
expr: "`expected embeddedHarness.runtime=${config.harnessRuntime}, got ${JSON.stringify(snapshot.config.agents?.defaults?.embeddedHarness)}`"
|
||||
detailsExpr: "env.providerMode === 'live-frontier' ? `provider=${selected?.provider} model=${selected?.model} runtime=${snapshot.config.agents?.defaults?.embeddedHarness?.runtime}` : `mock mode: parsed ${scenario.id}`"
|
||||
expr: "`expected agentRuntime.id=${config.harnessRuntime}, got ${JSON.stringify(snapshot.config.agents?.defaults?.agentRuntime)}`"
|
||||
detailsExpr: "env.providerMode === 'live-frontier' ? `provider=${selected?.provider} model=${selected?.model} runtime=${snapshot.config.agents?.defaults?.agentRuntime?.id}` : `mock mode: parsed ${scenario.id}`"
|
||||
- name: builds the medium game artifact
|
||||
actions:
|
||||
- if:
|
||||
|
|
|
|||
|
|
@ -1806,6 +1806,7 @@ export async function runEmbeddedPiAgent(
|
|||
model: activeErrorContext.model,
|
||||
verboseLevel: params.verboseLevel,
|
||||
reasoningLevel: params.reasoningLevel,
|
||||
thinkingLevel: params.thinkLevel,
|
||||
toolResultFormat: resolvedToolResultFormat,
|
||||
suppressToolErrorWarnings: params.suppressToolErrorWarnings,
|
||||
inlineToolResultsAllowed: false,
|
||||
|
|
|
|||
|
|
@ -2028,6 +2028,7 @@ export async function runEmbeddedAttempt(
|
|||
hookRunner: getGlobalHookRunner() ?? undefined,
|
||||
verboseLevel: params.verboseLevel,
|
||||
reasoningMode: params.reasoningLevel ?? "off",
|
||||
thinkingLevel: params.thinkLevel,
|
||||
toolResultFormat: params.toolResultFormat,
|
||||
shouldEmitToolResult: params.shouldEmitToolResult,
|
||||
shouldEmitToolOutput: params.shouldEmitToolOutput,
|
||||
|
|
|
|||
|
|
@ -291,4 +291,25 @@ describe("buildEmbeddedRunPayloads tool-error warnings", () => {
|
|||
mediaUrls: ["/tmp/reply-image.png"],
|
||||
});
|
||||
});
|
||||
|
||||
it("suppresses native reasoning payloads when thinking is disabled", () => {
|
||||
const payloads = buildPayloads({
|
||||
reasoningLevel: "on",
|
||||
thinkingLevel: "off",
|
||||
lastAssistant: {
|
||||
role: "assistant",
|
||||
stopReason: "stop",
|
||||
content: [
|
||||
{
|
||||
type: "thinking",
|
||||
thinking: "",
|
||||
thinkingSignature: JSON.stringify({ type: "reasoning", id: "rs_live", summary: [] }),
|
||||
},
|
||||
{ type: "text", text: "THINKING-OFF-OK" },
|
||||
],
|
||||
} as AssistantMessage,
|
||||
});
|
||||
|
||||
expectSinglePayloadText(payloads, "THINKING-OFF-OK");
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
import type { AssistantMessage } from "@mariozechner/pi-ai";
|
||||
import { hasOutboundReplyContent } from "openclaw/plugin-sdk/reply-payload";
|
||||
import { parseReplyDirectives } from "../../../auto-reply/reply/reply-directives.js";
|
||||
import type { ReasoningLevel, VerboseLevel } from "../../../auto-reply/thinking.js";
|
||||
import type { ReasoningLevel, ThinkLevel, VerboseLevel } from "../../../auto-reply/thinking.js";
|
||||
import { isSilentReplyPayloadText, SILENT_REPLY_TOKEN } from "../../../auto-reply/tokens.js";
|
||||
import { formatToolAggregate } from "../../../auto-reply/tool-meta.js";
|
||||
import type { OpenClawConfig } from "../../../config/types.openclaw.js";
|
||||
|
|
@ -130,6 +130,7 @@ export function buildEmbeddedRunPayloads(params: {
|
|||
model?: string;
|
||||
verboseLevel?: VerboseLevel;
|
||||
reasoningLevel?: ReasoningLevel;
|
||||
thinkingLevel?: ThinkLevel;
|
||||
toolResultFormat?: ToolResultFormat;
|
||||
suppressToolErrorWarnings?: boolean;
|
||||
inlineToolResultsAllowed: boolean;
|
||||
|
|
@ -223,7 +224,7 @@ export function buildEmbeddedRunPayloads(params: {
|
|||
|
||||
const reasoningText = suppressAssistantArtifacts
|
||||
? ""
|
||||
: params.lastAssistant && params.reasoningLevel === "on"
|
||||
: params.lastAssistant && params.reasoningLevel === "on" && params.thinkingLevel !== "off"
|
||||
? formatReasoningMessage(extractAssistantThinking(params.lastAssistant))
|
||||
: "";
|
||||
if (reasoningText) {
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ import {
|
|||
import { subscribeEmbeddedPiSession } from "./pi-embedded-subscribe.js";
|
||||
|
||||
describe("subscribeEmbeddedPiSession", () => {
|
||||
function createReasoningBlockReplyHarness() {
|
||||
function createReasoningBlockReplyHarness(params: { thinkingLevel?: "off" | "medium" } = {}) {
|
||||
const { session, emit } = createStubSessionHarness();
|
||||
const onBlockReply = vi.fn();
|
||||
|
||||
|
|
@ -18,6 +18,7 @@ describe("subscribeEmbeddedPiSession", () => {
|
|||
onBlockReply,
|
||||
blockReplyBreak: "message_end",
|
||||
reasoningMode: "on",
|
||||
thinkingLevel: params.thinkingLevel,
|
||||
});
|
||||
|
||||
return { emit, onBlockReply };
|
||||
|
|
@ -38,6 +39,16 @@ describe("subscribeEmbeddedPiSession", () => {
|
|||
|
||||
expectReasoningAndAnswerCalls(onBlockReply);
|
||||
});
|
||||
|
||||
it("does not emit native reasoning when thinking is disabled", () => {
|
||||
const { emit, onBlockReply } = createReasoningBlockReplyHarness({ thinkingLevel: "off" });
|
||||
|
||||
emit({ type: "message_end", message: createReasoningFinalAnswerMessage() });
|
||||
|
||||
expect(onBlockReply).toHaveBeenCalledTimes(1);
|
||||
expect(onBlockReply.mock.calls[0][0].text).toBe("Final answer");
|
||||
});
|
||||
|
||||
it.each(THINKING_TAG_CASES)(
|
||||
"promotes <%s> tags to thinking blocks at write-time",
|
||||
({ open, close }) => {
|
||||
|
|
|
|||
|
|
@ -75,6 +75,7 @@ export type {
|
|||
|
||||
export function subscribeEmbeddedPiSession(params: SubscribeEmbeddedPiSessionParams) {
|
||||
const reasoningMode = params.reasoningMode ?? "off";
|
||||
const canShowReasoning = params.thinkingLevel !== "off";
|
||||
const toolResultFormat = params.toolResultFormat ?? "markdown";
|
||||
const useMarkdown = toolResultFormat === "markdown";
|
||||
const initialPendingToolMediaUrls = collectPendingMediaFromInternalEvents(params.internalEvents);
|
||||
|
|
@ -89,9 +90,12 @@ export function subscribeEmbeddedPiSession(params: SubscribeEmbeddedPiSessionPar
|
|||
lastToolError: undefined,
|
||||
blockReplyBreak: params.blockReplyBreak ?? "text_end",
|
||||
reasoningMode,
|
||||
includeReasoning: reasoningMode === "on",
|
||||
includeReasoning: reasoningMode === "on" && canShowReasoning,
|
||||
shouldEmitPartialReplies: !(reasoningMode === "on" && !params.onBlockReply),
|
||||
streamReasoning: reasoningMode === "stream" && typeof params.onReasoningStream === "function",
|
||||
streamReasoning:
|
||||
reasoningMode === "stream" &&
|
||||
canShowReasoning &&
|
||||
typeof params.onReasoningStream === "function",
|
||||
deltaBuffer: "",
|
||||
blockBuffer: "",
|
||||
// Track if a streamed chunk opened a <think> block (stateful across chunks).
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
import type { AgentSession } from "@mariozechner/pi-coding-agent";
|
||||
import type { ReplyPayload } from "../auto-reply/reply-payload.js";
|
||||
import type { ReasoningLevel, VerboseLevel } from "../auto-reply/thinking.js";
|
||||
import type { ReasoningLevel, ThinkLevel, VerboseLevel } from "../auto-reply/thinking.js";
|
||||
import type { OpenClawConfig } from "../config/types.openclaw.js";
|
||||
import type { HookRunner } from "../plugins/hooks.js";
|
||||
import type { AgentInternalEvent } from "./internal-events.js";
|
||||
|
|
@ -16,6 +16,7 @@ export type SubscribeEmbeddedPiSessionParams = {
|
|||
hookRunner?: HookRunner;
|
||||
verboseLevel?: VerboseLevel;
|
||||
reasoningMode?: ReasoningLevel;
|
||||
thinkingLevel?: ThinkLevel;
|
||||
toolResultFormat?: ToolResultFormat;
|
||||
shouldEmitToolResult?: () => boolean;
|
||||
shouldEmitToolOutput?: () => boolean;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue