diff --git a/docs/docs.json b/docs/docs.json index 0df3e09b929..9d6ea82f7ca 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -52,6 +52,14 @@ ] }, "redirects": [ + { + "source": "/help/gpt54-codex-agentic-parity", + "destination": "/help/gpt55-codex-agentic-parity" + }, + { + "source": "/help/gpt54-codex-agentic-parity-maintainers", + "destination": "/help/gpt55-codex-agentic-parity-maintainers" + }, { "source": "/mcp", "destination": "/cli/mcp" @@ -1649,8 +1657,8 @@ "concepts/typing-indicators", "concepts/usage-tracking", "concepts/timezone", - "help/gpt54-codex-agentic-parity", - "help/gpt54-codex-agentic-parity-maintainers" + "help/gpt55-codex-agentic-parity", + "help/gpt55-codex-agentic-parity-maintainers" ] }, { diff --git a/docs/help/gpt54-codex-agentic-parity-maintainers.md b/docs/help/gpt55-codex-agentic-parity-maintainers.md similarity index 91% rename from docs/help/gpt54-codex-agentic-parity-maintainers.md rename to docs/help/gpt55-codex-agentic-parity-maintainers.md index c5e73564266..3d9be951294 100644 --- a/docs/help/gpt54-codex-agentic-parity-maintainers.md +++ b/docs/help/gpt55-codex-agentic-parity-maintainers.md @@ -1,12 +1,12 @@ --- -summary: "How to review the GPT-5.4 / Codex parity program as four merge units" -title: "GPT-5.4 / Codex parity maintainer notes" +summary: "How to review the GPT-5.5 / Codex parity program as four merge units" +title: "GPT-5.5 / Codex parity maintainer notes" read_when: - - Reviewing the GPT-5.4 / Codex parity PR series + - Reviewing the GPT-5.5 / Codex parity PR series - Maintaining the six-contract agentic architecture behind the parity program --- -This note explains how to review the GPT-5.4 / Codex parity program as four merge units without losing the original six-contract architecture. +This note explains how to review the GPT-5.5 / Codex parity program as four merge units without losing the original six-contract architecture. ## Merge units @@ -59,7 +59,7 @@ Does not own: Owns: -- first-wave GPT-5.4 vs Opus 4.6 scenario pack +- first-wave GPT-5.5 vs Opus 4.6 scenario pack - parity documentation - parity report and release-gate mechanics @@ -123,7 +123,7 @@ Expected artifacts from PR D: ## Release gate -Do not claim GPT-5.4 parity or superiority over Opus 4.6 until: +Do not claim GPT-5.5 parity or superiority over Opus 4.6 until: - PR A, PR B, and PR C are merged - PR D runs the first-wave parity pack cleanly @@ -132,7 +132,7 @@ Do not claim GPT-5.4 parity or superiority over Opus 4.6 until: ```mermaid flowchart LR - A["PR A-C merged"] --> B["Run GPT-5.4 parity pack"] + A["PR A-C merged"] --> B["Run GPT-5.5 parity pack"] A --> C["Run Opus 4.6 parity pack"] B --> D["qa-suite-summary.json"] C --> E["qa-suite-summary.json"] @@ -146,7 +146,7 @@ flowchart LR The parity harness is not the only evidence source. Keep this split explicit in review: -- PR D owns the scenario-based GPT-5.4 vs Opus 4.6 comparison +- PR D owns the scenario-based GPT-5.5 vs Opus 4.6 comparison - PR B deterministic suites still own auth/proxy/DNS and full-access truthfulness evidence ## Quick maintainer merge workflow @@ -179,13 +179,13 @@ If any one of the evidence bar items is missing, request changes instead of merg | No fake progress or fake tool completion | PR A + PR D | parity fake-success count plus scenario-level report details | | No false `/elevated full` guidance | PR B | deterministic runtime-truthfulness suites | | Replay/liveness failures remain explicit | PR C + PR D | lifecycle/replay suites plus `compaction-retry-mutating-tool` | -| GPT-5.4 matches or beats Opus 4.6 | PR D | `qa-agentic-parity-report.md` and `qa-agentic-parity-summary.json` | +| GPT-5.5 matches or beats Opus 4.6 | PR D | `qa-agentic-parity-report.md` and `qa-agentic-parity-summary.json` | ## Reviewer shorthand: before vs after | User-visible problem before | Review signal after | | ----------------------------------------------------------- | --------------------------------------------------------------------------------------- | -| GPT-5.4 stopped after planning | PR A shows act-or-block behavior instead of commentary-only completion | +| GPT-5.5 stopped after planning | PR A shows act-or-block behavior instead of commentary-only completion | | Tool use felt brittle with strict OpenAI/Codex schemas | PR C keeps tool registration and parameter-free invocation predictable | | `/elevated full` hints were sometimes misleading | PR B ties guidance to actual runtime capability and blocked reasons | | Long tasks could disappear into replay/compaction ambiguity | PR C emits explicit paused, blocked, abandoned, and replay-invalid state | @@ -193,4 +193,4 @@ If any one of the evidence bar items is missing, request changes instead of merg ## Related -- [GPT-5.4 / Codex agentic parity](/help/gpt54-codex-agentic-parity) +- [GPT-5.5 / Codex agentic parity](/help/gpt55-codex-agentic-parity) diff --git a/docs/help/gpt54-codex-agentic-parity.md b/docs/help/gpt55-codex-agentic-parity.md similarity index 87% rename from docs/help/gpt54-codex-agentic-parity.md rename to docs/help/gpt55-codex-agentic-parity.md index 3457aeadd25..38af2ef3c5b 100644 --- a/docs/help/gpt54-codex-agentic-parity.md +++ b/docs/help/gpt55-codex-agentic-parity.md @@ -1,15 +1,15 @@ --- -summary: "How OpenClaw closes agentic execution gaps for GPT-5.4 and Codex-style models" -title: "GPT-5.4 / Codex agentic parity" +summary: "How OpenClaw closes agentic execution gaps for GPT-5.5 and Codex-style models" +title: "GPT-5.5 / Codex agentic parity" read_when: - - Debugging GPT-5.4 or Codex agent behavior + - Debugging GPT-5.5 or Codex agent behavior - Comparing OpenClaw agentic behavior across frontier models - Reviewing the strict-agentic, tool-schema, elevation, and replay fixes --- -# GPT-5.4 / Codex Agentic Parity in OpenClaw +# GPT-5.5 / Codex Agentic Parity in OpenClaw -OpenClaw already worked well with tool-using frontier models, but GPT-5.4 and Codex-style models were still underperforming in a few practical ways: +OpenClaw already worked well with tool-using frontier models, but GPT-5.5 and Codex-style models were still underperforming in a few practical ways: - they could stop after planning instead of doing the work - they could use strict OpenAI/Codex tool schemas incorrectly @@ -27,7 +27,7 @@ This slice adds an opt-in `strict-agentic` execution contract for embedded Pi GP When enabled, OpenClaw stops accepting plan-only turns as “good enough” completion. If the model only says what it intends to do and does not actually use tools or make progress, OpenClaw retries with an act-now steer and then fails closed with an explicit blocked state instead of silently ending the task. -This improves the GPT-5.4 experience most on: +This improves the GPT-5.5 experience most on: - short “ok do it” follow-ups - code tasks where the first step is obvious @@ -40,7 +40,7 @@ This slice makes OpenClaw tell the truth about two things: - why the provider/runtime call failed - whether `/elevated full` is actually available -That means GPT-5.4 gets better runtime signals for missing scope, auth refresh failures, HTML 403 auth failures, proxy issues, DNS or timeout failures, and blocked full-access modes. The model is less likely to hallucinate the wrong remediation or keep asking for a permission mode the runtime cannot provide. +That means GPT-5.5 gets better runtime signals for missing scope, auth refresh failures, HTML 403 auth failures, proxy issues, DNS or timeout failures, and blocked full-access modes. The model is less likely to hallucinate the wrong remediation or keep asking for a permission mode the runtime cannot provide. ### PR C: execution correctness @@ -53,7 +53,7 @@ The tool-compat work reduces schema friction for strict OpenAI/Codex tool regist ### PR D: parity harness -This slice adds the first-wave QA-lab parity pack so GPT-5.4 and Opus 4.6 can be exercised through the same scenarios and compared using shared evidence. +This slice adds the first-wave QA-lab parity pack so GPT-5.5 and Opus 4.6 can be exercised through the same scenarios and compared using shared evidence. The parity pack is the proof layer. It does not change runtime behavior by itself. @@ -62,7 +62,7 @@ After you have two `qa-suite-summary.json` artifacts, generate the release-gate ```bash pnpm openclaw qa parity-report \ --repo-root . \ - --candidate-summary .artifacts/qa-e2e/gpt54/qa-suite-summary.json \ + --candidate-summary .artifacts/qa-e2e/gpt55/qa-suite-summary.json \ --baseline-summary .artifacts/qa-e2e/opus46/qa-suite-summary.json \ --output-dir .artifacts/qa-e2e/parity ``` @@ -73,16 +73,16 @@ That command writes: - a machine-readable JSON verdict - an explicit `pass` / `fail` gate result -## Why this improves GPT-5.4 in practice +## Why this improves GPT-5.5 in practice -Before this work, GPT-5.4 on OpenClaw could feel less agentic than Opus in real coding sessions because the runtime tolerated behaviors that are especially harmful for GPT-5-style models: +Before this work, GPT-5.5 on OpenClaw could feel less agentic than Opus in real coding sessions because the runtime tolerated behaviors that are especially harmful for GPT-5-style models: - commentary-only turns - schema friction around tools - vague permission feedback - silent replay or compaction breakage -The goal is not to make GPT-5.4 imitate Opus. The goal is to give GPT-5.4 a runtime contract that rewards real progress, supplies cleaner tool and permission semantics, and turns failure modes into explicit machine- and human-readable states. +The goal is not to make GPT-5.5 imitate Opus. The goal is to give GPT-5.5 a runtime contract that rewards real progress, supplies cleaner tool and permission semantics, and turns failure modes into explicit machine- and human-readable states. That changes the user experience from: @@ -92,15 +92,15 @@ to: - “the model either acted, or OpenClaw surfaced the exact reason it could not” -## Before vs after for GPT-5.4 users +## Before vs after for GPT-5.5 users | Before this program | After PR A-D | | ---------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------- | -| GPT-5.4 could stop after a reasonable plan without taking the next tool step | PR A turns “plan only” into “act now or surface a blocked state” | +| GPT-5.5 could stop after a reasonable plan without taking the next tool step | PR A turns “plan only” into “act now or surface a blocked state” | | Strict tool schemas could reject parameter-free or OpenAI/Codex-shaped tools in confusing ways | PR C makes provider-owned tool registration and invocation more predictable | -| `/elevated full` guidance could be vague or wrong in blocked runtimes | PR B gives GPT-5.4 and the user truthful runtime and permission hints | +| `/elevated full` guidance could be vague or wrong in blocked runtimes | PR B gives GPT-5.5 and the user truthful runtime and permission hints | | Replay or compaction failures could feel like the task silently disappeared | PR C surfaces paused, blocked, abandoned, and replay-invalid outcomes explicitly | -| “GPT-5.4 feels worse than Opus” was mostly anecdotal | PR D turns that into the same scenario pack, the same metrics, and a hard pass/fail gate | +| “GPT-5.5 feels worse than Opus” was mostly anecdotal | PR D turns that into the same scenario pack, the same metrics, and a hard pass/fail gate | ## Architecture @@ -123,7 +123,7 @@ flowchart TD ```mermaid flowchart LR - A["Merged runtime slices (PR A-C)"] --> B["Run GPT-5.4 parity pack"] + A["Merged runtime slices (PR A-C)"] --> B["Run GPT-5.5 parity pack"] A --> C["Run Opus 4.6 parity pack"] B --> D["qa-suite-summary.json"] C --> E["qa-suite-summary.json"] @@ -162,7 +162,7 @@ Checks that a task with a real mutating write keeps replay-unsafety explicit ins ## Scenario matrix -| Scenario | What it tests | Good GPT-5.4 behavior | Failure signal | +| Scenario | What it tests | Good GPT-5.5 behavior | Failure signal | | ---------------------------------- | --------------------------------------- | ------------------------------------------------------------------------------ | ------------------------------------------------------------------------------ | | `approval-turn-tool-followthrough` | Short approval turns after a plan | Starts the first concrete tool action immediately instead of restating intent | plan-only follow-up, no tool activity, or blocked turn without a real blocker | | `model-switch-tool-continuity` | Runtime/model switching under tool use | Preserves task context and continues acting coherently | resets into commentary, loses tool context, or stops after switch | @@ -172,7 +172,7 @@ Checks that a task with a real mutating write keeps replay-unsafety explicit ins ## Release gate -GPT-5.4 can only be considered at parity or better when the merged runtime passes the parity pack and the runtime-truthfulness regressions at the same time. +GPT-5.5 can only be considered at parity or better when the merged runtime passes the parity pack and the runtime-truthfulness regressions at the same time. Required outcomes: @@ -191,24 +191,24 @@ For the first-wave harness, the gate compares: Parity evidence is intentionally split across two layers: -- PR D proves same-scenario GPT-5.4 vs Opus 4.6 behavior with QA-lab +- PR D proves same-scenario GPT-5.5 vs Opus 4.6 behavior with QA-lab - PR B deterministic suites prove auth, proxy, DNS, and `/elevated full` truthfulness outside the harness ## Goal-to-evidence matrix | Completion gate item | Owning PR | Evidence source | Pass signal | | -------------------------------------------------------- | ----------- | ------------------------------------------------------------------ | ---------------------------------------------------------------------------------------- | -| GPT-5.4 no longer stalls after planning | PR A | `approval-turn-tool-followthrough` plus PR A runtime suites | approval turns trigger real work or an explicit blocked state | -| GPT-5.4 no longer fakes progress or fake tool completion | PR A + PR D | parity report scenario outcomes and fake-success count | no suspicious pass results and no commentary-only completion | -| GPT-5.4 no longer gives false `/elevated full` guidance | PR B | deterministic truthfulness suites | blocked reasons and full-access hints stay runtime-accurate | +| GPT-5.5 no longer stalls after planning | PR A | `approval-turn-tool-followthrough` plus PR A runtime suites | approval turns trigger real work or an explicit blocked state | +| GPT-5.5 no longer fakes progress or fake tool completion | PR A + PR D | parity report scenario outcomes and fake-success count | no suspicious pass results and no commentary-only completion | +| GPT-5.5 no longer gives false `/elevated full` guidance | PR B | deterministic truthfulness suites | blocked reasons and full-access hints stay runtime-accurate | | Replay/liveness failures stay explicit | PR C + PR D | PR C lifecycle/replay suites plus `compaction-retry-mutating-tool` | mutating work keeps replay-unsafety explicit instead of silently disappearing | -| GPT-5.4 matches or beats Opus 4.6 on the agreed metrics | PR D | `qa-agentic-parity-report.md` and `qa-agentic-parity-summary.json` | same scenario coverage and no regression on completion, stop behavior, or valid tool use | +| GPT-5.5 matches or beats Opus 4.6 on the agreed metrics | PR D | `qa-agentic-parity-report.md` and `qa-agentic-parity-summary.json` | same scenario coverage and no regression on completion, stop behavior, or valid tool use | ## How to read the parity verdict Use the verdict in `qa-agentic-parity-summary.json` as the final machine-readable decision for the first-wave parity pack. -- `pass` means GPT-5.4 covered the same scenarios as Opus 4.6 and did not regress on the agreed aggregate metrics. +- `pass` means GPT-5.5 covered the same scenarios as Opus 4.6 and did not regress on the agreed aggregate metrics. - `fail` means at least one hard gate tripped: weaker completion, worse unintended stops, weaker valid tool use, any fake-success case, or mismatched scenario coverage. - “shared/base CI issue” is not itself a parity result. If CI noise outside PR D blocks a run, the verdict should wait for a clean merged-runtime execution instead of being inferred from branch-era logs. - Auth, proxy, DNS, and `/elevated full` truthfulness still come from PR B’s deterministic suites, so the final release claim needs both: a passing PR D parity verdict and green PR B truthfulness coverage. @@ -218,7 +218,7 @@ Use the verdict in `qa-agentic-parity-summary.json` as the final machine-readabl Use `strict-agentic` when: - the agent is expected to act immediately when a next step is obvious -- GPT-5.4 or Codex-family models are the primary runtime +- GPT-5.5 or Codex-family models are the primary runtime - you prefer explicit blocked states over “helpful” recap-only replies Keep the default contract when: @@ -229,4 +229,4 @@ Keep the default contract when: ## Related -- [GPT-5.4 / Codex parity maintainer notes](/help/gpt54-codex-agentic-parity-maintainers) +- [GPT-5.5 / Codex parity maintainer notes](/help/gpt55-codex-agentic-parity-maintainers) diff --git a/extensions/qa-lab/src/agentic-parity-report.test.ts b/extensions/qa-lab/src/agentic-parity-report.test.ts index b1a836ab86d..042b5281f69 100644 --- a/extensions/qa-lab/src/agentic-parity-report.test.ts +++ b/extensions/qa-lab/src/agentic-parity-report.test.ts @@ -74,7 +74,7 @@ describe("qa agentic parity report", () => { it("fails the parity gate when the candidate regresses against baseline", () => { const comparison = buildQaAgenticParityComparison({ - candidateLabel: "openai/gpt-5.4", + candidateLabel: "openai/gpt-5.5", baselineLabel: "anthropic/claude-opus-4-6", candidateSummary: { scenarios: [ @@ -103,10 +103,10 @@ describe("qa agentic parity report", () => { expect(comparison.pass).toBe(false); expect(comparison.failures).toContain( - "openai/gpt-5.4 completion rate 80.0% is below anthropic/claude-opus-4-6 100.0%.", + "openai/gpt-5.5 completion rate 80.0% is below anthropic/claude-opus-4-6 100.0%.", ); expect(comparison.failures).toContain( - "openai/gpt-5.4 unintended-stop rate 20.0% exceeds anthropic/claude-opus-4-6 0.0%.", + "openai/gpt-5.5 unintended-stop rate 20.0% exceeds anthropic/claude-opus-4-6 0.0%.", ); }); @@ -120,7 +120,7 @@ describe("qa agentic parity report", () => { { name: "Extra non-parity lane", status: "pass" as const }, ]; const comparison = buildQaAgenticParityComparison({ - candidateLabel: "openai/gpt-5.4", + candidateLabel: "openai/gpt-5.5", baselineLabel: "anthropic/claude-opus-4-6", candidateSummary: { scenarios: baselineScenarios.filter( @@ -133,13 +133,13 @@ describe("qa agentic parity report", () => { expect(comparison.pass).toBe(false); expect(comparison.failures).toContain( - "Scenario coverage mismatch for Extra non-parity lane: openai/gpt-5.4=missing, anthropic/claude-opus-4-6=pass.", + "Scenario coverage mismatch for Extra non-parity lane: openai/gpt-5.5=missing, anthropic/claude-opus-4-6=pass.", ); }); it("reports each missing required parity scenario exactly once (no double-counting)", () => { const comparison = buildQaAgenticParityComparison({ - candidateLabel: "openai/gpt-5.4", + candidateLabel: "openai/gpt-5.5", baselineLabel: "anthropic/claude-opus-4-6", candidateSummary: { scenarios: [{ name: "Approval turn tool followthrough", status: "pass" }], @@ -181,7 +181,7 @@ describe("qa agentic parity report", () => { }; const comparison = buildQaAgenticParityComparison({ - candidateLabel: "openai/gpt-5.4", + candidateLabel: "openai/gpt-5.5", baselineLabel: "anthropic/claude-opus-4-6", candidateSummary: summaryWithExtras, baselineSummary: scopedSummary, @@ -203,7 +203,7 @@ describe("qa agentic parity report", () => { it("fails the parity gate when required parity scenarios are missing on both sides", () => { const comparison = buildQaAgenticParityComparison({ - candidateLabel: "openai/gpt-5.4", + candidateLabel: "openai/gpt-5.5", baselineLabel: "anthropic/claude-opus-4-6", candidateSummary: { scenarios: [{ name: "Approval turn tool followthrough", status: "pass" }], @@ -216,13 +216,13 @@ describe("qa agentic parity report", () => { expect(comparison.pass).toBe(false); expect(comparison.failures).toContain( - "Missing required parity scenario coverage for Image understanding from attachment: openai/gpt-5.4=missing, anthropic/claude-opus-4-6=missing.", + "Missing required parity scenario coverage for Image understanding from attachment: openai/gpt-5.5=missing, anthropic/claude-opus-4-6=missing.", ); }); it("fails the parity gate when required parity scenarios are skipped", () => { const comparison = buildQaAgenticParityComparison({ - candidateLabel: "openai/gpt-5.4", + candidateLabel: "openai/gpt-5.5", baselineLabel: "anthropic/claude-opus-4-6", candidateSummary: { scenarios: [ @@ -247,7 +247,7 @@ describe("qa agentic parity report", () => { expect(comparison.pass).toBe(false); expect(comparison.failures).toContain( - "Missing required parity scenario coverage for Compaction retry after mutating tool: openai/gpt-5.4=skip, anthropic/claude-opus-4-6=skip.", + "Missing required parity scenario coverage for Compaction retry after mutating tool: openai/gpt-5.5=skip, anthropic/claude-opus-4-6=skip.", ); }); @@ -263,7 +263,7 @@ describe("qa agentic parity report", () => { status: "fail", }); const comparison = buildQaAgenticParityComparison({ - candidateLabel: "openai/gpt-5.4", + candidateLabel: "openai/gpt-5.5", baselineLabel: "anthropic/claude-opus-4-6", candidateSummary: { scenarios: scenariosWithBothFail }, baselineSummary: { scenarios: scenariosWithBothFail }, @@ -272,7 +272,7 @@ describe("qa agentic parity report", () => { expect(comparison.pass).toBe(false); expect(comparison.failures).toContain( - "Required parity scenario Approval turn tool followthrough failed: openai/gpt-5.4=fail, anthropic/claude-opus-4-6=fail.", + "Required parity scenario Approval turn tool followthrough failed: openai/gpt-5.5=fail, anthropic/claude-opus-4-6=fail.", ); // Metric comparisons are relative, so a same-on-both-sides failure // must not appear as a relative metric failure. The required-scenario @@ -289,7 +289,7 @@ describe("qa agentic parity report", () => { status: "fail", }); const comparison = buildQaAgenticParityComparison({ - candidateLabel: "openai/gpt-5.4", + candidateLabel: "openai/gpt-5.5", baselineLabel: "anthropic/claude-opus-4-6", candidateSummary: { scenarios: candidateWithOneFail }, baselineSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS }, @@ -298,7 +298,7 @@ describe("qa agentic parity report", () => { expect(comparison.pass).toBe(false); expect(comparison.failures).toContain( - "Required parity scenario Approval turn tool followthrough failed: openai/gpt-5.4=fail, anthropic/claude-opus-4-6=pass.", + "Required parity scenario Approval turn tool followthrough failed: openai/gpt-5.5=fail, anthropic/claude-opus-4-6=pass.", ); }); @@ -306,7 +306,7 @@ describe("qa agentic parity report", () => { // Cover the full second-wave pack on both sides so the suspicious-pass assertion // below is the isolated gate failure under test (no coverage-gap noise). const comparison = buildQaAgenticParityComparison({ - candidateLabel: "openai/gpt-5.4", + candidateLabel: "openai/gpt-5.5", baselineLabel: "anthropic/claude-opus-4-6", candidateSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS, @@ -490,7 +490,7 @@ status=done`, expect(() => buildQaAgenticParityComparison({ - candidateLabel: "openai/gpt-5.4", + candidateLabel: "openai/gpt-5.5", baselineLabel: "anthropic/claude-opus-4-6", candidateSummary: { scenarios: parityPassScenarios, @@ -512,7 +512,7 @@ status=done`, expect(() => buildQaAgenticParityComparison({ - candidateLabel: "openai/gpt-5.4", + candidateLabel: "openai/gpt-5.5", baselineLabel: "anthropic/claude-opus-4-6", candidateSummary: { scenarios: parityPassScenarios, @@ -520,25 +520,25 @@ status=done`, }, baselineSummary: { scenarios: parityPassScenarios, - run: { primaryProvider: "openai", primaryModel: "gpt-5.4" }, + run: { primaryProvider: "openai", primaryModel: "gpt-5.5" }, }, comparedAt: "2026-04-11T00:00:00.000Z", }), ).toThrow( - /baseline summary run\.primaryProvider=openai and run\.primaryModel=gpt-5\.4 do not match --baseline-label/, + /baseline summary run\.primaryProvider=openai and run\.primaryModel=gpt-5\.5 do not match --baseline-label/, ); }); it("accepts matching run.primaryProvider labels without throwing", () => { const comparison = buildQaAgenticParityComparison({ - candidateLabel: "openai/gpt-5.4", + candidateLabel: "openai/gpt-5.5", baselineLabel: "anthropic/claude-opus-4-6", candidateSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS, run: { primaryProvider: "openai", - primaryModel: "openai/gpt-5.4", - primaryModelName: "gpt-5.4", + primaryModel: "openai/gpt-5.5", + primaryModelName: "gpt-5.5", }, }, baselineSummary: { @@ -558,7 +558,7 @@ status=done`, // Pre-PR-L summaries don't carry a `run` block. The gate must still // work against those, trusting the caller-supplied label. const comparison = buildQaAgenticParityComparison({ - candidateLabel: "openai/gpt-5.4", + candidateLabel: "openai/gpt-5.5", baselineLabel: "anthropic/claude-opus-4-6", candidateSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS }, baselineSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS }, @@ -569,14 +569,14 @@ status=done`, it("skips provider verification for arbitrary display labels when run metadata is present", () => { const comparison = buildQaAgenticParityComparison({ - candidateLabel: "GPT-5.4 candidate", + candidateLabel: "GPT-5.5 candidate", baselineLabel: "Opus 4.6 baseline", candidateSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS, run: { primaryProvider: "openai", - primaryModel: "openai/gpt-5.4", - primaryModelName: "gpt-5.4", + primaryModel: "openai/gpt-5.5", + primaryModelName: "gpt-5.5", }, }, baselineSummary: { @@ -595,14 +595,14 @@ status=done`, it("skips provider verification for mixed-case or decorated display labels", () => { const comparison = buildQaAgenticParityComparison({ - candidateLabel: "Candidate: GPT-5.4", + candidateLabel: "Candidate: GPT-5.5", baselineLabel: "Opus 4.6 / baseline", candidateSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS, run: { primaryProvider: "openai", - primaryModel: "openai/gpt-5.4", - primaryModelName: "gpt-5.4", + primaryModel: "openai/gpt-5.5", + primaryModelName: "gpt-5.5", }, }, baselineSummary: { @@ -622,14 +622,14 @@ status=done`, it("throws when a structured label mismatches the recorded model even if the provider matches", () => { expect(() => buildQaAgenticParityComparison({ - candidateLabel: "openai/gpt-5.4", + candidateLabel: "openai/gpt-5.5", baselineLabel: "anthropic/claude-opus-4-6", candidateSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS, run: { primaryProvider: "openai", - primaryModel: "openai/gpt-5.4-alt", - primaryModelName: "gpt-5.4-alt", + primaryModel: "openai/gpt-5.5-alt", + primaryModelName: "gpt-5.5-alt", }, }, baselineSummary: { @@ -643,20 +643,20 @@ status=done`, comparedAt: "2026-04-11T00:00:00.000Z", }), ).toThrow( - /candidate summary run\.primaryProvider=openai and run\.primaryModel=openai\/gpt-5\.4-alt do not match --candidate-label=openai\/gpt-5\.4/, + /candidate summary run\.primaryProvider=openai and run\.primaryModel=openai\/gpt-5\.5-alt do not match --candidate-label=openai\/gpt-5\.5/, ); }); it("accepts colon-delimited structured labels when provider and model both match", () => { const comparison = buildQaAgenticParityComparison({ - candidateLabel: "openai:gpt-5.4", + candidateLabel: "openai:gpt-5.5", baselineLabel: "anthropic:claude-opus-4-6", candidateSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS, run: { primaryProvider: "openai", - primaryModel: "openai/gpt-5.4", - primaryModelName: "gpt-5.4", + primaryModel: "openai/gpt-5.5", + primaryModelName: "gpt-5.5", }, }, baselineSummary: { @@ -678,7 +678,7 @@ status=done`, // verdict is not disrupted by required-scenario coverage failures // added by the second-wave expansion. const comparison = buildQaAgenticParityComparison({ - candidateLabel: "openai/gpt-5.4", + candidateLabel: "openai/gpt-5.5", baselineLabel: "anthropic/claude-opus-4-6", candidateSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS }, baselineSummary: { scenarios: FULL_PARITY_PASS_SCENARIOS }, @@ -688,7 +688,7 @@ status=done`, const report = renderQaAgenticParityMarkdownReport(comparison); expect(report).toContain( - "# OpenClaw Agentic Parity Report — openai/gpt-5.4 vs anthropic/claude-opus-4-6", + "# OpenClaw Agentic Parity Report — openai/gpt-5.5 vs anthropic/claude-opus-4-6", ); expect(report).toContain("| Completion rate | 100.0% | 100.0% |"); expect(report).toContain("### Approval turn tool followthrough"); @@ -697,20 +697,20 @@ status=done`, it("parametrizes the markdown header from the comparison labels", () => { // Regression for the loop-7 Copilot finding: callers that configure - // non-gpt-5.4 / non-opus labels (for example an internal candidate vs + // non-gpt-5.5 / non-opus labels (for example an internal candidate vs // another candidate) must see the labels in the rendered H1 instead of - // the hardcoded "GPT-5.4 / Opus 4.6" title that would otherwise confuse + // the hardcoded "GPT-5.5 / Opus 4.6" title that would otherwise confuse // readers of saved reports. const comparison = buildQaAgenticParityComparison({ - candidateLabel: "openai/gpt-5.4-alt", - baselineLabel: "openai/gpt-5.4", + candidateLabel: "openai/gpt-5.5-alt", + baselineLabel: "openai/gpt-5.5", candidateSummary: { scenarios: [] }, baselineSummary: { scenarios: [] }, comparedAt: "2026-04-11T00:00:00.000Z", }); const report = renderQaAgenticParityMarkdownReport(comparison); expect(report).toContain( - "# OpenClaw Agentic Parity Report — openai/gpt-5.4-alt vs openai/gpt-5.4", + "# OpenClaw Agentic Parity Report — openai/gpt-5.5-alt vs openai/gpt-5.5", ); }); }); diff --git a/extensions/qa-lab/src/agentic-parity-report.ts b/extensions/qa-lab/src/agentic-parity-report.ts index b1742631956..7a60aba36e7 100644 --- a/extensions/qa-lab/src/agentic-parity-report.ts +++ b/extensions/qa-lab/src/agentic-parity-report.ts @@ -225,7 +225,7 @@ type StructuredQaParityLabel = { /** * Only treat caller labels as provenance-checked identifiers when they are * exact lower-case provider/model refs. Human-facing display labels like - * "GPT-5.4 candidate" or "Candidate: GPT-5.4" should render in the report + * "GPT-5.5 candidate" or "Candidate: GPT-5.5" should render in the report * without being misread as structured provider ids. */ function parseStructuredLabelRef(label: string): StructuredQaParityLabel | null { @@ -486,9 +486,9 @@ export function buildQaAgenticParityComparison(params: { export function renderQaAgenticParityMarkdownReport(comparison: QaAgenticParityComparison): string { // Title is parametrized from the candidate / baseline labels so reports - // for any candidate/baseline pair (not only gpt-5.4 vs opus 4.6) render + // for any candidate/baseline pair (not only gpt-5.5 vs opus 4.6) render // with an accurate header. The default CLI labels are still - // openai/gpt-5.4 vs anthropic/claude-opus-4-6, but the helper works for + // openai/gpt-5.5 vs anthropic/claude-opus-4-6, but the helper works for // any parity comparison a caller configures. const lines = [ `# OpenClaw Agentic Parity Report — ${comparison.candidateLabel} vs ${comparison.baselineLabel}`, diff --git a/extensions/qa-lab/src/character-eval.test.ts b/extensions/qa-lab/src/character-eval.test.ts index dc9d4956211..8dd8277f7d4 100644 --- a/extensions/qa-lab/src/character-eval.test.ts +++ b/extensions/qa-lab/src/character-eval.test.ts @@ -82,7 +82,7 @@ describe("runQaCharacterEval", () => { }); const runJudge = makeRunJudge([ { - model: "openai/gpt-5.4", + model: "openai/gpt-5.5", rank: 1, score: 9.1, summary: "Most natural.", @@ -102,10 +102,10 @@ describe("runQaCharacterEval", () => { const result = await runQaCharacterEval({ repoRoot: tempRoot, outputDir: path.join(tempRoot, "character"), - models: ["openai/gpt-5.4", "codex-cli/test-model", "openai/gpt-5.4"], + models: ["openai/gpt-5.5", "codex-cli/test-model", "openai/gpt-5.5"], scenarioId: "character-vibes-gollum", candidateFastMode: true, - judgeModels: ["openai/gpt-5.4"], + judgeModels: ["openai/gpt-5.5"], runSuite, runJudge, }); @@ -115,15 +115,15 @@ describe("runQaCharacterEval", () => { 1, expect.objectContaining({ providerMode: "live-frontier", - primaryModel: "openai/gpt-5.4", - alternateModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", + alternateModel: "openai/gpt-5.5", fastMode: true, scenarioIds: ["character-vibes-gollum"], }), ); expect(runJudge).toHaveBeenCalledWith( expect.objectContaining({ - judgeModel: "openai/gpt-5.4", + judgeModel: "openai/gpt-5.5", judgeThinkingDefault: "xhigh", judgeFastMode: true, timeoutMs: 300_000, @@ -131,17 +131,17 @@ describe("runQaCharacterEval", () => { ); expect(result.judgments).toHaveLength(1); expect(result.judgments[0]?.rankings.map((ranking) => ranking.model)).toEqual([ - "openai/gpt-5.4", + "openai/gpt-5.5", "codex-cli/test-model", ]); const report = await fs.readFile(result.reportPath, "utf8"); expect(report).toContain("Execution: local QA gateway child processes, not Docker"); - expect(report).toContain("Judges: openai/gpt-5.4"); + expect(report).toContain("Judges: openai/gpt-5.5"); expect(report).toContain("Judge model labels: visible"); expect(report).toContain("## Judge Rankings"); - expect(report).toContain("### openai/gpt-5.4"); - expect(report).toContain("reply from openai/gpt-5.4"); + expect(report).toContain("### openai/gpt-5.5"); + expect(report).toContain("reply from openai/gpt-5.5"); expect(report).toContain("reply from codex-cli/test-model"); expect(report).toContain("Judge thinking: xhigh"); expect(report).toContain("- Timeout: 5m"); @@ -162,7 +162,7 @@ describe("runQaCharacterEval", () => { const runJudge = vi.fn(async (params: CharacterRunJudgeParams) => { expect(params.prompt).toContain("## CANDIDATE candidate-01"); expect(params.prompt).toContain("## CANDIDATE candidate-02"); - expect(params.prompt).not.toContain("openai/gpt-5.4"); + expect(params.prompt).not.toContain("openai/gpt-5.5"); expect(params.prompt).not.toContain("codex-cli/test-model"); return makeJudgeReply([ { @@ -183,8 +183,8 @@ describe("runQaCharacterEval", () => { const result = await runQaCharacterEval({ repoRoot: tempRoot, outputDir: path.join(tempRoot, "character"), - models: ["openai/gpt-5.4", "codex-cli/test-model"], - judgeModels: ["openai/gpt-5.4"], + models: ["openai/gpt-5.5", "codex-cli/test-model"], + judgeModels: ["openai/gpt-5.5"], judgeBlindModels: true, runSuite, runJudge, @@ -193,7 +193,7 @@ describe("runQaCharacterEval", () => { expect(result.judgments[0]?.blindModels).toBe(true); expect(result.judgments[0]?.rankings.map((ranking) => ranking.model)).toEqual([ "codex-cli/test-model", - "openai/gpt-5.4", + "openai/gpt-5.5", ]); const report = await fs.readFile(result.reportPath, "utf8"); expect(report).toContain("Judge model labels: blind"); @@ -203,7 +203,7 @@ describe("runQaCharacterEval", () => { it("defaults to the character eval model panel when no models are provided", async () => { const runSuite = makeRunSuite(); const runJudge = makeRunJudge([ - { model: "openai/gpt-5.4", rank: 1, score: 8, summary: "ok" }, + { model: "openai/gpt-5.5", rank: 1, score: 8, summary: "ok" }, { model: "openai/gpt-5.2", rank: 2, score: 7.5, summary: "ok" }, { model: "openai/gpt-5", rank: 3, score: 7.2, summary: "ok" }, { model: "anthropic/claude-opus-4-6", rank: 4, score: 7, summary: "ok" }, @@ -223,7 +223,7 @@ describe("runQaCharacterEval", () => { expect(runSuite).toHaveBeenCalledTimes(8); expect(runSuite.mock.calls.map(([params]) => params.primaryModel)).toEqual([ - "openai/gpt-5.4", + "openai/gpt-5.5", "openai/gpt-5.2", "openai/gpt-5", "anthropic/claude-opus-4-6", @@ -254,7 +254,7 @@ describe("runQaCharacterEval", () => { ]); expect(runJudge).toHaveBeenCalledTimes(2); expect(runJudge.mock.calls.map(([params]) => params.judgeModel)).toEqual([ - "openai/gpt-5.4", + "openai/gpt-5.5", "anthropic/claude-opus-4-6", ]); expect(runJudge.mock.calls.map(([params]) => params.judgeThinkingDefault)).toEqual([ @@ -275,7 +275,7 @@ describe("runQaCharacterEval", () => { return makeReplySuiteResult(params); }); const runJudge = makeRunJudge([ - { model: "openai/gpt-5.4", rank: 1, score: 8, summary: "ok" }, + { model: "openai/gpt-5.5", rank: 1, score: 8, summary: "ok" }, { model: "anthropic/claude-sonnet-4-6", rank: 2, score: 7, summary: "ok" }, { model: "moonshot/kimi-k2.5", rank: 3, score: 6, summary: "ok" }, ]); @@ -283,16 +283,16 @@ describe("runQaCharacterEval", () => { const result = await runQaCharacterEval({ repoRoot: tempRoot, outputDir: path.join(tempRoot, "character"), - models: ["openai/gpt-5.4", "anthropic/claude-sonnet-4-6", "moonshot/kimi-k2.5"], + models: ["openai/gpt-5.5", "anthropic/claude-sonnet-4-6", "moonshot/kimi-k2.5"], candidateConcurrency: 2, - judgeModels: ["openai/gpt-5.4"], + judgeModels: ["openai/gpt-5.5"], runSuite, runJudge, }); expect(maxActiveRuns).toBe(2); expect(result.runs.map((run) => run.model)).toEqual([ - "openai/gpt-5.4", + "openai/gpt-5.5", "anthropic/claude-sonnet-4-6", "moonshot/kimi-k2.5", ]); @@ -355,7 +355,7 @@ describe("runQaCharacterEval", () => { repoRoot: tempRoot, outputDir: path.join(tempRoot, "character"), models: ["qwen/qwen3.6-plus"], - judgeModels: ["openai/gpt-5.4"], + judgeModels: ["openai/gpt-5.5"], runSuite, runJudge, }); @@ -383,7 +383,7 @@ describe("runQaCharacterEval", () => { repoRoot: tempRoot, outputDir: path.join(tempRoot, "character"), models: ["qwen/qwen3.5-plus"], - judgeModels: ["openai/gpt-5.4"], + judgeModels: ["openai/gpt-5.5"], runSuite, runJudge, }); @@ -412,7 +412,7 @@ describe("runQaCharacterEval", () => { repoRoot: tempRoot, outputDir: path.join(tempRoot, "character"), models: ["qa/generic-fallback-model"], - judgeModels: ["openai/gpt-5.4"], + judgeModels: ["openai/gpt-5.5"], runSuite, runJudge, }); @@ -441,7 +441,7 @@ describe("runQaCharacterEval", () => { repoRoot: tempRoot, outputDir: path.join(tempRoot, "character"), models: ["google/gemini-test"], - judgeModels: ["openai/gpt-5.4"], + judgeModels: ["openai/gpt-5.5"], runSuite, runJudge, }); @@ -463,20 +463,20 @@ describe("runQaCharacterEval", () => { }), ); const runJudge = makeRunJudge([ - { model: "codex/gpt-5.4", rank: 1, score: 0.5, summary: "failed" }, + { model: "codex/gpt-5.5", rank: 1, score: 0.5, summary: "failed" }, ]); const result = await runQaCharacterEval({ repoRoot: tempRoot, outputDir: path.join(tempRoot, "character"), - models: ["codex/gpt-5.4"], - judgeModels: ["openai/gpt-5.4"], + models: ["codex/gpt-5.5"], + judgeModels: ["openai/gpt-5.5"], runSuite, runJudge, }); expect(result.runs[0]).toMatchObject({ - model: "codex/gpt-5.4", + model: "codex/gpt-5.5", status: "fail", error: "internal harness/meta text leaked into transcript", }); @@ -485,17 +485,17 @@ describe("runQaCharacterEval", () => { it("lets explicit candidate thinking override the default panel", async () => { const runSuite = makeRunSuite(); const runJudge = makeRunJudge([ - { model: "openai/gpt-5.4", rank: 1, score: 8, summary: "ok" }, + { model: "openai/gpt-5.5", rank: 1, score: 8, summary: "ok" }, { model: "moonshot/kimi-k2.5", rank: 2, score: 7, summary: "ok" }, ]); await runQaCharacterEval({ repoRoot: tempRoot, outputDir: path.join(tempRoot, "character"), - models: ["openai/gpt-5.4", "moonshot/kimi-k2.5"], + models: ["openai/gpt-5.5", "moonshot/kimi-k2.5"], candidateThinkingDefault: "medium", candidateThinkingByModel: { "moonshot/kimi-k2.5": "high" }, - judgeModels: ["openai/gpt-5.4"], + judgeModels: ["openai/gpt-5.5"], runSuite, runJudge, }); @@ -508,21 +508,21 @@ describe("runQaCharacterEval", () => { it("lets model-specific options override candidate and judge defaults", async () => { const runSuite = makeRunSuite(); - const runJudge = makeRunJudge([{ model: "openai/gpt-5.4", rank: 1, score: 8, summary: "ok" }]); + const runJudge = makeRunJudge([{ model: "openai/gpt-5.5", rank: 1, score: 8, summary: "ok" }]); await runQaCharacterEval({ repoRoot: tempRoot, outputDir: path.join(tempRoot, "character"), - models: ["openai/gpt-5.4", "moonshot/kimi-k2.5"], + models: ["openai/gpt-5.5", "moonshot/kimi-k2.5"], candidateFastMode: true, candidateThinkingDefault: "medium", candidateModelOptions: { - "openai/gpt-5.4": { thinkingDefault: "xhigh", fastMode: false }, + "openai/gpt-5.5": { thinkingDefault: "xhigh", fastMode: false }, }, - judgeModels: ["openai/gpt-5.4", "anthropic/claude-opus-4-6"], + judgeModels: ["openai/gpt-5.5", "anthropic/claude-opus-4-6"], judgeThinkingDefault: "medium", judgeModelOptions: { - "openai/gpt-5.4": { thinkingDefault: "xhigh", fastMode: true }, + "openai/gpt-5.5": { thinkingDefault: "xhigh", fastMode: true }, "anthropic/claude-opus-4-6": { thinkingDefault: "high" }, }, runSuite, @@ -554,15 +554,15 @@ describe("runQaCharacterEval", () => { }); const runJudge = vi.fn(async (_params: CharacterRunJudgeParams) => JSON.stringify({ - rankings: [{ model: "openai/gpt-5.4", rank: 1, score: 8, summary: "ok" }], + rankings: [{ model: "openai/gpt-5.5", rank: 1, score: 8, summary: "ok" }], }), ); const result = await runQaCharacterEval({ repoRoot: tempRoot, outputDir: path.join(tempRoot, "character"), - models: ["openai/gpt-5.4", "codex-cli/test-model"], - judgeModels: ["openai/gpt-5.4"], + models: ["openai/gpt-5.5", "codex-cli/test-model"], + judgeModels: ["openai/gpt-5.5"], runSuite, runJudge, }); diff --git a/extensions/qa-lab/src/cli.runtime.test.ts b/extensions/qa-lab/src/cli.runtime.test.ts index a7f96c13b24..cd850c25b37 100644 --- a/extensions/qa-lab/src/cli.runtime.test.ts +++ b/extensions/qa-lab/src/cli.runtime.test.ts @@ -131,7 +131,7 @@ describe("qa cli runtime", () => { summaryPath: "/tmp/character-summary.json", }); runQaManualLane.mockResolvedValue({ - model: "openai/gpt-5.4", + model: "openai/gpt-5.5", waited: { status: "ok" }, reply: "done", watchUrl: "http://127.0.0.1:43124", @@ -186,7 +186,7 @@ describe("qa cli runtime", () => { repoRoot: "/tmp/openclaw-repo", outputDir: ".artifacts/qa/frontier", providerMode: "live-frontier", - primaryModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", alternateModel: "anthropic/claude-sonnet-4-6", fastMode: true, thinking: "medium", @@ -198,7 +198,7 @@ describe("qa cli runtime", () => { outputDir: path.resolve("/tmp/openclaw-repo", ".artifacts/qa/frontier"), transportId: "qa-channel", providerMode: "live-frontier", - primaryModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", alternateModel: "anthropic/claude-sonnet-4-6", fastMode: true, thinkingDefault: "medium", @@ -211,8 +211,8 @@ describe("qa cli runtime", () => { repoRoot: "/tmp/openclaw-repo", outputDir: ".artifacts/qa/telegram", providerMode: "live-frontier", - primaryModel: "openai/gpt-5.4", - alternateModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", + alternateModel: "openai/gpt-5.5", fastMode: true, scenarioIds: ["telegram-help-command"], sutAccountId: "sut-live", @@ -222,8 +222,8 @@ describe("qa cli runtime", () => { repoRoot: path.resolve("/tmp/openclaw-repo"), outputDir: path.resolve("/tmp/openclaw-repo", ".artifacts/qa/telegram"), providerMode: "live-frontier", - primaryModel: "openai/gpt-5.4", - alternateModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", + alternateModel: "openai/gpt-5.5", fastMode: true, allowFailures: undefined, scenarioIds: ["telegram-help-command"], @@ -509,7 +509,7 @@ describe("qa cli runtime", () => { await runQaSuiteCommand({ repoRoot: "/tmp/openclaw-repo", providerMode: "mock-openai", - primaryModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", alternateModel: "anthropic/claude-opus-4-6", preflight: true, }); @@ -521,7 +521,7 @@ describe("qa cli runtime", () => { ), transportId: "qa-channel", providerMode: "mock-openai", - primaryModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", alternateModel: "anthropic/claude-opus-4-6", scenarioIds: ["approval-turn-tool-followthrough"], concurrency: 1, @@ -709,14 +709,14 @@ describe("qa cli runtime", () => { repoRoot: "/tmp/openclaw-repo", outputDir: ".artifacts/qa/character", model: [ - "openai/gpt-5.4,thinking=xhigh,fast=false", + "openai/gpt-5.5,thinking=xhigh,fast=false", "codex-cli/test-model,thinking=high,fast", ], scenario: "character-vibes-gollum", fast: true, thinking: "medium", modelThinking: ["codex-cli/test-model=medium"], - judgeModel: ["openai/gpt-5.4,thinking=xhigh,fast", "anthropic/claude-opus-4-6,thinking=high"], + judgeModel: ["openai/gpt-5.5,thinking=xhigh,fast", "anthropic/claude-opus-4-6,thinking=high"], judgeTimeoutMs: 180_000, blindJudgeModels: true, concurrency: 4, @@ -726,18 +726,18 @@ describe("qa cli runtime", () => { expect(runQaCharacterEval).toHaveBeenCalledWith({ repoRoot: path.resolve("/tmp/openclaw-repo"), outputDir: path.resolve("/tmp/openclaw-repo", ".artifacts/qa/character"), - models: ["openai/gpt-5.4", "codex-cli/test-model"], + models: ["openai/gpt-5.5", "codex-cli/test-model"], scenarioId: "character-vibes-gollum", candidateFastMode: true, candidateThinkingDefault: "medium", candidateThinkingByModel: { "codex-cli/test-model": "medium" }, candidateModelOptions: { - "openai/gpt-5.4": { thinkingDefault: "xhigh", fastMode: false }, + "openai/gpt-5.5": { thinkingDefault: "xhigh", fastMode: false }, "codex-cli/test-model": { thinkingDefault: "high", fastMode: true }, }, - judgeModels: ["openai/gpt-5.4", "anthropic/claude-opus-4-6"], + judgeModels: ["openai/gpt-5.5", "anthropic/claude-opus-4-6"], judgeModelOptions: { - "openai/gpt-5.4": { thinkingDefault: "xhigh", fastMode: true }, + "openai/gpt-5.5": { thinkingDefault: "xhigh", fastMode: true }, "anthropic/claude-opus-4-6": { thinkingDefault: "high" }, }, judgeTimeoutMs: 180_000, @@ -751,13 +751,13 @@ describe("qa cli runtime", () => { it("lets character eval auto-select candidate fast mode when --fast is omitted", async () => { await runQaCharacterEvalCommand({ repoRoot: "/tmp/openclaw-repo", - model: ["openai/gpt-5.4"], + model: ["openai/gpt-5.5"], }); expect(runQaCharacterEval).toHaveBeenCalledWith({ repoRoot: path.resolve("/tmp/openclaw-repo"), outputDir: undefined, - models: ["openai/gpt-5.4"], + models: ["openai/gpt-5.5"], scenarioId: undefined, candidateFastMode: undefined, candidateThinkingDefault: undefined, @@ -777,7 +777,7 @@ describe("qa cli runtime", () => { await expect( runQaCharacterEvalCommand({ repoRoot: "/tmp/openclaw-repo", - model: ["openai/gpt-5.4"], + model: ["openai/gpt-5.5"], thinking: "enormous", }), ).rejects.toThrow("--thinking must be one of"); @@ -785,22 +785,22 @@ describe("qa cli runtime", () => { await expect( runQaCharacterEvalCommand({ repoRoot: "/tmp/openclaw-repo", - model: ["openai/gpt-5.4,thinking=galaxy"], + model: ["openai/gpt-5.5,thinking=galaxy"], }), ).rejects.toThrow("--model thinking must be one of"); await expect( runQaCharacterEvalCommand({ repoRoot: "/tmp/openclaw-repo", - model: ["openai/gpt-5.4,warp"], + model: ["openai/gpt-5.5,warp"], }), ).rejects.toThrow("--model options must be thinking="); await expect( runQaCharacterEvalCommand({ repoRoot: "/tmp/openclaw-repo", - model: ["openai/gpt-5.4"], - modelThinking: ["openai/gpt-5.4"], + model: ["openai/gpt-5.5"], + modelThinking: ["openai/gpt-5.5"], }), ).rejects.toThrow("--model-thinking must use provider/model=level"); }); @@ -809,8 +809,8 @@ describe("qa cli runtime", () => { await runQaManualLaneCommand({ repoRoot: "/tmp/openclaw-repo", providerMode: "live-frontier", - primaryModel: "openai/gpt-5.4", - alternateModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", + alternateModel: "openai/gpt-5.5", fastMode: true, message: "read qa kickoff and reply short", timeoutMs: 45_000, @@ -820,8 +820,8 @@ describe("qa cli runtime", () => { repoRoot: path.resolve("/tmp/openclaw-repo"), transportId: "qa-channel", providerMode: "live-frontier", - primaryModel: "openai/gpt-5.4", - alternateModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", + alternateModel: "openai/gpt-5.5", fastMode: true, message: "read qa kickoff and reply short", timeoutMs: 45_000, @@ -867,8 +867,8 @@ describe("qa cli runtime", () => { repoRoot: "/tmp/openclaw-repo", runner: "multipass", providerMode: "live-frontier", - primaryModel: "openai/gpt-5.4", - alternateModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", + alternateModel: "openai/gpt-5.5", fastMode: true, allowFailures: true, scenarioIds: ["channel-chat-baseline"], @@ -879,8 +879,8 @@ describe("qa cli runtime", () => { repoRoot: path.resolve("/tmp/openclaw-repo"), transportId: "qa-channel", providerMode: "live-frontier", - primaryModel: "openai/gpt-5.4", - alternateModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", + alternateModel: "openai/gpt-5.5", fastMode: true, allowFailures: true, scenarioIds: ["channel-chat-baseline"], @@ -1052,7 +1052,7 @@ describe("qa cli runtime", () => { repoRoot: "/tmp/openclaw-repo", providerMode: "mock-openai", parityPack: "agentic", - primaryModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", alternateModel: "anthropic/claude-opus-4-6", }); @@ -1061,7 +1061,7 @@ describe("qa cli runtime", () => { outputDir: undefined, transportId: "qa-channel", providerMode: "mock-openai", - primaryModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", alternateModel: "anthropic/claude-opus-4-6", fastMode: undefined, scenarioIds: [ @@ -1102,8 +1102,8 @@ describe("qa cli runtime", () => { repoRoot: path.resolve("/tmp/openclaw-repo"), transportId: "qa-channel", providerMode: "mock-openai", - primaryModel: "mock-openai/gpt-5.4", - alternateModel: "mock-openai/gpt-5.4-alt", + primaryModel: "mock-openai/gpt-5.5", + alternateModel: "mock-openai/gpt-5.5-alt", fastMode: undefined, message: "read qa kickoff and reply short", timeoutMs: undefined, @@ -1121,8 +1121,8 @@ describe("qa cli runtime", () => { repoRoot: path.resolve("/tmp/openclaw-repo"), transportId: "qa-channel", providerMode: "aimock", - primaryModel: "aimock/gpt-5.4", - alternateModel: "aimock/gpt-5.4-alt", + primaryModel: "aimock/gpt-5.5", + alternateModel: "aimock/gpt-5.5-alt", fastMode: undefined, message: "read qa kickoff and reply short", timeoutMs: undefined, @@ -1139,8 +1139,8 @@ describe("qa cli runtime", () => { repoRoot: path.resolve("/tmp/openclaw-repo"), transportId: "qa-channel", providerMode: "live-frontier", - primaryModel: "openai/gpt-5.4", - alternateModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", + alternateModel: "openai/gpt-5.5", fastMode: undefined, message: "read qa kickoff and reply short", timeoutMs: undefined, @@ -1170,7 +1170,7 @@ describe("qa cli runtime", () => { it("defaults manual frontier runs onto Codex OAuth when the runtime resolver prefers it", async () => { defaultQaRuntimeModelForMode.mockImplementation((mode, options) => mode === "live-frontier" - ? "openai/gpt-5.4" + ? "openai/gpt-5.5" : defaultQaProviderModelForMode(mode as QaProviderModeInput, options), ); @@ -1183,8 +1183,8 @@ describe("qa cli runtime", () => { repoRoot: path.resolve("/tmp/openclaw-repo"), transportId: "qa-channel", providerMode: "live-frontier", - primaryModel: "openai/gpt-5.4", - alternateModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", + alternateModel: "openai/gpt-5.5", fastMode: undefined, message: "read qa kickoff and reply short", timeoutMs: undefined, diff --git a/extensions/qa-lab/src/lab-server-capture.test.ts b/extensions/qa-lab/src/lab-server-capture.test.ts index eb75418cac9..a612ec16e77 100644 --- a/extensions/qa-lab/src/lab-server-capture.test.ts +++ b/extensions/qa-lab/src/lab-server-capture.test.ts @@ -19,7 +19,7 @@ describe("qa-lab server capture helpers", () => { metaJson: JSON.stringify({ provider: "openai", api: "responses", - model: "gpt-5.4", + model: "gpt-5.5", captureOrigin: "shared-fetch", }), }), @@ -29,7 +29,7 @@ describe("qa-lab server capture helpers", () => { payloadPreview: '{"hello":"world"}', provider: "openai", api: "responses", - model: "gpt-5.4", + model: "gpt-5.5", captureOrigin: "shared-fetch", }), ); diff --git a/extensions/qa-lab/src/lab-server.test.ts b/extensions/qa-lab/src/lab-server.test.ts index 0e4e0b8f716..ca6fa268c13 100644 --- a/extensions/qa-lab/src/lab-server.test.ts +++ b/extensions/qa-lab/src/lab-server.test.ts @@ -508,9 +508,9 @@ describe("qa-lab server", () => { `fs.writeFileSync(${JSON.stringify(markerPath)}, process.argv.slice(2).join(" "), "utf8");`, "process.stdout.write(JSON.stringify({", " models: [{", - ' key: "openai/gpt-5.4",', - ' name: "GPT-5.4",', - ' input: "openai/gpt-5.4",', + ' key: "openai/gpt-5.5",', + ' name: "GPT-5.5",', + ' input: "openai/gpt-5.5",', " available: true,", " missing: false,", " }],", @@ -726,7 +726,7 @@ describe("qa-lab server", () => { metaJson: JSON.stringify({ provider: "openai", api: "responses", - model: "gpt-5.4", + model: "gpt-5.5", captureOrigin: "shared-fetch", }), }); @@ -747,7 +747,7 @@ describe("qa-lab server", () => { metaJson: JSON.stringify({ provider: "openai", api: "responses", - model: "gpt-5.4", + model: "gpt-5.5", captureOrigin: "shared-fetch", }), }); @@ -796,7 +796,7 @@ describe("qa-lab server", () => { expect.objectContaining({ flowId: "flow-1", provider: "openai", - model: "gpt-5.4", + model: "gpt-5.5", captureOrigin: "shared-fetch", }), expect.objectContaining({ @@ -828,7 +828,7 @@ describe("qa-lab server", () => { ); expect(coverage.coverage.models).toEqual( expect.arrayContaining([ - expect.objectContaining({ value: "gpt-5.4", count: 2 }), + expect.objectContaining({ value: "gpt-5.5", count: 2 }), expect.objectContaining({ value: "kimi-k2.5:cloud", count: 1 }), ]), ); diff --git a/extensions/qa-lab/src/live-timeout.test.ts b/extensions/qa-lab/src/live-timeout.test.ts index 67d62e75fcb..c696a9c9e54 100644 --- a/extensions/qa-lab/src/live-timeout.test.ts +++ b/extensions/qa-lab/src/live-timeout.test.ts @@ -20,8 +20,8 @@ describe("qa live timeout policy", () => { resolveQaLiveTurnTimeoutMs( { providerMode: "live-frontier", - primaryModel: "openai/gpt-5.4", - alternateModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", + alternateModel: "openai/gpt-5.5", }, 30_000, ), diff --git a/extensions/qa-lab/src/live-transports/shared/live-gateway.runtime.test.ts b/extensions/qa-lab/src/live-transports/shared/live-gateway.runtime.test.ts index 64e100220cd..22d87fefa58 100644 --- a/extensions/qa-lab/src/live-transports/shared/live-gateway.runtime.test.ts +++ b/extensions/qa-lab/src/live-transports/shared/live-gateway.runtime.test.ts @@ -75,8 +75,8 @@ describe("startQaLiveLaneGateway", () => { transport: createStubTransport(), transportBaseUrl: "http://127.0.0.1:43123", providerMode: "mock-openai", - primaryModel: "mock-openai/gpt-5.4", - alternateModel: "mock-openai/gpt-5.4-alt", + primaryModel: "mock-openai/gpt-5.5", + alternateModel: "mock-openai/gpt-5.5-alt", controlUiEnabled: false, }); @@ -100,8 +100,8 @@ describe("startQaLiveLaneGateway", () => { transport: createStubTransport(), transportBaseUrl: "http://127.0.0.1:43123", providerMode: "mock-openai", - primaryModel: "mock-openai/gpt-5.4", - alternateModel: "mock-openai/gpt-5.4-alt", + primaryModel: "mock-openai/gpt-5.5", + alternateModel: "mock-openai/gpt-5.5-alt", controlUiEnabled: false, }); @@ -116,8 +116,8 @@ describe("startQaLiveLaneGateway", () => { transport: createStubTransport(), transportBaseUrl: "http://127.0.0.1:43123", providerMode: "live-frontier", - primaryModel: "openai/gpt-5.4", - alternateModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", + alternateModel: "openai/gpt-5.5", controlUiEnabled: false, }); @@ -141,8 +141,8 @@ describe("startQaLiveLaneGateway", () => { transport: createStubTransport(), transportBaseUrl: "http://127.0.0.1:43123", providerMode: "mock-openai", - primaryModel: "mock-openai/gpt-5.4", - alternateModel: "mock-openai/gpt-5.4-alt", + primaryModel: "mock-openai/gpt-5.5", + alternateModel: "mock-openai/gpt-5.5-alt", controlUiEnabled: false, }); @@ -161,8 +161,8 @@ describe("startQaLiveLaneGateway", () => { transport: createStubTransport(), transportBaseUrl: "http://127.0.0.1:43123", providerMode: "mock-openai", - primaryModel: "mock-openai/gpt-5.4", - alternateModel: "mock-openai/gpt-5.4-alt", + primaryModel: "mock-openai/gpt-5.5", + alternateModel: "mock-openai/gpt-5.5-alt", controlUiEnabled: false, }); diff --git a/extensions/qa-lab/src/manual-lane.runtime.test.ts b/extensions/qa-lab/src/manual-lane.runtime.test.ts index 65d5b33096a..10931602698 100644 --- a/extensions/qa-lab/src/manual-lane.runtime.test.ts +++ b/extensions/qa-lab/src/manual-lane.runtime.test.ts @@ -82,8 +82,8 @@ describe("runQaManualLane", () => { const result = await runQaManualLane({ repoRoot: "/tmp/openclaw-repo", providerMode: "mock-openai", - primaryModel: "mock-openai/gpt-5.4", - alternateModel: "mock-openai/gpt-5.4-alt", + primaryModel: "mock-openai/gpt-5.5", + alternateModel: "mock-openai/gpt-5.5-alt", message: "check the kickoff file", timeoutMs: 5_000, replySettleMs: 0, @@ -111,8 +111,8 @@ describe("runQaManualLane", () => { const result = await runQaManualLane({ repoRoot: "/tmp/openclaw-repo", providerMode: "live-frontier", - primaryModel: "openai/gpt-5.4", - alternateModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", + alternateModel: "openai/gpt-5.5", message: "check the kickoff file", timeoutMs: 5_000, replySettleMs: 0, diff --git a/extensions/qa-lab/src/model-catalog.runtime.test.ts b/extensions/qa-lab/src/model-catalog.runtime.test.ts index 0095e477ba6..5e8b19b01aa 100644 --- a/extensions/qa-lab/src/model-catalog.runtime.test.ts +++ b/extensions/qa-lab/src/model-catalog.runtime.test.ts @@ -2,7 +2,7 @@ import { describe, expect, it } from "vitest"; import { selectQaRunnerModelOptions } from "./model-catalog.runtime.js"; describe("qa runner model catalog", () => { - it("filters to available rows and prefers gpt-5.4 first", () => { + it("filters to available rows and prefers gpt-5.5 first", () => { expect( selectQaRunnerModelOptions([ { @@ -13,8 +13,8 @@ describe("qa runner model catalog", () => { missing: false, }, { - key: "openai/gpt-5.4", - name: "gpt-5.4", + key: "openai/gpt-5.5", + name: "gpt-5.5", input: "text,image", available: true, missing: false, @@ -27,6 +27,6 @@ describe("qa runner model catalog", () => { missing: false, }, ]).map((entry) => entry.key), - ).toEqual(["openai/gpt-5.4", "anthropic/claude-sonnet-4-6"]); + ).toEqual(["openai/gpt-5.5", "anthropic/claude-sonnet-4-6"]); }); }); diff --git a/extensions/qa-lab/src/model-selection.runtime.test.ts b/extensions/qa-lab/src/model-selection.runtime.test.ts index 409ca3ba4c2..5e781d48dce 100644 --- a/extensions/qa-lab/src/model-selection.runtime.test.ts +++ b/extensions/qa-lab/src/model-selection.runtime.test.ts @@ -34,7 +34,7 @@ describe("qa model selection runtime", () => { resolveEnvApiKey.mockReturnValue({ apiKey: "sk-test" }); expect(resolveQaPreferredLiveModel()).toBeUndefined(); - expect(defaultQaRuntimeModelForMode("live-frontier")).toBe("openai/gpt-5.4"); + expect(defaultQaRuntimeModelForMode("live-frontier")).toBe("openai/gpt-5.5"); expect(loadAuthProfileStoreForRuntime).not.toHaveBeenCalled(); }); @@ -43,8 +43,8 @@ describe("qa model selection runtime", () => { provider === "openai-codex" ? ["openai-codex:user@example.com"] : [], ); - expect(resolveQaPreferredLiveModel()).toBe("openai/gpt-5.4"); - expect(defaultQaRuntimeModelForMode("live-frontier")).toBe("openai/gpt-5.4"); + expect(resolveQaPreferredLiveModel()).toBe("openai/gpt-5.5"); + expect(defaultQaRuntimeModelForMode("live-frontier")).toBe("openai/gpt-5.5"); }); it("keeps the OpenAI live default when stored OpenAI profiles are available", () => { @@ -53,7 +53,7 @@ describe("qa model selection runtime", () => { ); expect(resolveQaPreferredLiveModel()).toBeUndefined(); - expect(defaultQaRuntimeModelForMode("live-frontier")).toBe("openai/gpt-5.4"); + expect(defaultQaRuntimeModelForMode("live-frontier")).toBe("openai/gpt-5.5"); }); it("leaves mock defaults unchanged", () => { @@ -61,11 +61,11 @@ describe("qa model selection runtime", () => { provider === "openai-codex" ? ["openai-codex:user@example.com"] : [], ); - expect(defaultQaRuntimeModelForMode("mock-openai")).toBe("mock-openai/gpt-5.4"); + expect(defaultQaRuntimeModelForMode("mock-openai")).toBe("mock-openai/gpt-5.5"); expect(defaultQaRuntimeModelForMode("mock-openai", { alternate: true })).toBe( - "mock-openai/gpt-5.4-alt", + "mock-openai/gpt-5.5-alt", ); - expect(defaultQaRuntimeModelForMode("aimock")).toBe("aimock/gpt-5.4"); - expect(defaultQaRuntimeModelForMode("aimock", { alternate: true })).toBe("aimock/gpt-5.4-alt"); + expect(defaultQaRuntimeModelForMode("aimock")).toBe("aimock/gpt-5.5"); + expect(defaultQaRuntimeModelForMode("aimock", { alternate: true })).toBe("aimock/gpt-5.5-alt"); }); }); diff --git a/extensions/qa-lab/src/multipass.runtime.test.ts b/extensions/qa-lab/src/multipass.runtime.test.ts index eacf9248f9f..09658de0896 100644 --- a/extensions/qa-lab/src/multipass.runtime.test.ts +++ b/extensions/qa-lab/src/multipass.runtime.test.ts @@ -115,8 +115,8 @@ describe("qa multipass runtime", () => { repoRoot: process.cwd(), outputDir: path.join(process.cwd(), ".artifacts", "qa-e2e", "multipass-live-test"), providerMode: "live-frontier", - primaryModel: "openai/gpt-5.4", - alternateModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", + alternateModel: "openai/gpt-5.5", fastMode: true, scenarioIds: ["channel-chat-baseline"], }); @@ -128,9 +128,9 @@ describe("qa multipass runtime", () => { "--provider-mode", "live-frontier", "--model", - "openai/gpt-5.4", + "openai/gpt-5.5", "--alt-model", - "openai/gpt-5.4", + "openai/gpt-5.5", "--fast", ]), ); diff --git a/extensions/qa-lab/src/providers/aimock/server.test.ts b/extensions/qa-lab/src/providers/aimock/server.test.ts index 57e181c13f1..b14601586b1 100644 --- a/extensions/qa-lab/src/providers/aimock/server.test.ts +++ b/extensions/qa-lab/src/providers/aimock/server.test.ts @@ -24,7 +24,7 @@ describe("qa aimock server", () => { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ - model: "aimock/gpt-5.4", + model: "aimock/gpt-5.5", stream: false, input: [makeResponsesInput("hello aimock")], }), @@ -32,7 +32,7 @@ describe("qa aimock server", () => { expect(response.status).toBe(200); expect(await response.json()).toMatchObject({ status: "completed", - model: "aimock/gpt-5.4", + model: "aimock/gpt-5.5", }); const debug = await fetch(`${server.baseUrl}/debug/last-request`); @@ -40,7 +40,7 @@ describe("qa aimock server", () => { expect(await debug.json()).toMatchObject({ prompt: "hello aimock", allInputText: "hello aimock", - model: "aimock/gpt-5.4", + model: "aimock/gpt-5.5", providerVariant: "openai", }); } finally { @@ -58,7 +58,7 @@ describe("qa aimock server", () => { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ - model: "aimock/gpt-5.4", + model: "aimock/gpt-5.5", stream: false, input: [makeResponsesInput("@openclaw explain the QA lab")], }), @@ -90,7 +90,7 @@ describe("qa aimock server", () => { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ - model: "openai-codex/gpt-5.4", + model: "openai-codex/gpt-5.5", stream: false, input: [makeResponsesInput("hello codex-compatible aimock")], }), @@ -100,7 +100,7 @@ describe("qa aimock server", () => { const debug = await fetch(`${server.baseUrl}/debug/last-request`); expect(debug.status).toBe(200); expect(await debug.json()).toMatchObject({ - model: "openai-codex/gpt-5.4", + model: "openai-codex/gpt-5.5", providerVariant: "openai", }); } finally { diff --git a/extensions/qa-lab/src/providers/live-frontier/catalog.ts b/extensions/qa-lab/src/providers/live-frontier/catalog.ts index e0b88519c56..f31f6a9e854 100644 --- a/extensions/qa-lab/src/providers/live-frontier/catalog.ts +++ b/extensions/qa-lab/src/providers/live-frontier/catalog.ts @@ -1,5 +1,5 @@ export const QA_FRONTIER_PROVIDER_IDS = ["anthropic", "google", "openai"] as const; -export const QA_FRONTIER_CATALOG_PRIMARY_MODEL = "openai/gpt-5.4"; +export const QA_FRONTIER_CATALOG_PRIMARY_MODEL = "openai/gpt-5.5"; export const QA_FRONTIER_CATALOG_ALTERNATE_MODEL = "anthropic/claude-sonnet-4-6"; export function isPreferredQaLiveFrontierCatalogModel(modelRef: string) { diff --git a/extensions/qa-lab/src/providers/live-frontier/character-eval.ts b/extensions/qa-lab/src/providers/live-frontier/character-eval.ts index b3300ac7e95..5019ff5b8dc 100644 --- a/extensions/qa-lab/src/providers/live-frontier/character-eval.ts +++ b/extensions/qa-lab/src/providers/live-frontier/character-eval.ts @@ -6,7 +6,7 @@ type QaFrontierCharacterModelOptions = { }; export const QA_FRONTIER_CHARACTER_EVAL_MODELS = Object.freeze([ - "openai/gpt-5.4", + "openai/gpt-5.5", "openai/gpt-5.2", "openai/gpt-5", "anthropic/claude-opus-4-6", @@ -18,19 +18,19 @@ export const QA_FRONTIER_CHARACTER_EVAL_MODELS = Object.freeze([ export const QA_FRONTIER_CHARACTER_THINKING_BY_MODEL: Readonly> = Object.freeze({ - "openai/gpt-5.4": "medium", + "openai/gpt-5.5": "medium", "openai/gpt-5.2": "xhigh", "openai/gpt-5": "xhigh", }); export const QA_FRONTIER_CHARACTER_JUDGE_MODELS = Object.freeze([ - "openai/gpt-5.4", + "openai/gpt-5.5", "anthropic/claude-opus-4-6", ]); export const QA_FRONTIER_CHARACTER_JUDGE_MODEL_OPTIONS: Readonly< Record > = Object.freeze({ - "openai/gpt-5.4": { thinkingDefault: "xhigh", fastMode: true }, + "openai/gpt-5.5": { thinkingDefault: "xhigh", fastMode: true }, "anthropic/claude-opus-4-6": { thinkingDefault: "high" }, }); diff --git a/extensions/qa-lab/src/providers/live-frontier/index.ts b/extensions/qa-lab/src/providers/live-frontier/index.ts index ac54f7ea368..0613f7f96c6 100644 --- a/extensions/qa-lab/src/providers/live-frontier/index.ts +++ b/extensions/qa-lab/src/providers/live-frontier/index.ts @@ -23,7 +23,7 @@ function isClaudeOpusModel(modelRef: string) { export const liveFrontierProviderDefinition: QaProviderDefinition = { mode: "live-frontier", kind: "live", - defaultModel: (options) => options?.preferredLiveModel ?? "openai/gpt-5.4", + defaultModel: (options) => options?.preferredLiveModel ?? "openai/gpt-5.5", defaultImageGenerationProviderIds: ["openai"], defaultImageGenerationModel: ({ modelProviderIds }) => modelProviderIds.includes("openai") ? "openai/gpt-image-1" : null, diff --git a/extensions/qa-lab/src/providers/live-frontier/model-selection.runtime.ts b/extensions/qa-lab/src/providers/live-frontier/model-selection.runtime.ts index cae4f1978d6..00c79573d73 100644 --- a/extensions/qa-lab/src/providers/live-frontier/model-selection.runtime.ts +++ b/extensions/qa-lab/src/providers/live-frontier/model-selection.runtime.ts @@ -4,7 +4,7 @@ import { } from "openclaw/plugin-sdk/agent-runtime"; import { resolveEnvApiKey } from "openclaw/plugin-sdk/provider-auth"; -const QA_CODEX_OAUTH_LIVE_MODEL = "openai/gpt-5.4"; +const QA_CODEX_OAUTH_LIVE_MODEL = "openai/gpt-5.5"; export function resolveQaLiveFrontierPreferredModel() { if (resolveEnvApiKey("openai")?.apiKey) { diff --git a/extensions/qa-lab/src/providers/live-frontier/parity.ts b/extensions/qa-lab/src/providers/live-frontier/parity.ts index 887d691f33b..62bcd5556ce 100644 --- a/extensions/qa-lab/src/providers/live-frontier/parity.ts +++ b/extensions/qa-lab/src/providers/live-frontier/parity.ts @@ -1,2 +1,2 @@ -export const QA_FRONTIER_PARITY_CANDIDATE_LABEL = "openai/gpt-5.4"; +export const QA_FRONTIER_PARITY_CANDIDATE_LABEL = "openai/gpt-5.5"; export const QA_FRONTIER_PARITY_BASELINE_LABEL = "anthropic/claude-opus-4-6"; diff --git a/extensions/qa-lab/src/providers/mock-openai/server.test.ts b/extensions/qa-lab/src/providers/mock-openai/server.test.ts index af903846cc6..f5fe5998105 100644 --- a/extensions/qa-lab/src/providers/mock-openai/server.test.ts +++ b/extensions/qa-lab/src/providers/mock-openai/server.test.ts @@ -130,7 +130,7 @@ describe("qa mock openai server", () => { }, body: JSON.stringify({ stream: false, - model: "gpt-5.4", + model: "gpt-5.5", input: [ makeUserInput( "Before acting, tell me the single file you would start with in six words or fewer. Do not use tools yet.", @@ -159,7 +159,7 @@ describe("qa mock openai server", () => { }, body: JSON.stringify({ stream: true, - model: "gpt-5.4", + model: "gpt-5.5", input: [ makeUserInput( "Before acting, tell me the single file you would start with in six words or fewer. Do not use tools yet.", @@ -178,7 +178,7 @@ describe("qa mock openai server", () => { const debugResponse = await fetch(`${server.baseUrl}/debug/last-request`); expect(debugResponse.status).toBe(200); expect(await debugResponse.json()).toMatchObject({ - model: "gpt-5.4", + model: "gpt-5.5", prompt: "ok do it. read `QA_KICKOFF_TASK.md` now and reply with the QA mission in one short sentence.", allInputText: expect.stringContaining("ok do it."), @@ -285,7 +285,7 @@ describe("qa mock openai server", () => { }, body: JSON.stringify({ stream: true, - model: "gpt-5.4", + model: "gpt-5.5", input: [ { role: "user", @@ -312,7 +312,7 @@ describe("qa mock openai server", () => { }, body: JSON.stringify({ stream: false, - model: "gpt-5.4-alt", + model: "gpt-5.5-alt", input: [ { role: "user", @@ -344,8 +344,8 @@ describe("qa mock openai server", () => { const requests = await fetch(`${server.baseUrl}/debug/requests`); expect(requests.status).toBe(200); expect((await requests.json()) as Array<{ model?: string }>).toMatchObject([ - { model: "gpt-5.4" }, - { model: "gpt-5.4-alt" }, + { model: "gpt-5.5" }, + { model: "gpt-5.5-alt" }, ]); }); @@ -365,7 +365,7 @@ describe("qa mock openai server", () => { }, body: JSON.stringify({ stream: true, - model: "gpt-5.4", + model: "gpt-5.5", input: [ { role: "user", @@ -402,7 +402,7 @@ describe("qa mock openai server", () => { headers: { "content-type": "application/json" }, body: JSON.stringify({ stream: true, - model: "gpt-5.4", + model: "gpt-5.5", input: [{ role: "user", content: [{ type: "input_text", text: prompt }] }], }), }); @@ -414,7 +414,7 @@ describe("qa mock openai server", () => { headers: { "content-type": "application/json" }, body: JSON.stringify({ stream: true, - model: "gpt-5.4", + model: "gpt-5.5", input: [ { role: "user", content: [{ type: "input_text", text: prompt }] }, { @@ -433,7 +433,7 @@ describe("qa mock openai server", () => { headers: { "content-type": "application/json" }, body: JSON.stringify({ stream: true, - model: "gpt-5.4", + model: "gpt-5.5", input: [ { role: "user", content: [{ type: "input_text", text: prompt }] }, { @@ -451,7 +451,7 @@ describe("qa mock openai server", () => { headers: { "content-type": "application/json" }, body: JSON.stringify({ stream: true, - model: "gpt-5.4", + model: "gpt-5.5", input: [ { role: "user", content: [{ type: "input_text", text: prompt }] }, { @@ -472,7 +472,7 @@ describe("qa mock openai server", () => { headers: { "content-type": "application/json" }, body: JSON.stringify({ stream: false, - model: "gpt-5.4", + model: "gpt-5.5", input: [ { role: "user", content: [{ type: "input_text", text: prompt }] }, { @@ -508,7 +508,7 @@ describe("qa mock openai server", () => { }, body: JSON.stringify({ stream: true, - model: "gpt-5.4", + model: "gpt-5.5", input: [ { role: "user", @@ -538,7 +538,7 @@ describe("qa mock openai server", () => { }, body: JSON.stringify({ stream: false, - model: "gpt-5.4", + model: "gpt-5.5", input: [ { role: "user", @@ -1407,7 +1407,7 @@ describe("qa mock openai server", () => { headers: { "content-type": "application/json" }, body: JSON.stringify({ stream: false, - model: "mock-openai/gpt-5.4", + model: "mock-openai/gpt-5.5", input: [ { role: "user", @@ -1457,7 +1457,7 @@ describe("qa mock openai server", () => { headers: { "content-type": "application/json" }, body: JSON.stringify({ stream: false, - model: "mock-openai/gpt-5.4", + model: "mock-openai/gpt-5.5", input: [ { role: "user", @@ -1544,7 +1544,7 @@ describe("qa mock openai server", () => { }, body: JSON.stringify({ stream: false, - model: "gpt-5.4-alt", + model: "gpt-5.5-alt", input: [ { role: "user", @@ -1630,7 +1630,7 @@ describe("qa mock openai server", () => { const body = (await response.json()) as { data: Array<{ id: string }> }; const ids = body.data.map((entry) => entry.id); expect(ids).toContain("claude-opus-4-6"); - expect(ids).toContain("gpt-5.4"); + expect(ids).toContain("gpt-5.5"); }); it("dispatches an Anthropic /v1/messages read tool call for source discovery prompts", async () => { @@ -2160,7 +2160,7 @@ describe("qa mock openai server", () => { const toolPlan = await expectResponsesText(server, { stream: true, - model: "gpt-5.4", + model: "gpt-5.5", input: [makeUserInput(QA_REASONING_ONLY_RECOVERY_PROMPT)], }); expect(toolPlan).toContain('"name":"read"'); @@ -2171,7 +2171,7 @@ describe("qa mock openai server", () => { output?: Array<{ type?: string; id?: string; summary?: Array<{ text?: string }> }>; }>(server, { stream: false, - model: "gpt-5.4", + model: "gpt-5.5", input: [ makeUserInput(QA_REASONING_ONLY_RECOVERY_PROMPT), { @@ -2195,7 +2195,7 @@ describe("qa mock openai server", () => { output?: Array<{ content?: Array<{ text?: string }> }>; }>(server, { stream: false, - model: "gpt-5.4", + model: "gpt-5.5", input: [ makeUserInput(QA_REASONING_ONLY_RECOVERY_PROMPT), makeUserInput(QA_REASONING_ONLY_RETRY_INSTRUCTION), @@ -2222,7 +2222,7 @@ describe("qa mock openai server", () => { ]); }); - it("scripts the GPT-5.4 thinking visibility switch prompts", async () => { + it("scripts the GPT-5.5 thinking visibility switch prompts", async () => { const server = await startMockServer(); expect( @@ -2230,7 +2230,7 @@ describe("qa mock openai server", () => { output?: Array<{ type?: string; content?: Array<{ text?: string }> }>; }>(server, { stream: false, - model: "gpt-5.4", + model: "gpt-5.5", input: [makeUserInput(QA_THINKING_VISIBILITY_OFF_PROMPT)], }), ).toMatchObject({ @@ -2252,7 +2252,7 @@ describe("qa mock openai server", () => { }>; }>(server, { stream: false, - model: "gpt-5.4", + model: "gpt-5.5", input: [makeUserInput(QA_THINKING_VISIBILITY_MAX_PROMPT)], }), ).toMatchObject({ @@ -2275,7 +2275,7 @@ describe("qa mock openai server", () => { const toolPlan = await expectResponsesText(server, { stream: true, - model: "gpt-5.4", + model: "gpt-5.5", input: [makeUserInput(QA_REASONING_ONLY_SIDE_EFFECT_PROMPT)], }); expect(toolPlan).toContain('"name":"write"'); @@ -2286,7 +2286,7 @@ describe("qa mock openai server", () => { output?: Array<{ type?: string; id?: string }>; }>(server, { stream: false, - model: "gpt-5.4", + model: "gpt-5.5", input: [ makeUserInput(QA_REASONING_ONLY_SIDE_EFFECT_PROMPT), { @@ -2309,7 +2309,7 @@ describe("qa mock openai server", () => { const toolPlan = await expectResponsesText(server, { stream: true, - model: "gpt-5.4", + model: "gpt-5.5", input: [makeUserInput(QA_EMPTY_RESPONSE_RECOVERY_PROMPT)], }); expect(toolPlan).toContain('"name":"read"'); @@ -2319,7 +2319,7 @@ describe("qa mock openai server", () => { output?: Array<{ content?: Array<{ type?: string; text?: string }> }>; }>(server, { stream: false, - model: "gpt-5.4", + model: "gpt-5.5", input: [ makeUserInput(QA_EMPTY_RESPONSE_RECOVERY_PROMPT), { @@ -2341,7 +2341,7 @@ describe("qa mock openai server", () => { output?: Array<{ content?: Array<{ text?: string }> }>; }>(server, { stream: false, - model: "gpt-5.4", + model: "gpt-5.5", input: [ makeUserInput(QA_EMPTY_RESPONSE_RECOVERY_PROMPT), makeUserInput(QA_EMPTY_RESPONSE_RETRY_INSTRUCTION), @@ -2365,7 +2365,7 @@ describe("qa mock openai server", () => { await expectResponsesText(server, { stream: true, - model: "gpt-5.4", + model: "gpt-5.5", input: [makeUserInput(QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT)], }); @@ -2373,7 +2373,7 @@ describe("qa mock openai server", () => { output?: Array<{ content?: Array<{ text?: string }> }>; }>(server, { stream: false, - model: "gpt-5.4", + model: "gpt-5.5", input: [ makeUserInput(QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT), { @@ -2388,7 +2388,7 @@ describe("qa mock openai server", () => { output?: Array<{ content?: Array<{ text?: string }> }>; }>(server, { stream: false, - model: "gpt-5.4", + model: "gpt-5.5", input: [ makeUserInput(QA_EMPTY_RESPONSE_EXHAUSTION_PROMPT), makeUserInput(QA_EMPTY_RESPONSE_RETRY_INSTRUCTION), @@ -2404,9 +2404,9 @@ describe("qa mock openai server", () => { describe("resolveProviderVariant", () => { it("tags prefix-qualified openai models", () => { - expect(resolveProviderVariant("openai/gpt-5.4")).toBe("openai"); - expect(resolveProviderVariant("openai:gpt-5.4")).toBe("openai"); - expect(resolveProviderVariant("openai-codex/gpt-5.4")).toBe("openai"); + expect(resolveProviderVariant("openai/gpt-5.5")).toBe("openai"); + expect(resolveProviderVariant("openai:gpt-5.5")).toBe("openai"); + expect(resolveProviderVariant("openai-codex/gpt-5.5")).toBe("openai"); }); it("tags prefix-qualified anthropic models", () => { @@ -2416,8 +2416,8 @@ describe("resolveProviderVariant", () => { }); it("tags bare model names by prefix", () => { - expect(resolveProviderVariant("gpt-5.4")).toBe("openai"); - expect(resolveProviderVariant("gpt-5.4-alt")).toBe("openai"); + expect(resolveProviderVariant("gpt-5.5")).toBe("openai"); + expect(resolveProviderVariant("gpt-5.5-alt")).toBe("openai"); expect(resolveProviderVariant("gpt-4.5")).toBe("openai"); expect(resolveProviderVariant("o1-preview")).toBe("openai"); expect(resolveProviderVariant("claude-opus-4-6")).toBe("anthropic"); @@ -2425,7 +2425,7 @@ describe("resolveProviderVariant", () => { }); it("handles case drift and whitespace", () => { - expect(resolveProviderVariant(" OpenAI/GPT-5.4 ")).toBe("openai"); + expect(resolveProviderVariant(" OpenAI/GPT-5.5 ")).toBe("openai"); expect(resolveProviderVariant("ANTHROPIC/CLAUDE-OPUS-4-6")).toBe("anthropic"); }); @@ -2451,7 +2451,7 @@ describe("qa mock openai server provider variant tagging", () => { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ - model: "openai/gpt-5.4", + model: "openai/gpt-5.5", stream: false, input: [{ role: "user", content: [{ type: "input_text", text: "Heartbeat check" }] }], }), @@ -2461,7 +2461,7 @@ describe("qa mock openai server provider variant tagging", () => { model: string; providerVariant: string; }; - expect(debug.model).toBe("openai/gpt-5.4"); + expect(debug.model).toBe("openai/gpt-5.5"); expect(debug.providerVariant).toBe("openai"); }); diff --git a/extensions/qa-lab/src/providers/mock-openai/server.ts b/extensions/qa-lab/src/providers/mock-openai/server.ts index bbec49c02b2..f7dfa6c69fa 100644 --- a/extensions/qa-lab/src/providers/mock-openai/server.ts +++ b/extensions/qa-lab/src/providers/mock-openai/server.ts @@ -53,7 +53,7 @@ type StreamEvent = * - Everything else (including empty strings) → `"unknown"` * * The `/v1/messages` route always feeds `body.model` straight through, - * so an Anthropic request with an `openai/gpt-5.4` model string is still + * so an Anthropic request with an `openai/gpt-5.5` model string is still * classified as `"openai"`. That matches the parity program's convention * where the provider label is the source of truth, not the HTTP route. */ @@ -78,7 +78,7 @@ export function resolveProviderVariant(model: string | undefined): MockOpenAiPro return "anthropic"; } // Fall back to model-name prefix matching for bare model strings like - // `gpt-5.4` or `claude-opus-4-6`. + // `gpt-5.5` or `claude-opus-4-6`. if (/^(?:gpt-|o1-|openai-)/.test(trimmed)) { return "openai"; } @@ -1537,7 +1537,7 @@ async function buildResponsesPayload( // --------------------------------------------------------------------------- // // The QA parity gate needs two comparable scenario runs: one against the -// "candidate" (openai/gpt-5.4) and one against the "baseline" +// "candidate" (openai/gpt-5.5) and one against the "baseline" // (anthropic/claude-opus-4-6). The OpenAI mock above already dispatches all // the scenario prompt branches we care about. Rather than duplicating that // machinery, the /v1/messages route below translates Anthropic request @@ -1926,8 +1926,8 @@ export async function startQaMockOpenAiServer(params?: { host?: string; port?: n if (req.method === "GET" && url.pathname === "/v1/models") { writeJson(res, 200, { data: [ - { id: "gpt-5.4", object: "model" }, - { id: "gpt-5.4-alt", object: "model" }, + { id: "gpt-5.5", object: "model" }, + { id: "gpt-5.5-alt", object: "model" }, { id: "gpt-image-1", object: "model" }, { id: "text-embedding-3-small", object: "model" }, { id: "claude-opus-4-6", object: "model" }, diff --git a/extensions/qa-lab/src/providers/shared/mock-model-config.ts b/extensions/qa-lab/src/providers/shared/mock-model-config.ts index 8a3521ec09f..5d5d644d9ee 100644 --- a/extensions/qa-lab/src/providers/shared/mock-model-config.ts +++ b/extensions/qa-lab/src/providers/shared/mock-model-config.ts @@ -28,8 +28,8 @@ export function createMockOpenAiResponsesProvider(baseUrl: string): ModelProvide }, models: [ { - id: "gpt-5.4", - name: "gpt-5.4", + id: "gpt-5.5", + name: "gpt-5.5", api: "openai-responses", reasoning: false, input: ["text", "image"], @@ -38,8 +38,8 @@ export function createMockOpenAiResponsesProvider(baseUrl: string): ModelProvide maxTokens: 4096, }, { - id: "gpt-5.4-alt", - name: "gpt-5.4-alt", + id: "gpt-5.5-alt", + name: "gpt-5.5-alt", api: "openai-responses", reasoning: false, input: ["text", "image"], diff --git a/extensions/qa-lab/src/providers/shared/mock-provider-definition.ts b/extensions/qa-lab/src/providers/shared/mock-provider-definition.ts index 54f45deb1dc..0ad069f2505 100644 --- a/extensions/qa-lab/src/providers/shared/mock-provider-definition.ts +++ b/extensions/qa-lab/src/providers/shared/mock-provider-definition.ts @@ -10,7 +10,7 @@ export type MockQaProviderDefinitionParams = { }; function mockModelRef(providerId: string, alternate?: boolean) { - return `${providerId}/${alternate ? "gpt-5.4-alt" : "gpt-5.4"}`; + return `${providerId}/${alternate ? "gpt-5.5-alt" : "gpt-5.5"}`; } export function createMockQaProviderDefinition( diff --git a/extensions/qa-lab/src/qa-gateway-config.test.ts b/extensions/qa-lab/src/qa-gateway-config.test.ts index 0f6e502535f..ff0b2b96872 100644 --- a/extensions/qa-lab/src/qa-gateway-config.test.ts +++ b/extensions/qa-lab/src/qa-gateway-config.test.ts @@ -51,7 +51,7 @@ describe("buildQaGatewayConfig", () => { ...createQaChannelTransportParams(), }); - expect(getPrimaryModel(cfg.agents?.defaults?.model)).toBe("mock-openai/gpt-5.4"); + expect(getPrimaryModel(cfg.agents?.defaults?.model)).toBe("mock-openai/gpt-5.5"); expect(cfg.models?.providers?.["mock-openai"]?.baseUrl).toBe("http://127.0.0.1:44080/v1"); expect(cfg.models?.providers?.["mock-openai"]?.request).toEqual({ allowPrivateNetwork: true }); expect(cfg.models?.providers?.openai?.baseUrl).toBe("http://127.0.0.1:44080/v1"); @@ -88,14 +88,14 @@ describe("buildQaGatewayConfig", () => { providerBaseUrl: "http://127.0.0.1:44080/v1", workspaceDir: "/tmp/qa-workspace", providerMode: "mock-openai", - primaryModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", alternateModel: "anthropic/claude-opus-4-6", }); - expect(getPrimaryModel(cfg.agents?.defaults?.model)).toBe("openai/gpt-5.4"); + expect(getPrimaryModel(cfg.agents?.defaults?.model)).toBe("openai/gpt-5.5"); expect(cfg.models?.providers?.openai?.api).toBe("openai-responses"); expect(cfg.models?.providers?.openai?.request).toEqual({ allowPrivateNetwork: true }); - expect(cfg.models?.providers?.openai?.models.map((model) => model.id)).toContain("gpt-5.4"); + expect(cfg.models?.providers?.openai?.models.map((model) => model.id)).toContain("gpt-5.5"); expect(cfg.models?.providers?.anthropic?.api).toBe("anthropic-messages"); expect(cfg.models?.providers?.anthropic?.baseUrl).toBe("http://127.0.0.1:44080"); expect(cfg.models?.providers?.anthropic?.request).toEqual({ allowPrivateNetwork: true }); @@ -113,11 +113,11 @@ describe("buildQaGatewayConfig", () => { providerBaseUrl: "http://127.0.0.1:45080/v1", workspaceDir: "/tmp/qa-workspace", providerMode: "aimock", - primaryModel: "aimock/gpt-5.4", - alternateModel: "aimock/gpt-5.4-alt", + primaryModel: "aimock/gpt-5.5", + alternateModel: "aimock/gpt-5.5-alt", }); - expect(getPrimaryModel(cfg.agents?.defaults?.model)).toBe("aimock/gpt-5.4"); + expect(getPrimaryModel(cfg.agents?.defaults?.model)).toBe("aimock/gpt-5.5"); expect(cfg.agents?.defaults?.imageGenerationModel).toEqual({ primary: "aimock/gpt-image-1", }); @@ -167,17 +167,17 @@ describe("buildQaGatewayConfig", () => { workspaceDir: "/tmp/qa-workspace", providerMode: "live-frontier", fastMode: true, - primaryModel: "openai/gpt-5.4", - alternateModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", + alternateModel: "openai/gpt-5.5", ...createQaChannelTransportParams(), }); - expect(getPrimaryModel(cfg.agents?.defaults?.model)).toBe("openai/gpt-5.4"); - expect(getPrimaryModel(cfg.agents?.list?.[0]?.model)).toBe("openai/gpt-5.4"); + expect(getPrimaryModel(cfg.agents?.defaults?.model)).toBe("openai/gpt-5.5"); + expect(getPrimaryModel(cfg.agents?.list?.[0]?.model)).toBe("openai/gpt-5.5"); expect(cfg.models).toBeUndefined(); expect(cfg.plugins?.allow).toEqual(["acpx", "memory-core", "openai", "qa-channel"]); expect(cfg.plugins?.entries?.openai).toEqual({ enabled: true }); - expect(cfg.agents?.defaults?.models?.["openai/gpt-5.4"]).toEqual({ + expect(cfg.agents?.defaults?.models?.["openai/gpt-5.5"]).toEqual({ params: { transport: "sse", openaiWsWarmup: false, fastMode: true }, }); }); @@ -273,14 +273,14 @@ describe("buildQaGatewayConfig", () => { gatewayToken: "token", workspaceDir: "/tmp/qa-workspace", providerMode: "live-frontier", - primaryModel: "openai/gpt-5.4", - alternateModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", + alternateModel: "openai/gpt-5.5", thinkingDefault: "xhigh", ...createQaChannelTransportParams(), }); expect(cfg.agents?.defaults?.thinkingDefault).toBe("xhigh"); - expect(cfg.agents?.defaults?.models?.["openai/gpt-5.4"]?.params).toMatchObject({ + expect(cfg.agents?.defaults?.models?.["openai/gpt-5.5"]?.params).toMatchObject({ thinking: "xhigh", }); }); diff --git a/extensions/qa-lab/src/run-config.test.ts b/extensions/qa-lab/src/run-config.test.ts index 77d3ead5484..d1414f3caf0 100644 --- a/extensions/qa-lab/src/run-config.test.ts +++ b/extensions/qa-lab/src/run-config.test.ts @@ -45,8 +45,8 @@ describe("qa run config", () => { it("creates a live-by-default selection that arms every scenario", () => { expect(createDefaultQaRunSelection(scenarios)).toEqual({ providerMode: "live-frontier", - primaryModel: "openai/gpt-5.4", - alternateModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", + alternateModel: "openai/gpt-5.5", fastMode: true, scenarioIds: ["dm-chat-baseline", "thread-lifecycle"], }); @@ -57,7 +57,7 @@ describe("qa run config", () => { normalizeQaRunSelection( { providerMode: "live-frontier", - primaryModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", alternateModel: "", fastMode: false, scenarioIds: ["thread-lifecycle", "missing", "thread-lifecycle"], @@ -66,8 +66,8 @@ describe("qa run config", () => { ), ).toEqual({ providerMode: "live-frontier", - primaryModel: "openai/gpt-5.4", - alternateModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", + alternateModel: "openai/gpt-5.5", fastMode: true, scenarioIds: ["thread-lifecycle"], }); @@ -99,13 +99,13 @@ describe("qa run config", () => { }); it("keeps idle snapshots on static defaults so startup does not inspect auth profiles", () => { - defaultQaRuntimeModelForMode.mockReturnValue("openai/gpt-5.4"); + defaultQaRuntimeModelForMode.mockReturnValue("openai/gpt-5.5"); defaultQaRuntimeModelForMode.mockClear(); expect(createIdleQaRunnerSnapshot(scenarios).selection).toMatchObject({ providerMode: "live-frontier", - primaryModel: "openai/gpt-5.4", - alternateModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", + alternateModel: "openai/gpt-5.5", }); expect(defaultQaRuntimeModelForMode).not.toHaveBeenCalled(); }); @@ -123,8 +123,8 @@ describe("qa run config", () => { ), ).toEqual({ providerMode: "aimock", - primaryModel: "aimock/gpt-5.4", - alternateModel: "aimock/gpt-5.4-alt", + primaryModel: "aimock/gpt-5.5", + alternateModel: "aimock/gpt-5.5-alt", fastMode: false, scenarioIds: ["dm-chat-baseline"], }); @@ -138,14 +138,14 @@ describe("qa run config", () => { it("prefers the Codex OAuth default when the runtime resolver says it is available", () => { defaultQaRuntimeModelForMode.mockImplementation((mode, options) => mode === "live-frontier" - ? "openai/gpt-5.4" + ? "openai/gpt-5.5" : defaultQaProviderModelForMode(mode as QaProviderModeInput, options), ); expect(createDefaultQaRunSelection(scenarios)).toEqual({ providerMode: "live-frontier", - primaryModel: "openai/gpt-5.4", - alternateModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", + alternateModel: "openai/gpt-5.5", fastMode: true, scenarioIds: ["dm-chat-baseline", "thread-lifecycle"], }); diff --git a/extensions/qa-lab/src/scenario-catalog.test.ts b/extensions/qa-lab/src/scenario-catalog.test.ts index 846cf57be00..84bd87410cb 100644 --- a/extensions/qa-lab/src/scenario-catalog.test.ts +++ b/extensions/qa-lab/src/scenario-catalog.test.ts @@ -123,9 +123,9 @@ describe("qa scenario catalog", () => { ); }); - it("includes the GPT-5.4 thinking visibility switch scenario", () => { - const scenario = readQaScenarioById("gpt54-thinking-visibility-switch"); - const config = readQaScenarioExecutionConfig("gpt54-thinking-visibility-switch") as + it("includes the GPT-5.5 thinking visibility switch scenario", () => { + const scenario = readQaScenarioById("gpt55-thinking-visibility-switch"); + const config = readQaScenarioExecutionConfig("gpt55-thinking-visibility-switch") as | { requiredLiveProvider?: string; requiredLiveModel?: string; @@ -135,9 +135,9 @@ describe("qa scenario catalog", () => { } | undefined; - expect(scenario.sourcePath).toBe("qa/scenarios/models/gpt54-thinking-visibility-switch.md"); + expect(scenario.sourcePath).toBe("qa/scenarios/models/gpt55-thinking-visibility-switch.md"); expect(config?.requiredLiveProvider).toBe("openai"); - expect(config?.requiredLiveModel).toBe("gpt-5.4"); + expect(config?.requiredLiveModel).toBe("gpt-5.5"); expect(config?.offDirective).toBe("/think off"); expect(config?.maxDirective).toBe("/think medium"); expect(config?.reasoningDirective).toBe("/reasoning on"); @@ -169,10 +169,10 @@ describe("qa scenario catalog", () => { }, }); expect(config?.requiredProvider).toBe("openai"); - expect(config?.requiredModel).toBe("gpt-5.4"); + expect(config?.requiredModel).toBe("gpt-5.5"); expect(config?.expectedMarker).toBe("WEB-SEARCH-OK"); expect(scenario.execution.flow?.steps.map((step) => step.name)).toEqual([ - "confirms live OpenAI GPT-5.4 web search auto mode", + "confirms live OpenAI GPT-5.5 web search auto mode", "searches official OpenAI News through the live model", ]); }); @@ -191,7 +191,7 @@ describe("qa scenario catalog", () => { expect(scenario.sourcePath).toBe("qa/scenarios/models/thinking-slash-model-remap.md"); expect(config?.requiredProviderMode).toBe("live-frontier"); expect(config?.anthropicModelRef).toBe("anthropic/claude-sonnet-4-6"); - expect(config?.openAiXhighModelRef).toBe("openai/gpt-5.4"); + expect(config?.openAiXhighModelRef).toBe("openai/gpt-5.5"); expect(config?.noXhighModelRef).toBe("anthropic/claude-sonnet-4-6"); expect(scenario.execution.flow?.steps.map((step) => step.name)).toEqual([ "selects Anthropic and verifies adaptive options", diff --git a/extensions/qa-lab/src/suite-planning.test.ts b/extensions/qa-lab/src/suite-planning.test.ts index 5e87d99872f..753b23525ce 100644 --- a/extensions/qa-lab/src/suite-planning.test.ts +++ b/extensions/qa-lab/src/suite-planning.test.ts @@ -158,7 +158,7 @@ describe("qa suite planning helpers", () => { scenarios, scenarioIds: ["anthropic-only"], providerMode: "live-frontier", - primaryModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", }).map((scenario) => scenario.id), ).toEqual(["anthropic-only"]); }); @@ -274,7 +274,7 @@ describe("qa suite planning helpers", () => { const scenarios = [ makeQaSuiteTestScenario("generic"), makeQaSuiteTestScenario("openai-only", { - config: { requiredProvider: "openai", requiredModel: "gpt-5.4" }, + config: { requiredProvider: "openai", requiredModel: "gpt-5.5" }, }), makeQaSuiteTestScenario("anthropic-only", { config: { requiredProvider: "anthropic", requiredModel: "claude-opus-4-6" }, @@ -288,7 +288,7 @@ describe("qa suite planning helpers", () => { selectQaSuiteScenarios({ scenarios, providerMode: "live-frontier", - primaryModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", }).map((scenario) => scenario.id), ).toEqual(["generic", "openai-only"]); @@ -317,7 +317,7 @@ describe("qa suite planning helpers", () => { selectQaSuiteScenarios({ scenarios, providerMode: "mock-openai", - primaryModel: "mock-openai/gpt-5.4", + primaryModel: "mock-openai/gpt-5.5", }).map((scenario) => scenario.id), ).toEqual(["generic", "mock-only"]); @@ -325,7 +325,7 @@ describe("qa suite planning helpers", () => { selectQaSuiteScenarios({ scenarios, providerMode: "live-frontier", - primaryModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", }).map((scenario) => scenario.id), ).toEqual(["generic", "live-only"]); }); diff --git a/extensions/qa-lab/src/suite-runtime-agent-process.integration.test.ts b/extensions/qa-lab/src/suite-runtime-agent-process.integration.test.ts index f1a3d04c579..58a4ee08513 100644 --- a/extensions/qa-lab/src/suite-runtime-agent-process.integration.test.ts +++ b/extensions/qa-lab/src/suite-runtime-agent-process.integration.test.ts @@ -58,8 +58,8 @@ describe("qa suite runtime CLI integration", () => { OPENCLAW_BUNDLED_PLUGINS_DIR: bundledPluginsDir, }, }, - primaryModel: "openai/gpt-5.4", - alternateModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", + alternateModel: "openai/gpt-5.5", providerMode: "mock-openai", } as never, ["memory", "status", "--json"], diff --git a/extensions/qa-lab/src/suite-runtime-agent-process.test.ts b/extensions/qa-lab/src/suite-runtime-agent-process.test.ts index 04d3cb8a869..133eb73f196 100644 --- a/extensions/qa-lab/src/suite-runtime-agent-process.test.ts +++ b/extensions/qa-lab/src/suite-runtime-agent-process.test.ts @@ -81,8 +81,8 @@ describe("qa suite runtime agent process helpers", () => { tempRoot: "/tmp/runtime", runtimeEnv: { PATH: "/usr/bin" }, }, - primaryModel: "openai/gpt-5.4", - alternateModel: "openai/gpt-5.4-mini", + primaryModel: "openai/gpt-5.5", + alternateModel: "openai/gpt-5.5-mini", providerMode: "mock-openai", } as never, ["qa", "suite"], @@ -114,8 +114,8 @@ describe("qa suite runtime agent process helpers", () => { tempRoot: "/tmp/runtime", runtimeEnv: { PATH: "/usr/bin", OPENCLAW_STATE_DIR: "/tmp/default-state" }, }, - primaryModel: "openai/gpt-5.4", - alternateModel: "openai/gpt-5.4-mini", + primaryModel: "openai/gpt-5.5", + alternateModel: "openai/gpt-5.5-mini", providerMode: "mock-openai", } as never, ["crestodian", "-m", "overview"], @@ -156,8 +156,8 @@ describe("qa suite runtime agent process helpers", () => { tempRoot: "/tmp/runtime", runtimeEnv: {}, }, - primaryModel: "openai/gpt-5.4", - alternateModel: "openai/gpt-5.4-mini", + primaryModel: "openai/gpt-5.5", + alternateModel: "openai/gpt-5.5-mini", providerMode: "mock-openai", } as never, ["memory", "search"], @@ -182,8 +182,8 @@ describe("qa suite runtime agent process helpers", () => { tempRoot: "/tmp/runtime", runtimeEnv: {}, }, - primaryModel: "openai/gpt-5.4", - alternateModel: "openai/gpt-5.4-mini", + primaryModel: "openai/gpt-5.5", + alternateModel: "openai/gpt-5.5-mini", providerMode: "mock-openai", } as never, ["memory", "search", "--json"], @@ -213,8 +213,8 @@ describe("qa suite runtime agent process helpers", () => { tempRoot: "/tmp/runtime", runtimeEnv: {}, }, - primaryModel: "openai/gpt-5.4", - alternateModel: "openai/gpt-5.4-mini", + primaryModel: "openai/gpt-5.5", + alternateModel: "openai/gpt-5.5-mini", providerMode: "mock-openai", } as never, ["memory", "search", "--json"], diff --git a/extensions/qa-lab/src/suite-runtime-agent-session.test.ts b/extensions/qa-lab/src/suite-runtime-agent-session.test.ts index 5bd1af6a0c6..6e9168ba64b 100644 --- a/extensions/qa-lab/src/suite-runtime-agent-session.test.ts +++ b/extensions/qa-lab/src/suite-runtime-agent-session.test.ts @@ -17,8 +17,8 @@ describe("qa suite runtime agent session helpers", () => { const gatewayCall = vi.fn(); const env = { gateway: { call: gatewayCall }, - primaryModel: "openai/gpt-5.4", - alternateModel: "openai/gpt-5.4-mini", + primaryModel: "openai/gpt-5.5", + alternateModel: "openai/gpt-5.5-mini", providerMode: "mock-openai", } as never; diff --git a/extensions/qa-lab/src/suite-runtime-flow.test.ts b/extensions/qa-lab/src/suite-runtime-flow.test.ts index b051d0de455..c4275659789 100644 --- a/extensions/qa-lab/src/suite-runtime-flow.test.ts +++ b/extensions/qa-lab/src/suite-runtime-flow.test.ts @@ -185,8 +185,8 @@ describe("qa suite runtime flow", () => { }, repoRoot: "/repo", providerMode: "mock-openai", - primaryModel: "openai/gpt-5.4", - alternateModel: "openai/gpt-5.4-mini", + primaryModel: "openai/gpt-5.5", + alternateModel: "openai/gpt-5.5-mini", mock: null, cfg: {} as QaSuiteRuntimeEnv["cfg"], } satisfies Parameters[0]["env"]; diff --git a/extensions/qa-lab/src/suite-runtime-gateway.test.ts b/extensions/qa-lab/src/suite-runtime-gateway.test.ts index 985be291d56..0d88e12c764 100644 --- a/extensions/qa-lab/src/suite-runtime-gateway.test.ts +++ b/extensions/qa-lab/src/suite-runtime-gateway.test.ts @@ -31,7 +31,7 @@ describe("qa suite gateway helpers", () => { profile: "coding", }, agents: { - list: [{ id: "qa", model: { primary: "openai/gpt-5.4" } }], + list: [{ id: "qa", model: { primary: "openai/gpt-5.5" } }], }, }; diff --git a/extensions/qa-lab/src/suite.summary-json.test.ts b/extensions/qa-lab/src/suite.summary-json.test.ts index 5db4a6646f4..90256e17dbf 100644 --- a/extensions/qa-lab/src/suite.summary-json.test.ts +++ b/extensions/qa-lab/src/suite.summary-json.test.ts @@ -13,8 +13,8 @@ describe("buildQaSuiteSummaryJson", () => { startedAt: new Date("2026-04-11T00:00:00.000Z"), finishedAt: new Date("2026-04-11T00:05:00.000Z"), providerMode: "mock-openai" as const, - primaryModel: "openai/gpt-5.4", - alternateModel: "openai/gpt-5.4-alt", + primaryModel: "openai/gpt-5.5", + alternateModel: "openai/gpt-5.5-alt", fastMode: true, concurrency: 2, }; @@ -25,12 +25,12 @@ describe("buildQaSuiteSummaryJson", () => { startedAt: "2026-04-11T00:00:00.000Z", finishedAt: "2026-04-11T00:05:00.000Z", providerMode: "mock-openai", - primaryModel: "openai/gpt-5.4", + primaryModel: "openai/gpt-5.5", primaryProvider: "openai", - primaryModelName: "gpt-5.4", - alternateModel: "openai/gpt-5.4-alt", + primaryModelName: "gpt-5.5", + alternateModel: "openai/gpt-5.5-alt", alternateProvider: "openai", - alternateModelName: "gpt-5.4-alt", + alternateModelName: "gpt-5.5-alt", fastMode: true, concurrency: 2, scenarioIds: null, diff --git a/extensions/qa-lab/src/suite.ts b/extensions/qa-lab/src/suite.ts index 32f39e5a9a9..2e3c5919cae 100644 --- a/extensions/qa-lab/src/suite.ts +++ b/extensions/qa-lab/src/suite.ts @@ -286,7 +286,7 @@ export type QaSuiteSummaryJsonParams = { }; /** - * Strongly-typed shape of `qa-suite-summary.json`. The GPT-5.4 parity gate + * Strongly-typed shape of `qa-suite-summary.json`. The GPT-5.5 parity gate * (agentic-parity-report.ts, #64441) and any future parity wrapper can * import this type instead of re-declaring the shape, so changes to the * summary schema propagate through to every consumer at type-check time. @@ -294,7 +294,7 @@ export type QaSuiteSummaryJsonParams = { export type { QaSuiteSummaryJson } from "./suite-summary.js"; /** - * Pure-ish JSON builder for qa-suite-summary.json. Exported so the GPT-5.4 + * Pure-ish JSON builder for qa-suite-summary.json. Exported so the GPT-5.5 * parity gate (agentic-parity-report.ts, #64441) and any future parity * runner can assert-and-trust the provider/model that produced a given * summary instead of blindly accepting the caller's candidateLabel / diff --git a/extensions/qa-lab/web/src/ui-render.ts b/extensions/qa-lab/web/src/ui-render.ts index 76bd3184a45..8356bb82162 100644 --- a/extensions/qa-lab/web/src/ui-render.ts +++ b/extensions/qa-lab/web/src/ui-render.ts @@ -915,15 +915,15 @@ function renderMessageAttachments(message: Message): string { const MOCK_MODELS: RunnerModelOption[] = [ { - key: "mock-openai/gpt-5.4", - name: "GPT-5.4 (mock)", + key: "mock-openai/gpt-5.5", + name: "GPT-5.5 (mock)", provider: "mock-openai", input: "text", preferred: true, }, { - key: "mock-openai/gpt-5.4-alt", - name: "GPT-5.4 Alt (mock)", + key: "mock-openai/gpt-5.5-alt", + name: "GPT-5.5 Alt (mock)", provider: "mock-openai", input: "text", preferred: false, diff --git a/qa/scenarios/models/codex-harness-no-meta-leak.md b/qa/scenarios/models/codex-harness-no-meta-leak.md index bda6addbccd..c36ff8293bc 100644 --- a/qa/scenarios/models/codex-harness-no-meta-leak.md +++ b/qa/scenarios/models/codex-harness-no-meta-leak.md @@ -24,10 +24,10 @@ codeRefs: - extensions/qa-lab/src/suite.ts execution: kind: flow - summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.4 --alt-model openai/gpt-5.4 --fast --thinking medium --scenario codex-harness-no-meta-leak`. + summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.5 --alt-model openai/gpt-5.5 --fast --thinking medium --scenario codex-harness-no-meta-leak`. config: requiredProvider: codex - requiredModel: gpt-5.4 + requiredModel: gpt-5.5 harnessRuntime: codex harnessFallback: none expectedReply: QA_LEAK_OK @@ -47,7 +47,7 @@ execution: ```yaml qa-flow steps: - - name: confirms GPT-5.4 Codex harness target + - name: confirms GPT-5.5 Codex harness target actions: - set: selected value: diff --git a/qa/scenarios/models/gpt54-thinking-visibility-switch.md b/qa/scenarios/models/gpt55-thinking-visibility-switch.md similarity index 93% rename from qa/scenarios/models/gpt54-thinking-visibility-switch.md rename to qa/scenarios/models/gpt55-thinking-visibility-switch.md index 92e8c0d2a42..ce75812f05a 100644 --- a/qa/scenarios/models/gpt54-thinking-visibility-switch.md +++ b/qa/scenarios/models/gpt55-thinking-visibility-switch.md @@ -1,17 +1,17 @@ -# GPT-5.4 thinking visibility switch +# GPT-5.5 thinking visibility switch ```yaml qa-scenario -id: gpt54-thinking-visibility-switch -title: GPT-5.4 thinking visibility switch +id: gpt55-thinking-visibility-switch +title: GPT-5.5 thinking visibility switch surface: models coverage: primary: - models.thinking secondary: - runtime.reasoning-visibility -objective: Verify GPT-5.4 can switch from disabled thinking to medium thinking while reasoning display stays enabled. +objective: Verify GPT-5.5 can switch from disabled thinking to medium thinking while reasoning display stays enabled. successCriteria: - - Live runs target openai/gpt-5.4, not a mini or pro variant. + - Live runs target openai/gpt-5.5, not a mini or pro variant. - The session enables reasoning display before the comparison turns. - The disabled-thinking turn returns its visible marker without a Reasoning-prefixed message. - The medium-thinking turn returns its visible marker and a separate Reasoning-prefixed message. @@ -27,10 +27,10 @@ codeRefs: - extensions/qa-lab/src/providers/mock-openai/server.ts execution: kind: flow - summary: Toggle reasoning display and GPT-5.4 thinking between off/none and medium, then verify visible reasoning only on the medium turn. + summary: Toggle reasoning display and GPT-5.5 thinking between off/none and medium, then verify visible reasoning only on the medium turn. config: requiredLiveProvider: openai - requiredLiveModel: gpt-5.4 + requiredLiveModel: gpt-5.5 offDirective: /think off maxDirective: /think medium reasoningDirective: /reasoning on @@ -60,7 +60,7 @@ steps: - assert: expr: "env.providerMode !== 'live-frontier' || (selected?.provider === config.requiredLiveProvider && selected?.model === config.requiredLiveModel)" message: - expr: "`expected live GPT-5.4, got ${env.primaryModel}`" + expr: "`expected live GPT-5.5, got ${env.primaryModel}`" - call: state.addInboundMessage args: - conversation: @@ -133,9 +133,9 @@ steps: value: expr: "requests.find((request) => String(request.allInputText ?? '').includes(config.offPrompt))" - assert: - expr: "String(offRequest?.model ?? '').includes('gpt-5.4')" + expr: "String(offRequest?.model ?? '').includes('gpt-5.5')" message: - expr: "`expected GPT-5.4 off mock request, got ${String(offRequest?.model ?? '')}`" + expr: "`expected GPT-5.5 off mock request, got ${String(offRequest?.model ?? '')}`" detailsExpr: "`off ack=${offAck.text}; off answer=${offAnswer.text}`" - name: switches to medium thinking actions: @@ -204,8 +204,8 @@ steps: value: expr: "requests.find((request) => String(request.allInputText ?? '').includes(config.maxPrompt))" - assert: - expr: "String(maxRequest?.model ?? '').includes('gpt-5.4')" + expr: "String(maxRequest?.model ?? '').includes('gpt-5.5')" message: - expr: "`expected GPT-5.4 mock request, got ${String(maxRequest?.model ?? '')}`" + expr: "`expected GPT-5.5 mock request, got ${String(maxRequest?.model ?? '')}`" detailsExpr: "`answer=${maxAnswer.text}`" ``` diff --git a/qa/scenarios/models/model-switch-follow-up.md b/qa/scenarios/models/model-switch-follow-up.md index 733eff5e4fa..b6ebe2e9d8e 100644 --- a/qa/scenarios/models/model-switch-follow-up.md +++ b/qa/scenarios/models/model-switch-follow-up.md @@ -72,8 +72,8 @@ steps: expr: "state.getSnapshot().messages.filter((candidate) => candidate.direction === 'outbound' && candidate.conversation.id === 'qa-operator' && (() => { const lower = normalizeLowercaseStringOrEmpty(candidate.text); return lower.includes('switch') || lower.includes('handoff'); })()).at(-1)" - expr: resolveQaLiveTurnTimeoutMs(env, 20000, env.alternateModel) - assert: - expr: "!env.mock || ((await fetchJson(`${env.mock.baseUrl}/debug/last-request`))?.body?.model === 'gpt-5.4-alt')" + expr: "!env.mock || ((await fetchJson(`${env.mock.baseUrl}/debug/last-request`))?.body?.model === 'gpt-5.5-alt')" message: - expr: "`expected gpt-5.4-alt, got ${String((await fetchJson(`${env.mock.baseUrl}/debug/last-request`))?.body?.model ?? '')}`" + expr: "`expected gpt-5.5-alt, got ${String((await fetchJson(`${env.mock.baseUrl}/debug/last-request`))?.body?.model ?? '')}`" detailsExpr: outbound.text ``` diff --git a/qa/scenarios/models/openai-native-web-search-live.md b/qa/scenarios/models/openai-native-web-search-live.md index 6549eab202a..87afbfa0c9e 100644 --- a/qa/scenarios/models/openai-native-web-search-live.md +++ b/qa/scenarios/models/openai-native-web-search-live.md @@ -12,7 +12,7 @@ coverage: objective: Verify a live OpenAI GPT model can use OpenAI native web_search when OpenClaw web search is enabled in auto mode. successCriteria: - A live-frontier run fails fast unless the selected primary provider is openai. - - The selected primary model is GPT-5.4, not a mini or pro variant. + - The selected primary model is GPT-5.5, not a mini or pro variant. - Web search is enabled without pinning a managed web_search provider. - The live reply includes the required marker plus an official OpenAI News URL and headline found through web search. gatewayConfigPatch: @@ -32,10 +32,10 @@ codeRefs: - extensions/qa-lab/src/suite.ts execution: kind: flow - summary: Run with `OPENCLAW_LIVE_OPENAI_KEY="${OPENAI_API_KEY}" pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.4 --alt-model openai/gpt-5.4 --fast --thinking medium --scenario openai-native-web-search-live`. + summary: Run with `OPENCLAW_LIVE_OPENAI_KEY="${OPENAI_API_KEY}" pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.5 --alt-model openai/gpt-5.5 --fast --thinking medium --scenario openai-native-web-search-live`. config: requiredProvider: openai - requiredModel: gpt-5.4 + requiredModel: gpt-5.5 expectedMarker: WEB-SEARCH-OK failureMarker: WEB-SEARCH-FAILED searchPrompt: |- @@ -49,7 +49,7 @@ execution: ```yaml qa-flow steps: - - name: confirms live OpenAI GPT-5.4 web search auto mode + - name: confirms live OpenAI GPT-5.5 web search auto mode actions: - call: waitForGatewayHealthy args: diff --git a/qa/scenarios/models/thinking-slash-model-remap.md b/qa/scenarios/models/thinking-slash-model-remap.md index 786386565da..2e61e66666e 100644 --- a/qa/scenarios/models/thinking-slash-model-remap.md +++ b/qa/scenarios/models/thinking-slash-model-remap.md @@ -21,8 +21,8 @@ gatewayConfigPatch: params: {} successCriteria: - Anthropic Claude Sonnet 4.6 advertises adaptive but not OpenAI-only xhigh or Opus max. - - A stored adaptive level remaps to medium when switching to OpenAI GPT-5.4. - - OpenAI GPT-5.4 advertises xhigh but not adaptive or max. + - A stored adaptive level remaps to medium when switching to OpenAI GPT-5.5. + - OpenAI GPT-5.5 advertises xhigh but not adaptive or max. - A stored xhigh level remaps to high when switching to an Anthropic model without xhigh support. docsRefs: - docs/tools/thinking.md @@ -41,7 +41,7 @@ execution: config: requiredProviderMode: live-frontier anthropicModelRef: anthropic/claude-sonnet-4-6 - openAiXhighModelRef: openai/gpt-5.4 + openAiXhighModelRef: openai/gpt-5.5 noXhighModelRef: anthropic/claude-sonnet-4-6 conversationId: thinking-slash-remap sessionKey: agent:qa:main @@ -142,7 +142,7 @@ steps: - assert: expr: "/Options: .*\\bxhigh\\b/i.test(openAiThinkStatus.text) && !/Options: .*\\badaptive\\b/i.test(openAiThinkStatus.text) && !/Options: .*\\bmax\\b/i.test(openAiThinkStatus.text)" message: - expr: "`expected OpenAI GPT-5.4 /think options to include xhigh only, got ${openAiThinkStatus.text}`" + expr: "`expected OpenAI GPT-5.5 /think options to include xhigh only, got ${openAiThinkStatus.text}`" detailsExpr: "`adaptive=${adaptiveAck.text}; switch=${JSON.stringify(openAiModelAck.resolved)}; think=${openAiThinkStatus.text}`" - name: maps xhigh to high on a model without xhigh actions: diff --git a/qa/scenarios/runtime/compaction-retry-mutating-tool.md b/qa/scenarios/runtime/compaction-retry-mutating-tool.md index c67ad6a53d9..31702d4f67e 100644 --- a/qa/scenarios/runtime/compaction-retry-mutating-tool.md +++ b/qa/scenarios/runtime/compaction-retry-mutating-tool.md @@ -17,7 +17,7 @@ successCriteria: - Scenario details preserve the observed compaction count for review context. docsRefs: - docs/help/testing.md - - docs/help/gpt54-codex-agentic-parity.md + - docs/help/gpt55-codex-agentic-parity.md codeRefs: - extensions/qa-lab/src/suite.ts - extensions/qa-lab/src/mock-openai-server.ts diff --git a/qa/scenarios/runtime/reasoning-only-no-auto-retry-after-write.md b/qa/scenarios/runtime/reasoning-only-no-auto-retry-after-write.md index d98edf5491f..026c55bf796 100644 --- a/qa/scenarios/runtime/reasoning-only-no-auto-retry-after-write.md +++ b/qa/scenarios/runtime/reasoning-only-no-auto-retry-after-write.md @@ -17,7 +17,7 @@ successCriteria: - Mock trace stops after the write-side reasoning-only terminal turn instead of attempting a continuation. docsRefs: - docs/help/testing.md - - docs/help/gpt54-codex-agentic-parity.md + - docs/help/gpt55-codex-agentic-parity.md codeRefs: - extensions/qa-lab/src/mock-openai-server.ts - src/agents/pi-embedded-runner/run/incomplete-turn.ts diff --git a/qa/scenarios/workspace/medium-game-plan-codex-harness.md b/qa/scenarios/workspace/medium-game-plan-codex-harness.md index d14b65ca724..4c268f5eaa1 100644 --- a/qa/scenarios/workspace/medium-game-plan-codex-harness.md +++ b/qa/scenarios/workspace/medium-game-plan-codex-harness.md @@ -11,7 +11,7 @@ coverage: - models.codex-cli objective: Verify the Codex app-server harness can plan and build a medium-complex self-contained browser game. successCriteria: - - A live-frontier run fails fast unless the selected primary model is openai/gpt-5.4 with the Codex harness forced. + - A live-frontier run fails fast unless the selected primary model is openai/gpt-5.5 with the Codex harness forced. - The scenario forces the Codex embedded harness and disables PI fallback. - The prompt explicitly asks the agent to enter plan mode before editing. - The agent writes a self-contained HTML game with a canvas loop, controls, scoring, waves, pause, and restart. @@ -25,10 +25,10 @@ codeRefs: - extensions/qa-lab/src/suite.ts execution: kind: flow - summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.4 --alt-model openai/gpt-5.4 --fast --thinking medium --scenario medium-game-plan-codex-harness`. + summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.5 --alt-model openai/gpt-5.5 --fast --thinking medium --scenario medium-game-plan-codex-harness`. config: requiredProvider: codex - requiredModel: gpt-5.4 + requiredModel: gpt-5.5 harnessRuntime: codex harnessFallback: none artifactFile: star-garden-defenders-codex.html @@ -52,7 +52,7 @@ execution: ```yaml qa-flow steps: - - name: confirms GPT-5.4 Codex harness target + - name: confirms GPT-5.5 Codex harness target actions: - set: selected value: diff --git a/qa/scenarios/workspace/medium-game-plan-pi-harness.md b/qa/scenarios/workspace/medium-game-plan-pi-harness.md index 29e0b4decf7..f44efea9125 100644 --- a/qa/scenarios/workspace/medium-game-plan-pi-harness.md +++ b/qa/scenarios/workspace/medium-game-plan-pi-harness.md @@ -9,9 +9,9 @@ coverage: - workspace.planning secondary: - agents.pi-harness -objective: Verify GPT-5.4 can use the PI harness to plan and build a medium-complex self-contained browser game. +objective: Verify GPT-5.5 can use the PI harness to plan and build a medium-complex self-contained browser game. successCriteria: - - A live-frontier run fails fast unless the selected primary model is openai/gpt-5.4. + - A live-frontier run fails fast unless the selected primary model is openai/gpt-5.5. - The scenario forces the embedded PI harness before the build turn. - The prompt explicitly asks the agent to enter plan mode before editing. - The agent writes a self-contained HTML game with a canvas loop, controls, scoring, waves, pause, and restart. @@ -25,10 +25,10 @@ codeRefs: - extensions/qa-lab/src/suite.ts execution: kind: flow - summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.4 --alt-model openai/gpt-5.4 --fast --thinking medium --scenario medium-game-plan-pi-harness`. + summary: Run with `pnpm openclaw qa suite --provider-mode live-frontier --model openai/gpt-5.5 --alt-model openai/gpt-5.5 --fast --thinking medium --scenario medium-game-plan-pi-harness`. config: requiredProvider: openai - requiredModel: gpt-5.4 + requiredModel: gpt-5.5 harnessRuntime: pi harnessFallback: pi artifactFile: star-garden-defenders-pi.html @@ -52,7 +52,7 @@ execution: ```yaml qa-flow steps: - - name: confirms GPT-5.4 PI harness target + - name: confirms GPT-5.5 PI harness target actions: - set: selected value: diff --git a/scripts/control-ui-i18n.ts b/scripts/control-ui-i18n.ts index df8531d12b9..5d22dd68959 100644 --- a/scripts/control-ui-i18n.ts +++ b/scripts/control-ui-i18n.ts @@ -58,7 +58,7 @@ type TranslationBatchItem = { }; const CONTROL_UI_I18N_WORKFLOW = 1; -const DEFAULT_OPENAI_MODEL = "gpt-5.4"; +const DEFAULT_OPENAI_MODEL = "gpt-5.5"; const DEFAULT_ANTHROPIC_MODEL = "claude-opus-4-6"; const DEFAULT_PROVIDER = "openai"; const DEFAULT_PI_PACKAGE_VERSION = "0.58.3"; diff --git a/scripts/docker/install-sh-e2e/run.sh b/scripts/docker/install-sh-e2e/run.sh index e27f45c8866..383722d6c35 100755 --- a/scripts/docker/install-sh-e2e/run.sh +++ b/scripts/docker/install-sh-e2e/run.sh @@ -501,7 +501,7 @@ run_profile() { local image_model if [[ "$agent_model_provider" == "openai" ]]; then agent_model="$(set_agent_model "$profile" \ - "openai/gpt-5.4" \ + "openai/gpt-5.5" \ "openai/gpt-4o-mini" \ "openai/gpt-4o")" image_model="$(set_image_model "$profile" \ diff --git a/scripts/docs-i18n/translator_test.go b/scripts/docs-i18n/translator_test.go index d0befbc607c..ce63d7d4815 100644 --- a/scripts/docs-i18n/translator_test.go +++ b/scripts/docs-i18n/translator_test.go @@ -183,13 +183,13 @@ func TestResolveDocsPiCommandUsesOverrideEnv(t *testing.T) { func TestDocsPiModelRefUsesProviderPrefixWhenProviderFlagIsOmitted(t *testing.T) { t.Setenv(envDocsI18nProvider, "openai") - t.Setenv(envDocsI18nModel, "gpt-5.4") + t.Setenv(envDocsI18nModel, "gpt-5.5") t.Setenv(envDocsPiOmitProvider, "1") if got := docsPiProviderArg(); got != "" { t.Fatalf("expected empty provider arg when omit-provider is enabled, got %q", got) } - if got := docsPiModelRef(); got != "openai/gpt-5.4" { + if got := docsPiModelRef(); got != "openai/gpt-5.5" { t.Fatalf("expected provider-qualified model ref, got %q", got) } } diff --git a/scripts/docs-i18n/util.go b/scripts/docs-i18n/util.go index 629e4c8a7aa..4e564e9ff17 100644 --- a/scripts/docs-i18n/util.go +++ b/scripts/docs-i18n/util.go @@ -14,7 +14,7 @@ const ( docsI18nEngineName = "pi" envDocsI18nProvider = "OPENCLAW_DOCS_I18N_PROVIDER" envDocsI18nModel = "OPENCLAW_DOCS_I18N_MODEL" - defaultOpenAIModel = "gpt-5.4" + defaultOpenAIModel = "gpt-5.5" defaultAnthropicModel = "claude-opus-4-6" defaultFallbackProvider = "openai" defaultFallbackModelName = defaultOpenAIModel diff --git a/scripts/e2e/docker-openai-seed.ts b/scripts/e2e/docker-openai-seed.ts index 5df68998533..bc2b5ef7e93 100644 --- a/scripts/e2e/docker-openai-seed.ts +++ b/scripts/e2e/docker-openai-seed.ts @@ -6,12 +6,12 @@ import { export type { OpenClawConfig }; -const DOCKER_OPENAI_MODEL_REF = "openai/gpt-5.4"; +const DOCKER_OPENAI_MODEL_REF = "openai/gpt-5.5"; const DOCKER_OPENAI_BASE_URL = process.env.OPENCLAW_DOCKER_OPENAI_BASE_URL?.trim() || "http://127.0.0.1:9/v1"; const DOCKER_OPENAI_MODEL: ModelDefinitionConfig = { - id: "gpt-5.4", - name: "gpt-5.4", + id: "gpt-5.5", + name: "gpt-5.5", api: "openai-responses", reasoning: true, input: ["text", "image"], diff --git a/scripts/e2e/mock-openai-server.mjs b/scripts/e2e/mock-openai-server.mjs index c463817fa4f..814d7b9766c 100644 --- a/scripts/e2e/mock-openai-server.mjs +++ b/scripts/e2e/mock-openai-server.mjs @@ -111,7 +111,7 @@ const server = http.createServer(async (req, res) => { if (req.method === "GET" && url.pathname === "/v1/models") { writeJson(res, 200, { object: "list", - data: [{ id: "gpt-5.4", object: "model", owned_by: "openclaw-e2e" }], + data: [{ id: "gpt-5.5", object: "model", owned_by: "openclaw-e2e" }], }); return; } diff --git a/scripts/e2e/npm-onboard-channel-agent-docker.sh b/scripts/e2e/npm-onboard-channel-agent-docker.sh index 5e2ca831e31..faceb0041fc 100644 --- a/scripts/e2e/npm-onboard-channel-agent-docker.sh +++ b/scripts/e2e/npm-onboard-channel-agent-docker.sh @@ -206,7 +206,7 @@ const path = require("node:path"); const mockPort = Number(process.argv[2]); const configPath = path.join(process.env.HOME, ".openclaw", "openclaw.json"); const cfg = JSON.parse(fs.readFileSync(configPath, "utf8")); -const modelRef = "openai/gpt-5.4"; +const modelRef = "openai/gpt-5.5"; const cost = { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }; cfg.models = { @@ -222,8 +222,8 @@ cfg.models = { request: { ...(cfg.models?.providers?.openai?.request || {}), allowPrivateNetwork: true }, models: [ { - id: "gpt-5.4", - name: "gpt-5.4", + id: "gpt-5.5", + name: "gpt-5.5", api: "openai-responses", reasoning: false, input: ["text", "image"], diff --git a/scripts/e2e/parallels-linux-smoke.sh b/scripts/e2e/parallels-linux-smoke.sh index 9460aa3961f..8b6ba7596c5 100644 --- a/scripts/e2e/parallels-linux-smoke.sh +++ b/scripts/e2e/parallels-linux-smoke.sh @@ -200,7 +200,7 @@ case "$PROVIDER" in openai) AUTH_CHOICE="openai-api-key" AUTH_KEY_FLAG="openai-api-key" - MODEL_ID="openai/gpt-5.4" + MODEL_ID="openai/gpt-5.5" [[ -n "$API_KEY_ENV" ]] || API_KEY_ENV="OPENAI_API_KEY" ;; anthropic) diff --git a/scripts/e2e/parallels-macos-smoke.sh b/scripts/e2e/parallels-macos-smoke.sh index c825e53c646..558ca7fecf8 100644 --- a/scripts/e2e/parallels-macos-smoke.sh +++ b/scripts/e2e/parallels-macos-smoke.sh @@ -258,7 +258,7 @@ case "$PROVIDER" in openai) AUTH_CHOICE="openai-api-key" AUTH_KEY_FLAG="openai-api-key" - MODEL_ID="openai/gpt-5.4" + MODEL_ID="openai/gpt-5.5" [[ -n "$API_KEY_ENV" ]] || API_KEY_ENV="OPENAI_API_KEY" ;; anthropic) diff --git a/scripts/e2e/parallels-npm-update-smoke.sh b/scripts/e2e/parallels-npm-update-smoke.sh index d43b5d03f15..61c73eabb56 100755 --- a/scripts/e2e/parallels-npm-update-smoke.sh +++ b/scripts/e2e/parallels-npm-update-smoke.sh @@ -206,7 +206,7 @@ case "$PROVIDER" in openai) AUTH_CHOICE="openai-api-key" AUTH_KEY_FLAG="openai-api-key" - MODEL_ID="openai/gpt-5.4" + MODEL_ID="openai/gpt-5.5" [[ -n "$API_KEY_ENV" ]] || API_KEY_ENV="OPENAI_API_KEY" ;; anthropic) diff --git a/scripts/e2e/parallels-windows-smoke.sh b/scripts/e2e/parallels-windows-smoke.sh index 607445b7bd6..dfc4aa5b428 100644 --- a/scripts/e2e/parallels-windows-smoke.sh +++ b/scripts/e2e/parallels-windows-smoke.sh @@ -249,7 +249,7 @@ case "$PROVIDER" in openai) AUTH_CHOICE="openai-api-key" AUTH_KEY_FLAG="openai-api-key" - MODEL_ID="openai/gpt-5.4" + MODEL_ID="openai/gpt-5.5" [[ -n "$API_KEY_ENV" ]] || API_KEY_ENV="OPENAI_API_KEY" ;; anthropic) diff --git a/scripts/openclaw-cross-os-release-checks.ts b/scripts/openclaw-cross-os-release-checks.ts index 1ead13c0bcc..252bd813d24 100644 --- a/scripts/openclaw-cross-os-release-checks.ts +++ b/scripts/openclaw-cross-os-release-checks.ts @@ -35,7 +35,7 @@ const providerConfig = { extensionId: "openai", secretEnv: "OPENAI_API_KEY", authChoice: "openai-api-key", - model: "openai/gpt-5.4", + model: "openai/gpt-5.5", }, anthropic: { extensionId: "anthropic", diff --git a/scripts/test-live-codex-harness-docker.sh b/scripts/test-live-codex-harness-docker.sh index 3f6d120e85b..1136ca61835 100644 --- a/scripts/test-live-codex-harness-docker.sh +++ b/scripts/test-live-codex-harness-docker.sh @@ -200,7 +200,7 @@ openclaw_live_codex_harness_append_build_extension codex "$ROOT_DIR/scripts/test-live-build-docker.sh" echo "==> Run Codex harness live test in Docker" -echo "==> Model: ${OPENCLAW_LIVE_CODEX_HARNESS_MODEL:-codex/gpt-5.4}" +echo "==> Model: ${OPENCLAW_LIVE_CODEX_HARNESS_MODEL:-codex/gpt-5.5}" echo "==> Image probe: ${OPENCLAW_LIVE_CODEX_HARNESS_IMAGE_PROBE:-1}" echo "==> MCP probe: ${OPENCLAW_LIVE_CODEX_HARNESS_MCP_PROBE:-1}" echo "==> Guardian probe: ${OPENCLAW_LIVE_CODEX_HARNESS_GUARDIAN_PROBE:-1}" @@ -227,7 +227,7 @@ DOCKER_RUN_ARGS=(docker run --rm -t \ -e OPENCLAW_LIVE_CODEX_HARNESS_GUARDIAN_PROBE="${OPENCLAW_LIVE_CODEX_HARNESS_GUARDIAN_PROBE:-1}" \ -e OPENCLAW_LIVE_CODEX_HARNESS_IMAGE_PROBE="${OPENCLAW_LIVE_CODEX_HARNESS_IMAGE_PROBE:-1}" \ -e OPENCLAW_LIVE_CODEX_HARNESS_MCP_PROBE="${OPENCLAW_LIVE_CODEX_HARNESS_MCP_PROBE:-1}" \ - -e OPENCLAW_LIVE_CODEX_HARNESS_MODEL="${OPENCLAW_LIVE_CODEX_HARNESS_MODEL:-codex/gpt-5.4}" \ + -e OPENCLAW_LIVE_CODEX_HARNESS_MODEL="${OPENCLAW_LIVE_CODEX_HARNESS_MODEL:-codex/gpt-5.5}" \ -e OPENCLAW_LIVE_CODEX_HARNESS_REQUIRE_GUARDIAN_EVENTS="${OPENCLAW_LIVE_CODEX_HARNESS_REQUIRE_GUARDIAN_EVENTS:-1}" \ -e OPENCLAW_LIVE_CODEX_HARNESS_REQUEST_TIMEOUT_MS="${OPENCLAW_LIVE_CODEX_HARNESS_REQUEST_TIMEOUT_MS:-}" \ -e OPENCLAW_LIVE_CODEX_HARNESS_USE_CI_SAFE_CODEX_CONFIG="${OPENCLAW_LIVE_CODEX_HARNESS_USE_CI_SAFE_CODEX_CONFIG:-1}" \