From ab0c40d26ae35ac8425633b7ca16d5b33b2662ba Mon Sep 17 00:00:00 2001 From: Bradley Axen Date: Tue, 21 Apr 2026 15:32:24 -0700 Subject: [PATCH] fix: add strict:false to Responses API tools and gpt-5.4 to known models (#8636) Signed-off-by: Bradley Axen --- crates/goose-test-support/src/session.rs | 7 +- crates/goose/src/model.rs | 49 ++++++-- crates/goose/src/providers/databricks.rs | 57 ++++++++- .../goose/src/providers/formats/databricks.rs | 82 +++++++++---- crates/goose/src/providers/formats/openai.rs | 15 ++- .../src/providers/formats/openai_responses.rs | 107 ++++++++++++++++- crates/goose/src/providers/openai.rs | 86 +++++-------- crates/goose/src/providers/utils.rs | 113 +++++++++++++++--- crates/goose/tests/acp_common_tests/mod.rs | 29 +++-- .../goose/tests/acp_custom_requests_test.rs | 3 + .../tests/session_id_propagation_test.rs | 56 ++++++++- 11 files changed, 474 insertions(+), 130 deletions(-) diff --git a/crates/goose-test-support/src/session.rs b/crates/goose-test-support/src/session.rs index c150e82cd8..c3ac411d12 100644 --- a/crates/goose-test-support/src/session.rs +++ b/crates/goose-test-support/src/session.rs @@ -1,7 +1,12 @@ use std::sync::{Arc, Mutex}; pub const TEST_SESSION_ID: &str = "test-session-id"; -pub const TEST_MODEL: &str = "gpt-5-nano"; +// Use a Chat Completions model so the canned SSE fixtures (which return +// Chat Completions format) are parsed correctly. gpt-5-nano now routes to +// the Responses API which needs a different mock format. +// TODO: add a Responses API mock to OpenAiFixture so tests can cover +// responses-routed models like gpt-5-nano end-to-end. +pub const TEST_MODEL: &str = "gpt-4.1"; const NOT_YET_SET: &str = "session-id-not-yet-set"; pub(crate) const SESSION_ID_HEADER: &str = "agent-session-id"; diff --git a/crates/goose/src/model.rs b/crates/goose/src/model.rs index ab1d76bd97..7a2a11b4d5 100644 --- a/crates/goose/src/model.rs +++ b/crates/goose/src/model.rs @@ -138,9 +138,22 @@ impl ModelConfig { } } - if let Some(canonical) = + // Try canonical lookup with the full model name first, then fall back + // to the name with reasoning-effort suffixes stripped (e.g. + // "databricks-gpt-5.4-high" → "databricks-gpt-5.4"). + let canonical = crate::providers::canonical::maybe_get_canonical_model(provider_name, &self.model_name) - { + .or_else(|| { + let (base, _effort) = + crate::providers::utils::extract_reasoning_effort(&self.model_name); + if base != self.model_name { + crate::providers::canonical::maybe_get_canonical_model(provider_name, &base) + } else { + None + } + }); + + if let Some(canonical) = canonical { if self.context_limit.is_none() { self.context_limit = Some(canonical.limit.context); } @@ -299,15 +312,7 @@ impl ModelConfig { } pub fn is_openai_reasoning_model(&self) -> bool { - const DATABRICKS_MODEL_NAME_PREFIXES: &[&str] = &["goose-", "databricks-"]; - const REASONING_PREFIXES: &[&str] = &["o1", "o3", "o4", "gpt-5"]; - - let base = DATABRICKS_MODEL_NAME_PREFIXES - .iter() - .find_map(|p| self.model_name.strip_prefix(p)) - .unwrap_or(&self.model_name); - - REASONING_PREFIXES.iter().any(|p| base.starts_with(p)) + crate::providers::utils::is_openai_responses_model(&self.model_name) } pub fn max_output_tokens(&self) -> i32 { @@ -499,6 +504,28 @@ mod tests { assert_eq!(config.max_tokens, None); assert_eq!(config.reasoning, None); } + + #[test] + fn resolves_after_stripping_reasoning_effort_suffix() { + let _guard = env_lock::lock_env([ + ("GOOSE_MAX_TOKENS", None::<&str>), + ("GOOSE_CONTEXT_LIMIT", None::<&str>), + ]); + + // "databricks-gpt-5.4-high" should resolve via "databricks-gpt-5.4" + let config = ModelConfig::new_or_fail("databricks-gpt-5.4-high") + .with_canonical_limits("databricks"); + assert_eq!(config.context_limit, Some(1_050_000)); + + // "gpt-5.4-xhigh" should resolve via "gpt-5.4" + let config = ModelConfig::new_or_fail("gpt-5.4-xhigh").with_canonical_limits("openai"); + assert_eq!(config.context_limit, Some(1_050_000)); + + // "gpt-5.4-nano-low" should resolve via "gpt-5.4-nano" + let config = + ModelConfig::new_or_fail("gpt-5.4-nano-low").with_canonical_limits("openai"); + assert_eq!(config.context_limit, Some(400_000)); + } } mod is_openai_reasoning_model { diff --git a/crates/goose/src/providers/databricks.rs b/crates/goose/src/providers/databricks.rs index 3b1df79ea4..3dd342210e 100644 --- a/crates/goose/src/providers/databricks.rs +++ b/crates/goose/src/providers/databricks.rs @@ -269,17 +269,19 @@ impl DatabricksProvider { } fn is_responses_model(model_name: &str) -> bool { - let normalized = model_name.to_ascii_lowercase(); - normalized.contains("codex") + super::utils::is_openai_responses_model(model_name) } fn get_endpoint_path(&self, model_name: &str, is_embedding: bool) -> String { if is_embedding { "serving-endpoints/text-embedding-3-small/invocations".to_string() - } else if Self::is_responses_model(model_name) { - "serving-endpoints/responses".to_string() } else { - format!("serving-endpoints/{}/invocations", model_name) + let (clean_name, _) = super::utils::extract_reasoning_effort(model_name); + if Self::is_responses_model(&clean_name) { + "serving-endpoints/responses".to_string() + } else { + format!("serving-endpoints/{}/invocations", clean_name) + } } } @@ -594,3 +596,48 @@ impl EmbeddingCapable for DatabricksProvider { Ok(embeddings) } } + +#[cfg(test)] +mod tests { + use super::*; + + fn test_provider() -> DatabricksProvider { + DatabricksProvider { + api_client: super::super::api_client::ApiClient::new( + "https://example.com".to_string(), + super::super::api_client::AuthMethod::NoAuth, + ) + .unwrap(), + auth: DatabricksAuth::Token("fake".into()), + model: ModelConfig::new_or_fail("databricks-gpt-5.4"), + image_format: ImageFormat::OpenAi, + retry_config: RetryConfig::default(), + fast_retry_config: RetryConfig::new(0, 0, 1.0, 0), + name: "databricks".into(), + token_cache: std::sync::Arc::new(std::sync::Mutex::new(None)), + instance_id: None, + } + } + + #[test] + fn responses_models_route_to_responses_endpoint() { + let provider = test_provider(); + + for (model_name, expected_path) in [ + ("gpt-5.4", "serving-endpoints/responses"), + ("databricks-gpt-5.4-high", "serving-endpoints/responses"), + ("databricks-gpt-5-4-xhigh", "serving-endpoints/responses"), + ("o3-mini", "serving-endpoints/responses"), + ( + "databricks-claude-sonnet-4", + "serving-endpoints/databricks-claude-sonnet-4/invocations", + ), + ] { + assert_eq!( + provider.get_endpoint_path(model_name, false), + expected_path, + "unexpected endpoint for {model_name}" + ); + } + } +} diff --git a/crates/goose/src/providers/formats/databricks.rs b/crates/goose/src/providers/formats/databricks.rs index 74b1f3de58..584bbdf823 100644 --- a/crates/goose/src/providers/formats/databricks.rs +++ b/crates/goose/src/providers/formats/databricks.rs @@ -2,8 +2,9 @@ use crate::conversation::message::{Message, MessageContent}; use crate::model::ModelConfig; use crate::providers::formats::anthropic::{thinking_effort, thinking_type, ThinkingType}; use crate::providers::utils::{ - convert_image, detect_image_path, is_valid_function_name, load_image_file, safely_parse_json, - sanitize_function_name, ImageFormat, + convert_image, detect_image_path, extract_reasoning_effort, is_openai_responses_model, + is_valid_function_name, load_image_file, safely_parse_json, sanitize_function_name, + ImageFormat, }; use anyhow::{anyhow, Error}; use rmcp::model::{ @@ -581,24 +582,8 @@ pub fn create_request( )); } - let is_openai_reasoning_model = model_config.is_openai_reasoning_model(); - let (model_name, reasoning_effort) = if is_openai_reasoning_model { - let parts: Vec<&str> = model_config.model_name.split('-').collect(); - let last_part = parts.last().unwrap(); - - match *last_part { - "low" | "medium" | "high" => { - let base_name = parts[..parts.len() - 1].join("-"); - (base_name, Some(last_part.to_string())) - } - _ => ( - model_config.model_name.to_string(), - Some("medium".to_string()), - ), - } - } else { - (model_config.model_name.to_string(), None) - }; + let (model_name, reasoning_effort) = extract_reasoning_effort(&model_config.model_name); + let is_openai_reasoning_model = is_openai_responses_model(&model_name); let system_message = DatabricksMessage { role: "system".to_string(), @@ -1073,6 +1058,63 @@ mod tests { Ok(()) } + #[test] + fn test_create_request_reasoning_effort_xhigh() -> anyhow::Result<()> { + let model_config = ModelConfig { + model_name: "o3-xhigh".to_string(), + context_limit: Some(4096), + temperature: None, + max_tokens: Some(1024), + toolshim: false, + toolshim_model: None, + fast_model_config: None, + request_params: None, + reasoning: None, + }; + let request = create_request(&model_config, "system", &[], &[], &ImageFormat::OpenAi)?; + assert_eq!(request["model"], "o3"); + assert_eq!(request["reasoning_effort"], "xhigh"); + Ok(()) + } + + #[test] + fn test_create_request_reasoning_effort_none() -> anyhow::Result<()> { + let model_config = ModelConfig { + model_name: "o3-none".to_string(), + context_limit: Some(4096), + temperature: None, + max_tokens: Some(1024), + toolshim: false, + toolshim_model: None, + fast_model_config: None, + request_params: None, + reasoning: None, + }; + let request = create_request(&model_config, "system", &[], &[], &ImageFormat::OpenAi)?; + assert_eq!(request["model"], "o3"); + assert_eq!(request["reasoning_effort"], "none"); + Ok(()) + } + + #[test] + fn test_create_request_reasoning_effort_for_prefixed_gpt5_model() -> anyhow::Result<()> { + let model_config = ModelConfig { + model_name: "databricks-gpt-5.4-high".to_string(), + context_limit: Some(4096), + temperature: None, + max_tokens: Some(1024), + toolshim: false, + toolshim_model: None, + fast_model_config: None, + request_params: None, + reasoning: None, + }; + let request = create_request(&model_config, "system", &[], &[], &ImageFormat::OpenAi)?; + assert_eq!(request["model"], "databricks-gpt-5.4"); + assert_eq!(request["reasoning_effort"], "high"); + Ok(()) + } + #[test] fn test_create_request_adaptive_thinking_for_46_models() -> anyhow::Result<()> { let _guard = env_lock::lock_env([ diff --git a/crates/goose/src/providers/formats/openai.rs b/crates/goose/src/providers/formats/openai.rs index e896ecb164..cb70a91ca1 100644 --- a/crates/goose/src/providers/formats/openai.rs +++ b/crates/goose/src/providers/formats/openai.rs @@ -4,8 +4,9 @@ use crate::model::ModelConfig; use crate::providers::base::{ProviderUsage, Usage}; use crate::providers::errors::ProviderError; use crate::providers::utils::{ - convert_image, detect_image_path, extract_reasoning_effort, is_valid_function_name, - load_image_file, safely_parse_json, sanitize_function_name, ImageFormat, + convert_image, detect_image_path, extract_reasoning_effort, is_openai_responses_model, + is_valid_function_name, load_image_file, safely_parse_json, sanitize_function_name, + ImageFormat, }; use anyhow::{anyhow, Error}; use async_stream::try_stream; @@ -984,7 +985,7 @@ pub fn create_request( } let (model_name, reasoning_effort) = extract_reasoning_effort(&model_config.model_name); - let is_reasoning_model = reasoning_effort.is_some(); + let is_reasoning_model = is_openai_responses_model(&model_name); let system_message = json!({ "role": if is_reasoning_model { "developer" } else { "system" }, @@ -1716,7 +1717,8 @@ mod tests { #[test] fn test_create_request_o1_default() -> anyhow::Result<()> { - // Test default medium reasoning effort for O1 model + // Without an explicit effort suffix the API picks its own default; + // we should omit reasoning_effort entirely but still use "developer" role. let model_config = ModelConfig { model_name: "o1".to_string(), context_limit: Some(4096), @@ -1745,13 +1747,16 @@ mod tests { "content": "system" } ], - "reasoning_effort": "medium", "max_completion_tokens": 1024 }); for (key, value) in expected.as_object().unwrap() { assert_eq!(obj.get(key).unwrap(), value); } + assert!( + obj.get("reasoning_effort").is_none(), + "reasoning_effort should be omitted when no explicit suffix is provided" + ); Ok(()) } diff --git a/crates/goose/src/providers/formats/openai_responses.rs b/crates/goose/src/providers/formats/openai_responses.rs index 68e0f785f5..1023805953 100644 --- a/crates/goose/src/providers/formats/openai_responses.rs +++ b/crates/goose/src/providers/formats/openai_responses.rs @@ -2,7 +2,7 @@ use crate::conversation::message::{Message, MessageContent}; use crate::mcp_utils::extract_text_from_resource; use crate::model::ModelConfig; use crate::providers::base::{ProviderUsage, Usage}; -use crate::providers::utils::extract_reasoning_effort; +use crate::providers::utils::{extract_reasoning_effort, is_openai_responses_model}; use anyhow::{anyhow, Error}; use async_stream::try_stream; use chrono; @@ -468,7 +468,10 @@ pub fn create_responses_request( add_message_items(&mut input_items, messages); let (model_name, reasoning_effort) = extract_reasoning_effort(&model_config.model_name); - let is_reasoning_model = reasoning_effort.is_some(); + // All models routed here are responses-capable; temperature is rejected + // by the API for reasoning models regardless of whether an explicit + // effort suffix was provided. + let is_reasoning_model = is_openai_responses_model(&model_name); let mut payload = json!({ "model": model_name, @@ -495,6 +498,7 @@ pub fn create_responses_request( "name": tool.name, "description": tool.description, "parameters": tool.input_schema, + "strict": false, }) }) .collect(); @@ -1083,4 +1087,103 @@ mod tests { assert_eq!(info.effort.as_deref(), Some("high")); assert_eq!(info.summary.as_deref(), Some("Thought deeply")); } + + #[test] + fn test_responses_tools_include_strict_false() { + let model_config = ModelConfig { + model_name: "gpt-5.4".to_string(), + context_limit: None, + temperature: None, + max_tokens: None, + toolshim: false, + toolshim_model: None, + fast_model_config: None, + request_params: None, + reasoning: None, + }; + + let tool = Tool::new( + "shell", + "Execute a shell command", + object!({ + "type": "object", + "properties": { + "command": { + "type": "string", + "description": "The command to run" + } + }, + "required": ["command"] + }), + ); + + let result = + create_responses_request(&model_config, "You are helpful.", &[], &[tool]).unwrap(); + let tools = result["tools"] + .as_array() + .expect("tools should be an array"); + assert_eq!(tools.len(), 1); + assert_eq!(tools[0]["strict"], json!(false), + "Responses API defaults strict to true, but MCP tool schemas are not strict-compatible; must explicitly set strict: false"); + } + + #[test] + fn test_responses_request_with_explicit_effort_suffix() { + for (model_name, expected_model, expected_effort) in [ + ("gpt-5.4-xhigh", "gpt-5.4", "xhigh"), + ("databricks-gpt-5.4-high", "databricks-gpt-5.4", "high"), + ("databricks-o3-none", "databricks-o3", "none"), + ] { + let model_config = ModelConfig { + model_name: model_name.to_string(), + context_limit: None, + temperature: None, + max_tokens: None, + toolshim: false, + toolshim_model: None, + fast_model_config: None, + request_params: None, + reasoning: None, + }; + + let result = + create_responses_request(&model_config, "You are helpful.", &[], &[]).unwrap(); + + assert_eq!( + result["model"], expected_model, + "unexpected model for {model_name}" + ); + assert_eq!( + result["reasoning"]["effort"], expected_effort, + "unexpected effort for {model_name}" + ); + assert_eq!(result["reasoning"]["summary"], "auto"); + } + } + + #[test] + fn test_responses_request_without_effort_suffix_omits_reasoning() { + for model_name in ["gpt-5.4", "o3", "gpt-5-nano"] { + let model_config = ModelConfig { + model_name: model_name.to_string(), + context_limit: None, + temperature: None, + max_tokens: None, + toolshim: false, + toolshim_model: None, + fast_model_config: None, + request_params: None, + reasoning: None, + }; + + let result = + create_responses_request(&model_config, "You are helpful.", &[], &[]).unwrap(); + + assert_eq!(result["model"], model_name, "model should be unchanged"); + assert!( + result.get("reasoning").is_none(), + "reasoning should be omitted for {model_name} without explicit effort suffix" + ); + } + } } diff --git a/crates/goose/src/providers/openai.rs b/crates/goose/src/providers/openai.rs index 23736f2314..f3174ae1be 100644 --- a/crates/goose/src/providers/openai.rs +++ b/crates/goose/src/providers/openai.rs @@ -48,9 +48,21 @@ pub const OPEN_AI_KNOWN_MODELS: &[(&str, usize)] = &[ ("gpt-3.5-turbo", 16_385), ("gpt-4-turbo", 128_000), ("o4-mini", 128_000), + ("gpt-5", 400_000), + ("gpt-5-mini", 400_000), ("gpt-5-nano", 400_000), - ("gpt-5.1-codex", 400_000), + ("gpt-5-pro", 400_000), ("gpt-5-codex", 400_000), + ("gpt-5.1", 400_000), + ("gpt-5.1-codex", 400_000), + ("gpt-5.2", 400_000), + ("gpt-5.2-codex", 400_000), + ("gpt-5.2-pro", 400_000), + ("gpt-5.3-codex", 400_000), + ("gpt-5.4", 1_050_000), + ("gpt-5.4-mini", 400_000), + ("gpt-5.4-nano", 400_000), + ("gpt-5.4-pro", 1_050_000), ]; pub const OPEN_AI_DOC_URL: &str = "https://platform.openai.com/docs/models"; @@ -283,10 +295,7 @@ impl OpenAiProvider { } fn is_responses_model(model_name: &str) -> bool { - let normalized_model = model_name.to_ascii_lowercase(); - (normalized_model.starts_with("gpt-5") && normalized_model.contains("codex")) - || normalized_model.starts_with("gpt-5.2-pro") - || normalized_model.starts_with("gpt-5.4") + super::utils::is_openai_responses_model(model_name) } fn should_use_responses_api(model_name: &str, base_path: &str) -> bool { @@ -834,59 +843,20 @@ mod tests { } #[test] - fn gpt_5_2_codex_uses_responses_when_base_path_is_default() { - assert!(OpenAiProvider::should_use_responses_api( - "gpt-5.2-codex", - "v1/chat/completions" - )); - } - - #[test] - fn gpt_5_2_pro_uses_responses_when_base_path_is_default() { - assert!(OpenAiProvider::should_use_responses_api( - "gpt-5.2-pro", - "v1/chat/completions" - )); - } - - #[test] - fn gpt_5_2_pro_with_date_uses_responses() { - assert!(OpenAiProvider::should_use_responses_api( - "gpt-5.2-pro-2025-12-11", - "v1/chat/completions" - )); - } - - #[test] - fn explicit_chat_path_forces_chat_completions() { - assert!(!OpenAiProvider::should_use_responses_api( - "gpt-5.2-codex", - "openai/v1/chat/completions" - )); - } - - #[test] - fn gpt_5_4_uses_responses_when_base_path_is_default() { - assert!(OpenAiProvider::should_use_responses_api( - "gpt-5.4", - "v1/chat/completions" - )); - } - - #[test] - fn gpt_5_4_with_date_uses_responses() { - assert!(OpenAiProvider::should_use_responses_api( - "gpt-5.4-2026-03-01", - "v1/chat/completions" - )); - } - - #[test] - fn gpt_4o_does_not_use_responses() { - assert!(!OpenAiProvider::should_use_responses_api( - "gpt-4o", - "v1/chat/completions" - )); + fn responses_api_routing_uses_model_family_unless_path_forces_chat() { + for (model_name, base_path, expected) in [ + ("gpt-5.4", "v1/chat/completions", true), + ("gpt-5.4-xhigh", "v1/chat/completions", true), + ("gpt-5.2-pro-2025-12-11", "v1/chat/completions", true), + ("gpt-4o", "v1/chat/completions", false), + ("gpt-5.2-codex", "openai/v1/chat/completions", false), + ] { + assert_eq!( + OpenAiProvider::should_use_responses_api(model_name, base_path), + expected, + "unexpected routing for {model_name} via {base_path}" + ); + } } #[test] diff --git a/crates/goose/src/providers/utils.rs b/crates/goose/src/providers/utils.rs index 61a2f58fb1..3a99b88e60 100644 --- a/crates/goose/src/providers/utils.rs +++ b/crates/goose/src/providers/utils.rs @@ -193,26 +193,48 @@ pub async fn handle_response_google_compat(response: Response) -> Result (String, Option) { - let is_reasoning_model = model_name.starts_with("o1") - || model_name.starts_with("o2") - || model_name.starts_with("o3") - || model_name.starts_with("o4") - || model_name.starts_with("gpt-5"); +/// True when the model should use the OpenAI Responses API. +/// +/// The Responses API is backwards-compatible with all OpenAI reasoning +/// models, so every `o`-series (`o1`, `o3`, `o4`, …) and `gpt-5` variant +/// routes here. The matcher intentionally scans the full model identifier so +/// hosted aliases like `databricks-gpt-5.4`, `goose-o3-mini`, or +/// `headless-goose-o3-mini` work without provider-specific normalization. +pub fn is_openai_responses_model(model_name: &str) -> bool { + static RE: OnceLock = OnceLock::new(); + let re = + RE.get_or_init(|| Regex::new(r"(?i)(?:^|[-/])(?:o\d+(?:$|-)|gpt-5(?:$|[-.]))").unwrap()); + re.is_match(model_name) +} - if !is_reasoning_model { +/// Extract an explicit reasoning-effort suffix from a model name. +/// +/// Returns `(base_model_name, Some(effort))` when the user appended a +/// recognised suffix like `-high` or `-xhigh`, e.g. `gpt-5.4-high` → +/// `("gpt-5.4", Some("high"))`. +/// +/// When no suffix is present the effort is `None` — callers should omit +/// the `reasoning` field entirely so the API applies its own per-model +/// default. This avoids hard-coding a default that may be invalid for +/// certain models (e.g. `gpt-5-pro` only accepts `high`; older o-series +/// models reject `none` and `xhigh`). +pub fn extract_reasoning_effort(model_name: &str) -> (String, Option) { + if !is_openai_responses_model(model_name) { return (model_name.to_string(), None); } - let parts: Vec<&str> = model_name.split('-').collect(); - let last_part = parts.last().unwrap(); - match *last_part { - "low" | "medium" | "high" => { - let base_name = parts[..parts.len() - 1].join("-"); - (base_name, Some(last_part.to_string())) - } - _ => (model_name.to_string(), Some("medium".to_string())), + static RE: OnceLock = OnceLock::new(); + let re = RE.get_or_init(|| { + Regex::new(r"(?i)^(?P.+)-(?Pnone|low|medium|high|xhigh)$").unwrap() + }); + + if let Some(captures) = re.captures(model_name) { + let base = captures["base"].to_string(); + let effort = captures["effort"].to_ascii_lowercase(); + return (base, Some(effort)); } + + (model_name.to_string(), None) } pub fn sanitize_function_name(name: &str) -> String { @@ -870,4 +892,65 @@ mod tests { Some(Duration::from_secs(42)) ); } + + #[test] + fn test_is_openai_responses_model_matches_o_and_gpt5_families() { + for model in [ + "o3", + "o3-mini", + "o4-mini", + "gpt-5", + "gpt-5-pro", + "gpt-5.4", + "gpt-5.4-mini", + "gpt-5-4", + "gpt-5-2-pro", + "databricks-gpt-5.4", + "goose-gpt-5.4-high", + "headless-goose-o3-mini", + ] { + assert!(is_openai_responses_model(model), "{model} should match"); + } + } + + #[test] + fn test_is_openai_responses_model_rejects_other_families() { + for model in [ + "gpt-4o", + "claude-sonnet-4", + "databricks-claude-sonnet-4", + "llama-3-70b", + ] { + assert!( + !is_openai_responses_model(model), + "{model} should not match" + ); + } + } + + #[test] + fn test_extract_reasoning_effort_for_responses_models() { + for (model, expected_name, expected_effort) in [ + ("o3-none", "o3", Some("none")), + ("o3-xhigh", "o3", Some("xhigh")), + ("gpt-5-low", "gpt-5", Some("low")), + ("gpt-5.4", "gpt-5.4", None), + ( + "databricks-gpt-5.4-high", + "databricks-gpt-5.4", + Some("high"), + ), + ("databricks-o3-low", "databricks-o3", Some("low")), + ("goose-gpt-5-high", "goose-gpt-5", Some("high")), + ("gpt-4o", "gpt-4o", None), + ] { + let (name, effort) = extract_reasoning_effort(model); + assert_eq!(name, expected_name, "unexpected base model for {model}"); + assert_eq!( + effort.as_deref(), + expected_effort, + "unexpected effort for {model}" + ); + } + } } diff --git a/crates/goose/tests/acp_common_tests/mod.rs b/crates/goose/tests/acp_common_tests/mod.rs index 16d9ca2a14..b2e6bcb15f 100644 --- a/crates/goose/tests/acp_common_tests/mod.rs +++ b/crates/goose/tests/acp_common_tests/mod.rs @@ -422,10 +422,12 @@ pub async fn run_load_mode() { } pub async fn run_load_model() { + // Use a Chat Completions model so the canned SSE fixtures parse correctly. + // TODO: add a Responses API mock to OpenAiFixture for responses-routed models. let expected_session_id = C::expected_session_id(); let openai = OpenAiFixture::new( vec![( - r#""model":"o4-mini""#.into(), + r#""model":"gpt-4.1""#.into(), include_str!("../acp_test_data/openai_basic.txt"), )], expected_session_id.clone(), @@ -437,7 +439,7 @@ pub async fn run_load_model() { expected_session_id.set(&session.session_id().0); let session_id = session.session_id().0.to_string(); - conn.set_model(&session_id, "o4-mini").await.unwrap(); + conn.set_model(&session_id, "gpt-4.1").await.unwrap(); let output = session .prompt("what is 1+1", PermissionDecision::Cancel) @@ -446,7 +448,7 @@ pub async fn run_load_model() { assert_eq!(output.text, "2"); let SessionData { models, .. } = conn.load_session(&session_id, vec![]).await.unwrap(); - assert_eq!(&*models.unwrap().current_model_id.0, "o4-mini"); + assert_eq!(&*models.unwrap().current_model_id.0, "gpt-4.1"); } pub async fn run_load_session_mcp() { @@ -773,12 +775,14 @@ enum SetModelVia { } async fn run_model_set_impl(via: SetModelVia) { + // Use a Chat Completions model so the canned SSE fixtures parse correctly. + // TODO: add a Responses API mock to OpenAiFixture for responses-routed models. let expected_session_id = C::expected_session_id(); let openai = OpenAiFixture::new( vec![ // Session B prompt with switched model ( - r#""model":"o4-mini""#.into(), + r#""model":"gpt-4.1""#.into(), include_str!("../acp_test_data/openai_basic.txt"), ), // Session A prompt with default model @@ -803,23 +807,23 @@ async fn run_model_set_impl(via: SetModelVia) { .. } = conn.new_session().await.unwrap(); - // Session B: switch to o4-mini + // Session B: switch to gpt-4.1 let SessionData { session: mut session_b, .. } = conn.new_session().await.unwrap(); let session_id = &session_b.session_id().0; match via { - SetModelVia::Dedicated => conn.set_model(session_id, "o4-mini").await.unwrap(), + SetModelVia::Dedicated => conn.set_model(session_id, "gpt-4.1").await.unwrap(), SetModelVia::ConfigOption => conn - .set_config_option(session_id, "model", "o4-mini") + .set_config_option(session_id, "model", "gpt-4.1") .await .unwrap(), } let set_model_notifs = session_b.notifications(); - // Prompt B — expects o4-mini + // Prompt B — expects gpt-4.1 expected_session_id.set(&session_b.session_id().0); let output = session_b .prompt("what is 1+1", PermissionDecision::Cancel) @@ -1152,13 +1156,16 @@ pub async fn run_prompt_mcp() { } pub async fn run_prompt_model_mismatch() { - // Start the connection where the current model is not desired. + // Start the connection where the current model differs from TEST_MODEL. + // Use a Chat Completions model so the canned SSE fixtures parse correctly. + // TODO: add a Responses API mock to OpenAiFixture so we can test with + // responses-routed models like o4-mini here. let config = TestConnectionConfig { - current_model: "o4-mini".to_string(), + current_model: "gpt-4.1".to_string(), ..Default::default() }; - // Server starts on o4-mini; client is configured with TEST_MODEL. + // Server starts on gpt-4.1; client is configured with TEST_MODEL. // If session_model is seeded from the response, stream() detects the // mismatch and sends set_model(TEST_MODEL) before prompting. let BasicSession { conn: _, .. } = new_basic_session::(config).await; diff --git a/crates/goose/tests/acp_custom_requests_test.rs b/crates/goose/tests/acp_custom_requests_test.rs index b31509fc58..a2a5951647 100644 --- a/crates/goose/tests/acp_custom_requests_test.rs +++ b/crates/goose/tests/acp_custom_requests_test.rs @@ -250,6 +250,9 @@ fn test_developer_fs_requests_use_acp_session_id() { ) .await; let config = TestConnectionConfig { + // gpt-5-nano routes to the Responses API; use a Chat Completions + // model so the canned SSE fixtures are parsed correctly. + current_model: "gpt-4.1".to_string(), read_text_file: Some(Arc::new(move |req| { *seen_session_id_clone.lock().unwrap() = Some(req.session_id.0.to_string()); Ok(sacp::schema::ReadTextFileResponse::new( diff --git a/crates/goose/tests/session_id_propagation_test.rs b/crates/goose/tests/session_id_propagation_test.rs index fe902dc9d1..be26868475 100644 --- a/crates/goose/tests/session_id_propagation_test.rs +++ b/crates/goose/tests/session_id_propagation_test.rs @@ -48,12 +48,13 @@ fn create_test_provider(mock_server_url: &str) -> Box { async fn setup_mock_server() -> (MockServer, HeaderCapture, Box) { let mock_server = MockServer::start().await; let capture = HeaderCapture::new(); - let capture_clone = capture.clone(); + let chat_capture = capture.clone(); + let responses_capture = capture.clone(); Mock::given(method("POST")) .and(path("/v1/chat/completions")) .respond_with(move |req: &Request| { - capture_clone.capture_session_header(req); + chat_capture.capture_session_header(req); // Return SSE streaming format let sse_response = format!( "data: {}\n\ndata: {}\n\ndata: [DONE]\n\n", @@ -85,6 +86,57 @@ async fn setup_mock_server() -> (MockServer, HeaderCapture, Box) { .mount(&mock_server) .await; + Mock::given(method("POST")) + .and(path("/v1/responses")) + .respond_with(move |req: &Request| { + responses_capture.capture_session_header(req); + let sse_response = format!( + "data: {}\n\ndata: {}\n\ndata: {}\n\ndata: [DONE]\n\n", + json!({ + "type": "response.created", + "sequence_number": 1, + "response": { + "id": "resp_test", + "object": "response", + "created_at": 1755133833, + "status": "in_progress", + "model": "gpt-5-nano", + "output": [] + } + }), + json!({ + "type": "response.output_text.delta", + "sequence_number": 2, + "item_id": "msg_test", + "output_index": 0, + "content_index": 0, + "delta": "Hi there! How can I help you today?" + }), + json!({ + "type": "response.completed", + "sequence_number": 3, + "response": { + "id": "resp_test", + "object": "response", + "created_at": 1755133833, + "status": "completed", + "model": "gpt-5-nano", + "output": [], + "usage": { + "input_tokens": 8, + "output_tokens": 10, + "total_tokens": 18 + } + } + }) + ); + ResponseTemplate::new(200) + .set_body_string(sse_response) + .insert_header("content-type", "text/event-stream") + }) + .mount(&mock_server) + .await; + let provider = create_test_provider(&mock_server.uri()); (mock_server, capture, provider) }