fix: add strict:false to Responses API tools and gpt-5.4 to known models (#8636)

Signed-off-by: Bradley Axen <baxen@squareup.com>
2026-04-26 10:40:45 +00:00 · 2026-04-21 15:32:24 -07:00 · 2026-04-21 15:32:24 -07:00 · ab0c40d26a
commit ab0c40d26a
parent 7325fbdae3
11 changed files with 474 additions and 130 deletions
--- a/crates/goose-test-support/src/session.rs
+++ b/crates/goose-test-support/src/session.rs
@ -1,7 +1,12 @@
 use std::sync::{Arc, Mutex};

 pub const TEST_SESSION_ID: &str = "test-session-id";
-pub const TEST_MODEL: &str = "gpt-5-nano";
+// Use a Chat Completions model so the canned SSE fixtures (which return
+// Chat Completions format) are parsed correctly. gpt-5-nano now routes to
+// the Responses API which needs a different mock format.
+// TODO: add a Responses API mock to OpenAiFixture so tests can cover
+// responses-routed models like gpt-5-nano end-to-end.
+pub const TEST_MODEL: &str = "gpt-4.1";

 const NOT_YET_SET: &str = "session-id-not-yet-set";
 pub(crate) const SESSION_ID_HEADER: &str = "agent-session-id";
--- a/crates/goose/src/model.rs
+++ b/crates/goose/src/model.rs
@ -138,9 +138,22 @@ impl ModelConfig {
            }
        }

-        if let Some(canonical) =
+        // Try canonical lookup with the full model name first, then fall back
+        // to the name with reasoning-effort suffixes stripped (e.g.
+        // "databricks-gpt-5.4-high" → "databricks-gpt-5.4").
+        let canonical =
            crate::providers::canonical::maybe_get_canonical_model(provider_name, &self.model_name)
-        {
+                .or_else(|| {
+                    let (base, _effort) =
+                        crate::providers::utils::extract_reasoning_effort(&self.model_name);
+                    if base != self.model_name {
+                        crate::providers::canonical::maybe_get_canonical_model(provider_name, &base)
+                    } else {
+                        None
+                    }
+                });
+
+        if let Some(canonical) = canonical {
            if self.context_limit.is_none() {
                self.context_limit = Some(canonical.limit.context);
            }
@ -299,15 +312,7 @@ impl ModelConfig {
    }

    pub fn is_openai_reasoning_model(&self) -> bool {
-        const DATABRICKS_MODEL_NAME_PREFIXES: &[&str] = &["goose-", "databricks-"];
-        const REASONING_PREFIXES: &[&str] = &["o1", "o3", "o4", "gpt-5"];
-
-        let base = DATABRICKS_MODEL_NAME_PREFIXES
-            .iter()
-            .find_map(|p| self.model_name.strip_prefix(p))
-            .unwrap_or(&self.model_name);
-
-        REASONING_PREFIXES.iter().any(|p| base.starts_with(p))
+        crate::providers::utils::is_openai_responses_model(&self.model_name)
    }

    pub fn max_output_tokens(&self) -> i32 {
@ -499,6 +504,28 @@ mod tests {
            assert_eq!(config.max_tokens, None);
            assert_eq!(config.reasoning, None);
        }
+
+        #[test]
+        fn resolves_after_stripping_reasoning_effort_suffix() {
+            let _guard = env_lock::lock_env([
+                ("GOOSE_MAX_TOKENS", None::<&str>),
+                ("GOOSE_CONTEXT_LIMIT", None::<&str>),
+            ]);
+
+            // "databricks-gpt-5.4-high" should resolve via "databricks-gpt-5.4"
+            let config = ModelConfig::new_or_fail("databricks-gpt-5.4-high")
+                .with_canonical_limits("databricks");
+            assert_eq!(config.context_limit, Some(1_050_000));
+
+            // "gpt-5.4-xhigh" should resolve via "gpt-5.4"
+            let config = ModelConfig::new_or_fail("gpt-5.4-xhigh").with_canonical_limits("openai");
+            assert_eq!(config.context_limit, Some(1_050_000));
+
+            // "gpt-5.4-nano-low" should resolve via "gpt-5.4-nano"
+            let config =
+                ModelConfig::new_or_fail("gpt-5.4-nano-low").with_canonical_limits("openai");
+            assert_eq!(config.context_limit, Some(400_000));
+        }
    }

    mod is_openai_reasoning_model {
--- a/crates/goose/src/providers/databricks.rs
+++ b/crates/goose/src/providers/databricks.rs
@ -269,17 +269,19 @@ impl DatabricksProvider {
    }

    fn is_responses_model(model_name: &str) -> bool {
-        let normalized = model_name.to_ascii_lowercase();
-        normalized.contains("codex")
+        super::utils::is_openai_responses_model(model_name)
    }

    fn get_endpoint_path(&self, model_name: &str, is_embedding: bool) -> String {
        if is_embedding {
            "serving-endpoints/text-embedding-3-small/invocations".to_string()
-        } else if Self::is_responses_model(model_name) {
-            "serving-endpoints/responses".to_string()
        } else {
-            format!("serving-endpoints/{}/invocations", model_name)
+            let (clean_name, _) = super::utils::extract_reasoning_effort(model_name);
+            if Self::is_responses_model(&clean_name) {
+                "serving-endpoints/responses".to_string()
+            } else {
+                format!("serving-endpoints/{}/invocations", clean_name)
+            }
        }
    }

@ -594,3 +596,48 @@ impl EmbeddingCapable for DatabricksProvider {
        Ok(embeddings)
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn test_provider() -> DatabricksProvider {
+        DatabricksProvider {
+            api_client: super::super::api_client::ApiClient::new(
+                "https://example.com".to_string(),
+                super::super::api_client::AuthMethod::NoAuth,
+            )
+            .unwrap(),
+            auth: DatabricksAuth::Token("fake".into()),
+            model: ModelConfig::new_or_fail("databricks-gpt-5.4"),
+            image_format: ImageFormat::OpenAi,
+            retry_config: RetryConfig::default(),
+            fast_retry_config: RetryConfig::new(0, 0, 1.0, 0),
+            name: "databricks".into(),
+            token_cache: std::sync::Arc::new(std::sync::Mutex::new(None)),
+            instance_id: None,
+        }
+    }
+
+    #[test]
+    fn responses_models_route_to_responses_endpoint() {
+        let provider = test_provider();
+
+        for (model_name, expected_path) in [
+            ("gpt-5.4", "serving-endpoints/responses"),
+            ("databricks-gpt-5.4-high", "serving-endpoints/responses"),
+            ("databricks-gpt-5-4-xhigh", "serving-endpoints/responses"),
+            ("o3-mini", "serving-endpoints/responses"),
+            (
+                "databricks-claude-sonnet-4",
+                "serving-endpoints/databricks-claude-sonnet-4/invocations",
+            ),
+        ] {
+            assert_eq!(
+                provider.get_endpoint_path(model_name, false),
+                expected_path,
+                "unexpected endpoint for {model_name}"
+            );
+        }
+    }
+}
--- a/crates/goose/src/providers/formats/databricks.rs
+++ b/crates/goose/src/providers/formats/databricks.rs
@ -2,8 +2,9 @@ use crate::conversation::message::{Message, MessageContent};
 use crate::model::ModelConfig;
 use crate::providers::formats::anthropic::{thinking_effort, thinking_type, ThinkingType};
 use crate::providers::utils::{
-    convert_image, detect_image_path, is_valid_function_name, load_image_file, safely_parse_json,
-    sanitize_function_name, ImageFormat,
+    convert_image, detect_image_path, extract_reasoning_effort, is_openai_responses_model,
+    is_valid_function_name, load_image_file, safely_parse_json, sanitize_function_name,
+    ImageFormat,
 };
 use anyhow::{anyhow, Error};
 use rmcp::model::{
@ -581,24 +582,8 @@ pub fn create_request(
        ));
    }

-    let is_openai_reasoning_model = model_config.is_openai_reasoning_model();
-    let (model_name, reasoning_effort) = if is_openai_reasoning_model {
-        let parts: Vec<&str> = model_config.model_name.split('-').collect();
-        let last_part = parts.last().unwrap();
-
-        match *last_part {
-            "low" | "medium" | "high" => {
-                let base_name = parts[..parts.len() - 1].join("-");
-                (base_name, Some(last_part.to_string()))
-            }
-            _ => (
-                model_config.model_name.to_string(),
-                Some("medium".to_string()),
-            ),
-        }
-    } else {
-        (model_config.model_name.to_string(), None)
-    };
+    let (model_name, reasoning_effort) = extract_reasoning_effort(&model_config.model_name);
+    let is_openai_reasoning_model = is_openai_responses_model(&model_name);

    let system_message = DatabricksMessage {
        role: "system".to_string(),
@ -1073,6 +1058,63 @@ mod tests {
        Ok(())
    }

+    #[test]
+    fn test_create_request_reasoning_effort_xhigh() -> anyhow::Result<()> {
+        let model_config = ModelConfig {
+            model_name: "o3-xhigh".to_string(),
+            context_limit: Some(4096),
+            temperature: None,
+            max_tokens: Some(1024),
+            toolshim: false,
+            toolshim_model: None,
+            fast_model_config: None,
+            request_params: None,
+            reasoning: None,
+        };
+        let request = create_request(&model_config, "system", &[], &[], &ImageFormat::OpenAi)?;
+        assert_eq!(request["model"], "o3");
+        assert_eq!(request["reasoning_effort"], "xhigh");
+        Ok(())
+    }
+
+    #[test]
+    fn test_create_request_reasoning_effort_none() -> anyhow::Result<()> {
+        let model_config = ModelConfig {
+            model_name: "o3-none".to_string(),
+            context_limit: Some(4096),
+            temperature: None,
+            max_tokens: Some(1024),
+            toolshim: false,
+            toolshim_model: None,
+            fast_model_config: None,
+            request_params: None,
+            reasoning: None,
+        };
+        let request = create_request(&model_config, "system", &[], &[], &ImageFormat::OpenAi)?;
+        assert_eq!(request["model"], "o3");
+        assert_eq!(request["reasoning_effort"], "none");
+        Ok(())
+    }
+
+    #[test]
+    fn test_create_request_reasoning_effort_for_prefixed_gpt5_model() -> anyhow::Result<()> {
+        let model_config = ModelConfig {
+            model_name: "databricks-gpt-5.4-high".to_string(),
+            context_limit: Some(4096),
+            temperature: None,
+            max_tokens: Some(1024),
+            toolshim: false,
+            toolshim_model: None,
+            fast_model_config: None,
+            request_params: None,
+            reasoning: None,
+        };
+        let request = create_request(&model_config, "system", &[], &[], &ImageFormat::OpenAi)?;
+        assert_eq!(request["model"], "databricks-gpt-5.4");
+        assert_eq!(request["reasoning_effort"], "high");
+        Ok(())
+    }
+
    #[test]
    fn test_create_request_adaptive_thinking_for_46_models() -> anyhow::Result<()> {
        let _guard = env_lock::lock_env([
--- a/crates/goose/src/providers/formats/openai.rs
+++ b/crates/goose/src/providers/formats/openai.rs
@ -4,8 +4,9 @@ use crate::model::ModelConfig;
 use crate::providers::base::{ProviderUsage, Usage};
 use crate::providers::errors::ProviderError;
 use crate::providers::utils::{
-    convert_image, detect_image_path, extract_reasoning_effort, is_valid_function_name,
-    load_image_file, safely_parse_json, sanitize_function_name, ImageFormat,
+    convert_image, detect_image_path, extract_reasoning_effort, is_openai_responses_model,
+    is_valid_function_name, load_image_file, safely_parse_json, sanitize_function_name,
+    ImageFormat,
 };
 use anyhow::{anyhow, Error};
 use async_stream::try_stream;
@ -984,7 +985,7 @@ pub fn create_request(
    }

    let (model_name, reasoning_effort) = extract_reasoning_effort(&model_config.model_name);
-    let is_reasoning_model = reasoning_effort.is_some();
+    let is_reasoning_model = is_openai_responses_model(&model_name);

    let system_message = json!({
        "role": if is_reasoning_model { "developer" } else { "system" },
@ -1716,7 +1717,8 @@ mod tests {

    #[test]
    fn test_create_request_o1_default() -> anyhow::Result<()> {
-        // Test default medium reasoning effort for O1 model
+        // Without an explicit effort suffix the API picks its own default;
+        // we should omit reasoning_effort entirely but still use "developer" role.
        let model_config = ModelConfig {
            model_name: "o1".to_string(),
            context_limit: Some(4096),
@ -1745,13 +1747,16 @@ mod tests {
                    "content": "system"
                }
            ],
-            "reasoning_effort": "medium",
            "max_completion_tokens": 1024
        });

        for (key, value) in expected.as_object().unwrap() {
            assert_eq!(obj.get(key).unwrap(), value);
        }
+        assert!(
+            obj.get("reasoning_effort").is_none(),
+            "reasoning_effort should be omitted when no explicit suffix is provided"
+        );

        Ok(())
    }
--- a/crates/goose/src/providers/formats/openai_responses.rs
+++ b/crates/goose/src/providers/formats/openai_responses.rs
@ -2,7 +2,7 @@ use crate::conversation::message::{Message, MessageContent};
 use crate::mcp_utils::extract_text_from_resource;
 use crate::model::ModelConfig;
 use crate::providers::base::{ProviderUsage, Usage};
-use crate::providers::utils::extract_reasoning_effort;
+use crate::providers::utils::{extract_reasoning_effort, is_openai_responses_model};
 use anyhow::{anyhow, Error};
 use async_stream::try_stream;
 use chrono;
@ -468,7 +468,10 @@ pub fn create_responses_request(
    add_message_items(&mut input_items, messages);

    let (model_name, reasoning_effort) = extract_reasoning_effort(&model_config.model_name);
-    let is_reasoning_model = reasoning_effort.is_some();
+    // All models routed here are responses-capable; temperature is rejected
+    // by the API for reasoning models regardless of whether an explicit
+    // effort suffix was provided.
+    let is_reasoning_model = is_openai_responses_model(&model_name);

    let mut payload = json!({
        "model": model_name,
@ -495,6 +498,7 @@ pub fn create_responses_request(
                    "name": tool.name,
                    "description": tool.description,
                    "parameters": tool.input_schema,
+                    "strict": false,
                })
            })
            .collect();
@ -1083,4 +1087,103 @@ mod tests {
        assert_eq!(info.effort.as_deref(), Some("high"));
        assert_eq!(info.summary.as_deref(), Some("Thought deeply"));
    }
+
+    #[test]
+    fn test_responses_tools_include_strict_false() {
+        let model_config = ModelConfig {
+            model_name: "gpt-5.4".to_string(),
+            context_limit: None,
+            temperature: None,
+            max_tokens: None,
+            toolshim: false,
+            toolshim_model: None,
+            fast_model_config: None,
+            request_params: None,
+            reasoning: None,
+        };
+
+        let tool = Tool::new(
+            "shell",
+            "Execute a shell command",
+            object!({
+                "type": "object",
+                "properties": {
+                    "command": {
+                        "type": "string",
+                        "description": "The command to run"
+                    }
+                },
+                "required": ["command"]
+            }),
+        );
+
+        let result =
+            create_responses_request(&model_config, "You are helpful.", &[], &[tool]).unwrap();
+        let tools = result["tools"]
+            .as_array()
+            .expect("tools should be an array");
+        assert_eq!(tools.len(), 1);
+        assert_eq!(tools[0]["strict"], json!(false),
+            "Responses API defaults strict to true, but MCP tool schemas are not strict-compatible; must explicitly set strict: false");
+    }
+
+    #[test]
+    fn test_responses_request_with_explicit_effort_suffix() {
+        for (model_name, expected_model, expected_effort) in [
+            ("gpt-5.4-xhigh", "gpt-5.4", "xhigh"),
+            ("databricks-gpt-5.4-high", "databricks-gpt-5.4", "high"),
+            ("databricks-o3-none", "databricks-o3", "none"),
+        ] {
+            let model_config = ModelConfig {
+                model_name: model_name.to_string(),
+                context_limit: None,
+                temperature: None,
+                max_tokens: None,
+                toolshim: false,
+                toolshim_model: None,
+                fast_model_config: None,
+                request_params: None,
+                reasoning: None,
+            };
+
+            let result =
+                create_responses_request(&model_config, "You are helpful.", &[], &[]).unwrap();
+
+            assert_eq!(
+                result["model"], expected_model,
+                "unexpected model for {model_name}"
+            );
+            assert_eq!(
+                result["reasoning"]["effort"], expected_effort,
+                "unexpected effort for {model_name}"
+            );
+            assert_eq!(result["reasoning"]["summary"], "auto");
+        }
+    }
+
+    #[test]
+    fn test_responses_request_without_effort_suffix_omits_reasoning() {
+        for model_name in ["gpt-5.4", "o3", "gpt-5-nano"] {
+            let model_config = ModelConfig {
+                model_name: model_name.to_string(),
+                context_limit: None,
+                temperature: None,
+                max_tokens: None,
+                toolshim: false,
+                toolshim_model: None,
+                fast_model_config: None,
+                request_params: None,
+                reasoning: None,
+            };
+
+            let result =
+                create_responses_request(&model_config, "You are helpful.", &[], &[]).unwrap();
+
+            assert_eq!(result["model"], model_name, "model should be unchanged");
+            assert!(
+                result.get("reasoning").is_none(),
+                "reasoning should be omitted for {model_name} without explicit effort suffix"
+            );
+        }
+    }
 }
--- a/crates/goose/src/providers/openai.rs
+++ b/crates/goose/src/providers/openai.rs
@ -48,9 +48,21 @@ pub const OPEN_AI_KNOWN_MODELS: &[(&str, usize)] = &[
    ("gpt-3.5-turbo", 16_385),
    ("gpt-4-turbo", 128_000),
    ("o4-mini", 128_000),
+    ("gpt-5", 400_000),
+    ("gpt-5-mini", 400_000),
    ("gpt-5-nano", 400_000),
-    ("gpt-5.1-codex", 400_000),
+    ("gpt-5-pro", 400_000),
    ("gpt-5-codex", 400_000),
+    ("gpt-5.1", 400_000),
+    ("gpt-5.1-codex", 400_000),
+    ("gpt-5.2", 400_000),
+    ("gpt-5.2-codex", 400_000),
+    ("gpt-5.2-pro", 400_000),
+    ("gpt-5.3-codex", 400_000),
+    ("gpt-5.4", 1_050_000),
+    ("gpt-5.4-mini", 400_000),
+    ("gpt-5.4-nano", 400_000),
+    ("gpt-5.4-pro", 1_050_000),
 ];

 pub const OPEN_AI_DOC_URL: &str = "https://platform.openai.com/docs/models";
@ -283,10 +295,7 @@ impl OpenAiProvider {
    }

    fn is_responses_model(model_name: &str) -> bool {
-        let normalized_model = model_name.to_ascii_lowercase();
-        (normalized_model.starts_with("gpt-5") && normalized_model.contains("codex"))
-            || normalized_model.starts_with("gpt-5.2-pro")
-            || normalized_model.starts_with("gpt-5.4")
+        super::utils::is_openai_responses_model(model_name)
    }

    fn should_use_responses_api(model_name: &str, base_path: &str) -> bool {
@ -834,59 +843,20 @@ mod tests {
    }

    #[test]
-    fn gpt_5_2_codex_uses_responses_when_base_path_is_default() {
-        assert!(OpenAiProvider::should_use_responses_api(
-            "gpt-5.2-codex",
-            "v1/chat/completions"
-        ));
-    }
-
-    #[test]
-    fn gpt_5_2_pro_uses_responses_when_base_path_is_default() {
-        assert!(OpenAiProvider::should_use_responses_api(
-            "gpt-5.2-pro",
-            "v1/chat/completions"
-        ));
-    }
-
-    #[test]
-    fn gpt_5_2_pro_with_date_uses_responses() {
-        assert!(OpenAiProvider::should_use_responses_api(
-            "gpt-5.2-pro-2025-12-11",
-            "v1/chat/completions"
-        ));
-    }
-
-    #[test]
-    fn explicit_chat_path_forces_chat_completions() {
-        assert!(!OpenAiProvider::should_use_responses_api(
-            "gpt-5.2-codex",
-            "openai/v1/chat/completions"
-        ));
-    }
-
-    #[test]
-    fn gpt_5_4_uses_responses_when_base_path_is_default() {
-        assert!(OpenAiProvider::should_use_responses_api(
-            "gpt-5.4",
-            "v1/chat/completions"
-        ));
-    }
-
-    #[test]
-    fn gpt_5_4_with_date_uses_responses() {
-        assert!(OpenAiProvider::should_use_responses_api(
-            "gpt-5.4-2026-03-01",
-            "v1/chat/completions"
-        ));
-    }
-
-    #[test]
-    fn gpt_4o_does_not_use_responses() {
-        assert!(!OpenAiProvider::should_use_responses_api(
-            "gpt-4o",
-            "v1/chat/completions"
-        ));
+    fn responses_api_routing_uses_model_family_unless_path_forces_chat() {
+        for (model_name, base_path, expected) in [
+            ("gpt-5.4", "v1/chat/completions", true),
+            ("gpt-5.4-xhigh", "v1/chat/completions", true),
+            ("gpt-5.2-pro-2025-12-11", "v1/chat/completions", true),
+            ("gpt-4o", "v1/chat/completions", false),
+            ("gpt-5.2-codex", "openai/v1/chat/completions", false),
+        ] {
+            assert_eq!(
+                OpenAiProvider::should_use_responses_api(model_name, base_path),
+                expected,
+                "unexpected routing for {model_name} via {base_path}"
+            );
+        }
    }

    #[test]
--- a/crates/goose/src/providers/utils.rs
+++ b/crates/goose/src/providers/utils.rs
@ -193,26 +193,48 @@ pub async fn handle_response_google_compat(response: Response) -> Result<Value,
    }
 }

-pub fn extract_reasoning_effort(model_name: &str) -> (String, Option<String>) {
-    let is_reasoning_model = model_name.starts_with("o1")
-        || model_name.starts_with("o2")
-        || model_name.starts_with("o3")
-        || model_name.starts_with("o4")
-        || model_name.starts_with("gpt-5");
+/// True when the model should use the OpenAI Responses API.
+///
+/// The Responses API is backwards-compatible with all OpenAI reasoning
+/// models, so every `o`-series (`o1`, `o3`, `o4`, …) and `gpt-5` variant
+/// routes here. The matcher intentionally scans the full model identifier so
+/// hosted aliases like `databricks-gpt-5.4`, `goose-o3-mini`, or
+/// `headless-goose-o3-mini` work without provider-specific normalization.
+pub fn is_openai_responses_model(model_name: &str) -> bool {
+    static RE: OnceLock<Regex> = OnceLock::new();
+    let re =
+        RE.get_or_init(|| Regex::new(r"(?i)(?:^|[-/])(?:o\d+(?:$|-)|gpt-5(?:$|[-.]))").unwrap());
+    re.is_match(model_name)
+}

-    if !is_reasoning_model {
+/// Extract an explicit reasoning-effort suffix from a model name.
+///
+/// Returns `(base_model_name, Some(effort))` when the user appended a
+/// recognised suffix like `-high` or `-xhigh`, e.g. `gpt-5.4-high` →
+/// `("gpt-5.4", Some("high"))`.
+///
+/// When no suffix is present the effort is `None` — callers should omit
+/// the `reasoning` field entirely so the API applies its own per-model
+/// default. This avoids hard-coding a default that may be invalid for
+/// certain models (e.g. `gpt-5-pro` only accepts `high`; older o-series
+/// models reject `none` and `xhigh`).
+pub fn extract_reasoning_effort(model_name: &str) -> (String, Option<String>) {
+    if !is_openai_responses_model(model_name) {
        return (model_name.to_string(), None);
    }

-    let parts: Vec<&str> = model_name.split('-').collect();
-    let last_part = parts.last().unwrap();
-    match *last_part {
-        "low" | "medium" | "high" => {
-            let base_name = parts[..parts.len() - 1].join("-");
-            (base_name, Some(last_part.to_string()))
-        }
-        _ => (model_name.to_string(), Some("medium".to_string())),
+    static RE: OnceLock<Regex> = OnceLock::new();
+    let re = RE.get_or_init(|| {
+        Regex::new(r"(?i)^(?P<base>.+)-(?P<effort>none|low|medium|high|xhigh)$").unwrap()
+    });
+
+    if let Some(captures) = re.captures(model_name) {
+        let base = captures["base"].to_string();
+        let effort = captures["effort"].to_ascii_lowercase();
+        return (base, Some(effort));
    }
+
+    (model_name.to_string(), None)
 }

 pub fn sanitize_function_name(name: &str) -> String {
@ -870,4 +892,65 @@ mod tests {
            Some(Duration::from_secs(42))
        );
    }
+
+    #[test]
+    fn test_is_openai_responses_model_matches_o_and_gpt5_families() {
+        for model in [
+            "o3",
+            "o3-mini",
+            "o4-mini",
+            "gpt-5",
+            "gpt-5-pro",
+            "gpt-5.4",
+            "gpt-5.4-mini",
+            "gpt-5-4",
+            "gpt-5-2-pro",
+            "databricks-gpt-5.4",
+            "goose-gpt-5.4-high",
+            "headless-goose-o3-mini",
+        ] {
+            assert!(is_openai_responses_model(model), "{model} should match");
+        }
+    }
+
+    #[test]
+    fn test_is_openai_responses_model_rejects_other_families() {
+        for model in [
+            "gpt-4o",
+            "claude-sonnet-4",
+            "databricks-claude-sonnet-4",
+            "llama-3-70b",
+        ] {
+            assert!(
+                !is_openai_responses_model(model),
+                "{model} should not match"
+            );
+        }
+    }
+
+    #[test]
+    fn test_extract_reasoning_effort_for_responses_models() {
+        for (model, expected_name, expected_effort) in [
+            ("o3-none", "o3", Some("none")),
+            ("o3-xhigh", "o3", Some("xhigh")),
+            ("gpt-5-low", "gpt-5", Some("low")),
+            ("gpt-5.4", "gpt-5.4", None),
+            (
+                "databricks-gpt-5.4-high",
+                "databricks-gpt-5.4",
+                Some("high"),
+            ),
+            ("databricks-o3-low", "databricks-o3", Some("low")),
+            ("goose-gpt-5-high", "goose-gpt-5", Some("high")),
+            ("gpt-4o", "gpt-4o", None),
+        ] {
+            let (name, effort) = extract_reasoning_effort(model);
+            assert_eq!(name, expected_name, "unexpected base model for {model}");
+            assert_eq!(
+                effort.as_deref(),
+                expected_effort,
+                "unexpected effort for {model}"
+            );
+        }
+    }
 }
--- a/crates/goose/tests/acp_common_tests/mod.rs
+++ b/crates/goose/tests/acp_common_tests/mod.rs
@ -422,10 +422,12 @@ pub async fn run_load_mode<C: Connection>() {
 }

 pub async fn run_load_model<C: Connection>() {
+    // Use a Chat Completions model so the canned SSE fixtures parse correctly.
+    // TODO: add a Responses API mock to OpenAiFixture for responses-routed models.
    let expected_session_id = C::expected_session_id();
    let openai = OpenAiFixture::new(
        vec![(
-            r#""model":"o4-mini""#.into(),
+            r#""model":"gpt-4.1""#.into(),
            include_str!("../acp_test_data/openai_basic.txt"),
        )],
        expected_session_id.clone(),
@ -437,7 +439,7 @@ pub async fn run_load_model<C: Connection>() {
    expected_session_id.set(&session.session_id().0);

    let session_id = session.session_id().0.to_string();
-    conn.set_model(&session_id, "o4-mini").await.unwrap();
+    conn.set_model(&session_id, "gpt-4.1").await.unwrap();

    let output = session
        .prompt("what is 1+1", PermissionDecision::Cancel)
@ -446,7 +448,7 @@ pub async fn run_load_model<C: Connection>() {
    assert_eq!(output.text, "2");

    let SessionData { models, .. } = conn.load_session(&session_id, vec![]).await.unwrap();
-    assert_eq!(&*models.unwrap().current_model_id.0, "o4-mini");
+    assert_eq!(&*models.unwrap().current_model_id.0, "gpt-4.1");
 }

 pub async fn run_load_session_mcp<C: Connection>() {
@ -773,12 +775,14 @@ enum SetModelVia {
 }

 async fn run_model_set_impl<C: Connection>(via: SetModelVia) {
+    // Use a Chat Completions model so the canned SSE fixtures parse correctly.
+    // TODO: add a Responses API mock to OpenAiFixture for responses-routed models.
    let expected_session_id = C::expected_session_id();
    let openai = OpenAiFixture::new(
        vec![
            // Session B prompt with switched model
            (
-                r#""model":"o4-mini""#.into(),
+                r#""model":"gpt-4.1""#.into(),
                include_str!("../acp_test_data/openai_basic.txt"),
            ),
            // Session A prompt with default model
@ -803,23 +807,23 @@ async fn run_model_set_impl<C: Connection>(via: SetModelVia) {
        ..
    } = conn.new_session().await.unwrap();

-    // Session B: switch to o4-mini
+    // Session B: switch to gpt-4.1
    let SessionData {
        session: mut session_b,
        ..
    } = conn.new_session().await.unwrap();
    let session_id = &session_b.session_id().0;
    match via {
-        SetModelVia::Dedicated => conn.set_model(session_id, "o4-mini").await.unwrap(),
+        SetModelVia::Dedicated => conn.set_model(session_id, "gpt-4.1").await.unwrap(),
        SetModelVia::ConfigOption => conn
-            .set_config_option(session_id, "model", "o4-mini")
+            .set_config_option(session_id, "model", "gpt-4.1")
            .await
            .unwrap(),
    }

    let set_model_notifs = session_b.notifications();

-    // Prompt B — expects o4-mini
+    // Prompt B — expects gpt-4.1
    expected_session_id.set(&session_b.session_id().0);
    let output = session_b
        .prompt("what is 1+1", PermissionDecision::Cancel)
@ -1152,13 +1156,16 @@ pub async fn run_prompt_mcp<C: Connection>() {
 }

 pub async fn run_prompt_model_mismatch<C: Connection>() {
-    // Start the connection where the current model is not desired.
+    // Start the connection where the current model differs from TEST_MODEL.
+    // Use a Chat Completions model so the canned SSE fixtures parse correctly.
+    // TODO: add a Responses API mock to OpenAiFixture so we can test with
+    // responses-routed models like o4-mini here.
    let config = TestConnectionConfig {
-        current_model: "o4-mini".to_string(),
+        current_model: "gpt-4.1".to_string(),
        ..Default::default()
    };

-    // Server starts on o4-mini; client is configured with TEST_MODEL.
+    // Server starts on gpt-4.1; client is configured with TEST_MODEL.
    // If session_model is seeded from the response, stream() detects the
    // mismatch and sends set_model(TEST_MODEL) before prompting.
    let BasicSession { conn: _, .. } = new_basic_session::<C>(config).await;
--- a/crates/goose/tests/acp_custom_requests_test.rs
+++ b/crates/goose/tests/acp_custom_requests_test.rs
@ -250,6 +250,9 @@ fn test_developer_fs_requests_use_acp_session_id() {
        )
        .await;
        let config = TestConnectionConfig {
+            // gpt-5-nano routes to the Responses API; use a Chat Completions
+            // model so the canned SSE fixtures are parsed correctly.
+            current_model: "gpt-4.1".to_string(),
            read_text_file: Some(Arc::new(move |req| {
                *seen_session_id_clone.lock().unwrap() = Some(req.session_id.0.to_string());
                Ok(sacp::schema::ReadTextFileResponse::new(
--- a/crates/goose/tests/session_id_propagation_test.rs
+++ b/crates/goose/tests/session_id_propagation_test.rs
@ -48,12 +48,13 @@ fn create_test_provider(mock_server_url: &str) -> Box<dyn Provider> {
 async fn setup_mock_server() -> (MockServer, HeaderCapture, Box<dyn Provider>) {
    let mock_server = MockServer::start().await;
    let capture = HeaderCapture::new();
-    let capture_clone = capture.clone();
+    let chat_capture = capture.clone();
+    let responses_capture = capture.clone();

    Mock::given(method("POST"))
        .and(path("/v1/chat/completions"))
        .respond_with(move |req: &Request| {
-            capture_clone.capture_session_header(req);
+            chat_capture.capture_session_header(req);
            // Return SSE streaming format
            let sse_response = format!(
                "data: {}\n\ndata: {}\n\ndata: [DONE]\n\n",
@ -85,6 +86,57 @@ async fn setup_mock_server() -> (MockServer, HeaderCapture, Box<dyn Provider>) {
        .mount(&mock_server)
        .await;

+    Mock::given(method("POST"))
+        .and(path("/v1/responses"))
+        .respond_with(move |req: &Request| {
+            responses_capture.capture_session_header(req);
+            let sse_response = format!(
+                "data: {}\n\ndata: {}\n\ndata: {}\n\ndata: [DONE]\n\n",
+                json!({
+                    "type": "response.created",
+                    "sequence_number": 1,
+                    "response": {
+                        "id": "resp_test",
+                        "object": "response",
+                        "created_at": 1755133833,
+                        "status": "in_progress",
+                        "model": "gpt-5-nano",
+                        "output": []
+                    }
+                }),
+                json!({
+                    "type": "response.output_text.delta",
+                    "sequence_number": 2,
+                    "item_id": "msg_test",
+                    "output_index": 0,
+                    "content_index": 0,
+                    "delta": "Hi there! How can I help you today?"
+                }),
+                json!({
+                    "type": "response.completed",
+                    "sequence_number": 3,
+                    "response": {
+                        "id": "resp_test",
+                        "object": "response",
+                        "created_at": 1755133833,
+                        "status": "completed",
+                        "model": "gpt-5-nano",
+                        "output": [],
+                        "usage": {
+                            "input_tokens": 8,
+                            "output_tokens": 10,
+                            "total_tokens": 18
+                        }
+                    }
+                })
+            );
+            ResponseTemplate::new(200)
+                .set_body_string(sse_response)
+                .insert_header("content-type", "text/event-stream")
+        })
+        .mount(&mock_server)
+        .await;
+
    let provider = create_test_provider(&mock_server.uri());
    (mock_server, capture, provider)
 }