Add per-model thinking toggles

This commit is contained in:
Alishahryar1 2026-04-25 20:51:07 -07:00
parent 180c942af7
commit f29e693dc5
21 changed files with 220 additions and 54 deletions

View file

@ -28,9 +28,12 @@ MODEL="nvidia_nim/z-ai/glm4.7"
# Thinking output
# Global switch for provider reasoning requests and Claude thinking blocks.
# Set false to suppress thinking across NIM, OpenRouter, LM Studio, and llama.cpp.
ENABLE_THINKING=true
# Per-Claude-model switches for provider reasoning requests and Claude thinking blocks.
# Blank per-model switches inherit ENABLE_MODEL_THINKING.
ENABLE_OPUS_THINKING=
ENABLE_SONNET_THINKING=
ENABLE_HAIKU_THINKING=
ENABLE_MODEL_THINKING=true
# Provider config

View file

@ -102,8 +102,12 @@ MODEL_SONNET=
MODEL_HAIKU=
MODEL="nvidia_nim/z-ai/glm4.7" # fallback
# Global switch for provider reasoning requests and Claude thinking blocks.
ENABLE_THINKING=true
# Per-Claude-model switches for provider reasoning requests and Claude thinking blocks.
# Blank per-model switches inherit ENABLE_MODEL_THINKING.
ENABLE_OPUS_THINKING=
ENABLE_SONNET_THINKING=
ENABLE_HAIKU_THINKING=
ENABLE_MODEL_THINKING=true
```
</details>
@ -179,7 +183,7 @@ MODEL="nvidia_nim/z-ai/glm4.7" # fallback
</details>
> Migration: `NIM_ENABLE_THINKING` was removed in this release. Rename it to `ENABLE_THINKING`.
> Migration: `NIM_ENABLE_THINKING` and `ENABLE_THINKING` were removed in this release. Use `ENABLE_MODEL_THINKING` as the fallback switch, with optional `ENABLE_OPUS_THINKING`, `ENABLE_SONNET_THINKING`, and `ENABLE_HAIKU_THINKING` overrides.
<details>
<summary><b>Optional Authentication</b> (restrict access to your proxy)</summary>
@ -342,7 +346,7 @@ free-claude-code # starts the server
- **Per-model routing**: Opus / Sonnet / Haiku requests resolve to their model-specific backend, with `MODEL` as fallback
- **Request optimization**: 5 categories of trivial requests (quota probes, title generation, prefix detection, suggestions, filepath extraction) are intercepted and responded to locally without using API quota
- **Format handling**: OpenRouter, LM Studio, and llama.cpp use native Anthropic Messages endpoints; NIM and DeepSeek use shared OpenAI chat translation
- **Thinking tokens**: `<think>` tags and `reasoning_content` fields are converted into native Claude thinking blocks when `ENABLE_THINKING=true`
- **Thinking tokens**: `<think>` tags and `reasoning_content` fields are converted into native Claude thinking blocks when the resolved model's thinking switch is enabled
The proxy also exposes Claude-compatible probe routes: `GET /v1/models`, `POST /v1/messages`, `POST /v1/messages/count_tokens`, plus `HEAD`/`OPTIONS` support for the common probe endpoints.
@ -528,7 +532,10 @@ Configure via `WHISPER_DEVICE` (`cpu` | `cuda` | `nvidia_nim`) and `WHISPER_MODE
| `MODEL_SONNET` | Model for Claude Sonnet requests; empty falls back to `MODEL` | empty |
| `MODEL_HAIKU` | Model for Claude Haiku requests; empty falls back to `MODEL` | empty |
| `NVIDIA_NIM_API_KEY` | NVIDIA API key | required for NIM |
| `ENABLE_THINKING` | Global switch for provider reasoning requests and Claude thinking blocks. Set `false` to hide thinking across all providers. | `true` |
| `ENABLE_MODEL_THINKING` | Fallback switch for provider reasoning requests and Claude thinking blocks. Set `false` to hide thinking unless a model tier overrides it. | `true` |
| `ENABLE_OPUS_THINKING` | Optional thinking switch for Claude Opus requests; empty inherits `ENABLE_MODEL_THINKING`. | empty |
| `ENABLE_SONNET_THINKING` | Optional thinking switch for Claude Sonnet requests; empty inherits `ENABLE_MODEL_THINKING`. | empty |
| `ENABLE_HAIKU_THINKING` | Optional thinking switch for Claude Haiku requests; empty inherits `ENABLE_MODEL_THINKING`. | empty |
| `OPENROUTER_API_KEY` | OpenRouter API key | required for OpenRouter |
| `DEEPSEEK_API_KEY` | DeepSeek API key | required for DeepSeek |
| `LM_STUDIO_BASE_URL` | LM Studio server URL | `http://localhost:1234/v1` |

View file

@ -17,6 +17,7 @@ class ResolvedModel:
provider_id: str
provider_model: str
provider_model_ref: str
thinking_enabled: bool
@dataclass(frozen=True, slots=True)
@ -39,6 +40,7 @@ class ModelRouter:
def resolve(self, claude_model_name: str) -> ResolvedModel:
provider_model_ref = self._settings.resolve_model(claude_model_name)
thinking_enabled = self._settings.resolve_thinking(claude_model_name)
provider_id = Settings.parse_provider_type(provider_model_ref)
provider_model = Settings.parse_model_name(provider_model_ref)
if provider_model != claude_model_name:
@ -50,6 +52,7 @@ class ModelRouter:
provider_id=provider_id,
provider_model=provider_model,
provider_model_ref=provider_model_ref,
thinking_enabled=thinking_enabled,
)
def resolve_messages_request(

View file

@ -95,6 +95,7 @@ class ClaudeProxyService:
routed.request,
input_tokens=input_tokens,
request_id=request_id,
thinking_enabled=routed.resolved.thinking_enabled,
),
media_type="text/event-stream",
headers={

View file

@ -28,9 +28,12 @@ MODEL="nvidia_nim/z-ai/glm4.7"
# Thinking output
# Global switch for provider reasoning requests and Claude thinking blocks.
# Set false to suppress thinking across NIM, OpenRouter, LM Studio, and llama.cpp.
ENABLE_THINKING=true
# Per-Claude-model switches for provider reasoning requests and Claude thinking blocks.
# Blank per-model switches inherit ENABLE_MODEL_THINKING.
ENABLE_OPUS_THINKING=
ENABLE_SONNET_THINKING=
ENABLE_HAIKU_THINKING=
ENABLE_MODEL_THINKING=true
# Provider config

View file

@ -68,22 +68,26 @@ def _env_file_override(model_config: Mapping[str, Any], key: str) -> str | None:
def _removed_env_var_message(model_config: Mapping[str, Any]) -> str | None:
"""Return a migration error for removed env vars, if present."""
removed_key = "NIM_ENABLE_THINKING"
replacement = "ENABLE_THINKING"
removed_keys = ("NIM_ENABLE_THINKING", "ENABLE_THINKING")
replacement = (
"ENABLE_MODEL_THINKING, ENABLE_OPUS_THINKING, "
"ENABLE_SONNET_THINKING, or ENABLE_HAIKU_THINKING"
)
if removed_key in os.environ:
return (
f"{removed_key} has been removed in this release. "
f"Rename it to {replacement}."
)
for env_file in _configured_env_files(model_config):
if _env_file_contains_key(env_file, removed_key):
for removed_key in removed_keys:
if removed_key in os.environ:
return (
f"{removed_key} has been removed in this release. "
f"Rename it to {replacement}. Found in {env_file}."
f"Rename it to {replacement}."
)
for env_file in _configured_env_files(model_config):
if _env_file_contains_key(env_file, removed_key):
return (
f"{removed_key} has been removed in this release. "
f"Rename it to {replacement}. Found in {env_file}."
)
return None
@ -142,7 +146,18 @@ class Settings(BaseSettings):
provider_max_concurrency: int = Field(
default=5, validation_alias="PROVIDER_MAX_CONCURRENCY"
)
enable_thinking: bool = Field(default=True, validation_alias="ENABLE_THINKING")
enable_model_thinking: bool = Field(
default=True, validation_alias="ENABLE_MODEL_THINKING"
)
enable_opus_thinking: bool | None = Field(
default=None, validation_alias="ENABLE_OPUS_THINKING"
)
enable_sonnet_thinking: bool | None = Field(
default=None, validation_alias="ENABLE_SONNET_THINKING"
)
enable_haiku_thinking: bool | None = Field(
default=None, validation_alias="ENABLE_HAIKU_THINKING"
)
# ==================== HTTP Client Timeouts ====================
http_read_timeout: float = Field(
@ -222,6 +237,9 @@ class Settings(BaseSettings):
"model_opus",
"model_sonnet",
"model_haiku",
"enable_opus_thinking",
"enable_sonnet_thinking",
"enable_haiku_thinking",
mode="before",
)
@classmethod
@ -317,6 +335,17 @@ class Settings(BaseSettings):
return self.model_sonnet
return self.model
def resolve_thinking(self, claude_model_name: str) -> bool:
"""Resolve whether thinking is enabled for an incoming Claude model name."""
name_lower = claude_model_name.lower()
if "opus" in name_lower and self.enable_opus_thinking is not None:
return self.enable_opus_thinking
if "haiku" in name_lower and self.enable_haiku_thinking is not None:
return self.enable_haiku_thinking
if "sonnet" in name_lower and self.enable_sonnet_thinking is not None:
return self.enable_sonnet_thinking
return self.enable_model_thinking
@staticmethod
def parse_provider_type(model_string: str) -> str:
"""Extract provider type from any 'provider/model' string."""

View file

@ -59,9 +59,11 @@ class AnthropicMessagesTransport(BaseProvider):
"""Return headers for the native messages request."""
return {"Content-Type": "application/json"}
def _build_request_body(self, request: Any) -> dict:
def _build_request_body(
self, request: Any, thinking_enabled: bool | None = None
) -> dict:
"""Build a native Anthropic request body."""
thinking_enabled = self._is_thinking_enabled(request)
thinking_enabled = self._is_thinking_enabled(request, thinking_enabled)
body = request.model_dump(exclude_none=True)
body.pop("extra_body", None)
@ -218,12 +220,13 @@ class AnthropicMessagesTransport(BaseProvider):
input_tokens: int = 0,
*,
request_id: str | None = None,
thinking_enabled: bool | None = None,
) -> AsyncIterator[str]:
"""Stream response via a native Anthropic-compatible messages endpoint."""
tag = self._provider_name
req_tag = f" request_id={request_id}" if request_id else ""
thinking_enabled = self._is_thinking_enabled(request)
body = self._build_request_body(request)
body = self._build_request_body(request, thinking_enabled=thinking_enabled)
thinking_enabled = self._is_thinking_enabled(request, thinking_enabled)
logger.info(
"{}_STREAM:{} natively passing Anthropic request model={} msgs={} tools={}",

View file

@ -32,9 +32,16 @@ class BaseProvider(ABC):
def __init__(self, config: ProviderConfig):
self._config = config
def _is_thinking_enabled(self, request: Any) -> bool:
def _is_thinking_enabled(
self, request: Any, thinking_enabled: bool | None = None
) -> bool:
"""Return whether thinking should be enabled for this request."""
thinking = getattr(request, "thinking", None)
config_enabled = (
self._config.enable_thinking
if thinking_enabled is None
else thinking_enabled
)
request_enabled = True
if thinking is not None:
thinking_type = (
@ -52,7 +59,7 @@ class BaseProvider(ABC):
)
if enabled is not None:
request_enabled = bool(enabled)
return self._config.enable_thinking and request_enabled
return config_enabled and request_enabled
@abstractmethod
async def cleanup(self) -> None:
@ -65,6 +72,7 @@ class BaseProvider(ABC):
input_tokens: int = 0,
*,
request_id: str | None = None,
thinking_enabled: bool | None = None,
) -> AsyncIterator[str]:
"""Stream response in Anthropic SSE format."""
if False:

View file

@ -20,9 +20,11 @@ class DeepSeekProvider(OpenAIChatTransport):
api_key=config.api_key,
)
def _build_request_body(self, request: Any) -> dict:
def _build_request_body(
self, request: Any, thinking_enabled: bool | None = None
) -> dict:
"""Internal helper for tests and shared building."""
return build_request_body(
request,
thinking_enabled=self._is_thinking_enabled(request),
thinking_enabled=self._is_thinking_enabled(request, thinking_enabled),
)

View file

@ -30,12 +30,14 @@ class NvidiaNimProvider(OpenAIChatTransport):
)
self._nim_settings = nim_settings
def _build_request_body(self, request: Any) -> dict:
def _build_request_body(
self, request: Any, thinking_enabled: bool | None = None
) -> dict:
"""Internal helper for tests and shared building."""
return build_request_body(
request,
self._nim_settings,
thinking_enabled=self._is_thinking_enabled(request),
thinking_enabled=self._is_thinking_enabled(request, thinking_enabled),
)
def _get_retry_request_body(self, error: Exception, body: dict) -> dict | None:

View file

@ -42,11 +42,13 @@ class OpenRouterProvider(AnthropicMessagesTransport):
default_base_url=OPENROUTER_BASE_URL,
)
def _build_request_body(self, request: Any) -> dict:
def _build_request_body(
self, request: Any, thinking_enabled: bool | None = None
) -> dict:
"""Internal helper for tests and direct request dispatch."""
return build_request_body(
request,
thinking_enabled=self._is_thinking_enabled(request),
thinking_enabled=self._is_thinking_enabled(request, thinking_enabled),
)
def _request_headers(self) -> dict[str, str]:

View file

@ -81,7 +81,9 @@ class OpenAIChatTransport(BaseProvider):
await client.aclose()
@abstractmethod
def _build_request_body(self, request: Any) -> dict:
def _build_request_body(
self, request: Any, thinking_enabled: bool | None = None
) -> dict:
"""Build request body. Must be implemented by subclasses."""
def _handle_extra_reasoning(
@ -159,11 +161,12 @@ class OpenAIChatTransport(BaseProvider):
input_tokens: int = 0,
*,
request_id: str | None = None,
thinking_enabled: bool | None = None,
) -> AsyncIterator[str]:
"""Stream response in Anthropic SSE format."""
with logger.contextualize(request_id=request_id):
async for event in self._stream_response_impl(
request, input_tokens, request_id
request, input_tokens, request_id, thinking_enabled=thinking_enabled
):
yield event
@ -172,13 +175,16 @@ class OpenAIChatTransport(BaseProvider):
request: Any,
input_tokens: int,
request_id: str | None,
*,
thinking_enabled: bool | None,
) -> AsyncIterator[str]:
"""Shared streaming implementation."""
tag = self._provider_name
message_id = f"msg_{uuid.uuid4()}"
sse = SSEBuilder(message_id, request.model, input_tokens)
body = self._build_request_body(request)
body = self._build_request_body(request, thinking_enabled=thinking_enabled)
thinking_enabled = self._is_thinking_enabled(request, thinking_enabled)
req_tag = f" request_id={request_id}" if request_id else ""
logger.info(
"{}_STREAM:{} model={} msgs={} tools={}",
@ -193,8 +199,6 @@ class OpenAIChatTransport(BaseProvider):
think_parser = ThinkTagParser()
heuristic_parser = HeuristicToolParser()
thinking_enabled = self._is_thinking_enabled(request)
finish_reason = None
usage_info = None
error_occurred = False

View file

@ -183,7 +183,7 @@ def build_provider_config(
http_read_timeout=settings.http_read_timeout,
http_write_timeout=settings.http_write_timeout,
http_connect_timeout=settings.http_connect_timeout,
enable_thinking=settings.enable_thinking,
enable_thinking=settings.enable_model_thinking,
proxy=proxy,
)

View file

@ -170,6 +170,7 @@ CAPABILITY_CONTRACTS: tuple[CapabilityContract, ...] = (
"tests/contracts/test_stream_contracts.py",
"tests/providers/test_open_router.py",
),
("test_per_model_thinking_config_e2e",),
),
CapabilityContract(
"streaming_conversion",
@ -232,7 +233,7 @@ CAPABILITY_CONTRACTS: tuple[CapabilityContract, ...] = (
"removed_env_migration",
"removed_env_migration",
"config.settings.Settings",
"NIM_ENABLE_THINKING in env or dotenv",
"NIM_ENABLE_THINKING or ENABLE_THINKING in env or dotenv",
"startup validation error with rename guidance",
"application fails fast",
("tests/config/test_config.py",),

View file

@ -124,8 +124,9 @@ FEATURE_INVENTORY: tuple[FeatureCoverage, ...] = (
(
"test_provider_adaptive_thinking_history_e2e",
"test_claude_cli_adaptive_thinking_e2e",
"test_per_model_thinking_config_e2e",
),
("providers", "cli"),
("providers", "cli", "config"),
("configured provider",),
"configured providers must not reject adaptive thinking payloads",
),

View file

@ -64,6 +64,39 @@ def test_removed_env_migration_e2e(smoke_config: SmokeConfig, tmp_path) -> None:
assert "NIM_ENABLE_THINKING has been removed" in (result.stderr + result.stdout)
@pytest.mark.smoke_target("config")
def test_per_model_thinking_config_e2e(smoke_config: SmokeConfig, tmp_path) -> None:
env_file = tmp_path / "thinking.env"
env_file.write_text(
'ENABLE_MODEL_THINKING="false"\n'
'ENABLE_OPUS_THINKING="true"\n'
"ENABLE_SONNET_THINKING=\n"
'ENABLE_HAIKU_THINKING="false"\n',
encoding="utf-8",
)
env = os.environ.copy()
env["FCC_ENV_FILE"] = str(env_file)
script = (
"from config.settings import Settings; "
"s=Settings(); "
"print(s.resolve_thinking('claude-opus-4-20250514')); "
"print(s.resolve_thinking('claude-sonnet-4-20250514')); "
"print(s.resolve_thinking('claude-haiku-4-20250514')); "
"print(s.resolve_thinking('unknown-model'))"
)
result = subprocess.run(
cmd_python_c(script),
cwd=smoke_config.root,
env=env,
capture_output=True,
text=True,
timeout=smoke_config.timeout_s,
check=False,
)
assert result.returncode == 0, result.stderr
assert result.stdout.splitlines() == ["True", "False", "False", "False"]
@pytest.mark.smoke_target("config")
def test_proxy_timeout_config_e2e(smoke_config: SmokeConfig, tmp_path) -> None:
env_file = tmp_path / "timeouts.env"

View file

@ -100,7 +100,9 @@ def test_model_mapping(client: TestClient):
client.post("/v1/messages", json=payload_haiku)
assert len(_stream_response_calls) == 1
args = _stream_response_calls[0][0]
kwargs = _stream_response_calls[0][1]
assert args[0].model != "claude-3-haiku-20240307"
assert kwargs["thinking_enabled"] is True
def test_error_fallbacks(client: TestClient):

View file

@ -39,7 +39,7 @@ def _make_mock_settings(**overrides):
mock.http_read_timeout = 300.0
mock.http_write_timeout = 10.0
mock.http_connect_timeout = 2.0
mock.enable_thinking = True
mock.enable_model_thinking = True
for key, value in overrides.items():
setattr(mock, key, value)
return mock
@ -159,12 +159,12 @@ async def test_get_provider_deepseek_uses_fixed_base_url():
@pytest.mark.asyncio
async def test_get_provider_deepseek_passes_enable_thinking():
"""DeepSeek provider receives the global thinking toggle."""
async def test_get_provider_deepseek_passes_enable_model_thinking():
"""DeepSeek provider receives the fallback thinking toggle."""
with patch("api.dependencies.get_settings") as mock_settings:
mock_settings.return_value = _make_mock_settings(
provider_type="deepseek",
enable_thinking=False,
enable_model_thinking=False,
)
provider = get_provider()

View file

@ -14,6 +14,10 @@ def settings():
settings.model_opus = None
settings.model_sonnet = None
settings.model_haiku = None
settings.enable_model_thinking = True
settings.enable_opus_thinking = None
settings.enable_sonnet_thinking = None
settings.enable_haiku_thinking = None
return settings
@ -24,6 +28,7 @@ def test_model_router_resolves_default_model(settings):
assert resolved.provider_id == "nvidia_nim"
assert resolved.provider_model == "fallback-model"
assert resolved.provider_model_ref == "nvidia_nim/fallback-model"
assert resolved.thinking_enabled is True
def test_model_router_applies_opus_override(settings):
@ -39,9 +44,23 @@ def test_model_router_applies_opus_override(settings):
assert routed.request.model == "deepseek/deepseek-r1"
assert routed.resolved.provider_model_ref == "open_router/deepseek/deepseek-r1"
assert routed.resolved.original_model == "claude-opus-4-20250514"
assert routed.resolved.thinking_enabled is True
assert request.model == "claude-opus-4-20250514"
def test_model_router_resolves_per_model_thinking(settings):
settings.enable_model_thinking = False
settings.enable_opus_thinking = True
settings.enable_haiku_thinking = False
router = ModelRouter(settings)
assert router.resolve("claude-opus-4-20250514").thinking_enabled is True
assert router.resolve("claude-sonnet-4-20250514").thinking_enabled is False
assert router.resolve("claude-3-haiku-20240307").thinking_enabled is False
assert router.resolve("claude-2.1").thinking_enabled is False
def test_model_router_applies_haiku_override(settings):
settings.model_haiku = "lmstudio/qwen2.5-7b"

View file

@ -29,7 +29,7 @@ class TestSettings:
assert isinstance(settings.provider_rate_window, int)
assert isinstance(settings.nim.temperature, float)
assert isinstance(settings.fast_prefix_detection, bool)
assert isinstance(settings.enable_thinking, bool)
assert isinstance(settings.enable_model_thinking, bool)
assert settings.http_read_timeout == 120.0
def test_get_settings_cached(self):
@ -110,13 +110,48 @@ class TestSettings:
settings = Settings()
assert settings.http_connect_timeout == 5.0
def test_enable_thinking_from_env(self, monkeypatch):
"""ENABLE_THINKING env var is loaded into settings."""
def test_enable_model_thinking_from_env(self, monkeypatch):
"""ENABLE_MODEL_THINKING env var is loaded into settings."""
from config.settings import Settings
monkeypatch.setenv("ENABLE_THINKING", "false")
monkeypatch.setenv("ENABLE_MODEL_THINKING", "false")
settings = Settings()
assert settings.enable_thinking is False
assert settings.enable_model_thinking is False
def test_per_model_thinking_from_env(self, monkeypatch):
"""Per-model thinking env vars are loaded into settings."""
from config.settings import Settings
monkeypatch.setenv("ENABLE_OPUS_THINKING", "true")
monkeypatch.setenv("ENABLE_SONNET_THINKING", "false")
monkeypatch.setenv("ENABLE_HAIKU_THINKING", "false")
settings = Settings()
assert settings.enable_opus_thinking is True
assert settings.enable_sonnet_thinking is False
assert settings.enable_haiku_thinking is False
def test_empty_per_model_thinking_inherits_model_default(self, monkeypatch):
"""Blank per-model thinking env vars are treated as unset."""
from config.settings import Settings
monkeypatch.setenv("ENABLE_MODEL_THINKING", "false")
monkeypatch.setenv("ENABLE_OPUS_THINKING", "")
settings = Settings()
assert settings.enable_opus_thinking is None
assert settings.resolve_thinking("claude-opus-4-20250514") is False
def test_resolve_thinking_uses_model_tiers(self, monkeypatch):
"""resolve_thinking applies tier override then fallback."""
from config.settings import Settings
monkeypatch.setenv("ENABLE_MODEL_THINKING", "false")
monkeypatch.setenv("ENABLE_OPUS_THINKING", "true")
monkeypatch.setenv("ENABLE_HAIKU_THINKING", "false")
settings = Settings()
assert settings.resolve_thinking("claude-opus-4-20250514") is True
assert settings.resolve_thinking("claude-sonnet-4-20250514") is False
assert settings.resolve_thinking("claude-haiku-4-20250514") is False
assert settings.resolve_thinking("unknown-model") is False
def test_anthropic_auth_token_from_env_without_dotenv_key(self, monkeypatch):
"""ANTHROPIC_AUTH_TOKEN env var is loaded when dotenv does not define it."""
@ -166,7 +201,15 @@ class TestSettings:
from config.settings import Settings
monkeypatch.setenv("NIM_ENABLE_THINKING", "false")
with pytest.raises(ValidationError, match="Rename it to ENABLE_THINKING"):
with pytest.raises(ValidationError, match="ENABLE_MODEL_THINKING"):
Settings()
def test_removed_enable_thinking_raises(self, monkeypatch):
"""ENABLE_THINKING now fails fast with a migration message."""
from config.settings import Settings
monkeypatch.setenv("ENABLE_THINKING", "false")
with pytest.raises(ValidationError, match="ENABLE_MODEL_THINKING"):
Settings()

View file

@ -38,7 +38,7 @@ def _make_settings(**overrides):
mock.http_read_timeout = 300.0
mock.http_write_timeout = 10.0
mock.http_connect_timeout = 2.0
mock.enable_thinking = True
mock.enable_model_thinking = True
mock.nim = NimSettings()
for key, value in overrides.items():
setattr(mock, key, value)