diff --git a/.env.example b/.env.example index c01d959..e1900d9 100644 --- a/.env.example +++ b/.env.example @@ -17,13 +17,18 @@ LLAMACPP_BASE_URL="http://localhost:8080/v1" # All Claude model requests are mapped to these models, plain model is fallback # Format: provider_type/model/name # Valid providers: "nvidia_nim" | "open_router" | "lmstudio" | "llamacpp" -# model that happens to be loaded. MODEL_OPUS="nvidia_nim/z-ai/glm4.7" MODEL_SONNET="open_router/arcee-ai/trinity-large-preview:free" MODEL_HAIKU="open_router/stepfun/step-3.5-flash:free" MODEL="nvidia_nim/z-ai/glm4.7" +# NIM Settings +# Enable chat_template_kwargs + reasoning_budget for thinking models (kimi, nemotron). +# Leave false for models that don't support it (e.g. Mistral). +NIM_ENABLE_THINKING=false + + # Provider config PROVIDER_RATE_LIMIT=40 PROVIDER_RATE_WINDOW=60 @@ -77,4 +82,4 @@ FAST_PREFIX_DETECTION=true ENABLE_NETWORK_PROBE_MOCK=true ENABLE_TITLE_GENERATION_SKIP=true ENABLE_SUGGESTION_MODE_SKIP=true -ENABLE_FILEPATH_EXTRACTION_MOCK=true +ENABLE_FILEPATH_EXTRACTION_MOCK=true \ No newline at end of file diff --git a/README.md b/README.md index 334de6d..00dbb6a 100644 --- a/README.md +++ b/README.md @@ -73,6 +73,9 @@ MODEL_OPUS="nvidia_nim/z-ai/glm4.7" MODEL_SONNET="nvidia_nim/moonshotai/kimi-k2-thinking" MODEL_HAIKU="nvidia_nim/stepfun-ai/step-3.5-flash" MODEL="nvidia_nim/z-ai/glm4.7" # fallback + +# Enable for thinking models (kimi, nemotron). Leave false for others (e.g. Mistral). +NIM_ENABLE_THINKING=true ``` @@ -437,7 +440,8 @@ Configure via `WHISPER_DEVICE` (`cpu` | `cuda` | `nvidia_nim`) and `WHISPER_MODE | `MODEL_OPUS` | Model for Claude Opus requests (falls back to `MODEL`) | `nvidia_nim/z-ai/glm4.7` | | `MODEL_SONNET` | Model for Claude Sonnet requests (falls back to `MODEL`) | `open_router/arcee-ai/trinity-large-preview:free` | | `MODEL_HAIKU` | Model for Claude Haiku requests (falls back to `MODEL`) | `open_router/stepfun/step-3.5-flash:free` | -| `NVIDIA_NIM_API_KEY` | NVIDIA API key | required for NIM | +| `NVIDIA_NIM_API_KEY` | NVIDIA API key | required for NIM | +| `NIM_ENABLE_THINKING` | Send `chat_template_kwargs` + `reasoning_budget` on NIM requests. Enable for thinking models (kimi, nemotron); leave `false` for others (e.g. Mistral) | `false` | | `OPENROUTER_API_KEY` | OpenRouter API key | required for OpenRouter | | `LM_STUDIO_BASE_URL` | LM Studio server URL | `http://localhost:1234/v1` | | `LLAMACPP_BASE_URL` | llama.cpp server URL | `http://localhost:8080/v1` | diff --git a/config/nim.py b/config/nim.py index 8888a17..ea2f9f9 100644 --- a/config/nim.py +++ b/config/nim.py @@ -21,6 +21,7 @@ class NimSettings(BaseModel): parallel_tool_calls: bool = True ignore_eos: bool = False + enable_thinking: bool = False min_tokens: int = Field(0, ge=0) chat_template: str | None = None diff --git a/config/settings.py b/config/settings.py index b773ce9..b271f6a 100644 --- a/config/settings.py +++ b/config/settings.py @@ -90,6 +90,9 @@ class Settings(BaseSettings): # ==================== NIM Settings ==================== nim: NimSettings = Field(default_factory=NimSettings) + nim_enable_thinking: bool = Field( + default=False, validation_alias="NIM_ENABLE_THINKING" + ) # ==================== Voice Note Transcription ==================== voice_note_enabled: bool = Field( @@ -171,6 +174,13 @@ class Settings(BaseSettings): ) return v + @model_validator(mode="after") + def _inject_nim_thinking(self) -> Settings: + self.nim = self.nim.model_copy( + update={"enable_thinking": self.nim_enable_thinking} + ) + return self + @model_validator(mode="after") def check_nvidia_nim_api_key(self) -> Settings: if ( diff --git a/providers/nvidia_nim/request.py b/providers/nvidia_nim/request.py index 7459c9c..067dfd9 100644 --- a/providers/nvidia_nim/request.py +++ b/providers/nvidia_nim/request.py @@ -63,10 +63,11 @@ def build_request_body(request_data: Any, nim: NimSettings) -> dict: if request_extra: extra_body.update(request_extra) - extra_body.setdefault( - "chat_template_kwargs", {"thinking": True, "enable_thinking": True} - ) - _set_extra(extra_body, "reasoning_budget", max_tokens) + if nim.enable_thinking: + extra_body.setdefault( + "chat_template_kwargs", {"thinking": True, "enable_thinking": True} + ) + _set_extra(extra_body, "reasoning_budget", max_tokens) req_top_k = getattr(request_data, "top_k", None) top_k = req_top_k if req_top_k is not None else nim.top_k diff --git a/tests/providers/test_nvidia_nim_request.py b/tests/providers/test_nvidia_nim_request.py index caf71b9..8545423 100644 --- a/tests/providers/test_nvidia_nim_request.py +++ b/tests/providers/test_nvidia_nim_request.py @@ -98,7 +98,7 @@ class TestBuildRequestBody: req.extra_body = None req.top_k = None - nim = NimSettings() + nim = NimSettings(enable_thinking=True) body = build_request_body(req, nim) extra = body["extra_body"] assert extra["chat_template_kwargs"] == { @@ -107,6 +107,26 @@ class TestBuildRequestBody: } assert extra["reasoning_budget"] == body["max_tokens"] + def test_no_chat_template_kwargs_when_thinking_disabled(self): + req = MagicMock() + req.model = "test" + req.messages = [MagicMock(role="user", content="hi")] + req.max_tokens = 100 + req.system = None + req.temperature = None + req.top_p = None + req.stop_sequences = None + req.tools = None + req.tool_choice = None + req.extra_body = None + req.top_k = None + + nim = NimSettings(enable_thinking=False) + body = build_request_body(req, nim) + extra = body.get("extra_body", {}) + assert "chat_template_kwargs" not in extra + assert "reasoning_budget" not in extra + def test_no_reasoning_params_in_extra_body(self): req = MagicMock() req.model = "test"