common: Refactoring sampler parameters (#20429) (#22233)

This change refactors the reasoning_budget_message parameter from the common params into the sampling parameters specifically. It also removes the reasoning_budget common parameter and standardizes on the existing reasoning_budget_tokens parameter in the sampling configuration. Issue: https://github.com/ggml-org/llama.cpp/issues/20429 Original PR: https://github.com/ggml-org/llama.cpp/pull/20297
2026-05-19 16:31:59 +00:00 · 2026-04-22 01:40:19 -07:00 · 2026-04-22 01:40:19 -07:00 · 750579ff14
commit 750579ff14
parent 134d6e54d4
4 changed files with 7 additions and 8 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -3122,14 +3122,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        "token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)",
        [](common_params & params, int value) {
            if (value < -1) { throw std::invalid_argument("invalid value"); }
-            params.reasoning_budget = value;
+            params.sampling.reasoning_budget_tokens = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
    add_opt(common_arg(
        {"--reasoning-budget-message"}, "MESSAGE",
        "message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)",
        [](common_params & params, const std::string & value) {
-            params.reasoning_budget_message = value;
+            params.sampling.reasoning_budget_message = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET_MESSAGE"));
    add_opt(common_arg(
--- a/common/common.h
+++ b/common/common.h
@ -274,6 +274,7 @@ struct common_params_sampling {
    std::vector<llama_token> reasoning_budget_start;           // start tag token sequence
    std::vector<llama_token> reasoning_budget_end;             // end tag token sequence
    std::vector<llama_token> reasoning_budget_forced;          // forced sequence (message + end tag)
+    std::string              reasoning_budget_message;         // message injected before end tag when budget exhausted

    bool backend_sampling = false;

@ -581,8 +582,6 @@ struct common_params {
    bool force_pure_content_parser = false;
    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
    int enable_reasoning = -1; // -1 = auto, 0 = disable, 1 = enable
-    int reasoning_budget = -1;
-    std::string reasoning_budget_message; // message injected before end tag when budget exhausted
    bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
    int sleep_idle_seconds = -1;   // if >0, server will sleep after this many seconds of idle time

--- a/tools/cli/cli.cpp
+++ b/tools/cli/cli.cpp
@ -77,8 +77,8 @@ struct cli_context {
        // defaults.return_progress = true; // TODO: show progress

        verbose_prompt = params.verbose_prompt;
-        reasoning_budget = params.reasoning_budget;
-        reasoning_budget_message = params.reasoning_budget_message;
+        reasoning_budget = params.sampling.reasoning_budget_tokens;
+        reasoning_budget_message = params.sampling.reasoning_budget_message;
    }

    std::string generate_completion(result_timings & out_timings) {
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@ -1045,8 +1045,8 @@ private:
                /* allow_image           */ mctx ? mtmd_support_vision(mctx) : false,
                /* allow_audio           */ mctx ? mtmd_support_audio (mctx) : false,
                /* enable_thinking       */ enable_thinking,
-                /* reasoning_budget      */ params_base.reasoning_budget,
-                /* reasoning_budget_msg  */ params_base.reasoning_budget_message,
+                /* reasoning_budget      */ params_base.sampling.reasoning_budget_tokens,
+                /* reasoning_budget_msg  */ params_base.sampling.reasoning_budget_message,
                /* media_path            */ params_base.media_path,
                /* force_pure_content    */ params_base.force_pure_content_parser
            };