common: Refactoring sampler parameters (#20429) (#22233)

This change refactors the reasoning_budget_message parameter from the
common params into the sampling parameters specifically. It also removes
the reasoning_budget common parameter and standardizes on the existing
reasoning_budget_tokens parameter in the sampling configuration.

Issue: https://github.com/ggml-org/llama.cpp/issues/20429
Original PR: https://github.com/ggml-org/llama.cpp/pull/20297
This commit is contained in:
Ethan Turner 2026-04-22 01:40:19 -07:00 committed by GitHub
parent 134d6e54d4
commit 750579ff14
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 7 additions and 8 deletions

View file

@ -3122,14 +3122,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"token budget for thinking: -1 for unrestricted, 0 for immediate end, N>0 for token budget (default: -1)",
[](common_params & params, int value) {
if (value < -1) { throw std::invalid_argument("invalid value"); }
params.reasoning_budget = value;
params.sampling.reasoning_budget_tokens = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
add_opt(common_arg(
{"--reasoning-budget-message"}, "MESSAGE",
"message injected before the end-of-thinking tag when reasoning budget is exhausted (default: none)",
[](common_params & params, const std::string & value) {
params.reasoning_budget_message = value;
params.sampling.reasoning_budget_message = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET_MESSAGE"));
add_opt(common_arg(

View file

@ -274,6 +274,7 @@ struct common_params_sampling {
std::vector<llama_token> reasoning_budget_start; // start tag token sequence
std::vector<llama_token> reasoning_budget_end; // end tag token sequence
std::vector<llama_token> reasoning_budget_forced; // forced sequence (message + end tag)
std::string reasoning_budget_message; // message injected before end tag when budget exhausted
bool backend_sampling = false;
@ -581,8 +582,6 @@ struct common_params {
bool force_pure_content_parser = false;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
int enable_reasoning = -1; // -1 = auto, 0 = disable, 1 = enable
int reasoning_budget = -1;
std::string reasoning_budget_message; // message injected before end tag when budget exhausted
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
int sleep_idle_seconds = -1; // if >0, server will sleep after this many seconds of idle time

View file

@ -77,8 +77,8 @@ struct cli_context {
// defaults.return_progress = true; // TODO: show progress
verbose_prompt = params.verbose_prompt;
reasoning_budget = params.reasoning_budget;
reasoning_budget_message = params.reasoning_budget_message;
reasoning_budget = params.sampling.reasoning_budget_tokens;
reasoning_budget_message = params.sampling.reasoning_budget_message;
}
std::string generate_completion(result_timings & out_timings) {

View file

@ -1045,8 +1045,8 @@ private:
/* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
/* allow_audio */ mctx ? mtmd_support_audio (mctx) : false,
/* enable_thinking */ enable_thinking,
/* reasoning_budget */ params_base.reasoning_budget,
/* reasoning_budget_msg */ params_base.reasoning_budget_message,
/* reasoning_budget */ params_base.sampling.reasoning_budget_tokens,
/* reasoning_budget_msg */ params_base.sampling.reasoning_budget_message,
/* media_path */ params_base.media_path,
/* force_pure_content */ params_base.force_pure_content_parser
};