Add speculative decoding support to the server and command-line interfaces

This commit is contained in:
DeEMO 2025-06-23 20:36:32 +08:00 committed by DeEMO
parent 1ea2d61a97
commit 2e8e42a5ad
11 changed files with 591 additions and 31 deletions

View file

@ -1704,9 +1704,9 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
{"-md", "--model-draft"}, "FNAME",
"draft model for speculative decoding (default: unused)",
[](gpt_params & params, const std::string & value) {
params.model_draft = value;
params.speculative.model = value;
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
add_opt(llama_arg(
{"-mu", "--model-url"}, "MODEL_URL",
"model download url (default: unused)",