Add speculative decoding support to the server and command-line interfaces

2025-09-11 09:34:39 +00:00 · 2025-06-23 20:36:32 +08:00 · 2025-06-23 20:36:32 +08:00 · 2e8e42a5ad
commit 2e8e42a5ad
parent 1ea2d61a97
11 changed files with 591 additions and 31 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -1704,9 +1704,9 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
        {"-md", "--model-draft"}, "FNAME",
        "draft model for speculative decoding (default: unused)",
        [](gpt_params & params, const std::string & value) {
-            params.model_draft = value;
+            params.speculative.model = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
    add_opt(llama_arg(
        {"-mu", "--model-url"}, "MODEL_URL",
        "model download url (default: unused)",