From 2e8e42a5ad2949fa8b7f4171fddb0d88a684750b Mon Sep 17 00:00:00 2001 From: DeEMO Date: Mon, 23 Jun 2025 20:36:32 +0800 Subject: [PATCH 1/8] Add speculative decoding support to the server and command-line interfaces --- Makefile | 8 + common/arg.cpp | 4 +- common/common.cpp | 2 +- common/common.h | 18 +- common/sampling.cpp | 39 ++++ common/sampling.h | 21 +++ common/speculative.cpp | 271 +++++++++++++++++++++++++++ common/speculative.h | 28 +++ examples/server/server.cpp | 221 +++++++++++++++++++--- examples/speculative/speculative.cpp | 4 +- src/llama.cpp | 6 + 11 files changed, 591 insertions(+), 31 deletions(-) create mode 100644 common/speculative.cpp create mode 100644 common/speculative.h diff --git a/Makefile b/Makefile index 39ae0b9f..b1bbc0ed 100644 --- a/Makefile +++ b/Makefile @@ -963,6 +963,7 @@ OBJ_COMMON = \ common/console.o \ common/ngram-cache.o \ common/sampling.o \ + common/speculative.o \ common/train.o \ common/build-info.o \ common/json-schema-to-grammar.o @@ -1239,6 +1240,13 @@ common/json-schema-to-grammar.o: \ common/json-schema-to-grammar.h $(CXX) $(CXXFLAGS) -c $< -o $@ +# speculative +common/speculative.o: \ + common/speculative.cpp \ + common/speculative.h \ + include/llama.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + common/train.o: \ common/train.cpp \ common/train.h diff --git a/common/arg.cpp b/common/arg.cpp index c971bb49..813f87e8 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1704,9 +1704,9 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, {"-md", "--model-draft"}, "FNAME", "draft model for speculative decoding (default: unused)", [](gpt_params & params, const std::string & value) { - params.model_draft = value; + params.speculative.model = value; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); add_opt(llama_arg( {"-mu", "--model-url"}, "MODEL_URL", "model download url (default: unused)", diff --git a/common/common.cpp b/common/common.cpp index 92c95b3b..b6182b22 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -3111,7 +3111,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta); fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false"); fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH); - fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str()); + fprintf(stream, "model_draft: %s # default:\n", params.speculative.model.c_str()); fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false"); fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers); fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict); diff --git a/common/common.h b/common/common.h index 54eb7e8b..044dfdf5 100644 --- a/common/common.h +++ b/common/common.h @@ -33,6 +33,8 @@ struct llama_lora_adapter_container : llama_lora_adapter_info { struct llama_lora_adapter * adapter; }; +using llama_tokens = std::vector; + // build info extern int LLAMA_BUILD_NUMBER; extern char const * LLAMA_COMMIT; @@ -141,6 +143,20 @@ struct gpt_sampler_params { std::string print() const; }; +struct common_params_speculative { + int32_t n_ctx = 0; // draft context size + int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding + int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding + int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default) + float p_split = 0.1f; // speculative decoding split probability + float p_min = 0.9f; // minimum speculative decoding probability (greedy) + + struct cpu_params cpuparams; + struct cpu_params cpuparams_batch; + + std::string model = ""; // draft model for speculative decoding // NOLINT +}; + struct gpt_params { int32_t n_world = 1; // number of devices to use int32_t rank = 0; // my rank for distributed inference @@ -198,9 +214,9 @@ struct gpt_params { enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings struct gpt_sampler_params sparams; + struct common_params_speculative speculative; std::string model = ""; // model path // NOLINT - std::string model_draft = ""; // draft model for speculative decoding // NOLINT std::string model_alias = "unknown"; // model alias // NOLINT std::string model_url = ""; // model url to download // NOLINT std::string hf_token = ""; // HF token // NOLINT diff --git a/common/sampling.cpp b/common/sampling.cpp index 3dc7f112..b8db920a 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -318,6 +318,45 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context return cur_p.data[cur_p.selected].id; } +std::vector gpt_sampler_sample_and_accept_n(struct gpt_sampler * gsmpl, struct llama_context * ctx, const std::vector & idxs, const llama_tokens & draft, bool grammar_first) { + GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1"); + + std::vector result; + result.reserve(idxs.size()); + + size_t i = 0; + for (; i < draft.size(); i++) { + const llama_token id = gpt_sampler_sample(gsmpl, ctx, idxs[i], grammar_first); + + gpt_sampler_accept(gsmpl, id, true); + + result.push_back(id); + + if (draft[i] != id) { + break; + } + } + + if (i == draft.size()) { + const llama_token id = gpt_sampler_sample(gsmpl, ctx, idxs[i], grammar_first); + + gpt_sampler_accept(gsmpl, id, true); + + result.push_back(id); + } + + return result; +} + +std::vector gpt_sampler_sample_and_accept_n(struct gpt_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) { + std::vector idxs(draft.size() + 1); + for (size_t i = 0; i < idxs.size(); ++i) { + idxs[i] = i; + } + + return gpt_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first); +} + uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) { return llama_sampler_get_seed(gsmpl->chain); } diff --git a/common/sampling.h b/common/sampling.h index d0e1a920..82d6b023 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -60,6 +60,27 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * // llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false); +// generalized version of gpt_sampler_sample +// +// will cross-reference the sampled tokens with a batch of draft tokens and accept those that match +// if the sampler disagrees at some point, we stop and return the accepted tokens up to now +// +// gpt_sampler_sample_n(gsmpl, ctx, { idx }, {}); +// +// is equivalent to +// +// gpt_sampler_sample(gsmpl, ctx, idx); +// gpt_sampler_accept(gsmpl, token, true); +// +// requires: idxs.size() == draft.size() + 1 +// +// returns at least 1 token, up to idxs.size() +// +std::vector gpt_sampler_sample_and_accept_n(struct gpt_sampler * gsmpl, struct llama_context * ctx, const std::vector & idxs, const llama_tokens & draft, bool grammar_first = false); + +// assume idxs == [ 0, 1, 2, ..., draft.size() ] +std::vector gpt_sampler_sample_and_accept_n(struct gpt_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false); + uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl); // helpers diff --git a/common/speculative.cpp b/common/speculative.cpp new file mode 100644 index 00000000..8a03d08c --- /dev/null +++ b/common/speculative.cpp @@ -0,0 +1,271 @@ +#include "speculative.h" + +#include "log.h" +#include "common.h" +#include "sampling.h" + +#include + +#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 128 +#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5 + +struct common_speculative { + struct llama_context * ctx; + struct gpt_sampler * smpl; + + llama_batch batch; + llama_tokens prompt; +}; + +struct common_speculative * common_speculative_init( + struct llama_context * ctx_dft) { + auto * result = new common_speculative { + /* .ctx = */ ctx_dft, + /* .smpl = */ nullptr, + /* .batch = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1), + /* .prompt = */ {}, + }; + + // TODO: optimize or pass from outside? +#if 0 + { + common_params_sampling params; + params.no_perf = false; + + params.top_k = 40; + params.top_p = 0.9; + + params.samplers = { + COMMON_SAMPLER_TYPE_TOP_K, + COMMON_SAMPLER_TYPE_TOP_P, + COMMON_SAMPLER_TYPE_INFILL, + }; + + result->smpl = gpt_sampler_init(llama_get_model(ctx_dft), params); + } +#else + { + gpt_sampler_params params; + params.no_perf = false; + + params.top_k = 10; + + params.samplers = { + GPT_SAMPLER_TYPE_TOP_K, + }; + + result->smpl = gpt_sampler_init(llama_get_model(ctx_dft), params); + } +#endif + + llama_update_context_with_rankworld(result->ctx, 0, 1, 0, 1); + + return result; +} + +void common_speculative_free(struct common_speculative * spec) { + gpt_sampler_free(spec->smpl); + + llama_batch_free(spec->batch); + + delete spec; +} + +bool common_speculative_are_compatible( + const struct llama_context * ctx_tgt, + const struct llama_context * ctx_dft) { + const struct llama_model * model_tgt = llama_get_model(ctx_tgt); + const struct llama_model * model_dft = llama_get_model(ctx_dft); + + const bool vocab_type_tgt = llama_vocab_type(model_tgt); + LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt); + + const bool vocab_type_dft = llama_vocab_type(model_dft); + LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft); + + if (vocab_type_tgt != vocab_type_dft) { + LOG_ERR("%s: draft model vocab type must match target model to use speculation but " + "vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt); + return false; + } + + if (llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) || + llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) || + llama_token_bos(model_tgt) != llama_token_bos(model_dft) || + llama_token_eos(model_tgt) != llama_token_eos(model_dft) + ) { + LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__); + return false; + } + + { + const int n_vocab_tgt = llama_n_vocab(model_tgt); + const int n_vocab_dft = llama_n_vocab(model_dft); + + const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft); + + if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) { + LOG_ERR("%s: draft model vocab must closely match target model to use speculation but " + "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n", + __func__, n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE); + return false; + } + + for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) { + const char * token_text_tgt = llama_token_get_text(model_tgt, i); + const char * token_text_dft = llama_token_get_text(model_dft, i); + if (std::strcmp(token_text_tgt, token_text_dft) != 0) { + LOG_ERR("%s: draft model vocab must match target model to use speculation but " + "token %d content differs - target '%s', draft '%s'\n", __func__, i, + llama_token_to_piece(ctx_tgt, i).c_str(), + llama_token_to_piece(ctx_dft, i).c_str()); + return false; + } + } + } + + return true; +} + +llama_tokens common_speculative_gen_draft( + struct common_speculative * spec, + struct common_speculative_params params, + const llama_tokens & prompt_tgt, + llama_token id_last) { + auto & batch = spec->batch; + auto & ctx = spec->ctx; + auto & smpl = spec->smpl; + auto & prompt = spec->prompt; + + int reuse_i = 0; + int reuse_n = 0; + + const int n_ctx = llama_n_ctx(ctx) - params.n_draft; + + const int i_start = std::max(0, (int) prompt_tgt.size() - n_ctx); + + // reuse as much as possible from the old draft context + // ideally, the draft context should be as big as the target context and we will always reuse the entire prompt + for (int i = 0; i < (int) prompt.size(); ++i) { + int cur = 0; + while (i_start + cur < (int) prompt_tgt.size() && + i + cur < (int) prompt.size() && + prompt_tgt[i_start + cur] == prompt[i + cur]) { + cur++; + } + + if ((cur >= params.n_reuse || n_ctx >= (int) prompt_tgt.size()) && cur > reuse_n) { + reuse_i = i; + reuse_n = cur; + } + } + + LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt.size()); + + llama_tokens result; + result.reserve(params.n_draft); + + if (reuse_n == 0) { + llama_kv_cache_clear(ctx); + + prompt.clear(); + } else { + // this happens when a previous draft has been discarded (for example, due to being too small), but the + // target model agreed with it. in this case, we simply pass back the previous results to save compute + if (reuse_i + reuse_n < (int) prompt.size() && prompt[reuse_i + reuse_n] == id_last) { + for (int i = reuse_i + reuse_n + 1; i < (int) prompt.size(); ++i) { + result.push_back(prompt[i]); + + if (params.n_draft <= (int) result.size()) { + break; + } + } + + return result; + } + + if (reuse_i > 0) { + llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i); + llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i); + + prompt.erase(prompt.begin(), prompt.begin() + reuse_i); + } + + if (reuse_n < (int) prompt.size()) { + llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1); + + prompt.erase(prompt.begin() + reuse_n, prompt.end()); + } + } + + // prepare a batch to evaluate any new tokens in the prompt + llama_batch_clear(batch); + + for (size_t i = i_start + reuse_n; i < prompt_tgt.size(); ++i) { + //LOG_DBG("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]); + llama_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false); + + prompt.push_back(prompt_tgt[i]); + } + + // we should rarely end-up here during normal decoding + if (batch.n_tokens > 0) { + //LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str()); + + llama_decode(ctx, batch); + } + + const llama_pos n_past = prompt.size(); + + LOG_DBG("%s: n_past = %d\n", __func__, n_past); + + llama_batch_clear(batch); + llama_batch_add (batch, id_last, n_past, { 0 }, true); + + prompt.push_back(id_last); + + //LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx, prompt).c_str()); + + llama_decode(ctx, batch); + + gpt_sampler_reset(smpl); + + // sample n_draft tokens from the draft model + for (int i = 0; i < params.n_draft; ++i) { + llama_batch_clear(batch); + + gpt_sampler_sample(smpl, ctx, 0, true); + + const auto * cur_p = gpt_sampler_get_candidates(smpl); + + for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) { + LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n", + k, i, cur_p->data[k].id, cur_p->data[k].p, llama_token_to_piece(ctx, cur_p->data[k].id).c_str()); + } + + // add drafted token for each sequence + const llama_token id = cur_p->data[0].id; + + // only collect very high-confidence draft tokens + if (cur_p->data[0].p < params.p_min) { + break; + } + + gpt_sampler_accept(smpl, id, true); + + result.push_back(id); + + if (params.n_draft <= (int) result.size()) { + break; + } + + llama_batch_add(batch, id, n_past + i + 1, { 0 }, true); + + // evaluate the drafted tokens on the draft model + llama_decode(ctx, batch); + + prompt.push_back(id); + } + + return result; +} \ No newline at end of file diff --git a/common/speculative.h b/common/speculative.h new file mode 100644 index 00000000..0af10ea4 --- /dev/null +++ b/common/speculative.h @@ -0,0 +1,28 @@ +#pragma once + +#include "llama.h" +#include "common.h" + +struct common_speculative; + +struct common_speculative_params { + int n_draft = 16; // max drafted tokens + int n_reuse = 256; + + float p_min = 0.9f; // min probabiliy required to accept a token in the draft +}; + +struct common_speculative * common_speculative_init(struct llama_context * ctx_dft); + +void common_speculative_free(struct common_speculative * spec); + +bool common_speculative_are_compatible( + const struct llama_context * ctx_tgt, + const struct llama_context * ctx_dft); + +// sample up to n_draft tokens and add them to the batch using the draft model +llama_tokens common_speculative_gen_draft( + struct common_speculative * spec, + struct common_speculative_params params, + const llama_tokens & prompt, + llama_token id_last); \ No newline at end of file diff --git a/examples/server/server.cpp b/examples/server/server.cpp index af39f1ac..d086eaf3 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -6,6 +6,7 @@ #include "sampling.h" #include "json-schema-to-grammar.h" #include "llama.h" +#include "speculative.h" // Change JSON_ASSERT from assert() to GGML_ASSERT: #define JSON_ASSERT GGML_ASSERT @@ -133,6 +134,9 @@ struct slot_params { int32_t n_predict = -1; // new tokens to predict std::vector antiprompt; + + struct gpt_sampler_params sampling; + struct common_params_speculative speculative; json input_prefix; json input_suffix; @@ -142,6 +146,12 @@ struct server_slot { int id; int id_task = -1; + llama_batch batch_spec; + + llama_context * ctx_dft = nullptr; + + common_speculative * spec = nullptr; + // the index relative to completion multi-task request size_t index = 0; @@ -231,7 +241,7 @@ struct server_slot { generated_token_probs.clear(); } - bool has_budget(gpt_params &global_params) { + bool has_budget(const gpt_params &global_params) { if (params.n_predict == -1 && global_params.n_predict == -1) { return true; // limitless } @@ -251,6 +261,10 @@ struct server_slot { return state != SLOT_STATE_IDLE; } + bool can_speculate() const { + return ctx_dft && params.speculative.n_max > 0 && params.cache_prompt; + } + void add_token(const completion_token_output & token) { if (!is_processing()) { SLT_WRN(*this, "%s", "slot is not processing\n"); @@ -615,6 +629,9 @@ struct server_context { gpt_params params; + llama_model * model_dft = nullptr; + llama_context_params cparams_dft; + llama_batch batch = {}; bool clean_kv_cache = true; @@ -652,17 +669,33 @@ struct server_context { model = nullptr; } + if (model_dft) { + llama_free_model(model_dft); + model_dft = nullptr; + } + // Clear any sampling context for (server_slot & slot : slots) { if (slot.smpl != nullptr) { gpt_sampler_free(slot.smpl); } + slot.smpl = nullptr; + + llama_free(slot.ctx_dft); + slot.ctx_dft = nullptr; + + common_speculative_free(slot.spec); + slot.spec = nullptr; + + llama_batch_free(slot.batch_spec); } llama_batch_free(batch); } bool load_model(const gpt_params & params_) { + SRV_INF("loading model '%s'\n", params.model.c_str()); + params = params_; // dedicate one sequence to the system prompt @@ -685,6 +718,44 @@ struct server_context { add_bos_token = llama_add_bos_token(model); has_eos_token = !llama_add_eos_token(model); + + if (!params.speculative.model.empty()) { + SRV_INF("loading draft model '%s'\n", params.speculative.model.c_str()); + + auto params_dft = params; + + params_dft.model = params.speculative.model; + params_dft.n_ctx = params.speculative.n_ctx; + params_dft.n_gpu_layers = params.speculative.n_gpu_layers; + params_dft.n_world = 1; // do not split the draft model across devicesAdd commentMore actions + params_dft.rank = 0; // always load the draft model on the head device + + std::fill_n(params_dft.n_layer_window, params.n_world, 0); + + llama_init_result llama_init_dft = llama_init_from_gpt_params(params_dft); + + model_dft = llama_init_dft.model; + + if (model_dft == nullptr) { + SRV_ERR("failed to load draft model, '%s'\n", params.speculative.model.c_str()); + return false; + } + + if (!common_speculative_are_compatible(ctx, llama_init_dft.context)) { + SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params.speculative.model.c_str(), params.model.c_str()); + + llama_free (llama_init_dft.context); + llama_free_model(llama_init_dft.model); + + return false; + } + + cparams_dft = llama_context_params_from_gpt_params(params); + cparams_dft.n_batch = llama_n_ctx(llama_init_dft.context); + + // the context is not needed - we will create one for each slot + llama_free(llama_init_dft.context); + } return true; } @@ -708,6 +779,30 @@ struct server_context { slot.id = i; slot.n_ctx = n_ctx_slot; slot.n_predict = params.n_predict; + + if (model_dft) { + slot.batch_spec = llama_batch_init(params.speculative.n_max + 1, 0, 1); + + slot.ctx_dft = llama_new_context_with_model(model_dft, cparams_dft); + + if (llama_context_setup_backend(model, cparams_dft, slot.ctx_dft) == nullptr) { + SRV_ERR("%s: failed to setup context with model '%s'\n", __func__, params.model.c_str()); + llama_free(slot.ctx_dft); + llama_free_model(model); + return; + } + + if (slot.ctx_dft == nullptr) { + SRV_ERR("%s", "failed to create draft context\n"); + return; + } + + slot.spec = common_speculative_init(slot.ctx_dft); + if (slot.spec == nullptr) { + SRV_ERR("%s", "failed to create speculator\n"); + return; + } + } SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx); @@ -875,6 +970,8 @@ struct server_context { slot_params default_params; // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them) auto default_sparams = params.sparams; + default_params.speculative = params.speculative; + const auto & data = task.data; if (data.count("__oaicompat") != 0) { @@ -909,6 +1006,12 @@ struct server_context { slot.sparams.seed = json_value(data, "seed", default_sparams.seed); slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs); slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep); + + slot.params.speculative.n_min = json_value(data, "speculative.n_min", default_params.speculative.n_min); + slot.params.speculative.n_max = json_value(data, "speculative.n_max", default_params.speculative.n_max); + slot.params.speculative.p_min = json_value(data, "speculative.p_min", default_params.speculative.p_min); + + slot.params.speculative.n_min = std::min(slot.params.speculative.n_max, slot.params.speculative.n_min); // process "json_schema" and "grammar" if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) { @@ -1049,6 +1152,12 @@ struct server_context { return false; } } + + if (slot.ctx_dft) { + llama_batch_free(slot.batch_spec); + + slot.batch_spec = llama_batch_init(slot.params.speculative.n_max + 1, 0, 1); + } slot.state = SLOT_STATE_PROCESSING_PROMPT; slot.prompt_tokens.clear(); @@ -2357,38 +2466,100 @@ struct server_context { continue; // continue loop of slots } - completion_token_output result; - const llama_token id = gpt_sampler_sample(slot.smpl, ctx, slot.i_batch - i); + llama_token id; - gpt_sampler_accept(slot.smpl, id, true); + { + completion_token_output result; - slot.n_decoded += 1; - if (slot.n_decoded == 1) { - slot.t_start_generation = ggml_time_us(); - slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3; - metrics.on_prompt_eval(slot); + id = gpt_sampler_sample(slot.smpl, ctx, slot.i_batch - i); + + slot.i_batch = -1; + + gpt_sampler_accept(slot.smpl, id, true); + + slot.n_decoded += 1; + if (slot.n_decoded == 1) { + slot.t_start_generation = ggml_time_us(); + slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3; + metrics.on_prompt_eval(slot); + } + + result.tok = id; + + const auto * cur_p = gpt_sampler_get_candidates(slot.smpl); + + for (size_t i = 0; i < (size_t) slot.params.sampling.n_probs; ++i) { + result.probs.push_back({ + cur_p->data[i].id, + i >= cur_p->size ? 0.0f : cur_p->data[i].p, + }); + } + + if (!process_token(result, slot)) { + // release slot because of stop condition + slot.release(); + slot.print_timings(); + send_final_response(slot); + metrics.on_prediction(slot); + continue; + } } - result.tok = id; - - const auto * cur_p = gpt_sampler_get_candidates(slot.smpl); - - for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) { - result.probs.push_back({ - cur_p->data[i].id, - i >= cur_p->size ? 0.0f : cur_p->data[i].p, - }); + // check if the slot supports speculative decoding + if (!slot.can_speculate()) { + continue; } - if (!process_token(result, slot)) { - // release slot because of stop condition - slot.release(); - slot.print_timings(); - send_final_response(slot); - metrics.on_prediction(slot); + struct common_speculative_params params_spec; + params_spec.n_draft = slot.params.speculative.n_max; + params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max; + params_spec.p_min = slot.params.speculative.p_min; + + llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id); + + // ignore small drafts + if (slot.params.speculative.n_min > (int) draft.size()) { + continue; } - slot.i_batch = -1; + // construct the speculation batch + llama_batch_clear(slot.batch_spec); + llama_batch_add (slot.batch_spec, id, slot.n_past, { slot.id }, true); + + for (size_t i = 0; i < draft.size(); ++i) { + llama_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id }, true); + } + + llama_decode(ctx, slot.batch_spec); + + // the accepted tokens from the speculation + const auto ids = gpt_sampler_sample_and_accept_n(slot.smpl, ctx, draft); + + slot.n_past += ids.size(); + slot.n_decoded += ids.size(); + + slot.cache_tokens.push_back(id); + slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1); + + llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1); + + for (size_t i = 0; i < ids.size(); ++i) { + completion_token_output result; + + result.tok = ids[i]; + + if (!process_token(result, slot)) { + // release slot because of stop condition + slot.release(); + slot.print_timings(); + send_final_response(slot); + metrics.on_prediction(slot); + break; + } + } + + SRV_DBG("accepted %d/%d draft tokens\n", (int) ids.size() - 1, (int) draft.size()); + } } diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 8527f8b6..0a6c4701 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -41,7 +41,7 @@ int main(int argc, char ** argv) { gpt_init(); - if (params.model_draft.empty()) { + if (params.speculative.model.empty()) { LOG_ERR("%s: --model-draft is required\n", __func__); return 1; } @@ -68,7 +68,7 @@ int main(int argc, char ** argv) { // load the draft model // make a hard copy of params to use for the draft model gpt_params params_draft = params; - params_draft.model = params_draft.model_draft; + params_draft.model = params_draft.speculative.model; params_draft.n_gpu_layers = params_draft.n_gpu_layers_draft; params_draft.n_world = 1; // do not split the draft model across devices params_draft.rank = 0; // always load the draft model on the head device diff --git a/src/llama.cpp b/src/llama.cpp index e5965fd8..a57715e3 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -20944,6 +20944,12 @@ struct llama_context * llama_new_context_with_model( ctx->cparams.rank = params.rank; ctx->cparams.force = params.force; ctx->cparams.original_next_rank = (params.rank + 1) % params.n_world; + + auto &hparams = model->hparams; + auto &cparams = ctx->cparams; + cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; + ctx->logits_all = params.logits_all; + return ctx; } From d248f3c40e127897eb8a981ed38b90dcc2c51037 Mon Sep 17 00:00:00 2001 From: DeEMO Date: Fri, 27 Jun 2025 06:07:47 +0000 Subject: [PATCH 2/8] fix: some fields in cparams_draft --- common/arg.cpp | 5 +++-- examples/server/server.cpp | 14 ++++++++++---- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 813f87e8..2f999dcc 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1555,13 +1555,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N", "number of layers to store in VRAM for the draft model", [](gpt_params & params, int value) { - params.n_gpu_layers_draft = value; + params.n_gpu_layers_draft = value; // TODO: remove + params.speculative.n_gpu_layers = value; if (!llama_supports_gpu_offload()) { fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); } } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); add_opt(llama_arg( {"-sm", "--split-mode"}, "{none,layer,row}", "how to split the model across multiple GPUs, one of:\n" diff --git a/examples/server/server.cpp b/examples/server/server.cpp index d086eaf3..93a9cf33 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -727,6 +727,7 @@ struct server_context { params_dft.model = params.speculative.model; params_dft.n_ctx = params.speculative.n_ctx; params_dft.n_gpu_layers = params.speculative.n_gpu_layers; + params_dft.use_mlock = true; params_dft.n_world = 1; // do not split the draft model across devicesAdd commentMore actions params_dft.rank = 0; // always load the draft model on the head device @@ -749,9 +750,14 @@ struct server_context { return false; } - + cparams_dft = llama_context_params_from_gpt_params(params); cparams_dft.n_batch = llama_n_ctx(llama_init_dft.context); + cparams_dft.n_world = 1; + cparams_dft.rank = 0; + std::fill_n(cparams_dft.n_layer_window, 32, 0); + cparams_dft.n_layer_window[0] = llama_n_layer(model_dft); + cparams_dft.n_gpu_layers = params.speculative.n_gpu_layers; // the context is not needed - we will create one for each slot llama_free(llama_init_dft.context); @@ -785,10 +791,10 @@ struct server_context { slot.ctx_dft = llama_new_context_with_model(model_dft, cparams_dft); - if (llama_context_setup_backend(model, cparams_dft, slot.ctx_dft) == nullptr) { - SRV_ERR("%s: failed to setup context with model '%s'\n", __func__, params.model.c_str()); + if (llama_context_setup_backend(model_dft, cparams_dft, slot.ctx_dft) == nullptr) { + SRV_ERR("%s: failed to setup context with model '%s'\n", __func__, params.speculative.model.c_str()); llama_free(slot.ctx_dft); - llama_free_model(model); + llama_free_model(model_dft); return; } From 9bf6565df4b94a43414a1821220be55247abee15 Mon Sep 17 00:00:00 2001 From: DeEMO Date: Fri, 27 Jun 2025 06:30:57 +0000 Subject: [PATCH 3/8] fix: load draft model first --- examples/server/server.cpp | 66 ++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 31 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 93a9cf33..5b970e4f 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -700,6 +700,40 @@ struct server_context { // dedicate one sequence to the system prompt params.n_parallel += 1; + + // load draft model first + llama_init_result llama_init_dft; + if (!params.speculative.model.empty()) { + SRV_INF("loading draft model '%s'\n", params.speculative.model.c_str()); + + auto params_dft = params; + + params_dft.model = params.speculative.model; + params_dft.n_ctx = params.speculative.n_ctx; + params_dft.n_gpu_layers = params.speculative.n_gpu_layers; + params_dft.use_mlock = true; + params_dft.n_world = 1; // do not split the draft model across devicesAdd commentMore actions + params_dft.rank = 0; // always load the draft model on the head device + + std::fill_n(params_dft.n_layer_window, params.n_world, 0); + + llama_init_dft = llama_init_from_gpt_params(params_dft); + + model_dft = llama_init_dft.model; + + if (model_dft == nullptr) { + SRV_ERR("failed to load draft model, '%s'\n", params.speculative.model.c_str()); + return false; + } + + cparams_dft = llama_context_params_from_gpt_params(params); + cparams_dft.n_batch = llama_n_ctx(llama_init_dft.context); + cparams_dft.n_world = 1; + cparams_dft.rank = 0; + std::fill_n(cparams_dft.n_layer_window, 32, 0); + cparams_dft.n_layer_window[0] = llama_n_layer(model_dft); + cparams_dft.n_gpu_layers = params.speculative.n_gpu_layers; + } llama_init_result llama_init = llama_init_from_gpt_params(params); @@ -719,29 +753,7 @@ struct server_context { add_bos_token = llama_add_bos_token(model); has_eos_token = !llama_add_eos_token(model); - if (!params.speculative.model.empty()) { - SRV_INF("loading draft model '%s'\n", params.speculative.model.c_str()); - - auto params_dft = params; - - params_dft.model = params.speculative.model; - params_dft.n_ctx = params.speculative.n_ctx; - params_dft.n_gpu_layers = params.speculative.n_gpu_layers; - params_dft.use_mlock = true; - params_dft.n_world = 1; // do not split the draft model across devicesAdd commentMore actions - params_dft.rank = 0; // always load the draft model on the head device - - std::fill_n(params_dft.n_layer_window, params.n_world, 0); - - llama_init_result llama_init_dft = llama_init_from_gpt_params(params_dft); - - model_dft = llama_init_dft.model; - - if (model_dft == nullptr) { - SRV_ERR("failed to load draft model, '%s'\n", params.speculative.model.c_str()); - return false; - } - + if (!params.speculative.model.empty()){ if (!common_speculative_are_compatible(ctx, llama_init_dft.context)) { SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params.speculative.model.c_str(), params.model.c_str()); @@ -750,14 +762,6 @@ struct server_context { return false; } - - cparams_dft = llama_context_params_from_gpt_params(params); - cparams_dft.n_batch = llama_n_ctx(llama_init_dft.context); - cparams_dft.n_world = 1; - cparams_dft.rank = 0; - std::fill_n(cparams_dft.n_layer_window, 32, 0); - cparams_dft.n_layer_window[0] = llama_n_layer(model_dft); - cparams_dft.n_gpu_layers = params.speculative.n_gpu_layers; // the context is not needed - we will create one for each slot llama_free(llama_init_dft.context); From b4929d510aee558f4cd5015daee9dfa6de0d0b50 Mon Sep 17 00:00:00 2001 From: DeEMO Date: Mon, 30 Jun 2025 04:35:59 +0000 Subject: [PATCH 4/8] fix: args in speculative --- common/arg.cpp | 22 ++++++++++++++++++---- common/common.h | 1 - examples/speculative/speculative.cpp | 2 +- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 2f999dcc..a338f613 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -627,12 +627,19 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(llama_arg( - {"--draft"}, "N", - format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft), + {"--draft-max", "--draft", "--draft-n"}, "N", + format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max), [](gpt_params & params, int value) { - params.n_draft = value; + params.speculative.n_max = value; } - ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP})); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER})); + add_opt(llama_arg( + {"--draft-min", "--draft-n-min"}, "N", + format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min), + [](gpt_params & params, int value) { + params.speculative.n_min = value; + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER})); add_opt(llama_arg( {"-ps", "--p-split"}, "N", format("speculative decoding split probability (default: %.1f)", (double)params.p_split), @@ -640,6 +647,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, params.p_split = std::stof(value); } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); + add_opt(llama_arg( + {"--draft-p-min"}, "P", + format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min), + [](gpt_params & params, const std::string & value) { + params.speculative.p_min = std::stof(value); + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); add_opt(llama_arg( {"-lcs", "--lookup-cache-static"}, "FNAME", "path to static lookup cache to use for lookup decoding (not updated by generation)", diff --git a/common/common.h b/common/common.h index 044dfdf5..b454f799 100644 --- a/common/common.h +++ b/common/common.h @@ -177,7 +177,6 @@ struct gpt_params { int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_draft = 5; // number of tokens to draft during speculative decoding int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_parallel = 1; // number of parallel sequences to decode int32_t n_sequences = 1; // number of sequences to decode diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 0a6c4701..ad610ac8 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -169,7 +169,7 @@ int main(int argc, char ** argv) { const auto t_enc_end = ggml_time_us(); // how many tokens to draft each time - int n_draft = params.n_draft; + int n_draft = params.speculative.n_max; int n_predict = 0; int n_drafted = 0; From ca5996e7a6fbf900fa881f9d08ced2749abb8702 Mon Sep 17 00:00:00 2001 From: DeEMO Date: Mon, 30 Jun 2025 07:31:05 +0000 Subject: [PATCH 5/8] fix: slot id --- examples/server/server.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 5b970e4f..1acf1421 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2534,10 +2534,10 @@ struct server_context { // construct the speculation batch llama_batch_clear(slot.batch_spec); - llama_batch_add (slot.batch_spec, id, slot.n_past, { slot.id }, true); + llama_batch_add (slot.batch_spec, id, slot.n_past, { slot.id + 1 }, true); for (size_t i = 0; i < draft.size(); ++i) { - llama_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id }, true); + llama_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id + 1 }, true); } llama_decode(ctx, slot.batch_spec); @@ -2551,7 +2551,8 @@ struct server_context { slot.cache_tokens.push_back(id); slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1); - llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1); + llama_kv_cache_seq_rm (ctx, slot.id + 1, slot.n_past, -1); + llama_send_kv_cache_seq_rm(ctx, slot.id , slot.n_past, -1); for (size_t i = 0; i < ids.size(); ++i) { completion_token_output result; From 0cf87c88377b8e2bdcd853f201d6669ca7681ffa Mon Sep 17 00:00:00 2001 From: DeEMO Date: Sun, 6 Jul 2025 10:05:01 +0800 Subject: [PATCH 6/8] fix: set cache_prompt default to true --- examples/server/server.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 1acf1421..5cbd7f38 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -127,7 +127,7 @@ struct server_task_result { struct slot_params { bool stream = true; - bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt + bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half @@ -993,7 +993,7 @@ struct server_context { } slot.params.stream = json_value(data, "stream", false); - slot.params.cache_prompt = json_value(data, "cache_prompt", false); + slot.params.cache_prompt = json_value(data, "cache_prompt", true); slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", default_params.n_predict)); slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k); slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p); From b019a707b8c4480f9e4d7607fc15bcb62d76ccc0 Mon Sep 17 00:00:00 2001 From: "Li, Zonghang" <870644199@qq.com> Date: Sun, 13 Jul 2025 13:42:24 +0800 Subject: [PATCH 7/8] server: fix bugs --- common/arg.cpp | 1 + examples/server/server.cpp | 4 ++++ src/llama.cpp | 13 ++++++++----- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index a338f613..45954b52 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -673,6 +673,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx), [](gpt_params & params, int value) { params.n_ctx = value; + params.speculative.n_ctx = value; } ).set_env("LLAMA_ARG_CTX_SIZE")); add_opt(llama_arg( diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 5cbd7f38..3844c886 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -760,6 +760,8 @@ struct server_context { llama_free (llama_init_dft.context); llama_free_model(llama_init_dft.model); + model_dft = nullptr; + return false; } @@ -3566,6 +3568,8 @@ int main(int argc, char ** argv) { LOG_INF("%s: loading model\n", __func__); if (!ctx_server.load_model(params)) { + char * stop_signal = nullptr; + llama_free_sockets(ctx_server.ctx, &stop_signal); clean_up(); t.join(); LOG_ERR("%s: exiting due to model loading error\n", __func__); diff --git a/src/llama.cpp b/src/llama.cpp index a57715e3..9aa9cd82 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -17878,7 +17878,7 @@ static void llama_send_meta(zmq::socket_t & socket, struct sync_meta * meta, boo if (meta->pos != nullptr) { send_msgs.emplace_back("pos", strlen("pos")); - send_msgs.emplace_back(meta->pos, meta->n_ctx * sizeof(llama_pos)); + send_msgs.emplace_back(meta->pos, meta->n_tokens * sizeof(llama_pos)); } if (meta->n_seq_id != nullptr) { @@ -17986,8 +17986,8 @@ static int llama_recv_meta(zmq::socket_t & socket, struct sync_meta * meta) { } if (key == "pos") { - meta->pos = (llama_pos *) malloc(meta->n_ctx * sizeof(llama_pos)); - std::memcpy(meta->pos, data_msg.data(), meta->n_ctx * sizeof(llama_pos)); + meta->pos = (llama_pos *) malloc(meta->n_tokens * sizeof(llama_pos)); + std::memcpy(meta->pos, data_msg.data(), meta->n_tokens * sizeof(llama_pos)); } if (key == "n_seq_id") { @@ -18304,8 +18304,8 @@ static int llama_decode_internal( if (meta.n_tokens > 0) { batch_all.n_tokens = meta.n_tokens; if (meta.pos != nullptr) { - batch_all.pos = (llama_pos *) malloc(meta.n_ctx * sizeof(llama_pos)); - std::memcpy(batch_all.pos, meta.pos, meta.n_ctx * sizeof(llama_pos)); + batch_all.pos = (llama_pos *) malloc(meta.n_tokens * sizeof(llama_pos)); + std::memcpy(batch_all.pos, meta.pos, meta.n_tokens * sizeof(llama_pos)); } if (meta.n_seq_id != nullptr) { batch_all.n_seq_id = (int32_t *) malloc(meta.n_tokens * sizeof(int32_t)); @@ -22089,6 +22089,9 @@ void llama_model_compute_buf_size( // this value may vary by GPU and CUDA version, but it's lower than 400 MiB in most cases, // another 300 MiB is used to prevent accidental OOM. *gpu_buf += 700 * 1024 * 1024; + } else if (backend == BACKEND_METAL) { + // 300 MiB is used to prevent accidental OOM, e.g., automatic quantization conversion. + *gpu_buf += 300 * 1024 * 1024; } } From 86ca21e49c1ad24fc41c513e4e13315e6726387a Mon Sep 17 00:00:00 2001 From: "Li, Zonghang" <870644199@qq.com> Date: Sun, 13 Jul 2025 21:52:59 +0800 Subject: [PATCH 8/8] server: fix bugs when running speculative decoding --- README.md | 1 + examples/server/server.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a1a18538..d8e5d91c 100644 --- a/README.md +++ b/README.md @@ -381,6 +381,7 @@ curl -X POST http://localhost:8080/v1/cancel \ ``` **9. How to use speculative decoding?** + Please see "[Power prima.cpp with speculative decoding: Further speeds up by up to 80%](https://github.com/Lizonghang/prima.cpp/discussions/29)". ## ❤️ Acknowledgment diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 3844c886..a1cfa90c 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2542,7 +2542,7 @@ struct server_context { llama_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id + 1 }, true); } - llama_decode(ctx, slot.batch_spec); + llama_decode(ctx, slot.batch_spec, true); // the accepted tokens from the speculation const auto ids = gpt_sampler_sample_and_accept_n(slot.smpl, ctx, draft);