From 86ca21e49c1ad24fc41c513e4e13315e6726387a Mon Sep 17 00:00:00 2001 From: "Li, Zonghang" <870644199@qq.com> Date: Sun, 13 Jul 2025 21:52:59 +0800 Subject: [PATCH] server: fix bugs when running speculative decoding --- README.md | 1 + examples/server/server.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a1a18538..d8e5d91c 100644 --- a/README.md +++ b/README.md @@ -381,6 +381,7 @@ curl -X POST http://localhost:8080/v1/cancel \ ``` **9. How to use speculative decoding?** + Please see "[Power prima.cpp with speculative decoding: Further speeds up by up to 80%](https://github.com/Lizonghang/prima.cpp/discussions/29)". ## ❤️ Acknowledgment diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 3844c886..a1cfa90c 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2542,7 +2542,7 @@ struct server_context { llama_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id + 1 }, true); } - llama_decode(ctx, slot.batch_spec); + llama_decode(ctx, slot.batch_spec, true); // the accepted tokens from the speculation const auto ids = gpt_sampler_sample_and_accept_n(slot.smpl, ctx, draft);