From 86ca21e49c1ad24fc41c513e4e13315e6726387a Mon Sep 17 00:00:00 2001
From: "Li, Zonghang" <870644199@qq.com>
Date: Sun, 13 Jul 2025 21:52:59 +0800
Subject: [PATCH] server: fix bugs when running speculative decoding

---
 README.md                  | 1 +
 examples/server/server.cpp | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a1a18538..d8e5d91c 100644
--- a/README.md
+++ b/README.md
@@ -381,6 +381,7 @@ curl -X POST http://localhost:8080/v1/cancel \
 ```
 
 **9. How to use speculative decoding?**
+
 Please see "[Power prima.cpp with speculative decoding: Further speeds up by up to 80%](https://github.com/Lizonghang/prima.cpp/discussions/29)".
 
 ## ❤️ Acknowledgment
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 3844c886..a1cfa90c 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2542,7 +2542,7 @@ struct server_context {
                     llama_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id + 1 }, true);
                 }
 
-                llama_decode(ctx, slot.batch_spec);
+                llama_decode(ctx, slot.batch_spec, true);
 
                 // the accepted tokens from the speculation
                 const auto ids = gpt_sampler_sample_and_accept_n(slot.smpl, ctx, draft);