server: fix bugs when running speculative decoding

2025-09-17 04:59:39 +00:00 · 2025-07-13 21:52:59 +08:00 · 2025-07-13 21:52:59 +08:00 · 86ca21e49c
commit 86ca21e49c
parent b019a707b8
2 changed files with 2 additions and 1 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -2542,7 +2542,7 @@ struct server_context {
                    llama_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id + 1 }, true);
                }

-                llama_decode(ctx, slot.batch_spec);
+                llama_decode(ctx, slot.batch_spec, true);

                // the accepted tokens from the speculation
                const auto ids = gpt_sampler_sample_and_accept_n(slot.smpl, ctx, draft);