mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-04 00:49:08 +00:00
server: fix bugs when running speculative decoding
This commit is contained in:
parent
b019a707b8
commit
86ca21e49c
2 changed files with 2 additions and 1 deletions
|
@ -381,6 +381,7 @@ curl -X POST http://localhost:8080/v1/cancel \
|
|||
```
|
||||
|
||||
**9. How to use speculative decoding?**
|
||||
|
||||
Please see "[Power prima.cpp with speculative decoding: Further speeds up by up to 80%](https://github.com/Lizonghang/prima.cpp/discussions/29)".
|
||||
|
||||
## ❤️ Acknowledgment
|
||||
|
|
|
@ -2542,7 +2542,7 @@ struct server_context {
|
|||
llama_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id + 1 }, true);
|
||||
}
|
||||
|
||||
llama_decode(ctx, slot.batch_spec);
|
||||
llama_decode(ctx, slot.batch_spec, true);
|
||||
|
||||
// the accepted tokens from the speculation
|
||||
const auto ids = gpt_sampler_sample_and_accept_n(slot.smpl, ctx, draft);
|
||||
|
|
Loading…
Add table
Reference in a new issue