mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-05 19:29:02 +00:00
server: fix bugs when running speculative decoding
This commit is contained in:
parent
b019a707b8
commit
86ca21e49c
2 changed files with 2 additions and 1 deletions
|
@ -381,6 +381,7 @@ curl -X POST http://localhost:8080/v1/cancel \
|
||||||
```
|
```
|
||||||
|
|
||||||
**9. How to use speculative decoding?**
|
**9. How to use speculative decoding?**
|
||||||
|
|
||||||
Please see "[Power prima.cpp with speculative decoding: Further speeds up by up to 80%](https://github.com/Lizonghang/prima.cpp/discussions/29)".
|
Please see "[Power prima.cpp with speculative decoding: Further speeds up by up to 80%](https://github.com/Lizonghang/prima.cpp/discussions/29)".
|
||||||
|
|
||||||
## ❤️ Acknowledgment
|
## ❤️ Acknowledgment
|
||||||
|
|
|
@ -2542,7 +2542,7 @@ struct server_context {
|
||||||
llama_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id + 1 }, true);
|
llama_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id + 1 }, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_decode(ctx, slot.batch_spec);
|
llama_decode(ctx, slot.batch_spec, true);
|
||||||
|
|
||||||
// the accepted tokens from the speculation
|
// the accepted tokens from the speculation
|
||||||
const auto ids = gpt_sampler_sample_and_accept_n(slot.smpl, ctx, draft);
|
const auto ids = gpt_sampler_sample_and_accept_n(slot.smpl, ctx, draft);
|
||||||
|
|
Loading…
Add table
Reference in a new issue