mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-22 19:47:49 +00:00
llama: avoid copying logits during prompt decode in MTP (#23198)
* llama: avoid copying logits during prompt decode in MTP * review: update comment * llama-graph: call set_output for t_h_pre_norm
This commit is contained in:
parent
39cf5d6191
commit
3e12fbdea5
10 changed files with 91 additions and 27 deletions
|
|
@ -243,6 +243,11 @@ struct server_slot {
|
|||
return task->need_embd() || (spec && common_speculative_need_embd(spec));
|
||||
}
|
||||
|
||||
bool need_embd_pre_norm() const {
|
||||
GGML_ASSERT(task);
|
||||
return spec && common_speculative_need_embd_pre_norm(spec);
|
||||
}
|
||||
|
||||
// if the context does not have a memory module then all embeddings have to be computed within a single ubatch
|
||||
// also we cannot split if the pooling would require any past tokens
|
||||
// (MTP supports splitting — uses task->need_embd() not need_embd())
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue