mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-17 04:19:40 +00:00
Merge commit '9230dbe2c7
' into concedo_experimental
# Conflicts: # ggml/src/ggml-cpu/CMakeLists.txt # src/llama-graph.cpp # tools/server/README.md
This commit is contained in:
commit
c16d672ce4
29 changed files with 2666 additions and 2457 deletions
|
@ -1969,10 +1969,8 @@ struct server_context {
|
|||
params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
|
||||
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
|
||||
params_dft.n_parallel = 1;
|
||||
|
||||
// force F16 KV cache for the draft model for extra performance
|
||||
params_dft.cache_type_k = GGML_TYPE_F16;
|
||||
params_dft.cache_type_v = GGML_TYPE_F16;
|
||||
params_dft.cache_type_k = params_base.speculative.cache_type_k;
|
||||
params_dft.cache_type_v = params_base.speculative.cache_type_v;
|
||||
|
||||
llama_init_dft = common_init_from_params(params_dft);
|
||||
|
||||
|
@ -3387,38 +3385,6 @@ struct server_context {
|
|||
llama_set_embeddings(ctx, slot_batched->need_embd());
|
||||
}
|
||||
|
||||
// pad the batch so that batch.n_tokens >= n_slots
|
||||
// TODO: temporary workaround for https://github.com/ggml-org/llama.cpp/issues/13689
|
||||
if (slot_batched->need_embd()) {
|
||||
const int n_slots = slots.size();
|
||||
|
||||
if (batch.n_tokens < n_slots) {
|
||||
std::set<llama_seq_id> seq_ids;
|
||||
for (int j = 0; j < batch.n_tokens; ++j) {
|
||||
seq_ids.insert(batch.seq_id[j][0]);
|
||||
}
|
||||
|
||||
// find unused sequence id
|
||||
llama_seq_id seq_id = -1;
|
||||
for (int i = 0; i < n_slots; ++i) {
|
||||
if (seq_ids.find(i) == seq_ids.end()) {
|
||||
seq_id = i;
|
||||
}
|
||||
}
|
||||
|
||||
const int n_add = n_slots - batch.n_tokens;
|
||||
|
||||
SRV_WRN("adding %d dummy tokens to the batch, seq_id = %d\n", n_add, seq_id);
|
||||
|
||||
for (int j = 0; j < n_add; ++j) {
|
||||
common_batch_add(batch, 0, j, { seq_id }, true);
|
||||
}
|
||||
|
||||
slots[seq_id].cache_tokens.clear();
|
||||
llama_memory_seq_rm(llama_get_memory(ctx), seq_id, -1, -1);
|
||||
}
|
||||
}
|
||||
|
||||
int32_t i_next = 0;
|
||||
|
||||
// process the created batch of tokens
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue