mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 17:14:36 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # common/common.cpp # examples/batched-bench/batched-bench.cpp # examples/batched/batched.cpp # examples/export-lora/export-lora.cpp # examples/gritlm/gritlm.cpp # examples/parallel/parallel.cpp # examples/passkey/passkey.cpp # examples/speculative-simple/speculative-simple.cpp # examples/speculative/speculative.cpp # ggml/src/ggml-cann/CMakeLists.txt # ggml/src/ggml-cann/acl_tensor.cpp # ggml/src/ggml-cann/acl_tensor.h # ggml/src/ggml-cann/aclnn_ops.cpp # ggml/src/ggml-cann/aclnn_ops.h # ggml/src/ggml-vulkan/CMakeLists.txt # tests/test-arg-parser.cpp # tests/test-backend-ops.cpp
This commit is contained in:
commit
103d60ed2c
43 changed files with 1509 additions and 1129 deletions
|
@ -255,7 +255,8 @@ llama_context::llama_context(
|
|||
model.n_devices() > 1 &&
|
||||
model.params.n_gpu_layers > (int) model.hparams.n_layer &&
|
||||
model.params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
|
||||
cparams.offload_kqv;
|
||||
cparams.offload_kqv &&
|
||||
!model.has_tensor_overrides();
|
||||
|
||||
// pipeline parallelism requires support for async compute and events in all devices
|
||||
if (pipeline_parallel) {
|
||||
|
@ -1202,33 +1203,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
|||
const int64_t n_tokens_all = batch.n_tokens;
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
|
||||
// TODO: remove this stuff
|
||||
class batch_guard {
|
||||
public:
|
||||
batch_guard(llama_kv_cache_unified & kv_self) : kv_slot_restorer(kv_self) {
|
||||
}
|
||||
|
||||
~batch_guard() {
|
||||
if (!is_done) {
|
||||
kv_slot_restorer.restore();
|
||||
}
|
||||
}
|
||||
|
||||
void done() {
|
||||
is_done = true;
|
||||
}
|
||||
|
||||
void save(const llama_kv_cache_slot_info & slot_info) {
|
||||
kv_slot_restorer.save(slot_info);
|
||||
}
|
||||
|
||||
private:
|
||||
bool is_done = false;
|
||||
|
||||
llama_kv_slot_restorer kv_slot_restorer;
|
||||
};
|
||||
|
||||
batch_guard bg(*kv_self);
|
||||
llama_kv_cache_guard kv_guard(kv_self.get());
|
||||
|
||||
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
||||
|
||||
|
@ -1281,6 +1256,9 @@ int llama_context::decode(llama_batch & inp_batch) {
|
|||
return -2;
|
||||
};
|
||||
|
||||
// handle any pending defrags/shifts
|
||||
kv_self_update();
|
||||
|
||||
int64_t n_outputs_prev = 0;
|
||||
|
||||
while (sbatch.n_tokens > 0) {
|
||||
|
@ -1320,22 +1298,12 @@ int llama_context::decode(llama_batch & inp_batch) {
|
|||
|
||||
// find KV slot
|
||||
{
|
||||
kv_self_update();
|
||||
if (!kv_self->find_slot(ubatch)) {
|
||||
LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
|
||||
|
||||
// if we have enough unused cells before the current head ->
|
||||
// better to start searching from the beginning of the cache, hoping to fill it
|
||||
if (kv_self->head > kv_self->used + 2*ubatch.n_tokens) {
|
||||
kv_self->head = 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
const auto slot_info = kv_self->find_slot(ubatch);
|
||||
if (!slot_info) {
|
||||
LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__);
|
||||
return -3;
|
||||
}
|
||||
|
||||
bg.save(slot_info);
|
||||
|
||||
if (!kv_self->recurrent) {
|
||||
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
||||
// after enough generations, the benefit from this heuristic disappears
|
||||
|
@ -1372,16 +1340,6 @@ int llama_context::decode(llama_batch & inp_batch) {
|
|||
}
|
||||
}
|
||||
|
||||
// update the kv ring buffer
|
||||
{
|
||||
kv_self->head += ubatch.n_tokens;
|
||||
|
||||
// Ensure kv cache head points to a valid index.
|
||||
if (kv_self->head >= kv_self->size) {
|
||||
kv_self->head = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// plot the computation graph in dot format (for debugging purposes)
|
||||
//if (n_past%100 == 0) {
|
||||
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
||||
|
@ -1468,7 +1426,7 @@ int llama_context::decode(llama_batch & inp_batch) {
|
|||
}
|
||||
|
||||
// finalize the batch processing
|
||||
bg.done();
|
||||
kv_guard.commit();
|
||||
|
||||
// set output mappings
|
||||
{
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue