Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	common/common.cpp
#	examples/batched-bench/batched-bench.cpp
#	examples/batched/batched.cpp
#	examples/export-lora/export-lora.cpp
#	examples/gritlm/gritlm.cpp
#	examples/parallel/parallel.cpp
#	examples/passkey/passkey.cpp
#	examples/speculative-simple/speculative-simple.cpp
#	examples/speculative/speculative.cpp
#	ggml/src/ggml-cann/CMakeLists.txt
#	ggml/src/ggml-cann/acl_tensor.cpp
#	ggml/src/ggml-cann/acl_tensor.h
#	ggml/src/ggml-cann/aclnn_ops.cpp
#	ggml/src/ggml-cann/aclnn_ops.h
#	ggml/src/ggml-vulkan/CMakeLists.txt
#	tests/test-arg-parser.cpp
#	tests/test-backend-ops.cpp
This commit is contained in:
Concedo 2025-04-03 18:57:49 +08:00
commit 103d60ed2c
43 changed files with 1509 additions and 1129 deletions

View file

@ -255,7 +255,8 @@ llama_context::llama_context(
model.n_devices() > 1 &&
model.params.n_gpu_layers > (int) model.hparams.n_layer &&
model.params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
cparams.offload_kqv;
cparams.offload_kqv &&
!model.has_tensor_overrides();
// pipeline parallelism requires support for async compute and events in all devices
if (pipeline_parallel) {
@ -1202,33 +1203,7 @@ int llama_context::decode(llama_batch & inp_batch) {
const int64_t n_tokens_all = batch.n_tokens;
const int64_t n_embd = hparams.n_embd;
// TODO: remove this stuff
class batch_guard {
public:
batch_guard(llama_kv_cache_unified & kv_self) : kv_slot_restorer(kv_self) {
}
~batch_guard() {
if (!is_done) {
kv_slot_restorer.restore();
}
}
void done() {
is_done = true;
}
void save(const llama_kv_cache_slot_info & slot_info) {
kv_slot_restorer.save(slot_info);
}
private:
bool is_done = false;
llama_kv_slot_restorer kv_slot_restorer;
};
batch_guard bg(*kv_self);
llama_kv_cache_guard kv_guard(kv_self.get());
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
@ -1281,6 +1256,9 @@ int llama_context::decode(llama_batch & inp_batch) {
return -2;
};
// handle any pending defrags/shifts
kv_self_update();
int64_t n_outputs_prev = 0;
while (sbatch.n_tokens > 0) {
@ -1320,22 +1298,12 @@ int llama_context::decode(llama_batch & inp_batch) {
// find KV slot
{
kv_self_update();
if (!kv_self->find_slot(ubatch)) {
LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
// if we have enough unused cells before the current head ->
// better to start searching from the beginning of the cache, hoping to fill it
if (kv_self->head > kv_self->used + 2*ubatch.n_tokens) {
kv_self->head = 0;
return 1;
}
const auto slot_info = kv_self->find_slot(ubatch);
if (!slot_info) {
LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__);
return -3;
}
bg.save(slot_info);
if (!kv_self->recurrent) {
// a heuristic, to avoid attending the full cache if it is not yet utilized
// after enough generations, the benefit from this heuristic disappears
@ -1372,16 +1340,6 @@ int llama_context::decode(llama_batch & inp_batch) {
}
}
// update the kv ring buffer
{
kv_self->head += ubatch.n_tokens;
// Ensure kv cache head points to a valid index.
if (kv_self->head >= kv_self->size) {
kv_self->head = 0;
}
}
// plot the computation graph in dot format (for debugging purposes)
//if (n_past%100 == 0) {
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
@ -1468,7 +1426,7 @@ int llama_context::decode(llama_batch & inp_batch) {
}
// finalize the batch processing
bg.done();
kv_guard.commit();
// set output mappings
{