graph : fix nkvo offload with FA (#19105)

2026-05-08 01:41:37 +00:00 · 2026-01-26 20:18:34 +02:00 · 2026-01-26 20:18:34 +02:00 · 8f80d1b254
commit 8f80d1b254
parent 142cbe2ac6
2 changed files with 5 additions and 7 deletions
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@ -2173,13 +2173,6 @@ llm_graph_cb llama_context::graph_get_cb() const {
            ggml_set_name(cur, name);
        }

-        if (!cparams.offload_kqv) {
-            if (strcmp(name, "kqv_merged_cont") == 0) {
-                // all nodes between the KV store and the attention output are run on the CPU
-                ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend_cpu);
-            }
-        }
-
        // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
        // FIXME: fix in ggml_backend_sched
        const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer;