fix compute buffer estimate: don't reverse CUDA VRAM for output layer

2025-09-05 21:29:03 +00:00 · 2025-06-27 12:42:16 +00:00 · 2025-06-27 12:42:16 +00:00 · 11ce0d58f7
commit 11ce0d58f7
parent 3a03549fed
1 changed files with 14 additions and 3 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -21359,6 +21359,16 @@ void * llama_context_setup_backend(
            bool ok = true;
            GGML_ASSERT(ctx->sched.size() == gf.size());
            for (size_t i = 0; i < gf.size(); ++i) {
+
+#if defined(GGML_USE_CUDA)
+                if ((cparams.rank == 0 && (i == 0 || i == gf.size() - 1)) 
+                        || model->n_gpu_layers == 0) {
+                    continue;
+                }
+#elif defined(GGML_USE_METAL)
+
+#endif
+
                ok = ok & ggml_backend_sched_reserve(ctx->sched[i], gf[i]);
            }
            if (!ok) {
@ -21963,7 +21973,8 @@ void llama_model_compute_buf_size(
                    (n_inp_pos + n_kq_mask) * type_size_f32 + n_v * type_size_f16 + nb_attn_norm_w,
                });
            }
-            *gpu_buf     += (n_out_embd + n_result) * type_size_f32 + nb_output_w;
+            // we run the output layer on CPU by default
+            // *gpu_buf     += (n_out_embd + n_result) * type_size_f32 + nb_output_w;
            gpu_host_buf  = (n_inp_toks + n_inp_embd + n_bak_embd + n_inp_pos + n_kq_mask + n_inp_out_ids + n_out_embd) * type_size_f32;
        } else {
            if (has_gpu_layers) {
@ -22036,7 +22047,7 @@ void llama_model_compute_buf_size(

    LLAMA_LOG_INFO("\n");
    LLAMA_LOG_INFO("%s: here the compute buffer size is a predicted upper bound, not an exact value\n", __func__);
-    LLAMA_LOG_INFO("%s: (rank %d) compute buffer size = %7.2f MiB (GPU) + %7.2f MiB (CPU / GPU-host buffer)\n", __func__,
+    LLAMA_LOG_INFO("%s: (rank %d) compute buffer size = %7.2f MiB (GPU) + %7.2f MiB (CPU & GPU-host buffer)\n", __func__,
            my_rank, *gpu_buf / (1024.0 * 1024.0), *cpu_buf / (1024.0 * 1024.0));
    
    if (backend == BACKEND_CUDA) {