From 11ce0d58f7dbe1f3fa328fec9a7c68a810464e2d Mon Sep 17 00:00:00 2001
From: Zonghang Li <zonghang.li@mbzuai.ac.ae>
Date: Fri, 27 Jun 2025 12:42:16 +0000
Subject: [PATCH] fix compute buffer estimate: don't reverse CUDA VRAM for
 output layer

---
 src/llama.cpp | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index abda2327..01cf82a2 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -20976,7 +20976,7 @@ void * llama_context_setup_backend(
     auto       & cparams = ctx->cparams;
 
     std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window);
-    cparams.prefetch           = params.prefetch;
+    cparams.prefetch         = params.prefetch;
     cparams.n_seq_max        = std::max(1u, params.n_seq_max);
     cparams.n_threads        = params.n_threads;
     cparams.n_threads_batch  = params.n_threads_batch;
@@ -21359,6 +21359,16 @@ void * llama_context_setup_backend(
             bool ok = true;
             GGML_ASSERT(ctx->sched.size() == gf.size());
             for (size_t i = 0; i < gf.size(); ++i) {
+
+#if defined(GGML_USE_CUDA)
+                if ((cparams.rank == 0 && (i == 0 || i == gf.size() - 1)) 
+                        || model->n_gpu_layers == 0) {
+                    continue;
+                }
+#elif defined(GGML_USE_METAL)
+
+#endif
+
                 ok = ok & ggml_backend_sched_reserve(ctx->sched[i], gf[i]);
             }
             if (!ok) {
@@ -21963,7 +21973,8 @@ void llama_model_compute_buf_size(
                     (n_inp_pos + n_kq_mask) * type_size_f32 + n_v * type_size_f16 + nb_attn_norm_w,
                 });
             }
-            *gpu_buf     += (n_out_embd + n_result) * type_size_f32 + nb_output_w;
+            // we run the output layer on CPU by default
+            // *gpu_buf     += (n_out_embd + n_result) * type_size_f32 + nb_output_w;
             gpu_host_buf  = (n_inp_toks + n_inp_embd + n_bak_embd + n_inp_pos + n_kq_mask + n_inp_out_ids + n_out_embd) * type_size_f32;
         } else {
             if (has_gpu_layers) {
@@ -22036,7 +22047,7 @@ void llama_model_compute_buf_size(
 
     LLAMA_LOG_INFO("\n");
     LLAMA_LOG_INFO("%s: here the compute buffer size is a predicted upper bound, not an exact value\n", __func__);
-    LLAMA_LOG_INFO("%s: (rank %d) compute buffer size = %7.2f MiB (GPU) + %7.2f MiB (CPU / GPU-host buffer)\n", __func__,
+    LLAMA_LOG_INFO("%s: (rank %d) compute buffer size = %7.2f MiB (GPU) + %7.2f MiB (CPU & GPU-host buffer)\n", __func__,
             my_rank, *gpu_buf / (1024.0 * 1024.0), *cpu_buf / (1024.0 * 1024.0));
     
     if (backend == BACKEND_CUDA) {