From dd589561b42a8a2ab49ef7526bcd942335c7807c Mon Sep 17 00:00:00 2001
From: Zonghang Li <zonghang.li@mbzuai.ac.ae>
Date: Thu, 19 Jun 2025 08:02:43 +0000
Subject: [PATCH] improve the computing buffer estimate

---
 README.md           |  1 +
 common/arg.cpp      |  7 +++++
 common/common.cpp   | 11 +++----
 common/common.h     |  1 +
 common/profiler.cpp |  4 +--
 common/profiler.h   | 16 ++++++++--
 include/llama.h     |  5 +--
 src/llama.cpp       | 76 ++++++++++++++++++++++++++++++++-------------
 8 files changed, 87 insertions(+), 34 deletions(-)
diff --git a/README.md b/README.md
index c69194f0..300ffa6a 100644
--- a/README.md
+++ b/README.md
@@ -135,6 +135,7 @@ mkdir build && cd build
 cmake ..
 make -j$(nproc)
 sudo make install
+sudo ldconfig
 ```
 
 **macOS:**
diff --git a/common/arg.cpp b/common/arg.cpp
index e282c80d..f1a33372 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -765,6 +765,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             params.force = true;
         }
     ).set_env("LLAMA_ARG_FORCE"));
+    add_opt(llama_arg(
+        {"--master-priority"}, "N",
+        format("priority to assign workload to the master (default: %f, set 1.01 to use master first, and 0.99 to offload to other devices)", params.master_priority),
+        [](gpt_params & params, const std::string & value) {
+            params.master_priority = std::stof(value);
+        }
+    ).set_env("LLAMA_ARG_MASTER_PRIORITY"));
 // #ifdef GGML_USE_METAL
 //     // warn: if the output layer weights are not kept in metal shared memory, its mmap-ed weight data
 //     // could be released by the OS and reloaded repeatedly, which causes additional disk I/O latency.
diff --git a/common/common.cpp b/common/common.cpp
index 39b95d32..cc4536b6 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1053,7 +1053,7 @@ static bool assign_layers_to_device(
             GGML_ASSERT(!is_windows && "Windows is not tested yet\n");
 
             bool use_gpu = dev.gpu_support.metal || dev.gpu_support.cuda;
-            llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, use_gpu, m == 0, w[m] * k, n[m] * k);
+            llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, use_gpu, m == 0, dev_info_set[0].model_bytes, w[m] > n[m]);
 
             int  l_m          = w[m] * k;  // total number of layers assigned to device m
             int  l_m_gpu      = n[m] * k;  // number of layers assigned to device m that run on GPU
@@ -1248,10 +1248,8 @@ static bool assign_layers_to_device(
                     return cost * k;
                 }
             );
-            // apply higher priority to the head device, here 0.99 is a heuristic value
-            // to ensure that small models in homogeneous clusters result in 32:0 partitioning,
-            // rather than 1:31.
-            model.lp_.col_cost_[0] *= 0.99;
+            // apply priority to the head device
+            model.lp_.col_cost_[0] *= 1.0 / cparams.master_priority;
 
             // define the variable bounds
             model.lp_.col_lower_ = std::vector<double>(n_world * 2, 0.0);
@@ -1524,7 +1522,7 @@ static bool assign_layers_to_device(
     for (uint32_t m = 0; m < n_world; ++m) {
         const device_info & dev = dev_info_set[m];
         bool use_gpu = dev.gpu_support.metal || dev.gpu_support.cuda;
-        llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, use_gpu, m == 0, w[m], n[m]);
+        llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, use_gpu, m == 0, dev_info_set[0].model_bytes);
 
         if (dev.gpu_support.cuda || dev.gpu_support.metal) {
             int64_t required_mem = w[m] * b_prime;
@@ -2024,6 +2022,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     cparams.rank              = params.rank;
     cparams.prefetch          = params.prefetch;
     cparams.force             = params.force;
+    cparams.master_priority   = params.master_priority;
     cparams.keep_out_in_metal = params.keep_out_in_metal;
     cparams.n_gpu_layers      = params.n_gpu_layers;
     cparams.n_cycles          = params.n_cycles;
diff --git a/common/common.h b/common/common.h
index c6ffe136..cd78c173 100644
--- a/common/common.h
+++ b/common/common.h
@@ -152,6 +152,7 @@ struct gpt_params {
     bool    prefetch              = false; // prefetch layer weights
     bool    keep_out_in_metal     =  true; // whether to keep output weights in metal memory, true by default
     bool    force                 = false; // force to start prefetching after computation
+    float   master_priority       =  1.01; // priority to assign workload to the master (set 1.01 to use master first, and 0.99 to offload to other devices)
     int32_t gpu_mem               = 999.0; // gpu memory to use, in GiB
     int32_t n_cycles              =     0; // number of cycles to output one token
     int32_t n_predict             =    -1; // new tokens to predict
diff --git a/common/profiler.cpp b/common/profiler.cpp
index 18fe795d..292dc026 100644
--- a/common/profiler.cpp
+++ b/common/profiler.cpp
@@ -1603,10 +1603,10 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam
 
 #if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
     llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, true);
-    llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, true,  true, n_layers, n_gpu_layers);
+    llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, true,  true, n_bytes, n_layers > n_gpu_layers);
 #else
     llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, false);
-    llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, false, true, n_layers, n_gpu_layers);
+    llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, false, true, n_bytes, n_layers > n_gpu_layers);
 #endif
 
     double cpu_kv_size_gib     = static_cast<double>(cpu_kv_size) / 1024.0 / 1024.0 / 1024.0;     // convert to GiB
diff --git a/common/profiler.h b/common/profiler.h
index ff69a454..c904ef98 100644
--- a/common/profiler.h
+++ b/common/profiler.h
@@ -293,10 +293,20 @@ struct model_bytes {
     int64_t nb_layer;
     int64_t nb_output;
 
+    // used to estimate the compute buffer size 
+    int64_t nb_output_w;
+    int64_t nb_attn_norm_w;
+    int64_t nb_ffn_gate_w;
+    int64_t nb_ffn_down_w;
+
     model_bytes() :
-        nb_input (0),
-        nb_layer (0),
-        nb_output(0) {}
+        nb_input      (0),
+        nb_layer      (0),
+        nb_output     (0), 
+        nb_output_w   (0),
+        nb_attn_norm_w(0),
+        nb_ffn_gate_w (0),
+        nb_ffn_down_w (0) {}
 };
 
 struct disk_props {
diff --git a/include/llama.h b/include/llama.h
index 3c220562..c61dd851 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -327,6 +327,7 @@ extern "C" {
         uint32_t    n_cycles;          // number of cycles to output one token
         bool        prefetch;          // whether to prefetch layer weights
         bool        force;             // force to start prefetching after computation
+        float       master_priority;   // priority to assign workload to the master (set 1.01 to use master first, and 0.99 to offload to other devices)
         bool        keep_out_in_metal; // whether to keep output weights in metal memory
         char *      master_ip;         // ip address of the master node
         char *      next_node_ip;      // ip address of the next node
@@ -575,8 +576,8 @@ extern "C" {
          const struct llama_context_params   cparams, 
                                       bool   use_gpu,
                                       bool   is_master,
-                                       int   n_layers,
-                                       int   n_gpu_layers);
+                        struct model_bytes   n_bytes,
+                                      bool   offload);
 
     // Return the size of KV cache in the model
     LLAMA_API void llama_total_kv_size(
diff --git a/src/llama.cpp b/src/llama.cpp
index 8b5af567..15255ea0 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3679,6 +3679,8 @@ void llama_profile_device(
     dev_info->gpu_props.cuda_read_vram_bw   = device_cuda_read_vram_bw();
     dev_info->gpu_props.metal_mem_cpy_delay = device_metal_mem_copy(model);
     dev_info->gpu_props.cuda_mem_cpy_delay  = device_cuda_mem_copy(model);
+#else
+    (void)gpu_mem;
 #endif
 
     if (is_dtype_exist(n_params, GGML_TYPE_F32)) {
@@ -20263,6 +20265,7 @@ struct llama_context_params llama_context_default_params() {
         /*.n_cycles                    =*/ 0,
         /*.prefetch                    =*/ false,
         /*.force                       =*/ false,
+        /*.master_priority             =*/ 1.01,
         /*.keep_out_in_metal           =*/ true,
         /*.master_ip                   =*/ nullptr,
         /*.next_node_ip                =*/ nullptr,
@@ -21860,8 +21863,8 @@ void llama_model_compute_buf_size(
    const struct llama_context_params   cparams,
                                 bool   use_gpu,
                                 bool   is_master,
-                                 int   n_layers,
-                                 int   n_gpu_layers) {
+                  struct model_bytes   n_bytes,
+                                bool   offload) {
     const llama_hparams hparams = model->hparams;
 
     // input tensors
@@ -21872,34 +21875,61 @@ void llama_model_compute_buf_size(
     const int64_t n_bak_embd = hparams.n_embd  * cparams.n_ubatch;
     const int64_t n_inp_pos  = cparams.n_ubatch;
     const int64_t n_kq_mask  = cparams.n_ctx   * cparams.n_ubatch;
-    const int64_t n_inp_out_ids = cparams.n_ubatch;
     const int64_t n_norm     = hparams.n_embd  * cparams.n_ubatch;
-    const int64_t n_qcur     = hparams.n_embd  * cparams.n_ubatch * 2;
-    const int64_t n_kq       = cparams.n_ctx   * cparams.n_ubatch * hparams.n_head();
+    const int64_t n_qcur     = hparams.n_embd  * cparams.n_ubatch;
+    const int64_t n_ffn_gate = hparams.n_ff()  * cparams.n_ubatch;
+    const int64_t n_ffn_up   = hparams.n_ff()  * cparams.n_ubatch;
+    const int64_t n_inp_out_ids = cparams.n_ubatch;
 
     // outputs
     const int64_t n_out_embd = hparams.n_embd  * cparams.n_ubatch;
-    const int64_t n_output   = hparams.n_vocab * cparams.n_ubatch;
+    const int64_t n_result   = hparams.n_vocab * cparams.n_ubatch;
 
-    // compute buffer size for input, each layer, and output
-    const int64_t n_buf_inp  = (n_inp_toks + n_inp_embd) * ggml_type_size(GGML_TYPE_F32);
-    const int64_t n_buf_act  = (n_bak_embd + n_inp_pos + n_kq_mask + 
-                                 n_inp_out_ids + n_norm + n_qcur + n_kq
-                                ) * ggml_type_size(GGML_TYPE_F32);
-    const int64_t n_buf_out  = (n_out_embd + n_output) * ggml_type_size(GGML_TYPE_F32);
-
-    *cpu_buf = 0;
-    *gpu_buf = 0;
-    if (is_master) *cpu_buf = n_buf_inp + n_buf_out;
+    // weights
+    const int64_t nb_output_w    = n_bytes.nb_output_w;
+    const int64_t nb_attn_norm_w = n_bytes.nb_attn_norm_w;
+    const int64_t nb_ffn_gate_w  = n_bytes.nb_ffn_gate_w;
+    const int64_t nb_ffn_down_w  = n_bytes.nb_ffn_down_w;
+    
+    const int64_t nb_act_buf_base = (n_bak_embd + n_norm + n_inp_pos + n_ffn_gate) * ggml_type_size(GGML_TYPE_F32);
+    *gpu_buf = use_gpu ? nb_act_buf_base : 0;
+    *cpu_buf = nb_act_buf_base;
+    int64_t gpu_host_buf = 0;
 
+    // estimate GPU computing buffer and GPU-host computing buffer
     if (use_gpu) {
-        *gpu_buf += n_buf_act;
-        if (n_layers > n_gpu_layers) {
-            *cpu_buf += n_buf_act;
+        if (is_master) {
+            if (offload) {
+                *gpu_buf += (n_ffn_up + n_qcur) * ggml_type_size(GGML_TYPE_F32) + nb_attn_norm_w + nb_ffn_gate_w;
+            } else {
+                *gpu_buf += (n_ffn_up + n_qcur + n_kq_mask + n_inp_out_ids) * ggml_type_size(GGML_TYPE_F32);
+            }
+            *gpu_buf     += (n_out_embd + n_result) * ggml_type_size(GGML_TYPE_F32) + nb_output_w;
+            gpu_host_buf  = (n_inp_toks + n_inp_embd + n_bak_embd + n_inp_pos + n_kq_mask + n_inp_out_ids + n_out_embd) * ggml_type_size(GGML_TYPE_F32);
+        } else {
+            if (offload) {
+                *gpu_buf += n_qcur * ggml_type_size(GGML_TYPE_F32) + nb_attn_norm_w + nb_ffn_gate_w + nb_ffn_down_w;
+            } else {
+                *gpu_buf += (n_ffn_up + n_kq_mask) * ggml_type_size(GGML_TYPE_F32);
+            }
+            gpu_host_buf  = (n_bak_embd + n_inp_pos + n_kq_mask) * ggml_type_size(GGML_TYPE_F32);
         }
-    } else {
-        *cpu_buf += n_buf_act;
+    } 
+    
+    // estimate CPU computing buffer
+    {
+        if (is_master) {
+            *cpu_buf += (n_ffn_up + n_kq_mask + n_inp_out_ids + n_qcur + n_inp_toks + n_inp_embd + n_out_embd + n_result) * ggml_type_size(GGML_TYPE_F32);
+        } else {
+            *cpu_buf += (n_ffn_up + n_kq_mask) * ggml_type_size(GGML_TYPE_F32);
+        }
+        *cpu_buf += gpu_host_buf;
     }
+
+    LLAMA_LOG_INFO("%s: compute buffer size = %7.2f MiB (GPU) + %7.2f MiB (GPU-Host)\n", __func__,
+            *gpu_buf / (1024.0 * 1024.0), gpu_host_buf / (1024.0 * 1024.0));
+    LLAMA_LOG_INFO("%s: compute buffer size = %7.2f MiB (CPU)\n", __func__,
+            *cpu_buf / (1024.0 * 1024.0));
 }
 
 void llama_total_kv_size(
@@ -22045,6 +22075,7 @@ void llama_model_n_flops(
             if (blk_suffix == "attn_norm.weight" || blk_suffix == "ffn_norm.weight") {
                 count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 4 * n_embd + 1); // rms norm
                 count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, n_embd); // norm weights
+                n_bytes->nb_attn_norm_w = std::max(n_bytes->nb_attn_norm_w, (int64_t)ggml_nbytes(cur));
             } else if (blk_suffix == "attn_q.weight") {
                 count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_embd);
                 count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2.5 * n_embd);  // rope
@@ -22062,9 +22093,11 @@ void llama_model_n_flops(
             } else if (blk_suffix == "ffn_gate.weight") {
                 count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff);
                 count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 8 * n_ff); // SiLU
+                n_bytes->nb_ffn_gate_w = std::max(n_bytes->nb_ffn_gate_w, (int64_t)ggml_nbytes(cur));
             } else if (blk_suffix == "ffn_down.weight") {
                 count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff);
                 count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_embd); // shortcut
+                n_bytes->nb_ffn_down_w = std::max(n_bytes->nb_ffn_down_w, (int64_t)ggml_nbytes(cur));
             } else if (blk_suffix == "ffn_up.weight") {
                 count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff);
                 count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_ff); // silu(gate(x)) * up(x)
@@ -22097,6 +22130,7 @@ void llama_model_n_flops(
                 count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 5 * n_vocab); // softmax
                 count_n_params(n_params, cur->type,     PROFILER_LAYER_OUTPUT, ggml_nelements(cur));
                 count_n_bytes (n_bytes,                 PROFILER_LAYER_OUTPUT, ggml_nbytes(cur));
+                n_bytes->nb_output_w = std::max(n_bytes->nb_output_w, (int64_t)ggml_nbytes(cur));
             } else if (tensor_name == "rope_freqs.weight") {
                 if (!rope_used) {
                     count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));