diff --git a/common/common.cpp b/common/common.cpp
index 4f1ddff3..c241b54f 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1081,7 +1081,7 @@ static bool assign_layers_to_device(
             bool is_windows = strcmp(dev.device_os, "Windows") == 0;
             GGML_ASSERT(!is_windows && "Windows is not tested yet\n");
 
-            llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, get_backend_type(dev.gpu_support), m == 0, dev_info_set[0].model_bytes, w[m] > n[m], n[m] > 0);
+            llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, get_backend_type(dev.gpu_support), m, dev_info_set[0].model_bytes, w[m] > n[m], n[m] > 0);
 
             int  l_m          = w[m] * k;  // total number of layers assigned to device m
             int  l_m_gpu      = n[m] * k;  // number of layers assigned to device m that run on GPU
@@ -1242,6 +1242,7 @@ static bool assign_layers_to_device(
                 if (dev.gpu_support.metal && m == 0 && cparams.keep_out_in_metal) {
                     vec_z_gpu[m] -= (double)bo / (double)(n_layer * b_prime);
                 }
+                vec_z_gpu[m] = std::max(vec_z_gpu[m], 0.0f);
             }
         }
 
@@ -1554,7 +1555,7 @@ static bool assign_layers_to_device(
     for (uint32_t m = 0; m < n_world; ++m) {
         const device_info & dev = dev_info_set[m];
         bool use_gpu = dev.gpu_support.metal || dev.gpu_support.cuda;
-        llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, get_backend_type(dev.gpu_support), m == 0, dev_info_set[0].model_bytes, true);
+        llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, get_backend_type(dev.gpu_support), m, dev_info_set[0].model_bytes, true);
 
         if (dev.gpu_support.cuda || dev.gpu_support.metal) {
             int64_t required_mem = w[m] * b_prime;
diff --git a/common/profiler.cpp b/common/profiler.cpp
index c788a3e6..afc427f1 100644
--- a/common/profiler.cpp
+++ b/common/profiler.cpp
@@ -924,7 +924,7 @@ static void check_env_path() {
     setenv("PATH", update_env_path.c_str(), 1);
 }
 
-static void external_fio_impl(float * read_bw, float * write_bw, bool op_rand, int n_threads) {
+static void external_fio_impl(float * read_bw, float * write_bw, bool op_rand, int n_threads) {    
     pid_t pid = getpid(); // avoid conflict with other processes
 
     std::string test_file   = "fio_test_"   + std::to_string(pid);
@@ -1610,13 +1610,13 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam
 #elif GGML_USE_CUDA
     backend = BACKEND_CUDA;
 #endif
-    llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, backend,  true, n_bytes, n_layers > n_gpu_layers, n_gpu_layers > 0);
+    llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, backend, 0, n_bytes, n_layers > n_gpu_layers, n_gpu_layers > 0);
 
 #else
     llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, false);
 
     enum backend_type backend = BACKEND_CPU;
-    llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, backend, true, n_bytes, n_layers > n_gpu_layers, n_gpu_layers > 0);
+    llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, backend, 0, n_bytes, n_layers > n_gpu_layers, n_gpu_layers > 0);
 #endif
 
     double cpu_kv_size_gib     = static_cast<double>(cpu_kv_size) / 1024.0 / 1024.0 / 1024.0;     // convert to GiB
diff --git a/common/profiler.h b/common/profiler.h
index c904ef98..00407c96 100644
--- a/common/profiler.h
+++ b/common/profiler.h
@@ -295,18 +295,22 @@ struct model_bytes {
 
     // used to estimate the compute buffer size 
     int64_t nb_output_w;
+    int64_t nb_output_norm_w;
     int64_t nb_attn_norm_w;
+    int64_t nb_attn_q_w;
     int64_t nb_ffn_gate_w;
     int64_t nb_ffn_down_w;
 
     model_bytes() :
-        nb_input      (0),
-        nb_layer      (0),
-        nb_output     (0), 
-        nb_output_w   (0),
-        nb_attn_norm_w(0),
-        nb_ffn_gate_w (0),
-        nb_ffn_down_w (0) {}
+        nb_input        (0),
+        nb_layer        (0),
+        nb_output       (0), 
+        nb_output_w     (0),
+        nb_output_norm_w(0),
+        nb_attn_norm_w  (0),
+        nb_attn_q_w     (0),
+        nb_ffn_gate_w   (0),
+        nb_ffn_down_w   (0) {}
 };
 
 struct disk_props {
diff --git a/include/llama.h b/include/llama.h
index fc42856c..2faecdfe 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -585,7 +585,7 @@ extern "C" {
                   const struct llama_model * model, 
          const struct llama_context_params   cparams, 
                          enum backend_type   backend,
-                                      bool   is_master,
+                                       int   my_rank,
                         struct model_bytes   n_bytes,
                                       bool   offload,
                                       bool   has_gpu_layers);
diff --git a/src/llama.cpp b/src/llama.cpp
index eb551280..de49cdce 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -21863,11 +21863,12 @@ void llama_model_compute_buf_size(
             const struct llama_model * model, 
    const struct llama_context_params   cparams,
                    enum backend_type   backend,
-                                bool   is_master,
+                                 int   my_rank,
                   struct model_bytes   n_bytes,
                                 bool   offload,
                                 bool   has_gpu_layers) {
     const llama_hparams hparams = model->hparams;
+    bool is_master = my_rank == 0;
 
     // input tensors
     const int64_t n_inp_toks = cparams.n_ubatch;
@@ -21884,6 +21885,8 @@ void llama_model_compute_buf_size(
     const int64_t n_ffn_out  = hparams.n_embd  * cparams.n_ubatch;
     const int64_t n_ffn_inp  = hparams.n_embd  * cparams.n_ubatch;
     const int64_t n_kq       = cparams.n_ctx   * cparams.n_ubatch * hparams.n_head();
+    const int64_t n_k        = cparams.n_ctx   * hparams.n_embd_head_k * hparams.n_head_kv();
+    const int64_t n_v        = cparams.n_ctx   * hparams.n_embd_head_v * hparams.n_head_kv();
     const int64_t n_inp_out_ids = cparams.n_ubatch;
 
     // outputs
@@ -21893,34 +21896,60 @@ void llama_model_compute_buf_size(
     // weights
     const int64_t nb_output_w    = n_bytes.nb_output_w;
     const int64_t nb_attn_norm_w = n_bytes.nb_attn_norm_w;
-    const int64_t nb_ffn_gate_w  = n_bytes.nb_ffn_gate_w;
-    const int64_t nb_ffn_down_w  = n_bytes.nb_ffn_down_w;
+    const int64_t nb_attn_q_w    = n_bytes.nb_attn_q_w;
 
+    // format bytes
     const int64_t type_size_f32 = ggml_type_size(GGML_TYPE_F32);
+    const int64_t type_size_f16 = ggml_type_size(GGML_TYPE_F16);
 
     bool use_gpu = backend != BACKEND_CPU && has_gpu_layers;
     *gpu_buf = 0;
     *cpu_buf = 0;
     int64_t gpu_host_buf = 0;
 
+    // GPU compute buffer
+    // estimate the GPU compute buffer, here we can only estimate the upper bound of various models,
+    // but cannot estimate the exact value.
     if (backend == BACKEND_CUDA) {
-        const int64_t nb_act_buf_base = (n_bak_embd + n_norm + n_inp_pos + n_ffn_gate) * type_size_f32;
-        *gpu_buf = use_gpu ? nb_act_buf_base : 0;
+        *gpu_buf = (n_bak_embd + n_norm) * type_size_f32;
 
-        // CUDA computing buffer and CUDA-host buffer
         if (is_master) {
-            if (offload) {
-                *gpu_buf += (n_ffn_up + n_qcur) * type_size_f32 + nb_attn_norm_w + nb_ffn_gate_w;
+            if (has_gpu_layers) {
+                if (offload) {
+                    *gpu_buf += std::max<int64_t>({
+                        (n_qcur + n_inp_pos + n_kq_mask + n_inp_out_ids) * type_size_f32 + nb_attn_norm_w,
+                        (n_qcur + n_inp_pos + n_norm) * type_size_f32 + nb_attn_norm_w,
+                        (n_qcur + n_qcur + n_kq_mask + n_inp_pos) * type_size_f32,
+                        (n_qcur + n_qcur + n_inp_pos) * type_size_f32 + nb_attn_q_w,
+                        n_inp_pos * type_size_f32 + (n_k + n_v) * type_size_f16 + nb_attn_q_w
+                    });
+                } else {
+                    *gpu_buf += (n_qcur + n_inp_pos + n_kq_mask + n_inp_out_ids) * type_size_f32;
+                }
+                *gpu_buf += (n_qcur + n_kq) * type_size_f32;
             } else {
-                *gpu_buf += (n_ffn_up + n_qcur + n_kq_mask + n_inp_out_ids) * type_size_f32;
+                *gpu_buf += (n_qcur + n_kq) * type_size_f32;
+                *gpu_buf += std::max<int64_t>({
+                    (n_kq_mask + n_qcur + n_inp_pos) * type_size_f32 + nb_attn_norm_w,
+                    (n_inp_pos + n_kq_mask) * type_size_f32 + n_v * type_size_f16 + nb_attn_norm_w,
+                });
             }
             *gpu_buf     += (n_out_embd + n_result) * type_size_f32 + nb_output_w;
             gpu_host_buf  = (n_inp_toks + n_inp_embd + n_bak_embd + n_inp_pos + n_kq_mask + n_inp_out_ids + n_out_embd) * type_size_f32;
         } else {
-            if (offload) {
-                *gpu_buf += n_qcur * type_size_f32 + nb_attn_norm_w + nb_ffn_gate_w + nb_ffn_down_w;
+            if (has_gpu_layers) {
+                if (offload) {
+                    *gpu_buf += (n_kq + n_qcur) * type_size_f32;
+                    *gpu_buf += std::max<int64_t>({
+                        (n_inp_pos + n_norm + n_kq_mask) * type_size_f32 + nb_attn_norm_w,
+                        (n_inp_pos + n_norm + n_qcur)    * type_size_f32 + nb_attn_norm_w,
+                        n_inp_pos * type_size_f32 + (n_k + n_v) * type_size_f16 + nb_attn_q_w,
+                    });
+                } else {
+                    *gpu_buf += (n_inp_pos + n_kq_mask + n_qcur + n_qcur + n_kq) * type_size_f32;
+                }
             } else {
-                *gpu_buf += (n_ffn_up + n_kq_mask) * type_size_f32;
+                *gpu_buf += (n_qcur + n_kq + n_kq_mask + n_qcur + n_inp_pos) * type_size_f32 + nb_attn_norm_w;
             }
             gpu_host_buf  = (n_bak_embd + n_inp_pos + n_kq_mask) * type_size_f32;
         }
@@ -21964,19 +21993,22 @@ void llama_model_compute_buf_size(
         GGML_ASSERT(false && "Unsupported backend type for compute buffer estimation.\n");
     }
 
-    // CPU computing buffer
+    // CPU compute buffer
     if (*cpu_buf == 0) {
         *cpu_buf = (n_inp_pos + n_kq_mask + n_bak_embd + n_norm) * type_size_f32;
         if (is_master) {
             *cpu_buf += (n_inp_toks + n_inp_embd + n_inp_out_ids + n_out_embd + n_result) * type_size_f32;   
         }
         *cpu_buf += std::max(n_ffn_gate + n_ffn_up, n_qcur + n_qcur + n_kq) * type_size_f32;
+        *cpu_buf += gpu_host_buf;
     }
 
-    LLAMA_LOG_INFO("%s: compute buffer size = %7.2f MiB (GPU) + %7.2f MiB (GPU-Host)\n", __func__,
-            *gpu_buf / (1024.0 * 1024.0), gpu_host_buf / (1024.0 * 1024.0));
-    LLAMA_LOG_INFO("%s: compute buffer size = %7.2f MiB (CPU)\n", __func__,
-            *cpu_buf / (1024.0 * 1024.0));
+    LLAMA_LOG_INFO("\n");
+    LLAMA_LOG_INFO("%s: here the compute buffer size is a predicted upper bound, not an exact value\n", __func__);
+    LLAMA_LOG_INFO("%s: (rank %d) compute buffer size = %7.2f MiB (GPU) + %7.2f MiB (GPU-Host)\n", __func__,
+            my_rank, *gpu_buf / (1024.0 * 1024.0), gpu_host_buf / (1024.0 * 1024.0));
+    LLAMA_LOG_INFO("%s: (rank %d) compute buffer size = %7.2f MiB (CPU and GPU-Host buffer)\n", __func__,
+            my_rank, *cpu_buf / (1024.0 * 1024.0));
 }
 
 void llama_total_kv_size(
@@ -22126,6 +22158,7 @@ void llama_model_n_flops(
             } else if (blk_suffix == "attn_q.weight") {
                 count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_embd);
                 count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2.5 * n_embd);  // rope
+                n_bytes->nb_attn_q_w = std::max(n_bytes->nb_attn_q_w, (int64_t)ggml_nbytes(cur));
             } else if (blk_suffix == "attn_k.weight") {
                 count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_head_kv * n_embd_head_k);
                 count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2.5 * n_embd_head_k * n_head_kv); // rope