diff --git a/common/arg.cpp b/common/arg.cpp
index f1a33372..4039b19c 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -749,6 +749,9 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
         format("maximum GPU memory to use (default: %d)", params.gpu_mem),
         [](gpt_params & params, int value) {
             params.gpu_mem = value; // in GiB
+            if (value == 0) {
+                LOG_WRN("WARN: Set --gpu-mem to 0 may lead to errors during workload distribution.\n");
+            }
         }
     ).set_env("LLAMA_ARG_CUDA_MEM"));
     add_opt(llama_arg(
diff --git a/common/common.cpp b/common/common.cpp
index cc4536b6..4f1ddff3 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -847,6 +847,16 @@ static std::string vec_to_str(const std::vector<T> & vec) {
     return oss.str();
 }
 
+static backend_type get_backend_type(const gpu_support & support) {
+    if (support.cuda)    return BACKEND_CUDA;
+    if (support.metal)   return BACKEND_METAL;
+    if (support.vulkan)  return BACKEND_VULKAN;
+    if (support.kompute) return BACKEND_KOMPUTE;
+    if (support.gpublas) return BACKEND_GPUBLAS;
+    if (support.sycl)    return BACKEND_SYCL;
+    return BACKEND_CPU;
+}
+
 static bool assign_layers_to_device(
                                 uint32_t   n_world,
                        const device_info * dev_info_set, 
@@ -972,7 +982,7 @@ static bool assign_layers_to_device(
         bool is_android = strcmp(dev.device_os, "Android") == 0;
         bool is_windows = strcmp(dev.device_os, "Windows") == 0;
         GGML_ASSERT(!is_windows && "Windows is not tested yet\n");
-        
+
         if ((is_macos && !dev.gpu_support.metal) || is_linux) {
             mem_budget[m] = dev.memory.available_physical;
         } else if (is_macos && dev.gpu_support.metal) {
@@ -985,11 +995,21 @@ static bool assign_layers_to_device(
         }
     }
 
-    // initialize w_m proportionally to memory budget and n_m to 0
+    // initialize w_m proportionally to memory budget
     float total_mem_budget = std::accumulate(mem_budget.begin(), mem_budget.end(), 0.0f);
     for (uint32_t m = 0; m < n_world; ++m) {
         w[m] = std::round(mem_budget[m] / total_mem_budget * n_layer);
-        n[m] = 0;
+    }
+    // no 0 is allowed in w, it must be at least 1
+    for (uint32_t m = 0; m < n_world; ++m) {
+        if (w[m] == 0) {
+            w[m] = 1;
+            // find the maximum and decrease it by 1
+            auto max_it = std::max_element(w.begin(), w.end());
+            if (max_it != w.end() && *max_it > 1) {
+                *max_it -= 1;
+            }
+        }
     }
     // adjust w[m] to ensure L mod W = 0
     int diff = n_layer - std::accumulate(w.begin(), w.end(), 0);
@@ -997,6 +1017,15 @@ static bool assign_layers_to_device(
                              : std::min_element(mem_budget.begin(), mem_budget.end());
     w[std::distance(mem_budget.begin(), device)] += diff;
 
+    // initialize n_m to w_m (if there is GPU), assume all layers can run on GPU
+    for (uint32_t m = 0; m < n_world; ++m) {
+        if (dev_info_set[m].gpu_support.metal || dev_info_set[m].gpu_support.cuda) {
+            n[m] = w[m];
+        } else {
+            n[m] = 0;
+        }
+    }
+
     // stores the actual read bandwidth (GB/s) for each device
     std::vector<float> disk_speed(n_world, 0.0f);
     for (uint32_t m = 0; m < n_world; ++m) {
@@ -1052,8 +1081,7 @@ static bool assign_layers_to_device(
             bool is_windows = strcmp(dev.device_os, "Windows") == 0;
             GGML_ASSERT(!is_windows && "Windows is not tested yet\n");
 
-            bool use_gpu = dev.gpu_support.metal || dev.gpu_support.cuda;
-            llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, use_gpu, m == 0, dev_info_set[0].model_bytes, w[m] > n[m]);
+            llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, get_backend_type(dev.gpu_support), m == 0, dev_info_set[0].model_bytes, w[m] > n[m], n[m] > 0);
 
             int  l_m          = w[m] * k;  // total number of layers assigned to device m
             int  l_m_gpu      = n[m] * k;  // number of layers assigned to device m that run on GPU
@@ -1424,14 +1452,18 @@ static bool assign_layers_to_device(
                 if (n_m < static_cast<uint32_t>(std::floor(W * vec_z_gpu[m]))) {
                     // if there is still free GPU memory
                     has_free_gpu_memory = true;
+                    LOG_INF("Device %d still has free GPU memory: w_m = %d, n_m = %d, W * vec_z_gpu[m]) = %d\n", 
+                        m, w_m, n_m, static_cast<uint32_t>(std::floor(W * vec_z_gpu[m])));
                 }
                 if (w_m > n_m) {
                     // if layers are offloaded to CPU
                     has_gpu_overload = true;
+                    LOG_INF("Device %d has GPU overload: w_m = %d, n_m = %d\n", m, w_m, n_m);
                 }
             } else if (!in_set(m, M4)) {
                 // if the CPU is overloaded
                 has_cpu_overload = true;
+                LOG_INF("Device %d has CPU overload.\n", m);
             }
         }
 
@@ -1522,7 +1554,7 @@ static bool assign_layers_to_device(
     for (uint32_t m = 0; m < n_world; ++m) {
         const device_info & dev = dev_info_set[m];
         bool use_gpu = dev.gpu_support.metal || dev.gpu_support.cuda;
-        llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, use_gpu, m == 0, dev_info_set[0].model_bytes);
+        llama_model_compute_buf_size(&c_cpu[m], &c_gpu[m], model, cparams, get_backend_type(dev.gpu_support), m == 0, dev_info_set[0].model_bytes, true);
 
         if (dev.gpu_support.cuda || dev.gpu_support.metal) {
             int64_t required_mem = w[m] * b_prime;
diff --git a/common/profiler.cpp b/common/profiler.cpp
index 292dc026..c788a3e6 100644
--- a/common/profiler.cpp
+++ b/common/profiler.cpp
@@ -1603,10 +1603,20 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam
 
 #if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
     llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, true);
-    llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, true,  true, n_bytes, n_layers > n_gpu_layers);
+
+    enum backend_type backend;
+#if GGML_USE_METAL
+    backend = BACKEND_METAL;
+#elif GGML_USE_CUDA
+    backend = BACKEND_CUDA;
+#endif
+    llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, backend,  true, n_bytes, n_layers > n_gpu_layers, n_gpu_layers > 0);
+
 #else
     llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, false);
-    llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, false, true, n_bytes, n_layers > n_gpu_layers);
+
+    enum backend_type backend = BACKEND_CPU;
+    llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, backend, true, n_bytes, n_layers > n_gpu_layers, n_gpu_layers > 0);
 #endif
 
     double cpu_kv_size_gib     = static_cast<double>(cpu_kv_size) / 1024.0 / 1024.0 / 1024.0;     // convert to GiB
diff --git a/include/llama.h b/include/llama.h
index c61dd851..fc42856c 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -67,6 +67,16 @@ extern "C" {
     typedef int32_t llama_token;
     typedef int32_t llama_seq_id;
 
+    enum backend_type {
+        BACKEND_CPU     = 0,
+        BACKEND_CUDA    = 1,
+        BACKEND_METAL   = 2,
+        BACKEND_VULKAN  = 3,
+        BACKEND_KOMPUTE = 4,
+        BACKEND_GPUBLAS = 5,
+        BACKEND_SYCL    = 6
+    };
+
     enum llama_vocab_type {
         LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
         LLAMA_VOCAB_TYPE_SPM  = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
@@ -574,10 +584,11 @@ extern "C" {
                                    int64_t * gpu_buf,
                   const struct llama_model * model, 
          const struct llama_context_params   cparams, 
-                                      bool   use_gpu,
+                         enum backend_type   backend,
                                       bool   is_master,
                         struct model_bytes   n_bytes,
-                                      bool   offload);
+                                      bool   offload,
+                                      bool   has_gpu_layers);
 
     // Return the size of KV cache in the model
     LLAMA_API void llama_total_kv_size(
diff --git a/src/llama.cpp b/src/llama.cpp
index 15255ea0..eb551280 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3673,6 +3673,7 @@ void llama_profile_device(
     // reserved/limit memory to avoid potential OOM, default to 300 MiB
     dev_info->gpu_props.memory_free         = round(gpu_props.memory_free  / (double)(1 << 30) * 100) / 100;
     dev_info->gpu_props.memory_free         = std::min((float)gpu_mem, dev_info->gpu_props.memory_free) - 0.3;
+    dev_info->gpu_props.memory_free         = std::max(dev_info->gpu_props.memory_free, 0.0f);
 
     dev_info->gpu_props.memory_total        = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100;
     dev_info->gpu_props.metal_read_vram_bw  = device_metal_read_vram_bw();
@@ -21861,10 +21862,11 @@ void llama_model_compute_buf_size(
                              int64_t * gpu_buf, 
             const struct llama_model * model, 
    const struct llama_context_params   cparams,
-                                bool   use_gpu,
+                   enum backend_type   backend,
                                 bool   is_master,
                   struct model_bytes   n_bytes,
-                                bool   offload) {
+                                bool   offload,
+                                bool   has_gpu_layers) {
     const llama_hparams hparams = model->hparams;
 
     // input tensors
@@ -21879,6 +21881,9 @@ void llama_model_compute_buf_size(
     const int64_t n_qcur     = hparams.n_embd  * cparams.n_ubatch;
     const int64_t n_ffn_gate = hparams.n_ff()  * cparams.n_ubatch;
     const int64_t n_ffn_up   = hparams.n_ff()  * cparams.n_ubatch;
+    const int64_t n_ffn_out  = hparams.n_embd  * cparams.n_ubatch;
+    const int64_t n_ffn_inp  = hparams.n_embd  * cparams.n_ubatch;
+    const int64_t n_kq       = cparams.n_ctx   * cparams.n_ubatch * hparams.n_head();
     const int64_t n_inp_out_ids = cparams.n_ubatch;
 
     // outputs
@@ -21890,40 +21895,82 @@ void llama_model_compute_buf_size(
     const int64_t nb_attn_norm_w = n_bytes.nb_attn_norm_w;
     const int64_t nb_ffn_gate_w  = n_bytes.nb_ffn_gate_w;
     const int64_t nb_ffn_down_w  = n_bytes.nb_ffn_down_w;
-    
-    const int64_t nb_act_buf_base = (n_bak_embd + n_norm + n_inp_pos + n_ffn_gate) * ggml_type_size(GGML_TYPE_F32);
-    *gpu_buf = use_gpu ? nb_act_buf_base : 0;
-    *cpu_buf = nb_act_buf_base;
+
+    const int64_t type_size_f32 = ggml_type_size(GGML_TYPE_F32);
+
+    bool use_gpu = backend != BACKEND_CPU && has_gpu_layers;
+    *gpu_buf = 0;
+    *cpu_buf = 0;
     int64_t gpu_host_buf = 0;
 
-    // estimate GPU computing buffer and GPU-host computing buffer
-    if (use_gpu) {
+    if (backend == BACKEND_CUDA) {
+        const int64_t nb_act_buf_base = (n_bak_embd + n_norm + n_inp_pos + n_ffn_gate) * type_size_f32;
+        *gpu_buf = use_gpu ? nb_act_buf_base : 0;
+
+        // CUDA computing buffer and CUDA-host buffer
         if (is_master) {
             if (offload) {
-                *gpu_buf += (n_ffn_up + n_qcur) * ggml_type_size(GGML_TYPE_F32) + nb_attn_norm_w + nb_ffn_gate_w;
+                *gpu_buf += (n_ffn_up + n_qcur) * type_size_f32 + nb_attn_norm_w + nb_ffn_gate_w;
             } else {
-                *gpu_buf += (n_ffn_up + n_qcur + n_kq_mask + n_inp_out_ids) * ggml_type_size(GGML_TYPE_F32);
+                *gpu_buf += (n_ffn_up + n_qcur + n_kq_mask + n_inp_out_ids) * type_size_f32;
             }
-            *gpu_buf     += (n_out_embd + n_result) * ggml_type_size(GGML_TYPE_F32) + nb_output_w;
-            gpu_host_buf  = (n_inp_toks + n_inp_embd + n_bak_embd + n_inp_pos + n_kq_mask + n_inp_out_ids + n_out_embd) * ggml_type_size(GGML_TYPE_F32);
+            *gpu_buf     += (n_out_embd + n_result) * type_size_f32 + nb_output_w;
+            gpu_host_buf  = (n_inp_toks + n_inp_embd + n_bak_embd + n_inp_pos + n_kq_mask + n_inp_out_ids + n_out_embd) * type_size_f32;
         } else {
             if (offload) {
-                *gpu_buf += n_qcur * ggml_type_size(GGML_TYPE_F32) + nb_attn_norm_w + nb_ffn_gate_w + nb_ffn_down_w;
+                *gpu_buf += n_qcur * type_size_f32 + nb_attn_norm_w + nb_ffn_gate_w + nb_ffn_down_w;
             } else {
-                *gpu_buf += (n_ffn_up + n_kq_mask) * ggml_type_size(GGML_TYPE_F32);
+                *gpu_buf += (n_ffn_up + n_kq_mask) * type_size_f32;
             }
-            gpu_host_buf  = (n_bak_embd + n_inp_pos + n_kq_mask) * ggml_type_size(GGML_TYPE_F32);
+            gpu_host_buf  = (n_bak_embd + n_inp_pos + n_kq_mask) * type_size_f32;
         }
     } 
     
-    // estimate CPU computing buffer
-    {
-        if (is_master) {
-            *cpu_buf += (n_ffn_up + n_kq_mask + n_inp_out_ids + n_qcur + n_inp_toks + n_inp_embd + n_out_embd + n_result) * ggml_type_size(GGML_TYPE_F32);
+    else if (backend == BACKEND_METAL) {
+        const int64_t nb_act_buf_base = (n_inp_pos + n_kq_mask) * type_size_f32;
+        *gpu_buf = nb_act_buf_base;
+        *cpu_buf = nb_act_buf_base;
+
+        if (use_gpu) {
+            if (is_master) {
+                *cpu_buf += (n_inp_toks + n_inp_embd + n_bak_embd + n_inp_out_ids + n_out_embd + n_result) * type_size_f32;
+
+                if (offload) {
+                    *gpu_buf += (n_ffn_out + n_ffn_inp + n_inp_out_ids) * type_size_f32;
+                    *gpu_buf += std::max(n_ffn_up + n_ffn_gate, n_qcur + n_qcur + n_kq) * type_size_f32;
+                    *cpu_buf += n_norm * type_size_f32;
+                    *cpu_buf += std::max(n_ffn_up + n_ffn_gate, n_qcur + n_qcur + n_kq) * type_size_f32;
+                } else {
+                    *gpu_buf += (n_bak_embd + n_inp_out_ids + n_norm) * type_size_f32;
+                    *gpu_buf += std::max(n_ffn_up + n_ffn_gate, n_qcur + n_qcur + n_kq) * type_size_f32;
+                }
+            } else {
+                *gpu_buf += (n_ffn_out + n_ffn_inp) * type_size_f32;
+                *gpu_buf += std::max(n_ffn_up + n_ffn_gate, n_qcur + n_qcur + n_kq) * type_size_f32;
+
+                *cpu_buf += n_bak_embd * type_size_f32;
+                if (offload) {
+                    *cpu_buf += n_norm * type_size_f32;
+                    *cpu_buf += std::max(n_ffn_up + n_ffn_gate, n_qcur + n_qcur + n_kq) * type_size_f32;
+                }
+            }
         } else {
-            *cpu_buf += (n_ffn_up + n_kq_mask) * ggml_type_size(GGML_TYPE_F32);
+            *gpu_buf = 0;
+            *cpu_buf = 0;
         }
-        *cpu_buf += gpu_host_buf;
+    }
+
+    else if (backend != BACKEND_CPU) {
+        GGML_ASSERT(false && "Unsupported backend type for compute buffer estimation.\n");
+    }
+
+    // CPU computing buffer
+    if (*cpu_buf == 0) {
+        *cpu_buf = (n_inp_pos + n_kq_mask + n_bak_embd + n_norm) * type_size_f32;
+        if (is_master) {
+            *cpu_buf += (n_inp_toks + n_inp_embd + n_inp_out_ids + n_out_embd + n_result) * type_size_f32;   
+        }
+        *cpu_buf += std::max(n_ffn_gate + n_ffn_up, n_qcur + n_qcur + n_kq) * type_size_f32;
     }
 
     LLAMA_LOG_INFO("%s: compute buffer size = %7.2f MiB (GPU) + %7.2f MiB (GPU-Host)\n", __func__,