diff --git a/common/profiler.cpp b/common/profiler.cpp
index 9020b5e9..261c5fb2 100644
--- a/common/profiler.cpp
+++ b/common/profiler.cpp
@@ -880,7 +880,7 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c
     total_latency += gpu_latency_per_layer * n_gpu_layers;
     total_latency += cpu_latency_per_layer * (n_layers - n_gpu_layers);
 #else
-    total_latency *= cpu_latency_per_layer * n_layers;
+    total_latency += cpu_latency_per_layer * n_layers;
 #endif
 
     total_latency += (double)n_flops.output_f32_f32 / (double)cpu.flops_f32_f32 / 1e9;
@@ -900,8 +900,8 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c
 static float device_memory_access_delay(struct device_info & dev_info, int n_layers) {
     struct model_params n_params = dev_info.model_params;
 
-    int64_t total_bytes = 0;
-    total_bytes += n_params.layer_f32 * 4 +
+    int64_t total_bytes = 
+                   n_params.layer_f32 * 4 +
                    n_params.layer_f16 * 2 +
                    n_params.layer_q4k / 2 +
                    n_params.layer_q6k * 3 / 8 +
@@ -929,14 +929,22 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam
     int n_layers          = llama_model_n_layers(model);
     int n_gpu_layers      = cparams.n_gpu_layers;
 
+    uint64_t cpu_kv_size;
+    uint64_t gpu_kv_size;
+    uint64_t cpu_compute_buf;
+    uint64_t gpu_compute_buf;
+
 #if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
-    double cpu_kv_size_gb     = 0.0f;
-    double cpu_compute_buf_gb = 0.0f;
+    llama_model_kvcache_size(&cpu_kv_size, &gpu_kv_size, model, cparams, true);
+    llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, true);
 #else
-    double cpu_kv_size_gb     = static_cast<double>(llama_model_kvcache_size(model, cparams)) / 1e9; // convert to GB
-    double cpu_compute_buf_gb = static_cast<double>(llama_model_compute_buf_size(model, cparams, false)) / 1e9; // convert to GB
+    llama_model_kvcache_size(&cpu_kv_size, &gpu_kv_size, model, cparams, false);
+    llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, false);
 #endif
 
+    double cpu_kv_size_gb     = static_cast<double>(cpu_kv_size) / 1e9;     // convert to GB
+    double cpu_compute_buf_gb = static_cast<double>(cpu_compute_buf) / 1e9; // convert to GB
+
     int64_t cpu_total_bytes =
                    n_params.layer_f32 * 4 +
                    n_params.layer_f16 * 2 +
@@ -947,6 +955,7 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam
 #if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
     cpu_total_bytes *= (n_layers - n_gpu_layers);
 #else
+    (void)n_gpu_layers;
     cpu_total_bytes *= n_layers;
 #endif
 
@@ -960,13 +969,7 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam
     float cpu_total_gbytes = (double)cpu_total_bytes / 1e9; // convert to GB
     float cpu_mem_avail = dev_info.memory.available_physical * 1024.0f * 1024.0f * 1024.0f / 1e9; // convert to GB
           cpu_mem_avail -= static_cast<float>(cpu_kv_size_gb);
-    
-    if (cpu_mem_avail - static_cast<float>(cpu_compute_buf_gb) < cpu_total_gbytes) {
-        double compressed_compute_buf_gb = static_cast<double>(llama_model_compute_buf_size(model, cparams, true)) / 1e9; // convert to GB
-        cpu_mem_avail -= static_cast<float>(compressed_compute_buf_gb);
-    } else {
-        cpu_mem_avail -= static_cast<float>(cpu_compute_buf_gb);
-    }
+          cpu_mem_avail -= static_cast<float>(cpu_compute_buf_gb);
           
 #ifdef __linux__
     float disk_read_bw = dev_info.disk.read_seq_bw; // GB/s
diff --git a/include/llama.h b/include/llama.h
index c8eb58cd..e21b056a 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -525,10 +525,20 @@ extern "C" {
     LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
 
     // Return the size of compute buffer size, including input tensors and activations
-    LLAMA_API uint64_t llama_model_compute_buf_size(const struct llama_model * model, const struct llama_context_params cparams, bool compress_memory);
+    LLAMA_API void llama_model_compute_buf_size(
+                                  uint64_t * cpu_buf,
+                                  uint64_t * gpu_buf,
+                  const struct llama_model * model, 
+         const struct llama_context_params   cparams, 
+                                      bool   use_gpu);
 
     // Return the size of KV cache in the model
-    LLAMA_API uint64_t llama_model_kvcache_size(const struct llama_model * model, const struct llama_context_params cparams);
+    LLAMA_API void llama_model_kvcache_size(
+                                  uint64_t * cpu_cache, 
+                                  uint64_t * gpu_cache, 
+                  const struct llama_model * model, 
+         const struct llama_context_params   cparams,
+                                      bool   use_gpu);
 
     // Return the total number of float operations in the model
     LLAMA_API void llama_model_n_flops(
diff --git a/src/llama.cpp b/src/llama.cpp
index 80aa07fe..489ff29f 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -20810,7 +20810,12 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
     }
 }
 
-uint64_t llama_model_compute_buf_size(const struct llama_model * model, const struct llama_context_params cparams, bool compress_memory) {
+void llama_model_compute_buf_size(
+                            uint64_t * cpu_buf, 
+                            uint64_t * gpu_buf, 
+            const struct llama_model * model, 
+   const struct llama_context_params   cparams,
+                                bool   use_gpu) {
     const llama_hparams hparams = model->hparams;
 
     // input tensors
@@ -20831,30 +20836,42 @@ uint64_t llama_model_compute_buf_size(const struct llama_model * model, const st
     const uint64_t n_output   = hparams.n_vocab * cparams.n_ubatch;
 
     // compute buffer size for input, each layer, and output
-    const uint64_t n_buf_inp  = (n_inp_toks + n_inp_embd) * ggml_type_size(GGML_TYPE_F32);  // do not consider memory compression
+    const uint64_t n_buf_inp  = (n_inp_toks + n_inp_embd) * ggml_type_size(GGML_TYPE_F32);
     const uint64_t n_buf_act  = (n_bak_embd + n_inp_pos + n_kq_mask + 
                                  n_inp_out_ids + n_norm + n_qcur + n_kq
                                 ) * ggml_type_size(GGML_TYPE_F32);
-    const uint64_t n_buf_out  = (n_out_embd + n_output) * ggml_type_size(GGML_TYPE_F32);    // do not consider memory compression
+    const uint64_t n_buf_out  = (n_out_embd + n_output) * ggml_type_size(GGML_TYPE_F32);
 
-    uint64_t n_buf_total = 0;
-    if (cparams.rank == 0) {
-        if (compress_memory) {
-            n_buf_total = n_buf_inp / 2 + n_buf_act + n_buf_out / 2; // consider compressed memory with ratio 2:1
+    if (use_gpu) {
+        *gpu_buf = n_buf_act;
+        if (llama_model_n_layers(model) > cparams.n_gpu_layers) {
+            *cpu_buf = n_buf_inp + n_buf_act + n_buf_out;
         } else {
-            n_buf_total = n_buf_inp + n_buf_act + n_buf_out;
+            *cpu_buf = n_buf_inp + n_buf_out;
         }
     } else {
-        n_buf_total = n_buf_act;
+        *gpu_buf = 0;
+        *cpu_buf = n_buf_inp + n_buf_act + n_buf_out;
     }
-    return n_buf_total;
 }
 
-uint64_t llama_model_kvcache_size(const struct llama_model * model, const struct llama_context_params cparams) {
+void llama_model_kvcache_size(
+                            uint64_t * cpu_cache, 
+                            uint64_t * gpu_cache, 
+            const struct llama_model * model, 
+   const struct llama_context_params   cparams, 
+                                bool   use_gpu) {
     const llama_hparams hparams = model->hparams;
     uint64_t ne_k = static_cast<uint64_t>(hparams.n_embd_k_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_k);
     uint64_t ne_v = static_cast<uint64_t>(hparams.n_embd_v_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_v);
-    return (ne_k + ne_v) * llama_model_n_layers(model);
+    if (use_gpu) {
+        int n_gpu_layers = cparams.n_gpu_layers;
+        *gpu_cache = (ne_k + ne_v) * n_gpu_layers;
+        *cpu_cache = (ne_k + ne_v) * (llama_model_n_layers(model) - n_gpu_layers);
+    } else {
+        *gpu_cache = 0;
+        *cpu_cache = (ne_k + ne_v) * llama_model_n_layers(model);
+    }
 }
 
 void llama_model_n_flops(