diff --git a/common/common.cpp b/common/common.cpp
index 80d0c16d..ee33f351 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -914,7 +914,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
         dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info));
         dev_info_set[0] = dev_info;
         llama_gather_device_info(lctx, dev_info_set);
-        device_print_props(dev_info_set, n_world, model);
+        device_print_props(dev_info_set, n_world, model, cparams);
     } else {
         llama_send_device_info(lctx, &dev_info);
     }
diff --git a/common/profiler.cpp b/common/profiler.cpp
index 59dd6624..642f793d 100644
--- a/common/profiler.cpp
+++ b/common/profiler.cpp
@@ -876,8 +876,10 @@ static float device_memory_access_delay(struct device_info & dev_info, int n_lay
 #endif
 }
 
-static float device_disk_access_delay(struct device_info & dev_info, int n_layers) {
-    struct model_params n_params = dev_info.model_params;
+static float device_disk_access_delay(struct device_info & dev_info, struct llama_model * model, const struct llama_context_params cparams) {
+    auto n_params     = dev_info.model_params;
+    int n_layers      = llama_model_n_layers(model);
+    double kv_size_gb = static_cast<double>(llama_model_kvcache_size(model, cparams)) / 1e9; // convert to GB
 
     int64_t total_bytes = 0;
     total_bytes += n_params.layer_f32 * 4 +
@@ -895,13 +897,18 @@ static float device_disk_access_delay(struct device_info & dev_info, int n_layer
                    n_params.output_q80;
     
     float total_gbytes = (double)total_bytes / 1e9; // convert to GB
-    float mem_avail = dev_info.memory.available_physical * 1024.0f * 1024.0f * 1024.0f / 1e9; // convert to GB
+    float mem_avail    = dev_info.memory.available_physical * 1024.0f * 1024.0f * 1024.0f / 1e9; // convert to GB
+          mem_avail   -= static_cast<float>(kv_size_gb);
     // todo: consider activations which also consumes the available memory
+#ifdef __linux__
+    float disk_read_bw = dev_info.disk.read_seq_bw; // GB/s
+#else
     float disk_read_bw = dev_info.disk.read_rnd_bw; // GB/s
+#endif
     return std::max(0.0, static_cast<double>(total_gbytes - mem_avail) / disk_read_bw * 1000); // convert to ms
 }
 
-void device_print_props(struct device_info * dev_info_set, int n, struct llama_model * model) {
+void device_print_props(struct device_info * dev_info_set, int n, struct llama_model * model, const struct llama_context_params cparams) {
     LOG_INF("\n-------------------------------------------------------------------------------------------\n");
     LOG_INF("| Property                     ");
     for (int i = 0; i < n; ++i) {
@@ -1255,7 +1262,7 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
     int n_layers  = llama_model_n_layers(model);
     latency += device_compute_delay(dev_info_set[0], n_layers);
     latency += device_memory_access_delay(dev_info_set[0], n_layers);
-    latency += device_disk_access_delay(dev_info_set[0], n_layers); // if physical memory is not enough, some tensor weights will be released from memory and reloaded by mmap later
+    latency += device_disk_access_delay(dev_info_set[0], model, cparams); // if physical memory is not enough, some tensor weights will be released from memory and reloaded by mmap later
     
     LOG_INF("| Token latency (ms)           ");
     LOG_INF("| %-10.2f   ", latency);
diff --git a/common/profiler.h b/common/profiler.h
index 43a5fc81..286fccd1 100644
--- a/common/profiler.h
+++ b/common/profiler.h
@@ -8,6 +8,7 @@
 #define DISK_TEST_SEQ_BLOCK  100L * 1024 * 1024
 #define DISK_TEST_RND_BLOCK  4096
 
+
 struct cpu_props {
     const char * name;
     const char * description;
@@ -222,7 +223,7 @@ void     device_disk_rnd_bw     (float * read_rnd_bw, float * write_rnd_bw, int
 float    device_memory_bw       (int n_thread);
 float    device_cuda_memory_bw  (struct llama_model * model);
 void     device_get_props       (struct llama_model * model, int device, struct ggml_backend_dev_props * props); 
-void     device_print_props     (struct device_info * dev_info_set, int n, struct llama_model * model);
+void     device_print_props     (struct device_info * dev_info_set, int n, struct llama_model * model, const struct llama_context_params cparams);
 
 int      device_has_metal  (void);
 int      device_has_cuda   (void);
diff --git a/include/llama.h b/include/llama.h
index 7ae1d702..27c322f3 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -523,6 +523,9 @@ extern "C" {
     // Returns the total number of parameters in the model
     LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
 
+    // Return the size of KV cache in the model
+    LLAMA_API uint64_t llama_model_kvcache_size(const struct llama_model * model, const struct llama_context_params cparams);
+
     // Return the total number of float operations in the model
     LLAMA_API void llama_model_n_flops(
                         struct llama_model * model, 
diff --git a/src/llama.cpp b/src/llama.cpp
index f4bb64d7..851d1d69 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -20808,6 +20808,13 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
     }
 }
 
+uint64_t llama_model_kvcache_size(const struct llama_model * model, const struct llama_context_params cparams) {
+    const llama_hparams hparams = model->hparams;
+    uint64_t ne_k = static_cast<uint64_t>(hparams.n_embd_k_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_k);
+    uint64_t ne_v = static_cast<uint64_t>(hparams.n_embd_v_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_v);
+    return (ne_k + ne_v) * llama_model_n_layers(model);
+}
+
 void llama_model_n_flops(
             struct llama_model * model, 
      struct llama_model_loader * ml,