From 68ecabc8c3036dd9f73474bd25e9609a4b66747d Mon Sep 17 00:00:00 2001
From: Lizonghang <870644199@qq.com>
Date: Fri, 29 Nov 2024 19:03:01 +0400
Subject: [PATCH] add cpu_read_ram_bw, metal_read_vram_bw, cuda_read_vram_bw

---
 common/common.cpp   |   1 +
 common/profiler.cpp | 120 +++++++++++++++++++++++++++++++++++++-------
 common/profiler.h   |  37 +++++++-------
 include/llama.h     |   1 +
 src/llama.cpp       |  28 ++++++-----
 5 files changed, 139 insertions(+), 48 deletions(-)
diff --git a/common/common.cpp b/common/common.cpp
index ee33f351..86de0c06 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1117,6 +1117,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     cparams.n_world         = params.n_world;
     cparams.rank            = params.rank;
     cparams.unload          = params.unload;
+    cparams.n_gpu_layers    = params.n_gpu_layers;
     std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window);
 
     if (cparams.master_ip != nullptr) {
diff --git a/common/profiler.cpp b/common/profiler.cpp
index c874add2..e634c0b5 100644
--- a/common/profiler.cpp
+++ b/common/profiler.cpp
@@ -43,6 +43,26 @@
 #include <thread>
 #include <random>
 #include <regex>
+#include <fcntl.h>
+
+static int disable_log() {
+    int stdout_fd = dup(STDOUT_FILENO); 
+    int null_fd = open("/dev/null", O_WRONLY);
+    if (null_fd == -1) {
+        LOG_INF("Failed to open /dev/null\n");
+        return -1;
+    }
+    dup2(null_fd, STDOUT_FILENO);
+    close(null_fd);
+    return stdout_fd;
+}
+
+static void enable_log(int stdout_fd) {
+    if (stdout_fd != -1) {
+        dup2(stdout_fd, STDOUT_FILENO);
+        close(stdout_fd);
+    }
+}
 
 const char * device_name() {
     static char device_name[256];
@@ -94,7 +114,7 @@ uint32_t device_cpu_cores() {
     return core_count;
 }
 
-static float device_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, profiler_backend_type btype, int n_threads) {
+static float device_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, enum profiler_backend_type btype, int n_threads) {
     const int n_repeat = 1;
     const int n_embd   = llama_n_embd(model);
     std::vector<float> matrix_A(n_embd * n_embd, 1.0f); 
@@ -188,7 +208,9 @@ float device_cpu_flops(struct llama_model * model, enum ggml_type src0t, enum gg
 
 float device_metal_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t) {
 #ifdef GGML_USE_METAL
+    int fd = disable_log();
     return device_flops(model, src0t, src1t, PROFILER_BACKEND_TYPE_METAL, 4);
+    enable_log(fd);
 #endif
 
     (void)model;
@@ -199,7 +221,10 @@ float device_metal_flops(struct llama_model * model, enum ggml_type src0t, enum
 
 float device_cuda_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t) {
 #ifdef GGML_USE_CUDA
-    return device_flops(model, src0t, src1t, PROFILER_BACKEND_TYPE_CUDA, 4);
+    int fd = disable_log();
+    float ret = device_flops(model, src0t, src1t, PROFILER_BACKEND_TYPE_CUDA, 4)
+    enable_log(fd);
+    return ret;
 #endif
 
     (void)model;
@@ -712,12 +737,26 @@ float device_memory_bw(int n_thread) {
     return static_cast<float>(bandwidth);
 }
 
-float device_cuda_memory_bw(struct llama_model * model) {
-#ifdef GGML_USE_CUDA
+static float device_read_vram_bw(struct llama_model * model, enum profiler_backend_type btype) {
     const int n_embd = llama_n_embd(model) * 2;
     std::vector<float> matrix_A(n_embd * n_embd, 1.0f);
 
-    ggml_backend_t backend = ggml_backend_cuda_init(0);
+    ggml_backend_t backend = NULL;
+    switch (btype) {
+        case PROFILER_BACKEND_TYPE_METAL:
+#ifdef GGML_USE_METAL
+            backend = ggml_backend_metal_init();
+#endif
+            break;
+        case PROFILER_BACKEND_TYPE_CUDA:
+#ifdef GGML_USE_CUDA
+            backend = ggml_backend_cuda_init(0);
+#endif
+            break;
+        case PROFILER_BACKEND_TYPE_CPU:
+            break;
+    }
+
     if (!backend) {
         LOG_INF("%s: ggml backend init failed\n", __func__);
         return 0.0f;
@@ -769,10 +808,28 @@ float device_cuda_memory_bw(struct llama_model * model) {
     ggml_backend_free(backend);
 
     return bandwidth;
-#else
+}
+
+float device_metal_read_vram_bw(struct llama_model * model) {
+#ifdef GGML_USE_METAL
+    int fd = disable_log();
+    return device_read_vram_bw(model, PROFILER_BACKEND_TYPE_METAL);
+    enable_log(fd);
+#endif
+
     (void)model;
     return 0.0f;
+}
+
+float device_cuda_read_vram_bw(struct llama_model * model) {
+#ifdef GGML_USE_CUDA
+    int fd = disable_log();
+    return device_read_vram_bw(model, PROFILER_BACKEND_TYPE_CUDA);
+    enable_log(fd);
 #endif
+
+    (void)model;
+    return 0.0f;
 }
 
 int device_has_metal(void) {
@@ -827,6 +884,14 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers) {
     total_latency += (double)n_flops.layer_q4k_f32  / (double)gpu.cuda_flops_q4k_f32 / 1e9;
     total_latency += (double)n_flops.layer_q6k_f32  / (double)gpu.cuda_flops_q6k_f32 / 1e9;
     total_latency += (double)n_flops.layer_q80_f32  / (double)gpu.cuda_flops_q80_f32 / 1e9;
+#elif GGML_USE_METAL
+    struct gpu_props gpu = dev_info.gpu_props;
+
+    total_latency += (double)n_flops.layer_f32_f32  / (double)gpu.metal_flops_f32_f32 / 1e9;
+    total_latency += (double)n_flops.layer_f16_f32  / (double)gpu.metal_flops_f16_f32 / 1e9;
+    total_latency += (double)n_flops.layer_q4k_f32  / (double)gpu.metal_flops_q4k_f32 / 1e9;
+    total_latency += (double)n_flops.layer_q6k_f32  / (double)gpu.metal_flops_q6k_f32 / 1e9;
+    total_latency += (double)n_flops.layer_q80_f32  / (double)gpu.metal_flops_q80_f32 / 1e9;
 #else
     total_latency += (double)n_flops.layer_f32_f32  / (double)cpu.flops_f32_f32 / 1e9;
     total_latency += (double)n_flops.layer_f16_f32  / (double)cpu.flops_f16_f32 / 1e9;
@@ -870,15 +935,18 @@ static float device_memory_access_delay(struct device_info & dev_info, int n_lay
                    n_params.output_q80;
 
 #ifdef GGML_USE_CUDA
-    return (double)total_bytes / 1e6 / dev_info.gpu_props.read_bandwidth; // ms
+    return (double)total_bytes / 1e6 / dev_info.gpu_props.cuda_read_vram_bw; // ms
+#elif GGML_USE_METAL
+    return (double)total_bytes / 1e6 / dev_info.gpu_props.metal_read_vram_bw; // ms
 #else
-    return (double)total_bytes / 1e6 / dev_info.memory.read_bandwidth; // ms
+    return (double)total_bytes / 1e6 / dev_info.memory.cpu_read_ram_bw; // ms
 #endif
 }
 
 static float device_disk_access_delay(struct device_info & dev_info, struct llama_model * model, const struct llama_context_params cparams) {
     auto n_params         = dev_info.model_params;
     int n_layers          = llama_model_n_layers(model);
+    int n_gpu_layers      = cparams.n_gpu_layers;
     double kv_size_gb     = static_cast<double>(llama_model_kvcache_size(model, cparams)) / 1e9; // convert to GB
     double compute_buf_gb = static_cast<double>(llama_model_compute_buf_size(model, cparams, false)) / 1e9; // convert to GB
 
@@ -1005,7 +1073,7 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
 
     LOG_INF("| Mem Read Bandwidth (GB/s)    ");
     for (int i = 0; i < n; ++i) {
-        LOG_INF("| %-10.2f   ", dev_info_set[i].memory.read_bandwidth);
+        LOG_INF("| %-10.2f   ", dev_info_set[i].memory.cpu_read_ram_bw);
     }
     LOG_INF("\n");
 
@@ -1099,9 +1167,9 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
     }
     LOG_INF("\n");
 
-    LOG_INF("| VRAM Read Bandwidth (GB/s)   ");
+    LOG_INF("| Metal VRAM Read BW (GB/s)    ");
     for (int i = 0; i < n; ++i) {
-        LOG_INF("| %-10.2f   ", dev_info_set[i].gpu_props.read_bandwidth);
+        LOG_INF("| %-10.2f   ", dev_info_set[i].gpu_props.metal_read_vram_bw);
     }
     LOG_INF("\n");
 
@@ -1135,31 +1203,37 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
     }
     LOG_INF("\n");
 
-    LOG_INF("| CUDA  flops (F32xF32, GFLOPS)");
+    LOG_INF("| CUDA VRAM Read BW (GB/s)     ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.2f   ", dev_info_set[i].gpu_props.cuda_read_vram_bw);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| CUDA flops (F32xF32, GFLOPS) ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.cuda_flops_f32_f32);
     }
     LOG_INF("\n");
 
-    LOG_INF("| CUDA  flops (F16xF32, GFLOPS)");
+    LOG_INF("| CUDA flops (F16xF32, GFLOPS) ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.cuda_flops_f16_f32);
     }
     LOG_INF("\n");
 
-    LOG_INF("| CUDA  flops (Q4KxF32, GFLOPS)");
+    LOG_INF("| CUDA flops (Q4KxF32, GFLOPS) ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.cuda_flops_q4k_f32);
     }
     LOG_INF("\n");
 
-    LOG_INF("| CUDA  flops (Q6KxF32, GFLOPS)");
+    LOG_INF("| CUDA flops (Q6KxF32, GFLOPS) ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.cuda_flops_q6k_f32);
     }
     LOG_INF("\n");
 
-    LOG_INF("| CUDA  flops (Q80xF32, GFLOPS)");
+    LOG_INF("| CUDA flops (Q80xF32, GFLOPS) ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.cuda_flops_q80_f32);
     }
@@ -1269,7 +1343,9 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
     float latency = 0.0f;
     int n_layers  = llama_model_n_layers(model);
     latency += device_compute_delay(dev_info_set[0], n_layers);
+    LOG_INF("latency: %.2f\n", latency);
     latency += device_memory_access_delay(dev_info_set[0], n_layers);
+    LOG_INF("latency: %.2f\n", latency);
     latency += device_disk_access_delay(dev_info_set[0], model, cparams); // if physical memory is not enough, some tensor weights will be released from memory and reloaded by mmap later
     
     LOG_INF("| Token latency (ms)           ");
@@ -1300,7 +1376,7 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
                       + sizeof(float) * 5    // cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q6k_f32, cpu_props.flops_q80_f32
                       + sizeof(struct memory_info)
                       + sizeof(struct gpu_support)
-                      + sizeof(float) * 13; // gpu_props.memory_free, gpu_props.memory_total, gpu_props.read_bandwidth,
+                      + sizeof(float) * 14; // gpu_props.memory_free, gpu_props.memory_total, gpu_props.metal_read_vram_bw, gpu_props.cuda_read_vram_bw,
                                             // gpu_props.metal_flops_f32_f32, gpu_props.metal_flops_f16_f32, gpu_props.metal_flops_q4k_f32, gpu_props.metal_flops_q6k_f32, gpu_props.metal_flops_q80_f32, 
                                             // gpu_props.cuda_flops_f32_f32, gpu_props.cuda_flops_f16_f32, gpu_props.cuda_flops_q4k_f32, gpu_props.cuda_flops_q6k_f32, gpu_props.cuda_flops_q80_f32
 
@@ -1371,7 +1447,7 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
     memcpy(ptr, &dev_info->gpu_props.memory_total, sizeof(float));
     ptr += sizeof(float);
 
-    memcpy(ptr, &dev_info->gpu_props.read_bandwidth, sizeof(float));
+    memcpy(ptr, &dev_info->gpu_props.metal_read_vram_bw, sizeof(float));
     ptr += sizeof(float);
 
     memcpy(ptr, &dev_info->gpu_props.metal_flops_f32_f32, sizeof(float));
@@ -1389,6 +1465,9 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
     memcpy(ptr, &dev_info->gpu_props.metal_flops_q80_f32, sizeof(float));
     ptr += sizeof(float);
 
+    memcpy(ptr, &dev_info->gpu_props.cuda_read_vram_bw, sizeof(float));
+    ptr += sizeof(float);
+
     memcpy(ptr, &dev_info->gpu_props.cuda_flops_f32_f32, sizeof(float));
     ptr += sizeof(float);
 
@@ -1488,7 +1567,7 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
     memcpy(&dev_info->gpu_props.memory_total, ptr, sizeof(float));
     ptr += sizeof(float);
 
-    memcpy(&dev_info->gpu_props.read_bandwidth, ptr, sizeof(float));
+    memcpy(&dev_info->gpu_props.metal_read_vram_bw, ptr, sizeof(float));
     ptr += sizeof(float);
 
     memcpy(&dev_info->gpu_props.metal_flops_f32_f32, ptr, sizeof(float));
@@ -1506,6 +1585,9 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
     memcpy(&dev_info->gpu_props.metal_flops_q80_f32, ptr, sizeof(float));
     ptr += sizeof(float);
 
+    memcpy(&dev_info->gpu_props.cuda_read_vram_bw, ptr, sizeof(float));
+    ptr += sizeof(float);
+
     memcpy(&dev_info->gpu_props.cuda_flops_f32_f32, ptr, sizeof(float));
     ptr += sizeof(float);
 
diff --git a/common/profiler.h b/common/profiler.h
index 286fccd1..2e182380 100644
--- a/common/profiler.h
+++ b/common/profiler.h
@@ -35,14 +35,14 @@ struct memory_info {
     float        available_physical; // in GiB
     float        total_swap;         // in GiB
     float        available_swap;     // in GiB
-    float        read_bandwidth;     // in GB/s
+    float        cpu_read_ram_bw;     // in GB/s
 
     memory_info() : 
         total_physical    (0.0f), 
         available_physical(0.0f), 
         total_swap        (0.0f), 
         available_swap    (0.0f), 
-        read_bandwidth    (0.0f) {}
+        cpu_read_ram_bw   (0.0f) {}
 };
 
 struct gpu_support {
@@ -69,12 +69,13 @@ struct gpu_props {
     const char * description;
     float        memory_free;         // in GiB
     float        memory_total;        // in GiB
-    float        read_bandwidth;      // in GB/s
+    float        metal_read_vram_bw;  // in GB/s
     float        metal_flops_f32_f32; // in GFLOPS
     float        metal_flops_f16_f32; // in GFLOPS
     float        metal_flops_q4k_f32; // in GFLOPS
     float        metal_flops_q6k_f32; // in GFLOPS
     float        metal_flops_q80_f32; // in GFLOPS
+    float        cuda_read_vram_bw;   // in GB/s
     float        cuda_flops_f32_f32;  // in GFLOPS
     float        cuda_flops_f16_f32;  // in GFLOPS
     float        cuda_flops_q4k_f32;  // in GFLOPS
@@ -86,12 +87,13 @@ struct gpu_props {
         description(""), 
         memory_free        (0.0f), 
         memory_total       (0.0f), 
-        read_bandwidth     (0.0f),
+        metal_read_vram_bw (0.0f),
         metal_flops_f32_f32(0.0f), 
         metal_flops_f16_f32(0.0f),
         metal_flops_q4k_f32(0.0f),
         metal_flops_q6k_f32(0.0f),
         metal_flops_q80_f32(0.0f),
+        cuda_read_vram_bw  (0.0f),
         cuda_flops_f32_f32 (0.0f), 
         cuda_flops_f16_f32 (0.0f), 
         cuda_flops_q4k_f32 (0.0f), 
@@ -211,19 +213,20 @@ enum profiler_layer_type {
 
 const char * device_name(void); 
 
-uint32_t device_cpu_cores       (void);
-float    device_cpu_flops       (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, int n_threads);
-float    device_metal_flops     (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t);
-float    device_cuda_flops      (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t);
-float    device_inp_embd_delay  (struct llama_model * model, enum ggml_type src0t, int n_tokens, int n_threads);
-uint64_t device_physical_memory (bool available);
-uint64_t device_swap_memory     (bool available);
-void     device_disk_seq_bw     (float * read_seq_bw, float * write_seq_bw, int n_threads);
-void     device_disk_rnd_bw     (float * read_rnd_bw, float * write_rnd_bw, int n_threads);
-float    device_memory_bw       (int n_thread);
-float    device_cuda_memory_bw  (struct llama_model * model);
-void     device_get_props       (struct llama_model * model, int device, struct ggml_backend_dev_props * props); 
-void     device_print_props     (struct device_info * dev_info_set, int n, struct llama_model * model, const struct llama_context_params cparams);
+uint32_t device_cpu_cores         (void);
+float    device_cpu_flops         (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, int n_threads);
+float    device_metal_flops       (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t);
+float    device_cuda_flops        (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t);
+float    device_inp_embd_delay    (struct llama_model * model, enum ggml_type src0t, int n_tokens, int n_threads);
+uint64_t device_physical_memory   (bool available);
+uint64_t device_swap_memory       (bool available);
+void     device_disk_seq_bw       (float * read_seq_bw, float * write_seq_bw, int n_threads);
+void     device_disk_rnd_bw       (float * read_rnd_bw, float * write_rnd_bw, int n_threads);
+float    device_memory_bw         (int n_thread);
+float    device_metal_read_vram_bw(struct llama_model * model);
+float    device_cuda_read_vram_bw (struct llama_model * model);
+void     device_get_props         (struct llama_model * model, int device, struct ggml_backend_dev_props * props); 
+void     device_print_props       (struct device_info * dev_info_set, int n, struct llama_model * model, const struct llama_context_params cparams);
 
 int      device_has_metal  (void);
 int      device_has_cuda   (void);
diff --git a/include/llama.h b/include/llama.h
index 3c3191b0..c8eb58cd 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -320,6 +320,7 @@ extern "C" {
         uint32_t    n_world;           // world size
         uint32_t    rank;              // my rank
         uint32_t    n_layer_window[32];// number of layers to process in each compute
+        uint32_t    n_gpu_layers;      // number of layers to process on GPU
         bool        unload;            // whether to unload layer weights after use
         char *      master_ip;         // ip address of the master node
         char *      next_node_ip;      // ip address of the next node
diff --git a/src/llama.cpp b/src/llama.cpp
index 690e65fd..78034b80 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3555,17 +3555,17 @@ void llama_perf_context_sync(struct llama_context * ctx, const struct llama_mode
 void llama_profile_device(device_info * dev_info, struct llama_model * model, llama_model_loader * ml, int n_threads) {
     dev_info->device_name               = device_name();
     dev_info->cpu_props.cores           = device_cpu_cores();
-    dev_info->cpu_props.flops_f32_f32   = device_cpu_flops(model, GGML_TYPE_F32,  GGML_TYPE_F32, n_threads);
-    dev_info->cpu_props.flops_f16_f32   = device_cpu_flops(model, GGML_TYPE_F16,  GGML_TYPE_F32, n_threads);
-    dev_info->cpu_props.flops_q4k_f32   = device_cpu_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32, n_threads);
-    dev_info->cpu_props.flops_q6k_f32   = device_cpu_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32, n_threads);
-    dev_info->cpu_props.flops_q80_f32   = device_cpu_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32, n_threads);
+    // dev_info->cpu_props.flops_f32_f32   = device_cpu_flops(model, GGML_TYPE_F32,  GGML_TYPE_F32, n_threads);
+    // dev_info->cpu_props.flops_f16_f32   = device_cpu_flops(model, GGML_TYPE_F16,  GGML_TYPE_F32, n_threads);
+    // dev_info->cpu_props.flops_q4k_f32   = device_cpu_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32, n_threads);
+    // dev_info->cpu_props.flops_q6k_f32   = device_cpu_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32, n_threads);
+    // dev_info->cpu_props.flops_q80_f32   = device_cpu_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32, n_threads);
 
     dev_info->memory.total_physical     = round(device_physical_memory(false) / (double)(1 << 30) * 100) / 100;
     dev_info->memory.available_physical = round(device_physical_memory(true)  / (double)(1 << 30) * 100) / 100;
     dev_info->memory.total_swap         = round(device_swap_memory(false)     / (double)(1 << 30) * 100) / 100;
     dev_info->memory.available_swap     = round(device_swap_memory(true)      / (double)(1 << 30) * 100) / 100;
-    dev_info->memory.read_bandwidth     = device_memory_bw(n_threads);
+    dev_info->memory.cpu_read_ram_bw    = device_memory_bw(n_threads);
 
     device_disk_seq_bw(&dev_info->disk.read_seq_bw, &dev_info->disk.write_seq_bw, n_threads);
     device_disk_rnd_bw(&dev_info->disk.read_rnd_bw, &dev_info->disk.write_rnd_bw, n_threads);
@@ -3590,12 +3590,13 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, ll
     dev_info->gpu_props.description         = gpu_props.description;
     dev_info->gpu_props.memory_free         = round(gpu_props.memory_free  / (double)(1 << 30) * 100) / 100;
     dev_info->gpu_props.memory_total        = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100;
-    dev_info->gpu_props.read_bandwidth      = device_cuda_memory_bw(model);
+    dev_info->gpu_props.metal_read_vram_bw  = device_metal_read_vram_bw(model);
     dev_info->gpu_props.metal_flops_f32_f32 = device_metal_flops(model, GGML_TYPE_F32,  GGML_TYPE_F32);
     dev_info->gpu_props.metal_flops_f16_f32 = device_metal_flops(model, GGML_TYPE_F16,  GGML_TYPE_F32);
     dev_info->gpu_props.metal_flops_q4k_f32 = device_metal_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
     dev_info->gpu_props.metal_flops_q6k_f32 = device_metal_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32);
     dev_info->gpu_props.metal_flops_q80_f32 = device_metal_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32);
+    dev_info->gpu_props.cuda_read_vram_bw   = device_cuda_read_vram_bw(model);
     dev_info->gpu_props.cuda_flops_f32_f32  = device_cuda_flops (model, GGML_TYPE_F32,  GGML_TYPE_F32);
     dev_info->gpu_props.cuda_flops_f16_f32  = device_cuda_flops (model, GGML_TYPE_F16,  GGML_TYPE_F32);
     dev_info->gpu_props.cuda_flops_q4k_f32  = device_cuda_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
@@ -19623,6 +19624,7 @@ struct llama_context_params llama_context_default_params() {
         /*.n_world                     =*/ 1,
         /*.rank                        =*/ 0,
         /*.n_layer_window              =*/ {32},
+        /*.n_gpu_layers                =*/ 0,
         /*.unload                      =*/ false,
         /*.master_ip                   =*/ nullptr,
         /*.next_node_ip                =*/ nullptr,
@@ -20829,17 +20831,19 @@ uint64_t llama_model_compute_buf_size(const struct llama_model * model, const st
     const uint64_t n_output   = hparams.n_vocab * cparams.n_ubatch;
 
     // compute buffer size for input, each layer, and output
-    // const uint64_t n_buf_inp  = (n_inp_toks + n_inp_embd) * ggml_type_size(GGML_TYPE_F32);  // do not consider memory compression
-    const uint64_t n_buf_inp  = (n_inp_toks + n_inp_embd) * ggml_type_size(GGML_TYPE_F32) / 2; // consider compressed memory with ratio 2:1
+    const uint64_t n_buf_inp  = (n_inp_toks + n_inp_embd) * ggml_type_size(GGML_TYPE_F32);  // do not consider memory compression
     const uint64_t n_buf_act  = (n_bak_embd + n_inp_pos + n_kq_mask + 
                                  n_inp_out_ids + n_norm + n_qcur + n_kq
                                 ) * ggml_type_size(GGML_TYPE_F32);
-    // const uint64_t n_buf_out  = (n_out_embd + n_output) * ggml_type_size(GGML_TYPE_F32);    // do not consider memory compression
-    const uint64_t n_buf_out  = (n_out_embd + n_output) * ggml_type_size(GGML_TYPE_F32) / 2;   // consider compressed memory with ratio 2:1
+    const uint64_t n_buf_out  = (n_out_embd + n_output) * ggml_type_size(GGML_TYPE_F32);    // do not consider memory compression
 
     uint64_t n_buf_total = 0;
     if (cparams.rank == 0) {
-        n_buf_total = n_buf_inp + n_buf_act + n_buf_out;
+        if (compress_memory) {
+            n_buf_total = n_buf_inp / 2 + n_buf_act + n_buf_out / 2; // consider compressed memory with ratio 2:1
+        } else {
+            n_buf_total = n_buf_inp + n_buf_act + n_buf_out;
+        }
     } else {
         n_buf_total = n_buf_act;
     }