From 68ecabc8c3036dd9f73474bd25e9609a4b66747d Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Fri, 29 Nov 2024 19:03:01 +0400 Subject: [PATCH] add cpu_read_ram_bw, metal_read_vram_bw, cuda_read_vram_bw --- common/common.cpp | 1 + common/profiler.cpp | 120 +++++++++++++++++++++++++++++++++++++------- common/profiler.h | 37 +++++++------- include/llama.h | 1 + src/llama.cpp | 28 ++++++----- 5 files changed, 139 insertions(+), 48 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index ee33f351..86de0c06 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1117,6 +1117,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.n_world = params.n_world; cparams.rank = params.rank; cparams.unload = params.unload; + cparams.n_gpu_layers = params.n_gpu_layers; std::copy(std::begin(params.n_layer_window), std::end(params.n_layer_window), cparams.n_layer_window); if (cparams.master_ip != nullptr) { diff --git a/common/profiler.cpp b/common/profiler.cpp index c874add2..e634c0b5 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -43,6 +43,26 @@ #include #include #include +#include + +static int disable_log() { + int stdout_fd = dup(STDOUT_FILENO); + int null_fd = open("/dev/null", O_WRONLY); + if (null_fd == -1) { + LOG_INF("Failed to open /dev/null\n"); + return -1; + } + dup2(null_fd, STDOUT_FILENO); + close(null_fd); + return stdout_fd; +} + +static void enable_log(int stdout_fd) { + if (stdout_fd != -1) { + dup2(stdout_fd, STDOUT_FILENO); + close(stdout_fd); + } +} const char * device_name() { static char device_name[256]; @@ -94,7 +114,7 @@ uint32_t device_cpu_cores() { return core_count; } -static float device_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, profiler_backend_type btype, int n_threads) { +static float device_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, enum profiler_backend_type btype, int n_threads) { const int n_repeat = 1; const int n_embd = llama_n_embd(model); std::vector matrix_A(n_embd * n_embd, 1.0f); @@ -188,7 +208,9 @@ float device_cpu_flops(struct llama_model * model, enum ggml_type src0t, enum gg float device_metal_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t) { #ifdef GGML_USE_METAL + int fd = disable_log(); return device_flops(model, src0t, src1t, PROFILER_BACKEND_TYPE_METAL, 4); + enable_log(fd); #endif (void)model; @@ -199,7 +221,10 @@ float device_metal_flops(struct llama_model * model, enum ggml_type src0t, enum float device_cuda_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t) { #ifdef GGML_USE_CUDA - return device_flops(model, src0t, src1t, PROFILER_BACKEND_TYPE_CUDA, 4); + int fd = disable_log(); + float ret = device_flops(model, src0t, src1t, PROFILER_BACKEND_TYPE_CUDA, 4) + enable_log(fd); + return ret; #endif (void)model; @@ -712,12 +737,26 @@ float device_memory_bw(int n_thread) { return static_cast(bandwidth); } -float device_cuda_memory_bw(struct llama_model * model) { -#ifdef GGML_USE_CUDA +static float device_read_vram_bw(struct llama_model * model, enum profiler_backend_type btype) { const int n_embd = llama_n_embd(model) * 2; std::vector matrix_A(n_embd * n_embd, 1.0f); - ggml_backend_t backend = ggml_backend_cuda_init(0); + ggml_backend_t backend = NULL; + switch (btype) { + case PROFILER_BACKEND_TYPE_METAL: +#ifdef GGML_USE_METAL + backend = ggml_backend_metal_init(); +#endif + break; + case PROFILER_BACKEND_TYPE_CUDA: +#ifdef GGML_USE_CUDA + backend = ggml_backend_cuda_init(0); +#endif + break; + case PROFILER_BACKEND_TYPE_CPU: + break; + } + if (!backend) { LOG_INF("%s: ggml backend init failed\n", __func__); return 0.0f; @@ -769,10 +808,28 @@ float device_cuda_memory_bw(struct llama_model * model) { ggml_backend_free(backend); return bandwidth; -#else +} + +float device_metal_read_vram_bw(struct llama_model * model) { +#ifdef GGML_USE_METAL + int fd = disable_log(); + return device_read_vram_bw(model, PROFILER_BACKEND_TYPE_METAL); + enable_log(fd); +#endif + (void)model; return 0.0f; +} + +float device_cuda_read_vram_bw(struct llama_model * model) { +#ifdef GGML_USE_CUDA + int fd = disable_log(); + return device_read_vram_bw(model, PROFILER_BACKEND_TYPE_CUDA); + enable_log(fd); #endif + + (void)model; + return 0.0f; } int device_has_metal(void) { @@ -827,6 +884,14 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers) { total_latency += (double)n_flops.layer_q4k_f32 / (double)gpu.cuda_flops_q4k_f32 / 1e9; total_latency += (double)n_flops.layer_q6k_f32 / (double)gpu.cuda_flops_q6k_f32 / 1e9; total_latency += (double)n_flops.layer_q80_f32 / (double)gpu.cuda_flops_q80_f32 / 1e9; +#elif GGML_USE_METAL + struct gpu_props gpu = dev_info.gpu_props; + + total_latency += (double)n_flops.layer_f32_f32 / (double)gpu.metal_flops_f32_f32 / 1e9; + total_latency += (double)n_flops.layer_f16_f32 / (double)gpu.metal_flops_f16_f32 / 1e9; + total_latency += (double)n_flops.layer_q4k_f32 / (double)gpu.metal_flops_q4k_f32 / 1e9; + total_latency += (double)n_flops.layer_q6k_f32 / (double)gpu.metal_flops_q6k_f32 / 1e9; + total_latency += (double)n_flops.layer_q80_f32 / (double)gpu.metal_flops_q80_f32 / 1e9; #else total_latency += (double)n_flops.layer_f32_f32 / (double)cpu.flops_f32_f32 / 1e9; total_latency += (double)n_flops.layer_f16_f32 / (double)cpu.flops_f16_f32 / 1e9; @@ -870,15 +935,18 @@ static float device_memory_access_delay(struct device_info & dev_info, int n_lay n_params.output_q80; #ifdef GGML_USE_CUDA - return (double)total_bytes / 1e6 / dev_info.gpu_props.read_bandwidth; // ms + return (double)total_bytes / 1e6 / dev_info.gpu_props.cuda_read_vram_bw; // ms +#elif GGML_USE_METAL + return (double)total_bytes / 1e6 / dev_info.gpu_props.metal_read_vram_bw; // ms #else - return (double)total_bytes / 1e6 / dev_info.memory.read_bandwidth; // ms + return (double)total_bytes / 1e6 / dev_info.memory.cpu_read_ram_bw; // ms #endif } static float device_disk_access_delay(struct device_info & dev_info, struct llama_model * model, const struct llama_context_params cparams) { auto n_params = dev_info.model_params; int n_layers = llama_model_n_layers(model); + int n_gpu_layers = cparams.n_gpu_layers; double kv_size_gb = static_cast(llama_model_kvcache_size(model, cparams)) / 1e9; // convert to GB double compute_buf_gb = static_cast(llama_model_compute_buf_size(model, cparams, false)) / 1e9; // convert to GB @@ -1005,7 +1073,7 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| Mem Read Bandwidth (GB/s) "); for (int i = 0; i < n; ++i) { - LOG_INF("| %-10.2f ", dev_info_set[i].memory.read_bandwidth); + LOG_INF("| %-10.2f ", dev_info_set[i].memory.cpu_read_ram_bw); } LOG_INF("\n"); @@ -1099,9 +1167,9 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); - LOG_INF("| VRAM Read Bandwidth (GB/s) "); + LOG_INF("| Metal VRAM Read BW (GB/s) "); for (int i = 0; i < n; ++i) { - LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.read_bandwidth); + LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.metal_read_vram_bw); } LOG_INF("\n"); @@ -1135,31 +1203,37 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); - LOG_INF("| CUDA flops (F32xF32, GFLOPS)"); + LOG_INF("| CUDA VRAM Read BW (GB/s) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.cuda_read_vram_bw); + } + LOG_INF("\n"); + + LOG_INF("| CUDA flops (F32xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f32_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (F16xF32, GFLOPS)"); + LOG_INF("| CUDA flops (F16xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f16_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (Q4KxF32, GFLOPS)"); + LOG_INF("| CUDA flops (Q4KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q4k_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (Q6KxF32, GFLOPS)"); + LOG_INF("| CUDA flops (Q6KxF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q6k_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (Q80xF32, GFLOPS)"); + LOG_INF("| CUDA flops (Q80xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q80_f32); } @@ -1269,7 +1343,9 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m float latency = 0.0f; int n_layers = llama_model_n_layers(model); latency += device_compute_delay(dev_info_set[0], n_layers); + LOG_INF("latency: %.2f\n", latency); latency += device_memory_access_delay(dev_info_set[0], n_layers); + LOG_INF("latency: %.2f\n", latency); latency += device_disk_access_delay(dev_info_set[0], model, cparams); // if physical memory is not enough, some tensor weights will be released from memory and reloaded by mmap later LOG_INF("| Token latency (ms) "); @@ -1300,7 +1376,7 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { + sizeof(float) * 5 // cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q6k_f32, cpu_props.flops_q80_f32 + sizeof(struct memory_info) + sizeof(struct gpu_support) - + sizeof(float) * 13; // gpu_props.memory_free, gpu_props.memory_total, gpu_props.read_bandwidth, + + sizeof(float) * 14; // gpu_props.memory_free, gpu_props.memory_total, gpu_props.metal_read_vram_bw, gpu_props.cuda_read_vram_bw, // gpu_props.metal_flops_f32_f32, gpu_props.metal_flops_f16_f32, gpu_props.metal_flops_q4k_f32, gpu_props.metal_flops_q6k_f32, gpu_props.metal_flops_q80_f32, // gpu_props.cuda_flops_f32_f32, gpu_props.cuda_flops_f16_f32, gpu_props.cuda_flops_q4k_f32, gpu_props.cuda_flops_q6k_f32, gpu_props.cuda_flops_q80_f32 @@ -1371,7 +1447,7 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.memory_total, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->gpu_props.read_bandwidth, sizeof(float)); + memcpy(ptr, &dev_info->gpu_props.metal_read_vram_bw, sizeof(float)); ptr += sizeof(float); memcpy(ptr, &dev_info->gpu_props.metal_flops_f32_f32, sizeof(float)); @@ -1389,6 +1465,9 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.metal_flops_q80_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.cuda_read_vram_bw, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.cuda_flops_f32_f32, sizeof(float)); ptr += sizeof(float); @@ -1488,7 +1567,7 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->gpu_props.memory_total, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->gpu_props.read_bandwidth, ptr, sizeof(float)); + memcpy(&dev_info->gpu_props.metal_read_vram_bw, ptr, sizeof(float)); ptr += sizeof(float); memcpy(&dev_info->gpu_props.metal_flops_f32_f32, ptr, sizeof(float)); @@ -1506,6 +1585,9 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->gpu_props.metal_flops_q80_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->gpu_props.cuda_read_vram_bw, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->gpu_props.cuda_flops_f32_f32, ptr, sizeof(float)); ptr += sizeof(float); diff --git a/common/profiler.h b/common/profiler.h index 286fccd1..2e182380 100644 --- a/common/profiler.h +++ b/common/profiler.h @@ -35,14 +35,14 @@ struct memory_info { float available_physical; // in GiB float total_swap; // in GiB float available_swap; // in GiB - float read_bandwidth; // in GB/s + float cpu_read_ram_bw; // in GB/s memory_info() : total_physical (0.0f), available_physical(0.0f), total_swap (0.0f), available_swap (0.0f), - read_bandwidth (0.0f) {} + cpu_read_ram_bw (0.0f) {} }; struct gpu_support { @@ -69,12 +69,13 @@ struct gpu_props { const char * description; float memory_free; // in GiB float memory_total; // in GiB - float read_bandwidth; // in GB/s + float metal_read_vram_bw; // in GB/s float metal_flops_f32_f32; // in GFLOPS float metal_flops_f16_f32; // in GFLOPS float metal_flops_q4k_f32; // in GFLOPS float metal_flops_q6k_f32; // in GFLOPS float metal_flops_q80_f32; // in GFLOPS + float cuda_read_vram_bw; // in GB/s float cuda_flops_f32_f32; // in GFLOPS float cuda_flops_f16_f32; // in GFLOPS float cuda_flops_q4k_f32; // in GFLOPS @@ -86,12 +87,13 @@ struct gpu_props { description(""), memory_free (0.0f), memory_total (0.0f), - read_bandwidth (0.0f), + metal_read_vram_bw (0.0f), metal_flops_f32_f32(0.0f), metal_flops_f16_f32(0.0f), metal_flops_q4k_f32(0.0f), metal_flops_q6k_f32(0.0f), metal_flops_q80_f32(0.0f), + cuda_read_vram_bw (0.0f), cuda_flops_f32_f32 (0.0f), cuda_flops_f16_f32 (0.0f), cuda_flops_q4k_f32 (0.0f), @@ -211,19 +213,20 @@ enum profiler_layer_type { const char * device_name(void); -uint32_t device_cpu_cores (void); -float device_cpu_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, int n_threads); -float device_metal_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t); -float device_cuda_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t); -float device_inp_embd_delay (struct llama_model * model, enum ggml_type src0t, int n_tokens, int n_threads); -uint64_t device_physical_memory (bool available); -uint64_t device_swap_memory (bool available); -void device_disk_seq_bw (float * read_seq_bw, float * write_seq_bw, int n_threads); -void device_disk_rnd_bw (float * read_rnd_bw, float * write_rnd_bw, int n_threads); -float device_memory_bw (int n_thread); -float device_cuda_memory_bw (struct llama_model * model); -void device_get_props (struct llama_model * model, int device, struct ggml_backend_dev_props * props); -void device_print_props (struct device_info * dev_info_set, int n, struct llama_model * model, const struct llama_context_params cparams); +uint32_t device_cpu_cores (void); +float device_cpu_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, int n_threads); +float device_metal_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t); +float device_cuda_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t); +float device_inp_embd_delay (struct llama_model * model, enum ggml_type src0t, int n_tokens, int n_threads); +uint64_t device_physical_memory (bool available); +uint64_t device_swap_memory (bool available); +void device_disk_seq_bw (float * read_seq_bw, float * write_seq_bw, int n_threads); +void device_disk_rnd_bw (float * read_rnd_bw, float * write_rnd_bw, int n_threads); +float device_memory_bw (int n_thread); +float device_metal_read_vram_bw(struct llama_model * model); +float device_cuda_read_vram_bw (struct llama_model * model); +void device_get_props (struct llama_model * model, int device, struct ggml_backend_dev_props * props); +void device_print_props (struct device_info * dev_info_set, int n, struct llama_model * model, const struct llama_context_params cparams); int device_has_metal (void); int device_has_cuda (void); diff --git a/include/llama.h b/include/llama.h index 3c3191b0..c8eb58cd 100644 --- a/include/llama.h +++ b/include/llama.h @@ -320,6 +320,7 @@ extern "C" { uint32_t n_world; // world size uint32_t rank; // my rank uint32_t n_layer_window[32];// number of layers to process in each compute + uint32_t n_gpu_layers; // number of layers to process on GPU bool unload; // whether to unload layer weights after use char * master_ip; // ip address of the master node char * next_node_ip; // ip address of the next node diff --git a/src/llama.cpp b/src/llama.cpp index 690e65fd..78034b80 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3555,17 +3555,17 @@ void llama_perf_context_sync(struct llama_context * ctx, const struct llama_mode void llama_profile_device(device_info * dev_info, struct llama_model * model, llama_model_loader * ml, int n_threads) { dev_info->device_name = device_name(); dev_info->cpu_props.cores = device_cpu_cores(); - dev_info->cpu_props.flops_f32_f32 = device_cpu_flops(model, GGML_TYPE_F32, GGML_TYPE_F32, n_threads); - dev_info->cpu_props.flops_f16_f32 = device_cpu_flops(model, GGML_TYPE_F16, GGML_TYPE_F32, n_threads); - dev_info->cpu_props.flops_q4k_f32 = device_cpu_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32, n_threads); - dev_info->cpu_props.flops_q6k_f32 = device_cpu_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32, n_threads); - dev_info->cpu_props.flops_q80_f32 = device_cpu_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32, n_threads); + // dev_info->cpu_props.flops_f32_f32 = device_cpu_flops(model, GGML_TYPE_F32, GGML_TYPE_F32, n_threads); + // dev_info->cpu_props.flops_f16_f32 = device_cpu_flops(model, GGML_TYPE_F16, GGML_TYPE_F32, n_threads); + // dev_info->cpu_props.flops_q4k_f32 = device_cpu_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32, n_threads); + // dev_info->cpu_props.flops_q6k_f32 = device_cpu_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32, n_threads); + // dev_info->cpu_props.flops_q80_f32 = device_cpu_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32, n_threads); dev_info->memory.total_physical = round(device_physical_memory(false) / (double)(1 << 30) * 100) / 100; dev_info->memory.available_physical = round(device_physical_memory(true) / (double)(1 << 30) * 100) / 100; dev_info->memory.total_swap = round(device_swap_memory(false) / (double)(1 << 30) * 100) / 100; dev_info->memory.available_swap = round(device_swap_memory(true) / (double)(1 << 30) * 100) / 100; - dev_info->memory.read_bandwidth = device_memory_bw(n_threads); + dev_info->memory.cpu_read_ram_bw = device_memory_bw(n_threads); device_disk_seq_bw(&dev_info->disk.read_seq_bw, &dev_info->disk.write_seq_bw, n_threads); device_disk_rnd_bw(&dev_info->disk.read_rnd_bw, &dev_info->disk.write_rnd_bw, n_threads); @@ -3590,12 +3590,13 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, ll dev_info->gpu_props.description = gpu_props.description; dev_info->gpu_props.memory_free = round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100; dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100; - dev_info->gpu_props.read_bandwidth = device_cuda_memory_bw(model); + dev_info->gpu_props.metal_read_vram_bw = device_metal_read_vram_bw(model); dev_info->gpu_props.metal_flops_f32_f32 = device_metal_flops(model, GGML_TYPE_F32, GGML_TYPE_F32); dev_info->gpu_props.metal_flops_f16_f32 = device_metal_flops(model, GGML_TYPE_F16, GGML_TYPE_F32); dev_info->gpu_props.metal_flops_q4k_f32 = device_metal_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32); dev_info->gpu_props.metal_flops_q6k_f32 = device_metal_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32); dev_info->gpu_props.metal_flops_q80_f32 = device_metal_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32); + dev_info->gpu_props.cuda_read_vram_bw = device_cuda_read_vram_bw(model); dev_info->gpu_props.cuda_flops_f32_f32 = device_cuda_flops (model, GGML_TYPE_F32, GGML_TYPE_F32); dev_info->gpu_props.cuda_flops_f16_f32 = device_cuda_flops (model, GGML_TYPE_F16, GGML_TYPE_F32); dev_info->gpu_props.cuda_flops_q4k_f32 = device_cuda_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32); @@ -19623,6 +19624,7 @@ struct llama_context_params llama_context_default_params() { /*.n_world =*/ 1, /*.rank =*/ 0, /*.n_layer_window =*/ {32}, + /*.n_gpu_layers =*/ 0, /*.unload =*/ false, /*.master_ip =*/ nullptr, /*.next_node_ip =*/ nullptr, @@ -20829,17 +20831,19 @@ uint64_t llama_model_compute_buf_size(const struct llama_model * model, const st const uint64_t n_output = hparams.n_vocab * cparams.n_ubatch; // compute buffer size for input, each layer, and output - // const uint64_t n_buf_inp = (n_inp_toks + n_inp_embd) * ggml_type_size(GGML_TYPE_F32); // do not consider memory compression - const uint64_t n_buf_inp = (n_inp_toks + n_inp_embd) * ggml_type_size(GGML_TYPE_F32) / 2; // consider compressed memory with ratio 2:1 + const uint64_t n_buf_inp = (n_inp_toks + n_inp_embd) * ggml_type_size(GGML_TYPE_F32); // do not consider memory compression const uint64_t n_buf_act = (n_bak_embd + n_inp_pos + n_kq_mask + n_inp_out_ids + n_norm + n_qcur + n_kq ) * ggml_type_size(GGML_TYPE_F32); - // const uint64_t n_buf_out = (n_out_embd + n_output) * ggml_type_size(GGML_TYPE_F32); // do not consider memory compression - const uint64_t n_buf_out = (n_out_embd + n_output) * ggml_type_size(GGML_TYPE_F32) / 2; // consider compressed memory with ratio 2:1 + const uint64_t n_buf_out = (n_out_embd + n_output) * ggml_type_size(GGML_TYPE_F32); // do not consider memory compression uint64_t n_buf_total = 0; if (cparams.rank == 0) { - n_buf_total = n_buf_inp + n_buf_act + n_buf_out; + if (compress_memory) { + n_buf_total = n_buf_inp / 2 + n_buf_act + n_buf_out / 2; // consider compressed memory with ratio 2:1 + } else { + n_buf_total = n_buf_inp + n_buf_act + n_buf_out; + } } else { n_buf_total = n_buf_act; }