From a7ec685eda667ba27bd00f50dfd7e20eb297b48e Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Sun, 29 Dec 2024 16:19:08 +0400 Subject: [PATCH] add memcpy speed test --- common/profiler.cpp | 39 +++++++++++++++++++++++++++++++++++---- common/profiler.h | 10 ++++++++-- src/llama.cpp | 3 +++ 3 files changed, 46 insertions(+), 6 deletions(-) diff --git a/common/profiler.cpp b/common/profiler.cpp index 48b90626..fd75186f 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -1581,7 +1581,7 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); - LOG_INF("| Physical Mem Total (GiB) "); + LOG_INF("| Physical Mem Total (GiB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.total_physical); } @@ -1617,6 +1617,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); + LOG_INF("| CPU KVCache Copy Time (ms/l) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.2f ", dev_info_set[i].memory.mem_cpy_delay); + } + LOG_INF("\n"); + LOG_INF("| Disk Read Seq Speed (GB/s) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].disk.read_seq_bw); @@ -1713,6 +1719,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); + LOG_INF("| Metal KVCache Copy Time(ms/l)"); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.metal_mem_cpy_delay); + } + LOG_INF("\n"); + LOG_INF("| Metal flops (F32xF32, GFLOPS)"); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_f32_f32); @@ -1755,6 +1767,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); + LOG_INF("| CUDA KVCache Copy Time (ms/l)"); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.cuda_mem_cpy_delay); + } + LOG_INF("\n"); + LOG_INF("| CUDA flops (F32xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f32_f32); @@ -1956,12 +1974,13 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { + gpu_description_len + sizeof(struct disk_props) + sizeof(uint32_t) // cpu_props.cores - + sizeof(float) * 6 // cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q5k_f32, cpu_props.flops_q6k_f32, cpu_props.flops_q80_f32 + + sizeof(float) * 6 // cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q5k_f32, cpu_props.flops_q6k_f32, cpu_props.flops_q80_f32 + sizeof(struct memory_info) + sizeof(struct gpu_support) - + sizeof(float) * 16; // gpu_props.memory_free, gpu_props.memory_total, gpu_props.metal_read_vram_bw, gpu_props.cuda_read_vram_bw, + + sizeof(float) * 18; // gpu_props.memory_free, gpu_props.memory_total, gpu_props.metal_read_vram_bw, gpu_props.cuda_read_vram_bw, // gpu_props.metal_flops_f32_f32, gpu_props.metal_flops_f16_f32, gpu_props.metal_flops_q4k_f32, gpu_props.metal_flops_q5k_f32, gpu_props.metal_flops_q6k_f32, gpu_props.metal_flops_q80_f32, - // gpu_props.cuda_flops_f32_f32, gpu_props.cuda_flops_f16_f32, gpu_props.cuda_flops_q4k_f32, gpu_props.cuda_flops_q5k_f32, gpu_props.cuda_flops_q6k_f32, gpu_props.cuda_flops_q80_f32 + // gpu_props.cuda_flops_f32_f32, gpu_props.cuda_flops_f16_f32, gpu_props.cuda_flops_q4k_f32, gpu_props.cuda_flops_q5k_f32, gpu_props.cuda_flops_q6k_f32, gpu_props.cuda_flops_q80_f32, + // gpu_props.metal_mem_cpy_delay, gpu_props.cuda_mem_cpy_delay *buffer = (char *)malloc(total_size); char * ptr = *buffer; @@ -2054,6 +2073,9 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.metal_flops_q80_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.metal_mem_cpy_delay, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.cuda_read_vram_bw, sizeof(float)); ptr += sizeof(float); @@ -2073,6 +2095,9 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { ptr += sizeof(float); memcpy(ptr, &dev_info->gpu_props.cuda_flops_q80_f32, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->gpu_props.cuda_mem_cpy_delay, sizeof(float)); // no need to synchronize model flops and model params return total_size; @@ -2183,6 +2208,9 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->gpu_props.metal_flops_q80_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->gpu_props.metal_mem_cpy_delay, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->gpu_props.cuda_read_vram_bw, ptr, sizeof(float)); ptr += sizeof(float); @@ -2202,6 +2230,9 @@ void deserialize(const char * buffer, struct device_info * dev_info) { ptr += sizeof(float); memcpy(&dev_info->gpu_props.cuda_flops_q80_f32, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->gpu_props.cuda_mem_cpy_delay, ptr, sizeof(float)); // no need to synchronize model flops and model params } \ No newline at end of file diff --git a/common/profiler.h b/common/profiler.h index acd6f5ac..e32fb895 100644 --- a/common/profiler.h +++ b/common/profiler.h @@ -41,6 +41,7 @@ struct memory_info { float total_swap; // in GiB float available_swap; // in GiB float cpu_read_ram_bw; // in GB/s + float mem_cpy_delay; // in ms memory_info() : total_physical (0.0f), @@ -48,7 +49,8 @@ struct memory_info { used_can_swap (0.0f), total_swap (0.0f), available_swap (0.0f), - cpu_read_ram_bw (0.0f) {} + cpu_read_ram_bw (0.0f), + mem_cpy_delay (0.0f) {} }; struct gpu_support { @@ -82,6 +84,7 @@ struct gpu_props { float metal_flops_q5k_f32; // in GFLOPS float metal_flops_q6k_f32; // in GFLOPS float metal_flops_q80_f32; // in GFLOPS + float metal_mem_cpy_delay; // in ms float cuda_read_vram_bw; // in GB/s float cuda_flops_f32_f32; // in GFLOPS float cuda_flops_f16_f32; // in GFLOPS @@ -89,6 +92,7 @@ struct gpu_props { float cuda_flops_q5k_f32; // in GFLOPS float cuda_flops_q6k_f32; // in GFLOPS float cuda_flops_q80_f32; // in GFLOPS + float cuda_mem_cpy_delay; // in ms gpu_props() : name(""), @@ -102,13 +106,15 @@ struct gpu_props { metal_flops_q5k_f32(EPS), metal_flops_q6k_f32(EPS), metal_flops_q80_f32(EPS), + metal_mem_cpy_delay(0.0f), cuda_read_vram_bw (0.0f), cuda_flops_f32_f32 (EPS), cuda_flops_f16_f32 (EPS), cuda_flops_q4k_f32 (EPS), cuda_flops_q5k_f32 (EPS), cuda_flops_q6k_f32 (EPS), - cuda_flops_q80_f32 (EPS) {} + cuda_flops_q80_f32 (EPS), + cuda_mem_cpy_delay (0.0f) {} }; struct model_flops { diff --git a/src/llama.cpp b/src/llama.cpp index 901ec2f9..4f056fde 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3587,6 +3587,7 @@ void llama_profile_device( dev_info->memory.total_swap = round(device_swap_memory(false) / (double)(1 << 30) * 100) / 100; dev_info->memory.available_swap = round(device_swap_memory(true) / (double)(1 << 30) * 100) / 100; dev_info->memory.cpu_read_ram_bw = device_memory_bw(n_threads); + dev_info->memory.mem_cpy_delay = device_cpu_mem_copy(model, n_threads); struct model_flops * n_flops = &dev_info->model_flops; struct model_params * n_params = &dev_info->model_params; @@ -3622,6 +3623,8 @@ void llama_profile_device( dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100; dev_info->gpu_props.metal_read_vram_bw = device_metal_read_vram_bw(); dev_info->gpu_props.cuda_read_vram_bw = device_cuda_read_vram_bw(); + dev_info->gpu_props.metal_mem_cpy_delay = device_metal_mem_copy(model); + dev_info->gpu_props.cuda_mem_cpy_delay = device_cuda_mem_copy(model); if (is_dtype_exist(n_params, GGML_TYPE_F32)) { dev_info->cpu_props.flops_f32_f32 = device_cpu_flops (model, GGML_TYPE_F32, GGML_TYPE_F32, n_threads);