mirror of
https://github.com/Lizonghang/prima.cpp.git
synced 2025-09-06 12:59:03 +00:00
add memcpy speed test
This commit is contained in:
parent
5b46c4e848
commit
a7ec685eda
3 changed files with 46 additions and 6 deletions
|
@ -1581,7 +1581,7 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Physical Mem Total (GiB) ");
|
||||
LOG_INF("| Physical Mem Total (GiB) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.2f ", dev_info_set[i].memory.total_physical);
|
||||
}
|
||||
|
@ -1617,6 +1617,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CPU KVCache Copy Time (ms/l) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.2f ", dev_info_set[i].memory.mem_cpy_delay);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Disk Read Seq Speed (GB/s) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.2f ", dev_info_set[i].disk.read_seq_bw);
|
||||
|
@ -1713,6 +1719,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Metal KVCache Copy Time(ms/l)");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.metal_mem_cpy_delay);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| Metal flops (F32xF32, GFLOPS)");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_f32_f32);
|
||||
|
@ -1755,6 +1767,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
|
|||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CUDA KVCache Copy Time (ms/l)");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.2f ", dev_info_set[i].gpu_props.cuda_mem_cpy_delay);
|
||||
}
|
||||
LOG_INF("\n");
|
||||
|
||||
LOG_INF("| CUDA flops (F32xF32, GFLOPS) ");
|
||||
for (int i = 0; i < n; ++i) {
|
||||
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f32_f32);
|
||||
|
@ -1956,12 +1974,13 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
|||
+ gpu_description_len
|
||||
+ sizeof(struct disk_props)
|
||||
+ sizeof(uint32_t) // cpu_props.cores
|
||||
+ sizeof(float) * 6 // cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q5k_f32, cpu_props.flops_q6k_f32, cpu_props.flops_q80_f32
|
||||
+ sizeof(float) * 6 // cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q5k_f32, cpu_props.flops_q6k_f32, cpu_props.flops_q80_f32
|
||||
+ sizeof(struct memory_info)
|
||||
+ sizeof(struct gpu_support)
|
||||
+ sizeof(float) * 16; // gpu_props.memory_free, gpu_props.memory_total, gpu_props.metal_read_vram_bw, gpu_props.cuda_read_vram_bw,
|
||||
+ sizeof(float) * 18; // gpu_props.memory_free, gpu_props.memory_total, gpu_props.metal_read_vram_bw, gpu_props.cuda_read_vram_bw,
|
||||
// gpu_props.metal_flops_f32_f32, gpu_props.metal_flops_f16_f32, gpu_props.metal_flops_q4k_f32, gpu_props.metal_flops_q5k_f32, gpu_props.metal_flops_q6k_f32, gpu_props.metal_flops_q80_f32,
|
||||
// gpu_props.cuda_flops_f32_f32, gpu_props.cuda_flops_f16_f32, gpu_props.cuda_flops_q4k_f32, gpu_props.cuda_flops_q5k_f32, gpu_props.cuda_flops_q6k_f32, gpu_props.cuda_flops_q80_f32
|
||||
// gpu_props.cuda_flops_f32_f32, gpu_props.cuda_flops_f16_f32, gpu_props.cuda_flops_q4k_f32, gpu_props.cuda_flops_q5k_f32, gpu_props.cuda_flops_q6k_f32, gpu_props.cuda_flops_q80_f32,
|
||||
// gpu_props.metal_mem_cpy_delay, gpu_props.cuda_mem_cpy_delay
|
||||
|
||||
*buffer = (char *)malloc(total_size);
|
||||
char * ptr = *buffer;
|
||||
|
@ -2054,6 +2073,9 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
|||
memcpy(ptr, &dev_info->gpu_props.metal_flops_q80_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.metal_mem_cpy_delay, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.cuda_read_vram_bw, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
|
@ -2073,6 +2095,9 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
|
|||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q80_f32, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(ptr, &dev_info->gpu_props.cuda_mem_cpy_delay, sizeof(float));
|
||||
|
||||
// no need to synchronize model flops and model params
|
||||
return total_size;
|
||||
|
@ -2183,6 +2208,9 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
|
|||
memcpy(&dev_info->gpu_props.metal_flops_q80_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.metal_mem_cpy_delay, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.cuda_read_vram_bw, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
|
@ -2202,6 +2230,9 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
|
|||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.cuda_flops_q80_f32, ptr, sizeof(float));
|
||||
ptr += sizeof(float);
|
||||
|
||||
memcpy(&dev_info->gpu_props.cuda_mem_cpy_delay, ptr, sizeof(float));
|
||||
|
||||
// no need to synchronize model flops and model params
|
||||
}
|
|
@ -41,6 +41,7 @@ struct memory_info {
|
|||
float total_swap; // in GiB
|
||||
float available_swap; // in GiB
|
||||
float cpu_read_ram_bw; // in GB/s
|
||||
float mem_cpy_delay; // in ms
|
||||
|
||||
memory_info() :
|
||||
total_physical (0.0f),
|
||||
|
@ -48,7 +49,8 @@ struct memory_info {
|
|||
used_can_swap (0.0f),
|
||||
total_swap (0.0f),
|
||||
available_swap (0.0f),
|
||||
cpu_read_ram_bw (0.0f) {}
|
||||
cpu_read_ram_bw (0.0f),
|
||||
mem_cpy_delay (0.0f) {}
|
||||
};
|
||||
|
||||
struct gpu_support {
|
||||
|
@ -82,6 +84,7 @@ struct gpu_props {
|
|||
float metal_flops_q5k_f32; // in GFLOPS
|
||||
float metal_flops_q6k_f32; // in GFLOPS
|
||||
float metal_flops_q80_f32; // in GFLOPS
|
||||
float metal_mem_cpy_delay; // in ms
|
||||
float cuda_read_vram_bw; // in GB/s
|
||||
float cuda_flops_f32_f32; // in GFLOPS
|
||||
float cuda_flops_f16_f32; // in GFLOPS
|
||||
|
@ -89,6 +92,7 @@ struct gpu_props {
|
|||
float cuda_flops_q5k_f32; // in GFLOPS
|
||||
float cuda_flops_q6k_f32; // in GFLOPS
|
||||
float cuda_flops_q80_f32; // in GFLOPS
|
||||
float cuda_mem_cpy_delay; // in ms
|
||||
|
||||
gpu_props() :
|
||||
name(""),
|
||||
|
@ -102,13 +106,15 @@ struct gpu_props {
|
|||
metal_flops_q5k_f32(EPS),
|
||||
metal_flops_q6k_f32(EPS),
|
||||
metal_flops_q80_f32(EPS),
|
||||
metal_mem_cpy_delay(0.0f),
|
||||
cuda_read_vram_bw (0.0f),
|
||||
cuda_flops_f32_f32 (EPS),
|
||||
cuda_flops_f16_f32 (EPS),
|
||||
cuda_flops_q4k_f32 (EPS),
|
||||
cuda_flops_q5k_f32 (EPS),
|
||||
cuda_flops_q6k_f32 (EPS),
|
||||
cuda_flops_q80_f32 (EPS) {}
|
||||
cuda_flops_q80_f32 (EPS),
|
||||
cuda_mem_cpy_delay (0.0f) {}
|
||||
};
|
||||
|
||||
struct model_flops {
|
||||
|
|
|
@ -3587,6 +3587,7 @@ void llama_profile_device(
|
|||
dev_info->memory.total_swap = round(device_swap_memory(false) / (double)(1 << 30) * 100) / 100;
|
||||
dev_info->memory.available_swap = round(device_swap_memory(true) / (double)(1 << 30) * 100) / 100;
|
||||
dev_info->memory.cpu_read_ram_bw = device_memory_bw(n_threads);
|
||||
dev_info->memory.mem_cpy_delay = device_cpu_mem_copy(model, n_threads);
|
||||
|
||||
struct model_flops * n_flops = &dev_info->model_flops;
|
||||
struct model_params * n_params = &dev_info->model_params;
|
||||
|
@ -3622,6 +3623,8 @@ void llama_profile_device(
|
|||
dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100;
|
||||
dev_info->gpu_props.metal_read_vram_bw = device_metal_read_vram_bw();
|
||||
dev_info->gpu_props.cuda_read_vram_bw = device_cuda_read_vram_bw();
|
||||
dev_info->gpu_props.metal_mem_cpy_delay = device_metal_mem_copy(model);
|
||||
dev_info->gpu_props.cuda_mem_cpy_delay = device_cuda_mem_copy(model);
|
||||
|
||||
if (is_dtype_exist(n_params, GGML_TYPE_F32)) {
|
||||
dev_info->cpu_props.flops_f32_f32 = device_cpu_flops (model, GGML_TYPE_F32, GGML_TYPE_F32, n_threads);
|
||||
|
|
Loading…
Add table
Reference in a new issue