add f32, f16, q8, q4k speed test for cuda

This commit is contained in:
Lizonghang 2024-11-10 23:41:13 +04:00
parent f4260bb346
commit 10f6f92c7e
3 changed files with 59 additions and 12 deletions

View file

@ -90,6 +90,8 @@ float device_metal_flops(struct llama_model * model, enum ggml_type dtype) {
return device_flops(model, dtype, PROFILER_BACKEND_TYPE_METAL, 4); return device_flops(model, dtype, PROFILER_BACKEND_TYPE_METAL, 4);
#endif #endif
(void)model;
(void)dtype;
return 0.0f; return 0.0f;
} }
@ -98,6 +100,8 @@ float device_cuda_flops(struct llama_model * model, enum ggml_type dtype) {
return device_flops(model, dtype, PROFILER_BACKEND_TYPE_CUDA, 4); return device_flops(model, dtype, PROFILER_BACKEND_TYPE_CUDA, 4);
#endif #endif
(void)model;
(void)dtype;
return 0.0f; return 0.0f;
} }
@ -578,15 +582,33 @@ void device_print_props(struct device_info * dev_info_set, int n) {
} }
LOG_INF("\n"); LOG_INF("\n");
LOG_INF("| GPU Metal flops (GFLOPS) "); LOG_INF("| Metal flops (F32, GFLOPS) ");
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops); LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops);
} }
LOG_INF("\n"); LOG_INF("\n");
LOG_INF("| GPU CUDA flops (GFLOPS) "); LOG_INF("| CUDA flops (F32, GFLOPS) ");
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops); LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f32);
}
LOG_INF("\n");
LOG_INF("| CUDA flops (F16, GFLOPS) ");
for (int i = 0; i < n; ++i) {
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f16);
}
LOG_INF("\n");
LOG_INF("| CUDA flops (Q8_0, GFLOPS) ");
for (int i = 0; i < n; ++i) {
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q8);
}
LOG_INF("\n");
LOG_INF("| CUDA flops (Q4_K, GFLOPS) ");
for (int i = 0; i < n; ++i) {
LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q4k);
} }
LOG_INF("\n"); LOG_INF("\n");
@ -614,7 +636,8 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
+ sizeof(float) * 2 // cpu_props.flops_f32 and cpu_props.flops_f16 + sizeof(float) * 2 // cpu_props.flops_f32 and cpu_props.flops_f16
+ sizeof(struct memory_info) + sizeof(struct memory_info)
+ sizeof(struct gpu_support) + sizeof(struct gpu_support)
+ sizeof(float) * 4; // gpu_props.memory_free, gpu_props.memory_total, gpu_props.metal_flops, and gpu_props.cuda_flops + sizeof(float) * 7; // gpu_props.memory_free, gpu_props.memory_total, gpu_props.metal_flops,
// gpu_props.cuda_flops_f32, gpu_props.cuda_flops_f16, gpu_props.cuda_flops_q8, and gpu_props.cuda_flops_q4k
*buffer = (char *)malloc(total_size); *buffer = (char *)malloc(total_size);
char * ptr = *buffer; char * ptr = *buffer;
@ -677,7 +700,16 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
memcpy(ptr, &dev_info->gpu_props.metal_flops, sizeof(float)); memcpy(ptr, &dev_info->gpu_props.metal_flops, sizeof(float));
ptr += sizeof(float); ptr += sizeof(float);
memcpy(ptr, &dev_info->gpu_props.cuda_flops, sizeof(float)); memcpy(ptr, &dev_info->gpu_props.cuda_flops_f32, sizeof(float));
ptr += sizeof(float);
memcpy(ptr, &dev_info->gpu_props.cuda_flops_f16, sizeof(float));
ptr += sizeof(float);
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q8, sizeof(float));
ptr += sizeof(float);
memcpy(ptr, &dev_info->gpu_props.cuda_flops_q4k, sizeof(float));
return total_size; return total_size;
} }
@ -757,5 +789,14 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
memcpy(&dev_info->gpu_props.metal_flops, ptr, sizeof(float)); memcpy(&dev_info->gpu_props.metal_flops, ptr, sizeof(float));
ptr += sizeof(float); ptr += sizeof(float);
memcpy(&dev_info->gpu_props.cuda_flops, ptr, sizeof(float)); memcpy(&dev_info->gpu_props.cuda_flops_f32, ptr, sizeof(float));
ptr += sizeof(float);
memcpy(&dev_info->gpu_props.cuda_flops_f16, ptr, sizeof(float));
ptr += sizeof(float);
memcpy(&dev_info->gpu_props.cuda_flops_q8, ptr, sizeof(float));
ptr += sizeof(float);
memcpy(&dev_info->gpu_props.cuda_flops_q4k, ptr, sizeof(float));
} }

View file

@ -45,10 +45,13 @@ struct gpu_props {
float memory_free; // in GB float memory_free; // in GB
float memory_total; // in GB float memory_total; // in GB
float metal_flops; // in GFLOPS float metal_flops; // in GFLOPS
float cuda_flops; // in GFLOPS float cuda_flops_f32; // in GFLOPS
float cuda_flops_f16; // in GFLOPS
float cuda_flops_q8; // in GFLOPS
float cuda_flops_q4k; // in GFLOPS
gpu_props() gpu_props()
: name(""), description(""), memory_free(0.0f), memory_total(0.0f), metal_flops(0.0f), cuda_flops(0.0f) {} : name(""), description(""), memory_free(0.0f), memory_total(0.0f), metal_flops(0.0f), cuda_flops_f32(0.0f), cuda_flops_f16(0.0f), cuda_flops_q8(0.0f), cuda_flops_q4k(0.0f) {}
}; };
struct device_info { struct device_info {

View file

@ -3582,7 +3582,10 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, co
dev_info->gpu_props.memory_free = round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100; dev_info->gpu_props.memory_free = round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100;
dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100; dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100;
dev_info->gpu_props.metal_flops = device_metal_flops(model, GGML_TYPE_F32); dev_info->gpu_props.metal_flops = device_metal_flops(model, GGML_TYPE_F32);
dev_info->gpu_props.cuda_flops = device_cuda_flops(model, GGML_TYPE_F32); dev_info->gpu_props.cuda_flops_f32 = device_cuda_flops(model, GGML_TYPE_F32);
dev_info->gpu_props.cuda_flops_f16 = device_cuda_flops(model, GGML_TYPE_F16);
dev_info->gpu_props.cuda_flops_q8 = device_cuda_flops(model, GGML_TYPE_Q8_0);
dev_info->gpu_props.cuda_flops_q4k = device_cuda_flops(model, GGML_TYPE_Q4_K);
} }
ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device) { ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device) {