diff --git a/common/profiler.cpp b/common/profiler.cpp index 8369ec99..94a37bda 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -90,6 +90,8 @@ float device_metal_flops(struct llama_model * model, enum ggml_type dtype) { return device_flops(model, dtype, PROFILER_BACKEND_TYPE_METAL, 4); #endif + (void)model; + (void)dtype; return 0.0f; } @@ -98,6 +100,8 @@ float device_cuda_flops(struct llama_model * model, enum ggml_type dtype) { return device_flops(model, dtype, PROFILER_BACKEND_TYPE_CUDA, 4); #endif + (void)model; + (void)dtype; return 0.0f; } @@ -578,15 +582,33 @@ void device_print_props(struct device_info * dev_info_set, int n) { } LOG_INF("\n"); - LOG_INF("| GPU Metal flops (GFLOPS) "); + LOG_INF("| Metal flops (F32, GFLOPS) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops); } LOG_INF("\n"); - LOG_INF("| GPU CUDA flops (GFLOPS) "); + LOG_INF("| CUDA flops (F32, GFLOPS) "); for (int i = 0; i < n; ++i) { - LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops); + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f32); + } + LOG_INF("\n"); + + LOG_INF("| CUDA flops (F16, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f16); + } + LOG_INF("\n"); + + LOG_INF("| CUDA flops (Q8_0, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q8); + } + LOG_INF("\n"); + + LOG_INF("| CUDA flops (Q4_K, GFLOPS) "); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q4k); } LOG_INF("\n"); @@ -614,7 +636,8 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { + sizeof(float) * 2 // cpu_props.flops_f32 and cpu_props.flops_f16 + sizeof(struct memory_info) + sizeof(struct gpu_support) - + sizeof(float) * 4; // gpu_props.memory_free, gpu_props.memory_total, gpu_props.metal_flops, and gpu_props.cuda_flops + + sizeof(float) * 7; // gpu_props.memory_free, gpu_props.memory_total, gpu_props.metal_flops, + // gpu_props.cuda_flops_f32, gpu_props.cuda_flops_f16, gpu_props.cuda_flops_q8, and gpu_props.cuda_flops_q4k *buffer = (char *)malloc(total_size); char * ptr = *buffer; @@ -677,7 +700,16 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.metal_flops, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->gpu_props.cuda_flops, sizeof(float)); + memcpy(ptr, &dev_info->gpu_props.cuda_flops_f32, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->gpu_props.cuda_flops_f16, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->gpu_props.cuda_flops_q8, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->gpu_props.cuda_flops_q4k, sizeof(float)); return total_size; } @@ -757,5 +789,14 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->gpu_props.metal_flops, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->gpu_props.cuda_flops, ptr, sizeof(float)); + memcpy(&dev_info->gpu_props.cuda_flops_f32, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->gpu_props.cuda_flops_f16, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->gpu_props.cuda_flops_q8, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->gpu_props.cuda_flops_q4k, ptr, sizeof(float)); } diff --git a/common/profiler.h b/common/profiler.h index 74a5864c..3d6a1f1e 100644 --- a/common/profiler.h +++ b/common/profiler.h @@ -42,13 +42,16 @@ struct gpu_support { struct gpu_props { const char * name; const char * description; - float memory_free; // in GB - float memory_total; // in GB - float metal_flops; // in GFLOPS - float cuda_flops; // in GFLOPS + float memory_free; // in GB + float memory_total; // in GB + float metal_flops; // in GFLOPS + float cuda_flops_f32; // in GFLOPS + float cuda_flops_f16; // in GFLOPS + float cuda_flops_q8; // in GFLOPS + float cuda_flops_q4k; // in GFLOPS gpu_props() - : name(""), description(""), memory_free(0.0f), memory_total(0.0f), metal_flops(0.0f), cuda_flops(0.0f) {} + : name(""), description(""), memory_free(0.0f), memory_total(0.0f), metal_flops(0.0f), cuda_flops_f32(0.0f), cuda_flops_f16(0.0f), cuda_flops_q8(0.0f), cuda_flops_q4k(0.0f) {} }; struct device_info { diff --git a/src/llama.cpp b/src/llama.cpp index 3b09105b..42238996 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3582,7 +3582,10 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, co dev_info->gpu_props.memory_free = round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100; dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100; dev_info->gpu_props.metal_flops = device_metal_flops(model, GGML_TYPE_F32); - dev_info->gpu_props.cuda_flops = device_cuda_flops(model, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_f32 = device_cuda_flops(model, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_f16 = device_cuda_flops(model, GGML_TYPE_F16); + dev_info->gpu_props.cuda_flops_q8 = device_cuda_flops(model, GGML_TYPE_Q8_0); + dev_info->gpu_props.cuda_flops_q4k = device_cuda_flops(model, GGML_TYPE_Q4_K); } ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device) {