From a5ba34169ad96158513792632ea00efcd0cec80e Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Sat, 23 Nov 2024 21:36:34 +0400 Subject: [PATCH] add f32, f16, q4k_f32, q6k_f32 flops test and fix duplicate inp_embd in subgraphs --- common/profiler.cpp | 123 +++++++++++++++++++++++++++++++++----------- common/profiler.h | 92 ++++++++++++++++++++++++--------- src/llama.cpp | 39 ++++++++------ 3 files changed, 184 insertions(+), 70 deletions(-) diff --git a/common/profiler.cpp b/common/profiler.cpp index 05fb4ba0..8dcd05e7 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -82,7 +82,7 @@ uint32_t device_cpu_cores() { return core_count; } -static float device_flops(struct llama_model * model, enum ggml_type dtype, profiler_backend_type btype, int n_threads) { +static float device_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, profiler_backend_type btype, int n_threads) { const int n_embd = llama_n_embd(model); std::vector matrix_A(n_embd * n_embd, 1.0f); std::vector matrix_B(n_embd * n_embd, 1.0f / n_embd); @@ -119,8 +119,8 @@ static float device_flops(struct llama_model * model, enum ggml_type dtype, prof }; struct ggml_context * ctx = ggml_init(params); - struct ggml_tensor * tensor_a = ggml_new_tensor_2d(ctx, dtype, n_embd, n_embd); - struct ggml_tensor * tensor_b = ggml_new_tensor_2d(ctx, dtype, n_embd, n_embd); + struct ggml_tensor * tensor_a = ggml_new_tensor_2d(ctx, src0t, n_embd, n_embd); + struct ggml_tensor * tensor_b = ggml_new_tensor_2d(ctx, src1t, n_embd, n_embd); ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend); @@ -168,27 +168,29 @@ static float device_flops(struct llama_model * model, enum ggml_type dtype, prof return (float)flops; } -float device_cpu_flops(struct llama_model * model, enum ggml_type dtype, int n_threads) { - return device_flops(model, dtype, PROFILER_BACKEND_TYPE_CPU, n_threads); +float device_cpu_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, int n_threads) { + return device_flops(model, src0t, src1t, PROFILER_BACKEND_TYPE_CPU, n_threads); } -float device_metal_flops(struct llama_model * model, enum ggml_type dtype) { +float device_metal_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t) { #ifdef GGML_USE_METAL - return device_flops(model, dtype, PROFILER_BACKEND_TYPE_METAL, 4); + return device_flops(model, src0t, src1t, PROFILER_BACKEND_TYPE_METAL, 4); #endif (void)model; - (void)dtype; + (void)src0t; + (void)src1t; return 0.0f; } -float device_cuda_flops(struct llama_model * model, enum ggml_type dtype) { +float device_cuda_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t) { #ifdef GGML_USE_CUDA - return device_flops(model, dtype, PROFILER_BACKEND_TYPE_CUDA, 4); + return device_flops(model, src0t, src1t, PROFILER_BACKEND_TYPE_CUDA, 4); #endif (void)model; - (void)dtype; + (void)src0t; + (void)src1t; return 0.0f; } @@ -463,18 +465,30 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); - LOG_INF("| CPU flops (F32, GFLOPS) "); + LOG_INF("| CPU flops (F32 x F32, GFLOPS)"); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_f32); } LOG_INF("\n"); - LOG_INF("| CPU flops (F16, GFLOPS) "); + LOG_INF("| CPU flops (F16 x F16, GFLOPS)"); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_f16); } LOG_INF("\n"); + LOG_INF("| CPU flops (Q4K x F32, GFLOPS)"); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q4k_f32); + } + LOG_INF("\n"); + + LOG_INF("| CPU flops (Q6K x F32, GFLOPS)"); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q6k_f32); + } + LOG_INF("\n"); + LOG_INF("| Physical Mem Total (GB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.total_physical); @@ -577,33 +591,51 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); - LOG_INF("| Metal flops (F32, GFLOPS) "); + LOG_INF("| Metal flops (F32xF32, GFLOPS)"); for (int i = 0; i < n; ++i) { - LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops); + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (F32, GFLOPS) "); + LOG_INF("| Metal flops (F16xF16, GFLOPS)"); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_f16); + } + LOG_INF("\n"); + + LOG_INF("| Metal flops (Q4KxF32, GFLOPS)"); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q4k_f32); + } + LOG_INF("\n"); + + LOG_INF("| Metal flops (Q6KxF32, GFLOPS)"); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q6k_f32); + } + LOG_INF("\n"); + + LOG_INF("| CUDA flops (F32xF32, GFLOPS)"); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (F16, GFLOPS) "); + LOG_INF("| CUDA flops (F16xF16, GFLOPS)"); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f16); } LOG_INF("\n"); - LOG_INF("| CUDA flops (Q8_0, GFLOPS) "); + LOG_INF("| CUDA flops (Q4KxF32, GFLOPS)"); for (int i = 0; i < n; ++i) { - LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q8); + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q4k_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (Q4_K, GFLOPS) "); + LOG_INF("| CUDA flops (Q6KxF32, GFLOPS)"); for (int i = 0; i < n; ++i) { - LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q4k); + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q6k_f32); } LOG_INF("\n"); @@ -660,10 +692,11 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { + gpu_description_len + sizeof(float) // disk_read_bandwidth + sizeof(uint32_t) // cpu_props.cores - + sizeof(float) * 2 // cpu_props.flops_f32 and cpu_props.flops_f16 + + sizeof(float) * 4 // cpu_props.flops_f32, cpu_props.flops_f16, cpu_props.flops_q4k_f32, cpu_props.flops_q6k_f32 + sizeof(struct memory_info) + sizeof(struct gpu_support) - + sizeof(float) * 7; // gpu_props.memory_free, gpu_props.memory_total, gpu_props.metal_flops, + + sizeof(float) * 10; // gpu_props.memory_free, gpu_props.memory_total, + // gpu_props.metal_flops_f32, gpu_props.metal_flops_f16, gpu_props.metal_flops_q4k_f32, gpu_props.metal_flops_q6k_f32, // gpu_props.cuda_flops_f32, gpu_props.cuda_flops_f16, gpu_props.cuda_flops_q8, and gpu_props.cuda_flops_q4k *buffer = (char *)malloc(total_size); @@ -712,6 +745,12 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->cpu_props.flops_f16, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->cpu_props.flops_q4k_f32, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->cpu_props.flops_q6k_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->memory, sizeof(struct memory_info)); ptr += sizeof(struct memory_info); @@ -724,7 +763,16 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.memory_total, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->gpu_props.metal_flops, sizeof(float)); + memcpy(ptr, &dev_info->gpu_props.metal_flops_f32, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->gpu_props.metal_flops_f16, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->gpu_props.metal_flops_q4k_f32, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->gpu_props.metal_flops_q6k_f32, sizeof(float)); ptr += sizeof(float); memcpy(ptr, &dev_info->gpu_props.cuda_flops_f32, sizeof(float)); @@ -733,10 +781,10 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.cuda_flops_f16, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->gpu_props.cuda_flops_q8, sizeof(float)); + memcpy(ptr, &dev_info->gpu_props.cuda_flops_q4k_f32, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->gpu_props.cuda_flops_q4k, sizeof(float)); + memcpy(ptr, &dev_info->gpu_props.cuda_flops_q6k_f32, sizeof(float)); // no need to synchronize model flops return total_size; @@ -802,6 +850,12 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->cpu_props.flops_f16, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->cpu_props.flops_q4k_f32, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->cpu_props.flops_q6k_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->memory, ptr, sizeof(struct memory_info)); ptr += sizeof(struct memory_info); @@ -814,7 +868,16 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->gpu_props.memory_total, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->gpu_props.metal_flops, ptr, sizeof(float)); + memcpy(&dev_info->gpu_props.metal_flops_f32, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->gpu_props.metal_flops_f16, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->gpu_props.metal_flops_q4k_f32, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->gpu_props.metal_flops_q6k_f32, ptr, sizeof(float)); ptr += sizeof(float); memcpy(&dev_info->gpu_props.cuda_flops_f32, ptr, sizeof(float)); @@ -823,10 +886,10 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->gpu_props.cuda_flops_f16, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->gpu_props.cuda_flops_q8, ptr, sizeof(float)); + memcpy(&dev_info->gpu_props.cuda_flops_q4k_f32, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->gpu_props.cuda_flops_q4k, ptr, sizeof(float)); + memcpy(&dev_info->gpu_props.cuda_flops_q6k_f32, ptr, sizeof(float)); // no need to synchronize model flops -} +} \ No newline at end of file diff --git a/common/profiler.h b/common/profiler.h index c9f046a1..f1c79d8d 100644 --- a/common/profiler.h +++ b/common/profiler.h @@ -8,11 +8,19 @@ struct cpu_props { const char * name; const char * description; uint32_t cores; - float flops_f32; // in GFLOPS - float flops_f16; // in GFLOPS + float flops_f32; // in GFLOPS + float flops_f16; // in GFLOPS + float flops_q4k_f32; // in GFLOPS + float flops_q6k_f32; // in GFLOPS - cpu_props() - : name(""), description(""), cores(0), flops_f32(0.0f), flops_f16(0.0f) {} + cpu_props() : + name(""), + description(""), + cores(0), + flops_f32 (0.0f), + flops_f16 (0.0f), + flops_q4k_f32(0.0f), + flops_q6k_f32(0.0f) {} }; struct memory_info { @@ -22,8 +30,12 @@ struct memory_info { float available_swap; // in GB float bandwidth; // in GB/s - memory_info() - : total_physical(0.0f), available_physical(0.0f), total_swap(0.0f), available_swap(0.0f), bandwidth(0.0f) {} + memory_info() : + total_physical (0.0f), + available_physical(0.0f), + total_swap (0.0f), + available_swap (0.0f), + bandwidth (0.0f) {} }; struct gpu_support { @@ -35,23 +47,43 @@ struct gpu_support { bool blas; bool sycl; - gpu_support() - : metal(false), cuda(false), vulkan(false), kompute(false), gpublas(false), blas(false), sycl(false) {} + gpu_support() : + metal (false), + cuda (false), + vulkan (false), + kompute(false), + gpublas(false), + blas (false), + sycl (false) {} }; struct gpu_props { const char * name; const char * description; - float memory_free; // in GB - float memory_total; // in GB - float metal_flops; // in GFLOPS - float cuda_flops_f32; // in GFLOPS - float cuda_flops_f16; // in GFLOPS - float cuda_flops_q8; // in GFLOPS - float cuda_flops_q4k; // in GFLOPS + float memory_free; // in GB + float memory_total; // in GB + float metal_flops_f32; // in GFLOPS + float metal_flops_f16; // in GFLOPS + float metal_flops_q4k_f32; // in GFLOPS + float metal_flops_q6k_f32; // in GFLOPS + float cuda_flops_f32; // in GFLOPS + float cuda_flops_f16; // in GFLOPS + float cuda_flops_q4k_f32; // in GFLOPS + float cuda_flops_q6k_f32; // in GFLOPS - gpu_props() - : name(""), description(""), memory_free(0.0f), memory_total(0.0f), metal_flops(0.0f), cuda_flops_f32(0.0f), cuda_flops_f16(0.0f), cuda_flops_q8(0.0f), cuda_flops_q4k(0.0f) {} + gpu_props() : + name(""), + description(""), + memory_free (0.0f), + memory_total (0.0f), + metal_flops_f32 (0.0f), + metal_flops_f16 (0.0f), + metal_flops_q4k_f32(0.0f), + metal_flops_q6k_f32(0.0f), + cuda_flops_f32 (0.0f), + cuda_flops_f16 (0.0f), + cuda_flops_q4k_f32 (0.0f), + cuda_flops_q6k_f32 (0.0f) {} }; struct model_flops { @@ -65,8 +97,13 @@ struct model_flops { int64_t output_params; int64_t layer_params; - model_flops() - : input_flops(0), output_flops(0), layer_flops(0), input_params(0), output_params(0), layer_params(0) {} + model_flops() : + input_flops (0), + output_flops (0), + layer_flops (0), + input_params (0), + output_params(0), + layer_params (0) {} }; struct device_info { @@ -79,8 +116,15 @@ struct device_info { struct gpu_props gpu_props; struct model_flops model_flops; - device_info() - : rank(0), device_name(""), disk_read_bandwidth(0.0f), cpu_props(), memory(), gpu_support(), gpu_props(), model_flops() {} + device_info() : + rank(0), + device_name(""), + disk_read_bandwidth(0.0f), + cpu_props(), + memory(), + gpu_support(), + gpu_props(), + model_flops() {} }; enum profiler_backend_type { @@ -92,9 +136,9 @@ enum profiler_backend_type { const char * device_name(void); uint32_t device_cpu_cores (void); -float device_cpu_flops (struct llama_model * model, enum ggml_type dtype, int n_threads); -float device_metal_flops (struct llama_model * model, enum ggml_type dtype); -float device_cuda_flops (struct llama_model * model, enum ggml_type dtype); +float device_cpu_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, int n_threads); +float device_metal_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t); +float device_cuda_flops (struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t); uint64_t device_physical_memory(bool available); uint64_t device_swap_memory (bool available); uint64_t device_disk_read_bw (const char * test_file, size_t buffer_size_mb); diff --git a/src/llama.cpp b/src/llama.cpp index 303f451d..1a3eb1d6 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3549,14 +3549,16 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_ void llama_profile_device(device_info * dev_info, struct llama_model * model, llama_model_loader * ml, const char * test_file, int n_threads) { dev_info->device_name = device_name(); dev_info->cpu_props.cores = device_cpu_cores(); - dev_info->cpu_props.flops_f32 = device_cpu_flops(model, GGML_TYPE_F32, n_threads); - dev_info->cpu_props.flops_f16 = device_cpu_flops(model, GGML_TYPE_F16, n_threads); + dev_info->cpu_props.flops_f32 = device_cpu_flops(model, GGML_TYPE_F32, GGML_TYPE_F32, n_threads); + dev_info->cpu_props.flops_f16 = device_cpu_flops(model, GGML_TYPE_F16, GGML_TYPE_F16, n_threads); + dev_info->cpu_props.flops_q4k_f32 = device_cpu_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32, n_threads); + dev_info->cpu_props.flops_q6k_f32 = device_cpu_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32, n_threads); dev_info->memory.total_physical = round(device_physical_memory(false) / (double)(1 << 30) * 100) / 100; dev_info->memory.available_physical = round(device_physical_memory(true) / (double)(1 << 30) * 100) / 100; - dev_info->memory.total_swap = round(device_swap_memory(false) / (double)(1 << 30) * 100) / 100; - dev_info->memory.available_swap = round(device_swap_memory(true) / (double)(1 << 30) * 100) / 100; - dev_info->memory.bandwidth = round(device_memory_bw(500) / (double)(1 << 30) * 100) / 100; + dev_info->memory.total_swap = round(device_swap_memory (false) / (double)(1 << 30) * 100) / 100; + dev_info->memory.available_swap = round(device_swap_memory (true) / (double)(1 << 30) * 100) / 100; + dev_info->memory.bandwidth = round(device_memory_bw (500) / (double)(1 << 30) * 100) / 100; dev_info->disk_read_bandwidth = round(device_disk_read_bw(test_file, 500) / (double)(1 << 30) * 100) / 100; @@ -3573,18 +3575,21 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, ll device_get_props(model, -1, &cpu_props); // -1 for cpu device_get_props(model, 0, &gpu_props); // 0 for gpu0 - dev_info->cpu_props.name = cpu_props.name; - dev_info->cpu_props.description = cpu_props.description; + dev_info->cpu_props.name = cpu_props.name; + dev_info->cpu_props.description = cpu_props.description; - dev_info->gpu_props.name = gpu_props.name; - dev_info->gpu_props.description = gpu_props.description; - dev_info->gpu_props.memory_free = round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100; - dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100; - dev_info->gpu_props.metal_flops = device_metal_flops(model, GGML_TYPE_F32); - dev_info->gpu_props.cuda_flops_f32 = device_cuda_flops(model, GGML_TYPE_F32); - dev_info->gpu_props.cuda_flops_f16 = device_cuda_flops(model, GGML_TYPE_F16); - dev_info->gpu_props.cuda_flops_q8 = device_cuda_flops(model, GGML_TYPE_Q8_0); - dev_info->gpu_props.cuda_flops_q4k = device_cuda_flops(model, GGML_TYPE_Q4_K); + dev_info->gpu_props.name = gpu_props.name; + dev_info->gpu_props.description = gpu_props.description; + dev_info->gpu_props.memory_free = round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100; + dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100; + dev_info->gpu_props.metal_flops_f32 = device_metal_flops(model, GGML_TYPE_F32, GGML_TYPE_F32); + dev_info->gpu_props.metal_flops_f16 = device_metal_flops(model, GGML_TYPE_F16, GGML_TYPE_F16); + dev_info->gpu_props.metal_flops_q4k_f32 = device_metal_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32); + dev_info->gpu_props.metal_flops_q6k_f32 = device_metal_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_f32 = device_cuda_flops (model, GGML_TYPE_F32, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_f16 = device_cuda_flops (model, GGML_TYPE_F16, GGML_TYPE_F16); + dev_info->gpu_props.cuda_flops_q4k_f32 = device_cuda_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_q6k_f32 = device_cuda_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32); if (dev_info->rank == 0) { struct model_flops * ffo = &dev_info->model_flops; @@ -10687,7 +10692,9 @@ struct llm_build_context { // build the input layer as a seperate subgraph ggml_build_forward_expand(sub_gf, inpL); sub_gfs.push_back(sub_gf); + sub_gf = nullptr; + inpL = nullptr; } // inpB - contains the output embedding from other nodes