From 3fe00a16a06f34cbac39c037f31c8c14bff3bc29 Mon Sep 17 00:00:00 2001 From: Lizonghang <870644199@qq.com> Date: Sun, 24 Nov 2024 13:13:32 +0400 Subject: [PATCH] count model flops for f32xf32, f16xf32, q4kxf32, q6kxf32 --- common/profiler.cpp | 82 +++++++++++++++----------- common/profiler.h | 82 +++++++++++++++----------- include/llama.h | 3 +- src/llama.cpp | 140 ++++++++++++++++++++++++++++---------------- 4 files changed, 188 insertions(+), 119 deletions(-) diff --git a/common/profiler.cpp b/common/profiler.cpp index 8dcd05e7..d2363a4f 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -465,15 +465,15 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); - LOG_INF("| CPU flops (F32 x F32, GFLOPS)"); + LOG_INF("| CPU flops (F32xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { - LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_f32); + LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_f32_f32); } LOG_INF("\n"); - LOG_INF("| CPU flops (F16 x F16, GFLOPS)"); + LOG_INF("| CPU flops (F16xF32, GFLOPS) "); for (int i = 0; i < n; ++i) { - LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_f16); + LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_f16_f32); } LOG_INF("\n"); @@ -593,13 +593,13 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| Metal flops (F32xF32, GFLOPS)"); for (int i = 0; i < n; ++i) { - LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_f32); + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_f32_f32); } LOG_INF("\n"); - LOG_INF("| Metal flops (F16xF16, GFLOPS)"); + LOG_INF("| Metal flops (F16xF32, GFLOPS)"); for (int i = 0; i < n; ++i) { - LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_f16); + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_f16_f32); } LOG_INF("\n"); @@ -617,13 +617,13 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m LOG_INF("| CUDA flops (F32xF32, GFLOPS)"); for (int i = 0; i < n; ++i) { - LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f32); + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f32_f32); } LOG_INF("\n"); - LOG_INF("| CUDA flops (F16xF16, GFLOPS)"); + LOG_INF("| CUDA flops (F16xF32, GFLOPS)"); for (int i = 0; i < n; ++i) { - LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f16); + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f16_f32); } LOG_INF("\n"); @@ -639,33 +639,45 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); - LOG_INF("| Model flops (input) "); - LOG_INF("| %-10lu ", dev_info_set[0].model_flops.input_flops); + LOG_INF("| Model flops (output F32xF32) "); + LOG_INF("| %-10lu ", dev_info_set[0].model_flops.output_f32_f32); LOG_INF("\n"); - LOG_INF("| Model flops (each layer) "); - LOG_INF("| %-10lu ", dev_info_set[0].model_flops.layer_flops); + LOG_INF("| Model flops (output Q6KxF32) "); + LOG_INF("| %-10lu ", dev_info_set[0].model_flops.output_q6k_f32); LOG_INF("\n"); - LOG_INF("| Model flops (output) "); - LOG_INF("| %-10lu ", dev_info_set[0].model_flops.output_flops); + LOG_INF("| Model flops (layer F32xF32) "); + LOG_INF("| %-10lu ", dev_info_set[0].model_flops.layer_f32_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (layer F16xF32) "); + LOG_INF("| %-10lu ", dev_info_set[0].model_flops.layer_f16_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (layer Q4KxF32) "); + LOG_INF("| %-10lu ", dev_info_set[0].model_flops.layer_q4k_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (layer Q6KxF32) "); + LOG_INF("| %-10lu ", dev_info_set[0].model_flops.layer_q6k_f32); LOG_INF("\n"); LOG_INF("| Model params (input) "); - LOG_INF("| %-10lu ", dev_info_set[0].model_flops.input_params); + LOG_INF("| %-10lu ", dev_info_set[0].model_params.input_params); LOG_INF("\n"); LOG_INF("| Model params (each layer) "); - LOG_INF("| %-10lu ", dev_info_set[0].model_flops.layer_params); + LOG_INF("| %-10lu ", dev_info_set[0].model_params.layer_params); LOG_INF("\n"); LOG_INF("| Model params (output) "); - LOG_INF("| %-10lu ", dev_info_set[0].model_flops.output_params); + LOG_INF("| %-10lu ", dev_info_set[0].model_params.output_params); LOG_INF("\n"); model_flops ffo = dev_info_set[0].model_flops; - int64_t total_flops = ffo.input_flops + ffo.output_flops + (ffo.layer_flops * llama_model_n_layers(model)); - double cpu_flops_f16 = dev_info_set[0].cpu_props.flops_f16 * 1e9; + int64_t total_flops = ffo.output_f32_f32 + (ffo.layer_f32_f32 * llama_model_n_layers(model)); // todo + double cpu_flops_f16 = dev_info_set[0].cpu_props.flops_f16_f32 * 1e9; LOG_INF("| Token latency (ms) "); LOG_INF("| %-10.2f ", total_flops / cpu_flops_f16 * 1000); @@ -739,10 +751,10 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->cpu_props.cores, sizeof(uint32_t)); ptr += sizeof(uint32_t); - memcpy(ptr, &dev_info->cpu_props.flops_f32, sizeof(float)); + memcpy(ptr, &dev_info->cpu_props.flops_f32_f32, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->cpu_props.flops_f16, sizeof(float)); + memcpy(ptr, &dev_info->cpu_props.flops_f16_f32, sizeof(float)); ptr += sizeof(float); memcpy(ptr, &dev_info->cpu_props.flops_q4k_f32, sizeof(float)); @@ -763,10 +775,10 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.memory_total, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->gpu_props.metal_flops_f32, sizeof(float)); + memcpy(ptr, &dev_info->gpu_props.metal_flops_f32_f32, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->gpu_props.metal_flops_f16, sizeof(float)); + memcpy(ptr, &dev_info->gpu_props.metal_flops_f16_f32, sizeof(float)); ptr += sizeof(float); memcpy(ptr, &dev_info->gpu_props.metal_flops_q4k_f32, sizeof(float)); @@ -775,10 +787,10 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.metal_flops_q6k_f32, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->gpu_props.cuda_flops_f32, sizeof(float)); + memcpy(ptr, &dev_info->gpu_props.cuda_flops_f32_f32, sizeof(float)); ptr += sizeof(float); - memcpy(ptr, &dev_info->gpu_props.cuda_flops_f16, sizeof(float)); + memcpy(ptr, &dev_info->gpu_props.cuda_flops_f16_f32, sizeof(float)); ptr += sizeof(float); memcpy(ptr, &dev_info->gpu_props.cuda_flops_q4k_f32, sizeof(float)); @@ -786,7 +798,7 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.cuda_flops_q6k_f32, sizeof(float)); - // no need to synchronize model flops + // no need to synchronize model flops and model params return total_size; } @@ -844,10 +856,10 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->cpu_props.cores, ptr, sizeof(uint32_t)); ptr += sizeof(uint32_t); - memcpy(&dev_info->cpu_props.flops_f32, ptr, sizeof(float)); + memcpy(&dev_info->cpu_props.flops_f32_f32, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->cpu_props.flops_f16, ptr, sizeof(float)); + memcpy(&dev_info->cpu_props.flops_f16_f32, ptr, sizeof(float)); ptr += sizeof(float); memcpy(&dev_info->cpu_props.flops_q4k_f32, ptr, sizeof(float)); @@ -868,10 +880,10 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->gpu_props.memory_total, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->gpu_props.metal_flops_f32, ptr, sizeof(float)); + memcpy(&dev_info->gpu_props.metal_flops_f32_f32, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->gpu_props.metal_flops_f16, ptr, sizeof(float)); + memcpy(&dev_info->gpu_props.metal_flops_f16_f32, ptr, sizeof(float)); ptr += sizeof(float); memcpy(&dev_info->gpu_props.metal_flops_q4k_f32, ptr, sizeof(float)); @@ -880,10 +892,10 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->gpu_props.metal_flops_q6k_f32, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->gpu_props.cuda_flops_f32, ptr, sizeof(float)); + memcpy(&dev_info->gpu_props.cuda_flops_f32_f32, ptr, sizeof(float)); ptr += sizeof(float); - memcpy(&dev_info->gpu_props.cuda_flops_f16, ptr, sizeof(float)); + memcpy(&dev_info->gpu_props.cuda_flops_f16_f32, ptr, sizeof(float)); ptr += sizeof(float); memcpy(&dev_info->gpu_props.cuda_flops_q4k_f32, ptr, sizeof(float)); @@ -891,5 +903,5 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->gpu_props.cuda_flops_q6k_f32, ptr, sizeof(float)); - // no need to synchronize model flops + // no need to synchronize model flops and model params } \ No newline at end of file diff --git a/common/profiler.h b/common/profiler.h index f1c79d8d..bda570ff 100644 --- a/common/profiler.h +++ b/common/profiler.h @@ -8,8 +8,8 @@ struct cpu_props { const char * name; const char * description; uint32_t cores; - float flops_f32; // in GFLOPS - float flops_f16; // in GFLOPS + float flops_f32_f32; // in GFLOPS + float flops_f16_f32; // in GFLOPS float flops_q4k_f32; // in GFLOPS float flops_q6k_f32; // in GFLOPS @@ -17,8 +17,8 @@ struct cpu_props { name(""), description(""), cores(0), - flops_f32 (0.0f), - flops_f16 (0.0f), + flops_f32_f32(0.0f), + flops_f16_f32(0.0f), flops_q4k_f32(0.0f), flops_q6k_f32(0.0f) {} }; @@ -62,12 +62,12 @@ struct gpu_props { const char * description; float memory_free; // in GB float memory_total; // in GB - float metal_flops_f32; // in GFLOPS - float metal_flops_f16; // in GFLOPS + float metal_flops_f32_f32; // in GFLOPS + float metal_flops_f16_f32; // in GFLOPS float metal_flops_q4k_f32; // in GFLOPS float metal_flops_q6k_f32; // in GFLOPS - float cuda_flops_f32; // in GFLOPS - float cuda_flops_f16; // in GFLOPS + float cuda_flops_f32_f32; // in GFLOPS + float cuda_flops_f16_f32; // in GFLOPS float cuda_flops_q4k_f32; // in GFLOPS float cuda_flops_q6k_f32; // in GFLOPS @@ -76,45 +76,54 @@ struct gpu_props { description(""), memory_free (0.0f), memory_total (0.0f), - metal_flops_f32 (0.0f), - metal_flops_f16 (0.0f), + metal_flops_f32_f32(0.0f), + metal_flops_f16_f32(0.0f), metal_flops_q4k_f32(0.0f), metal_flops_q6k_f32(0.0f), - cuda_flops_f32 (0.0f), - cuda_flops_f16 (0.0f), + cuda_flops_f32_f32 (0.0f), + cuda_flops_f16_f32 (0.0f), cuda_flops_q4k_f32 (0.0f), cuda_flops_q6k_f32 (0.0f) {} }; struct model_flops { - // model flops - int64_t input_flops; - int64_t output_flops; - int64_t layer_flops; - - // model params + int64_t output_f32_f32; + int64_t output_q6k_f32; + int64_t layer_f32_f32; + int64_t layer_f16_f32; + int64_t layer_q4k_f32; + int64_t layer_q6k_f32; + + model_flops() : + output_f32_f32(0), + output_q6k_f32(0), + layer_f32_f32 (0), + layer_f16_f32 (0), + layer_q4k_f32 (0), + layer_q6k_f32 (0) {} +}; + +struct model_params { int64_t input_params; int64_t output_params; int64_t layer_params; - model_flops() : - input_flops (0), - output_flops (0), - layer_flops (0), - input_params (0), - output_params(0), + model_params() : + input_params (0), + output_params(0), layer_params (0) {} }; struct device_info { - uint32_t rank; - const char * device_name; - float disk_read_bandwidth; // in GB/s - struct cpu_props cpu_props; - struct memory_info memory; - struct gpu_support gpu_support; - struct gpu_props gpu_props; - struct model_flops model_flops; + uint32_t rank; + const char * device_name; + float disk_read_bandwidth; // in GB/s + struct cpu_props cpu_props; + struct memory_info memory; + struct gpu_support gpu_support; + struct gpu_props gpu_props; + struct model_flops model_flops; + struct model_params model_params; device_info() : rank(0), @@ -124,7 +133,8 @@ struct device_info { memory(), gpu_support(), gpu_props(), - model_flops() {} + model_flops(), + model_params() {} }; enum profiler_backend_type { @@ -133,6 +143,12 @@ enum profiler_backend_type { PROFILER_BACKEND_TYPE_CUDA = 2, }; +enum profiler_layer_type { + PROFILER_LAYER_INPUT = 0, + PROFILER_LAYER_OUTPUT = 1, + PROFILER_LAYER_BACKEND = 2, +}; + const char * device_name(void); uint32_t device_cpu_cores (void); diff --git a/include/llama.h b/include/llama.h index 7fe18d39..24663712 100644 --- a/include/llama.h +++ b/include/llama.h @@ -528,7 +528,8 @@ extern "C" { LLAMA_API void llama_model_n_flops( struct llama_model * model, struct llama_model_loader * ml, - struct model_flops * ffo, + struct model_flops * n_flops, + struct model_params * n_params, const int64_t n_input, const int64_t n_history); diff --git a/src/llama.cpp b/src/llama.cpp index 1a3eb1d6..50a3b832 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3549,8 +3549,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_ void llama_profile_device(device_info * dev_info, struct llama_model * model, llama_model_loader * ml, const char * test_file, int n_threads) { dev_info->device_name = device_name(); dev_info->cpu_props.cores = device_cpu_cores(); - dev_info->cpu_props.flops_f32 = device_cpu_flops(model, GGML_TYPE_F32, GGML_TYPE_F32, n_threads); - dev_info->cpu_props.flops_f16 = device_cpu_flops(model, GGML_TYPE_F16, GGML_TYPE_F16, n_threads); + dev_info->cpu_props.flops_f32_f32 = device_cpu_flops(model, GGML_TYPE_F32, GGML_TYPE_F32, n_threads); + dev_info->cpu_props.flops_f16_f32 = device_cpu_flops(model, GGML_TYPE_F16, GGML_TYPE_F32, n_threads); dev_info->cpu_props.flops_q4k_f32 = device_cpu_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32, n_threads); dev_info->cpu_props.flops_q6k_f32 = device_cpu_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32, n_threads); @@ -3582,18 +3582,19 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, ll dev_info->gpu_props.description = gpu_props.description; dev_info->gpu_props.memory_free = round(gpu_props.memory_free / (double)(1 << 30) * 100) / 100; dev_info->gpu_props.memory_total = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100; - dev_info->gpu_props.metal_flops_f32 = device_metal_flops(model, GGML_TYPE_F32, GGML_TYPE_F32); - dev_info->gpu_props.metal_flops_f16 = device_metal_flops(model, GGML_TYPE_F16, GGML_TYPE_F16); + dev_info->gpu_props.metal_flops_f32_f32 = device_metal_flops(model, GGML_TYPE_F32, GGML_TYPE_F32); + dev_info->gpu_props.metal_flops_f16_f32 = device_metal_flops(model, GGML_TYPE_F16, GGML_TYPE_F32); dev_info->gpu_props.metal_flops_q4k_f32 = device_metal_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32); dev_info->gpu_props.metal_flops_q6k_f32 = device_metal_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32); - dev_info->gpu_props.cuda_flops_f32 = device_cuda_flops (model, GGML_TYPE_F32, GGML_TYPE_F32); - dev_info->gpu_props.cuda_flops_f16 = device_cuda_flops (model, GGML_TYPE_F16, GGML_TYPE_F16); + dev_info->gpu_props.cuda_flops_f32_f32 = device_cuda_flops (model, GGML_TYPE_F32, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_f16_f32 = device_cuda_flops (model, GGML_TYPE_F16, GGML_TYPE_F32); dev_info->gpu_props.cuda_flops_q4k_f32 = device_cuda_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32); dev_info->gpu_props.cuda_flops_q6k_f32 = device_cuda_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32); if (dev_info->rank == 0) { - struct model_flops * ffo = &dev_info->model_flops; - llama_model_n_flops(model, ml, ffo, 1, 10); + struct model_flops * n_flops = &dev_info->model_flops; + struct model_params * n_params = &dev_info->model_params; + llama_model_n_flops(model, ml, n_flops, n_params, 1, 10); } } @@ -20669,7 +20670,46 @@ static void llama_model_reset_tensors(struct llama_model * model) { model->cls_out_b = nullptr; } -void llama_model_n_flops(struct llama_model * model, struct llama_model_loader * ml, struct model_flops * ffo, const int64_t n_input, const int64_t n_history) { +static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, enum profiler_layer_type ltype, int64_t n) { + switch (ltype) { + case PROFILER_LAYER_OUTPUT: + switch (dtype) { + case GGML_TYPE_F32: + n_flops->output_f32_f32 += n; + break; + case GGML_TYPE_Q6_K: + n_flops->output_q6k_f32 += n; + break; + default: + throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n"); + } + break; + + case PROFILER_LAYER_BACKEND: + switch (dtype) { + case GGML_TYPE_F32: + n_flops->layer_f32_f32 += n; + break; + case GGML_TYPE_F16: + n_flops->layer_f16_f32 += n; + break; + case GGML_TYPE_Q4_K: + n_flops->layer_q4k_f32 += n; + break; + case GGML_TYPE_Q6_K: + n_flops->layer_q6k_f32 += n; + break; + default: + throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n"); + } + break; + + default: + throw std::runtime_error("Unrecognized profiler layer type\n"); + } +} + +void llama_model_n_flops(struct llama_model * model, struct llama_model_loader * ml, struct model_flops * n_flops, struct model_params * n_params, const int64_t n_input, const int64_t n_history) { const llama_hparams hparams = model->hparams; const int64_t n_layer = hparams.n_layer; const int64_t n_vocab = hparams.n_vocab; @@ -20774,73 +20814,73 @@ void llama_model_n_flops(struct llama_model * model, struct llama_model_loader * if (it != tensor_name_map.end()) { switch (it->second) { case 1: { // "token_embd.weight" - ffo->input_flops += (2 * n_input * n_embd * n_vocab - n_input * n_embd); - ffo->input_params += static_cast(ggml_nelements(cur)); + n_params->input_params += static_cast(ggml_nelements(cur)); break; } case 2: { // "output_norm.weight" - ffo->output_flops += n_input * (8 * n_embd + 1); - ffo->output_params += static_cast(ggml_nelements(cur)); + count_n_flops(n_flops, cur->type, PROFILER_LAYER_OUTPUT, n_input * (4 * n_embd + 1)); + n_params->output_params += static_cast(ggml_nelements(cur)); break; } case 3: { // "output.weight" - ffo->output_flops += 2 * n_input * n_embd * n_vocab; - ffo->output_flops += 5 * n_input * n_vocab; - ffo->output_params += static_cast(ggml_nelements(cur)); + count_n_flops(n_flops, cur->type, PROFILER_LAYER_OUTPUT, 2 * n_input * n_embd * n_vocab); + count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 5 * n_input * n_vocab); // softmax + n_params->output_params += static_cast(ggml_nelements(cur)); break; } case 4: // "blk.0.attn_norm.weight" case 12: // "blk.0.ffn_norm.weight" { - ffo->layer_flops += n_input * (8 * n_embd + 1); - ffo->layer_params += static_cast(ggml_nelements(cur)); + count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, n_input * (4 * n_embd + 1)); + n_params->layer_params += static_cast(ggml_nelements(cur)); break; } case 5: { // "blk.0.attn_q.weight" - ffo->layer_flops += 2 * n_input * n_embd * (n_head * n_embd_head_k); - ffo->layer_params += static_cast(ggml_nelements(cur)); + count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_head_k)); + count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd_head_k); // rope + n_params->layer_params += static_cast(ggml_nelements(cur)); break; } case 6: { // "blk.0.attn_k.weight" - ffo->layer_flops += 2 * n_input * n_embd * (n_head * n_embd_k_gqa); - ffo->layer_flops += 2 * n_input * (n_input + n_history) * n_embd_head_k * n_head; // Q*K with KVCache - ffo->layer_flops += 7 * n_input * (n_input + n_history) * n_head; // scale, mask, and softmax - ffo->layer_params += static_cast(ggml_nelements(cur)); + count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_k_gqa)); + count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd_k_gqa); // rope + count_n_flops(n_flops, GGML_TYPE_F16, PROFILER_LAYER_BACKEND, 2 * n_input * (n_input + n_history) * n_embd_head_k * n_head); // compute kq with kvcache + count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 7 * n_input * (n_input + n_history) * n_head); // scale, mask, and softmax + n_params->layer_params += static_cast(ggml_nelements(cur)); break; } case 7: { // "blk.0.attn_v.weight" - ffo->layer_flops += 2 * n_input * n_embd * (n_head * n_embd_v_gqa); - ffo->layer_flops += n_input * (n_input + n_history) * n_embd_head_k * n_head; // QKV with KVCache - ffo->layer_params += static_cast(ggml_nelements(cur)); + count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_v_gqa)); + count_n_flops(n_flops, GGML_TYPE_F16, PROFILER_LAYER_BACKEND, n_input * (n_input + n_history) * n_embd_head_k * n_head); // compute kqv with kvcache + n_params->layer_params += static_cast(ggml_nelements(cur)); break; } case 8: { // "blk.0.attn_output.weight" - ffo->layer_flops += 2 * n_input * (n_head * n_embd_head_k) * n_embd; - ffo->layer_flops += n_input * n_embd; // shortcut - ffo->layer_params += static_cast(ggml_nelements(cur)); + count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * (n_head * n_embd_head_k) * n_embd); + count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_embd); // shortcut + n_params->layer_params += static_cast(ggml_nelements(cur)); break; } case 9: { // "blk.0.ffn_gate.weight" - ffo->layer_flops += 2 * n_input * n_embd * n_ff; - ffo->layer_flops += 5 * n_input * n_ff; // SiLU - ffo->layer_params += static_cast(ggml_nelements(cur)); + count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff); + count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 5 * n_input * n_ff); // SiLU + n_params->layer_params += static_cast(ggml_nelements(cur)); break; } case 10: { // "blk.0.ffn_down.weight" - ffo->layer_flops += 2 * n_input * n_embd * n_ff; - ffo->layer_flops += n_input * n_embd; // shortcut - ffo->layer_params += static_cast(ggml_nelements(cur)); + count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff); + count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_embd); // shortcut + n_params->layer_params += static_cast(ggml_nelements(cur)); break; } case 11: { // "blk.0.ffn_up.weight" - ffo->layer_flops += 2 * n_input * n_embd * n_ff; - ffo->layer_flops += n_input * n_ff; // silu(gate(x)) * up(x) - ffo->layer_params += static_cast(ggml_nelements(cur)); + count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff); + count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_ff); // silu(gate(x)) * up(x) + n_params->layer_params += static_cast(ggml_nelements(cur)); break; } - case 13: { // rope_freqs.weight, for Q and K - ffo->layer_flops += 8 * n_input * n_head * n_embd_head_k; - ffo->layer_params += static_cast(ggml_nelements(cur)); + case 13: { // rope_freqs.weight, has been counted in q and k + n_params->layer_params += static_cast(ggml_nelements(cur)); break; } // optional: bias tensors @@ -20850,29 +20890,29 @@ void llama_model_n_flops(struct llama_model * model, struct llama_model_loader * case 17: // "blk.0.attn_output.bias" case 19: // "blk.0.ffn_down.bias" { - ffo->layer_flops += n_input * n_embd; - ffo->layer_params += static_cast(ggml_nelements(cur)); + count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, n_input * n_embd); + n_params->layer_params += static_cast(ggml_nelements(cur)); break; } case 18: // "blk.0.ffn_gate.bias" case 20: // "blk.0.ffn_up.bias" { - ffo->layer_flops += n_input * n_ff; - ffo->layer_params += static_cast(ggml_nelements(cur)); + count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, n_input * n_ff); + n_params->layer_params += static_cast(ggml_nelements(cur)); break; } // optional: expert tensors case 21: { // "blk.0.ffn_gate_inp.weight" - ffo->layer_flops += 2 * n_input * n_embd * n_expert; - ffo->layer_params += static_cast(ggml_nelements(cur)); + count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_expert); + n_params->layer_params += static_cast(ggml_nelements(cur)); break; } case 22: // "blk.0.ffn_gate_exps.weight" case 23: // "blk.0.ffn_down_exps.weight" case 24: // "blk.0.ffn_up_exps.weight" { - ffo->layer_flops += 2 * n_input * n_embd * n_ff * n_expert; - ffo->layer_params += static_cast(ggml_nelements(cur)); + count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff * n_expert); + n_params->layer_params += static_cast(ggml_nelements(cur)); break; } default: