diff --git a/common/profiler.cpp b/common/profiler.cpp index d2363a4f..1bdd88b6 100644 --- a/common/profiler.cpp +++ b/common/profiler.cpp @@ -31,6 +31,7 @@ #include #include #include +#include const char * device_name() { static char device_name[256]; @@ -489,6 +490,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); + LOG_INF("| CPU flops (Q80 x F32, GFLOPS)"); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].cpu_props.flops_q80_f32); + } + LOG_INF("\n"); + LOG_INF("| Physical Mem Total (GB) "); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.2f ", dev_info_set[i].memory.total_physical); @@ -615,6 +622,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); + LOG_INF("| Metal flops (Q80xF32, GFLOPS)"); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.metal_flops_q80_f32); + } + LOG_INF("\n"); + LOG_INF("| CUDA flops (F32xF32, GFLOPS)"); for (int i = 0; i < n; ++i) { LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_f32_f32); @@ -639,49 +652,119 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m } LOG_INF("\n"); + LOG_INF("| CUDA flops (Q80xF32, GFLOPS)"); + for (int i = 0; i < n; ++i) { + LOG_INF("| %-10.1f ", dev_info_set[i].gpu_props.cuda_flops_q80_f32); + } + LOG_INF("\n"); + LOG_INF("| Model flops (output F32xF32) "); - LOG_INF("| %-10lu ", dev_info_set[0].model_flops.output_f32_f32); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_f32_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (output F16xF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_f16_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (output Q4KxF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q4k_f32); LOG_INF("\n"); LOG_INF("| Model flops (output Q6KxF32) "); - LOG_INF("| %-10lu ", dev_info_set[0].model_flops.output_q6k_f32); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q6k_f32); + LOG_INF("\n"); + + LOG_INF("| Model flops (output Q80xF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.output_q80_f32); LOG_INF("\n"); LOG_INF("| Model flops (layer F32xF32) "); - LOG_INF("| %-10lu ", dev_info_set[0].model_flops.layer_f32_f32); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_f32_f32); LOG_INF("\n"); LOG_INF("| Model flops (layer F16xF32) "); - LOG_INF("| %-10lu ", dev_info_set[0].model_flops.layer_f16_f32); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_f16_f32); LOG_INF("\n"); LOG_INF("| Model flops (layer Q4KxF32) "); - LOG_INF("| %-10lu ", dev_info_set[0].model_flops.layer_q4k_f32); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q4k_f32); LOG_INF("\n"); LOG_INF("| Model flops (layer Q6KxF32) "); - LOG_INF("| %-10lu ", dev_info_set[0].model_flops.layer_q6k_f32); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q6k_f32); LOG_INF("\n"); - LOG_INF("| Model params (input) "); - LOG_INF("| %-10lu ", dev_info_set[0].model_params.input_params); + LOG_INF("| Model flops (layer Q80xF32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_flops.layer_q80_f32); LOG_INF("\n"); - LOG_INF("| Model params (each layer) "); - LOG_INF("| %-10lu ", dev_info_set[0].model_params.layer_params); + LOG_INF("| Model params (input F32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_f32); LOG_INF("\n"); - LOG_INF("| Model params (output) "); - LOG_INF("| %-10lu ", dev_info_set[0].model_params.output_params); + LOG_INF("| Model params (input F16) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_f16); + LOG_INF("\n"); + + LOG_INF("| Model params (input Q4K) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q4k); + LOG_INF("\n"); + + LOG_INF("| Model params (input Q6K) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q6k); + LOG_INF("\n"); + + LOG_INF("| Model params (input Q80) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.input_q80); + LOG_INF("\n"); + + LOG_INF("| Model params (layer F32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_f32); + LOG_INF("\n"); + + LOG_INF("| Model params (layer F16) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_f16); + LOG_INF("\n"); + + LOG_INF("| Model params (layer Q4K) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q4k); + LOG_INF("\n"); + + LOG_INF("| Model params (layer Q6K) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q6k); + LOG_INF("\n"); + + LOG_INF("| Model params (layer Q80) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.layer_q80); + LOG_INF("\n"); + + LOG_INF("| Model params (output F32) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_f32); + LOG_INF("\n"); + + LOG_INF("| Model params (output F16) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_f16); + LOG_INF("\n"); + + LOG_INF("| Model params (output Q4K) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q4k); + LOG_INF("\n"); + + LOG_INF("| Model params (output Q6K) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q6k); + LOG_INF("\n"); + + LOG_INF("| Model params (output Q80) "); + LOG_INF("| %-10" PRId64 " ", dev_info_set[0].model_params.output_q80); LOG_INF("\n"); model_flops ffo = dev_info_set[0].model_flops; int64_t total_flops = ffo.output_f32_f32 + (ffo.layer_f32_f32 * llama_model_n_layers(model)); // todo double cpu_flops_f16 = dev_info_set[0].cpu_props.flops_f16_f32 * 1e9; - LOG_INF("| Token latency (ms) "); - LOG_INF("| %-10.2f ", total_flops / cpu_flops_f16 * 1000); - LOG_INF("\n"); + // LOG_INF("| Token latency (ms) "); + // LOG_INF("| %-10.2f ", total_flops / cpu_flops_f16 * 1000); + // LOG_INF("\n"); LOG_INF("-------------------------------------------------------------------------------------------\n\n"); } @@ -704,12 +787,12 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { + gpu_description_len + sizeof(float) // disk_read_bandwidth + sizeof(uint32_t) // cpu_props.cores - + sizeof(float) * 4 // cpu_props.flops_f32, cpu_props.flops_f16, cpu_props.flops_q4k_f32, cpu_props.flops_q6k_f32 + + sizeof(float) * 5 // cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q6k_f32, cpu_props.flops_q80_f32 + sizeof(struct memory_info) + sizeof(struct gpu_support) - + sizeof(float) * 10; // gpu_props.memory_free, gpu_props.memory_total, - // gpu_props.metal_flops_f32, gpu_props.metal_flops_f16, gpu_props.metal_flops_q4k_f32, gpu_props.metal_flops_q6k_f32, - // gpu_props.cuda_flops_f32, gpu_props.cuda_flops_f16, gpu_props.cuda_flops_q8, and gpu_props.cuda_flops_q4k + + sizeof(float) * 12; // gpu_props.memory_free, gpu_props.memory_total, + // gpu_props.metal_flops_f32_f32, gpu_props.metal_flops_f16_f32, gpu_props.metal_flops_q4k_f32, gpu_props.metal_flops_q6k_f32, gpu_props.metal_flops_q80_f32, + // gpu_props.cuda_flops_f32_f32, gpu_props.cuda_flops_f16_f32, gpu_props.cuda_flops_q4k_f32, gpu_props.cuda_flops_q6k_f32, gpu_props.cuda_flops_q80_f32 *buffer = (char *)malloc(total_size); char * ptr = *buffer; @@ -763,6 +846,9 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->cpu_props.flops_q6k_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->cpu_props.flops_q80_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->memory, sizeof(struct memory_info)); ptr += sizeof(struct memory_info); @@ -787,6 +873,9 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { memcpy(ptr, &dev_info->gpu_props.metal_flops_q6k_f32, sizeof(float)); ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.metal_flops_q80_f32, sizeof(float)); + ptr += sizeof(float); + memcpy(ptr, &dev_info->gpu_props.cuda_flops_f32_f32, sizeof(float)); ptr += sizeof(float); @@ -797,6 +886,9 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) { ptr += sizeof(float); memcpy(ptr, &dev_info->gpu_props.cuda_flops_q6k_f32, sizeof(float)); + ptr += sizeof(float); + + memcpy(ptr, &dev_info->gpu_props.cuda_flops_q80_f32, sizeof(float)); // no need to synchronize model flops and model params return total_size; @@ -868,6 +960,9 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->cpu_props.flops_q6k_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->cpu_props.flops_q80_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->memory, ptr, sizeof(struct memory_info)); ptr += sizeof(struct memory_info); @@ -892,6 +987,9 @@ void deserialize(const char * buffer, struct device_info * dev_info) { memcpy(&dev_info->gpu_props.metal_flops_q6k_f32, ptr, sizeof(float)); ptr += sizeof(float); + memcpy(&dev_info->gpu_props.metal_flops_q80_f32, ptr, sizeof(float)); + ptr += sizeof(float); + memcpy(&dev_info->gpu_props.cuda_flops_f32_f32, ptr, sizeof(float)); ptr += sizeof(float); @@ -902,6 +1000,9 @@ void deserialize(const char * buffer, struct device_info * dev_info) { ptr += sizeof(float); memcpy(&dev_info->gpu_props.cuda_flops_q6k_f32, ptr, sizeof(float)); + ptr += sizeof(float); + + memcpy(&dev_info->gpu_props.cuda_flops_q80_f32, ptr, sizeof(float)); // no need to synchronize model flops and model params } \ No newline at end of file diff --git a/common/profiler.h b/common/profiler.h index bda570ff..a2395a14 100644 --- a/common/profiler.h +++ b/common/profiler.h @@ -12,6 +12,7 @@ struct cpu_props { float flops_f16_f32; // in GFLOPS float flops_q4k_f32; // in GFLOPS float flops_q6k_f32; // in GFLOPS + float flops_q80_f32; // in GFLOPS cpu_props() : name(""), @@ -20,7 +21,8 @@ struct cpu_props { flops_f32_f32(0.0f), flops_f16_f32(0.0f), flops_q4k_f32(0.0f), - flops_q6k_f32(0.0f) {} + flops_q6k_f32(0.0f), + flops_q80_f32(0.0f) {} }; struct memory_info { @@ -66,10 +68,12 @@ struct gpu_props { float metal_flops_f16_f32; // in GFLOPS float metal_flops_q4k_f32; // in GFLOPS float metal_flops_q6k_f32; // in GFLOPS + float metal_flops_q80_f32; // in GFLOPS float cuda_flops_f32_f32; // in GFLOPS float cuda_flops_f16_f32; // in GFLOPS float cuda_flops_q4k_f32; // in GFLOPS float cuda_flops_q6k_f32; // in GFLOPS + float cuda_flops_q80_f32; // in GFLOPS gpu_props() : name(""), @@ -80,38 +84,72 @@ struct gpu_props { metal_flops_f16_f32(0.0f), metal_flops_q4k_f32(0.0f), metal_flops_q6k_f32(0.0f), + metal_flops_q80_f32(0.0f), cuda_flops_f32_f32 (0.0f), cuda_flops_f16_f32 (0.0f), cuda_flops_q4k_f32 (0.0f), - cuda_flops_q6k_f32 (0.0f) {} + cuda_flops_q6k_f32 (0.0f), + cuda_flops_q80_f32 (0.0f) {} }; struct model_flops { int64_t output_f32_f32; + int64_t output_f16_f32; + int64_t output_q4k_f32; int64_t output_q6k_f32; + int64_t output_q80_f32; int64_t layer_f32_f32; int64_t layer_f16_f32; int64_t layer_q4k_f32; int64_t layer_q6k_f32; + int64_t layer_q80_f32; model_flops() : output_f32_f32(0), + output_f16_f32(0), + output_q4k_f32(0), output_q6k_f32(0), + output_q80_f32(0), layer_f32_f32 (0), layer_f16_f32 (0), layer_q4k_f32 (0), - layer_q6k_f32 (0) {} + layer_q6k_f32 (0), + layer_q80_f32 (0) {} }; struct model_params { - int64_t input_params; - int64_t output_params; - int64_t layer_params; + int64_t input_f32; + int64_t input_f16; + int64_t input_q4k; + int64_t input_q6k; + int64_t input_q80; + int64_t output_f32; + int64_t output_f16; + int64_t output_q4k; + int64_t output_q6k; + int64_t output_q80; + int64_t layer_f32; + int64_t layer_f16; + int64_t layer_q4k; + int64_t layer_q6k; + int64_t layer_q80; model_params() : - input_params (0), - output_params(0), - layer_params (0) {} + input_f32 (0), + input_f16 (0), + input_q4k (0), + input_q6k (0), + input_q80 (0), + output_f32(0), + output_f16(0), + output_q4k(0), + output_q6k(0), + output_q80(0), + layer_f32 (0), + layer_f16 (0), + layer_q4k (0), + layer_q6k (0), + layer_q80 (0) {} }; struct device_info { diff --git a/src/llama.cpp b/src/llama.cpp index 50a3b832..fd7cb279 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -91,6 +91,7 @@ #include #include #include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -110,7 +111,7 @@ struct Timer { ~Timer() { if (enable_timer) { int64_t end_time = ggml_time_us(); - LLAMA_LOG_INFO("Time to run %s: %lu ms\n", name, (end_time - start_time)/1000); + LLAMA_LOG_INFO("Time to run %s: %" PRId64 " ms\n", name, (end_time - start_time) / 1000); } } }; @@ -3553,6 +3554,7 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, ll dev_info->cpu_props.flops_f16_f32 = device_cpu_flops(model, GGML_TYPE_F16, GGML_TYPE_F32, n_threads); dev_info->cpu_props.flops_q4k_f32 = device_cpu_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32, n_threads); dev_info->cpu_props.flops_q6k_f32 = device_cpu_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32, n_threads); + dev_info->cpu_props.flops_q80_f32 = device_cpu_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32, n_threads); dev_info->memory.total_physical = round(device_physical_memory(false) / (double)(1 << 30) * 100) / 100; dev_info->memory.available_physical = round(device_physical_memory(true) / (double)(1 << 30) * 100) / 100; @@ -3586,10 +3588,12 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, ll dev_info->gpu_props.metal_flops_f16_f32 = device_metal_flops(model, GGML_TYPE_F16, GGML_TYPE_F32); dev_info->gpu_props.metal_flops_q4k_f32 = device_metal_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32); dev_info->gpu_props.metal_flops_q6k_f32 = device_metal_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32); + dev_info->gpu_props.metal_flops_q80_f32 = device_metal_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32); dev_info->gpu_props.cuda_flops_f32_f32 = device_cuda_flops (model, GGML_TYPE_F32, GGML_TYPE_F32); dev_info->gpu_props.cuda_flops_f16_f32 = device_cuda_flops (model, GGML_TYPE_F16, GGML_TYPE_F32); dev_info->gpu_props.cuda_flops_q4k_f32 = device_cuda_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32); dev_info->gpu_props.cuda_flops_q6k_f32 = device_cuda_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32); + dev_info->gpu_props.cuda_flops_q80_f32 = device_cuda_flops (model, GGML_TYPE_Q8_0, GGML_TYPE_F32); if (dev_info->rank == 0) { struct model_flops * n_flops = &dev_info->model_flops; @@ -20677,9 +20681,18 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en case GGML_TYPE_F32: n_flops->output_f32_f32 += n; break; + case GGML_TYPE_F16: + n_flops->output_f16_f32 += n; + break; + case GGML_TYPE_Q4_K: + n_flops->output_q4k_f32 += n; + break; case GGML_TYPE_Q6_K: n_flops->output_q6k_f32 += n; break; + case GGML_TYPE_Q8_0: + n_flops->output_q80_f32 += n; + break; default: throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n"); } @@ -20699,6 +20712,82 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en case GGML_TYPE_Q6_K: n_flops->layer_q6k_f32 += n; break; + case GGML_TYPE_Q8_0: + n_flops->layer_q80_f32 += n; + break; + default: + throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n"); + } + break; + + default: + throw std::runtime_error("Unrecognized profiler layer type\n"); + } +} + +static void count_n_params(struct model_params * n_params, enum ggml_type dtype, enum profiler_layer_type ltype, size_t n) { + int64_t n_i64t = static_cast(n); + switch (ltype) { + case PROFILER_LAYER_INPUT: + switch (dtype) { + case GGML_TYPE_F32: + n_params->input_f32 += n_i64t; + break; + case GGML_TYPE_F16: + n_params->input_f16 += n_i64t; + break; + case GGML_TYPE_Q4_K: + n_params->input_q4k += n_i64t; + break; + case GGML_TYPE_Q6_K: + n_params->input_q6k += n_i64t; + break; + case GGML_TYPE_Q8_0: + n_params->input_q80 += n_i64t; + break; + default: + throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n"); + } + break; + + case PROFILER_LAYER_OUTPUT: + switch (dtype) { + case GGML_TYPE_F32: + n_params->output_f32 += n_i64t; + break; + case GGML_TYPE_F16: + n_params->output_f16 += n_i64t; + break; + case GGML_TYPE_Q4_K: + n_params->output_q4k += n_i64t; + break; + case GGML_TYPE_Q6_K: + n_params->output_q6k += n_i64t; + break; + case GGML_TYPE_Q8_0: + n_params->output_q80 += n_i64t; + default: + throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n"); + } + break; + + case PROFILER_LAYER_BACKEND: + switch (dtype) { + case GGML_TYPE_F32: + n_params->layer_f32 += n_i64t; + break; + case GGML_TYPE_F16: + n_params->layer_f16 += n_i64t; + break; + case GGML_TYPE_Q4_K: + n_params->layer_q4k += n_i64t; + break; + case GGML_TYPE_Q6_K: + n_params->layer_q6k += n_i64t; + break; + case GGML_TYPE_Q8_0: + n_params->layer_q80 += n_i64t; + break; default: throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n"); } @@ -20814,73 +20903,73 @@ void llama_model_n_flops(struct llama_model * model, struct llama_model_loader * if (it != tensor_name_map.end()) { switch (it->second) { case 1: { // "token_embd.weight" - n_params->input_params += static_cast(ggml_nelements(cur)); + count_n_params(n_params, cur->type, PROFILER_LAYER_INPUT, ggml_nelements(cur)); break; } case 2: { // "output_norm.weight" - count_n_flops(n_flops, cur->type, PROFILER_LAYER_OUTPUT, n_input * (4 * n_embd + 1)); - n_params->output_params += static_cast(ggml_nelements(cur)); + count_n_flops (n_flops, cur->type, PROFILER_LAYER_OUTPUT, n_input * (4 * n_embd + 1)); + count_n_params(n_params, cur->type, PROFILER_LAYER_OUTPUT, ggml_nelements(cur)); break; } case 3: { // "output.weight" - count_n_flops(n_flops, cur->type, PROFILER_LAYER_OUTPUT, 2 * n_input * n_embd * n_vocab); - count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 5 * n_input * n_vocab); // softmax - n_params->output_params += static_cast(ggml_nelements(cur)); + count_n_flops (n_flops, cur->type, PROFILER_LAYER_OUTPUT, 2 * n_input * n_embd * n_vocab); + count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 5 * n_input * n_vocab); // softmax + count_n_params(n_params, cur->type, PROFILER_LAYER_OUTPUT, ggml_nelements(cur)); break; } case 4: // "blk.0.attn_norm.weight" case 12: // "blk.0.ffn_norm.weight" { - count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, n_input * (4 * n_embd + 1)); - n_params->layer_params += static_cast(ggml_nelements(cur)); + count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, n_input * (4 * n_embd + 1)); + count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur)); break; } case 5: { // "blk.0.attn_q.weight" - count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_head_k)); - count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd_head_k); // rope - n_params->layer_params += static_cast(ggml_nelements(cur)); + count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_head_k)); + count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd_head_k); // rope + count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur)); break; } case 6: { // "blk.0.attn_k.weight" - count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_k_gqa)); - count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd_k_gqa); // rope - count_n_flops(n_flops, GGML_TYPE_F16, PROFILER_LAYER_BACKEND, 2 * n_input * (n_input + n_history) * n_embd_head_k * n_head); // compute kq with kvcache - count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 7 * n_input * (n_input + n_history) * n_head); // scale, mask, and softmax - n_params->layer_params += static_cast(ggml_nelements(cur)); + count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_k_gqa)); + count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd_k_gqa); // rope + count_n_flops (n_flops, GGML_TYPE_F16, PROFILER_LAYER_BACKEND, 2 * n_input * (n_input + n_history) * n_embd_head_k * n_head); // compute kq with kvcache + count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 7 * n_input * (n_input + n_history) * n_head); // scale, mask, and softmax + count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur)); break; } case 7: { // "blk.0.attn_v.weight" - count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_v_gqa)); - count_n_flops(n_flops, GGML_TYPE_F16, PROFILER_LAYER_BACKEND, n_input * (n_input + n_history) * n_embd_head_k * n_head); // compute kqv with kvcache - n_params->layer_params += static_cast(ggml_nelements(cur)); + count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_v_gqa)); + count_n_flops (n_flops, GGML_TYPE_F16, PROFILER_LAYER_BACKEND, n_input * (n_input + n_history) * n_embd_head_k * n_head); // compute kqv with kvcache + count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur)); break; } case 8: { // "blk.0.attn_output.weight" - count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * (n_head * n_embd_head_k) * n_embd); - count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_embd); // shortcut - n_params->layer_params += static_cast(ggml_nelements(cur)); + count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * (n_head * n_embd_head_k) * n_embd); + count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_embd); // shortcut + count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur)); break; } case 9: { // "blk.0.ffn_gate.weight" - count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff); - count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 5 * n_input * n_ff); // SiLU - n_params->layer_params += static_cast(ggml_nelements(cur)); + count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff); + count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 5 * n_input * n_ff); // SiLU + count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur)); break; } case 10: { // "blk.0.ffn_down.weight" - count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff); - count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_embd); // shortcut - n_params->layer_params += static_cast(ggml_nelements(cur)); + count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff); + count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_embd); // shortcut + count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur)); break; } case 11: { // "blk.0.ffn_up.weight" - count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff); - count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_ff); // silu(gate(x)) * up(x) - n_params->layer_params += static_cast(ggml_nelements(cur)); + count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff); + count_n_flops (n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_ff); // silu(gate(x)) * up(x) + count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur)); break; } case 13: { // rope_freqs.weight, has been counted in q and k - n_params->layer_params += static_cast(ggml_nelements(cur)); + count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur)); break; } // optional: bias tensors @@ -20890,29 +20979,29 @@ void llama_model_n_flops(struct llama_model * model, struct llama_model_loader * case 17: // "blk.0.attn_output.bias" case 19: // "blk.0.ffn_down.bias" { - count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, n_input * n_embd); - n_params->layer_params += static_cast(ggml_nelements(cur)); + count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, n_input * n_embd); + count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur)); break; } case 18: // "blk.0.ffn_gate.bias" case 20: // "blk.0.ffn_up.bias" { - count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, n_input * n_ff); - n_params->layer_params += static_cast(ggml_nelements(cur)); + count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, n_input * n_ff); + count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur)); break; } // optional: expert tensors case 21: { // "blk.0.ffn_gate_inp.weight" - count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_expert); - n_params->layer_params += static_cast(ggml_nelements(cur)); + count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_expert); + count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur)); break; } case 22: // "blk.0.ffn_gate_exps.weight" case 23: // "blk.0.ffn_down_exps.weight" case 24: // "blk.0.ffn_up_exps.weight" { - count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff * n_expert); - n_params->layer_params += static_cast(ggml_nelements(cur)); + count_n_flops (n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff * n_expert); + count_n_params(n_params, cur->type, PROFILER_LAYER_BACKEND, ggml_nelements(cur)); break; } default: