From 3fe00a16a06f34cbac39c037f31c8c14bff3bc29 Mon Sep 17 00:00:00 2001
From: Lizonghang <870644199@qq.com>
Date: Sun, 24 Nov 2024 13:13:32 +0400
Subject: [PATCH] count model flops for f32xf32, f16xf32, q4kxf32, q6kxf32

---
 common/profiler.cpp |  82 +++++++++++++++-----------
 common/profiler.h   |  82 +++++++++++++++-----------
 include/llama.h     |   3 +-
 src/llama.cpp       | 140 ++++++++++++++++++++++++++++----------------
 4 files changed, 188 insertions(+), 119 deletions(-)

diff --git a/common/profiler.cpp b/common/profiler.cpp
index 8dcd05e7..d2363a4f 100644
--- a/common/profiler.cpp
+++ b/common/profiler.cpp
@@ -465,15 +465,15 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
     }
     LOG_INF("\n");
 
-    LOG_INF("| CPU flops (F32 x F32, GFLOPS)");
+    LOG_INF("| CPU flops (F32xF32, GFLOPS)  ");
     for (int i = 0; i < n; ++i) {
-        LOG_INF("| %-10.1f   ", dev_info_set[i].cpu_props.flops_f32);
+        LOG_INF("| %-10.1f   ", dev_info_set[i].cpu_props.flops_f32_f32);
     }
     LOG_INF("\n");
 
-    LOG_INF("| CPU flops (F16 x F16, GFLOPS)");
+    LOG_INF("| CPU flops (F16xF32, GFLOPS)  ");
     for (int i = 0; i < n; ++i) {
-        LOG_INF("| %-10.1f   ", dev_info_set[i].cpu_props.flops_f16);
+        LOG_INF("| %-10.1f   ", dev_info_set[i].cpu_props.flops_f16_f32);
     }
     LOG_INF("\n");
 
@@ -593,13 +593,13 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
 
     LOG_INF("| Metal flops (F32xF32, GFLOPS)");
     for (int i = 0; i < n; ++i) {
-        LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.metal_flops_f32);
+        LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.metal_flops_f32_f32);
     }
     LOG_INF("\n");
 
-    LOG_INF("| Metal flops (F16xF16, GFLOPS)");
+    LOG_INF("| Metal flops (F16xF32, GFLOPS)");
     for (int i = 0; i < n; ++i) {
-        LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.metal_flops_f16);
+        LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.metal_flops_f16_f32);
     }
     LOG_INF("\n");
 
@@ -617,13 +617,13 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
 
     LOG_INF("| CUDA  flops (F32xF32, GFLOPS)");
     for (int i = 0; i < n; ++i) {
-        LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.cuda_flops_f32);
+        LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.cuda_flops_f32_f32);
     }
     LOG_INF("\n");
 
-    LOG_INF("| CUDA  flops (F16xF16, GFLOPS)");
+    LOG_INF("| CUDA  flops (F16xF32, GFLOPS)");
     for (int i = 0; i < n; ++i) {
-        LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.cuda_flops_f16);
+        LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.cuda_flops_f16_f32);
     }
     LOG_INF("\n");
 
@@ -639,33 +639,45 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
     }
     LOG_INF("\n");
 
-    LOG_INF("| Model flops (input)          ");
-    LOG_INF("| %-10lu   ", dev_info_set[0].model_flops.input_flops);
+    LOG_INF("| Model flops (output F32xF32) ");
+    LOG_INF("| %-10lu   ", dev_info_set[0].model_flops.output_f32_f32);
     LOG_INF("\n");
 
-    LOG_INF("| Model flops (each layer)     ");
-    LOG_INF("| %-10lu   ", dev_info_set[0].model_flops.layer_flops);
+    LOG_INF("| Model flops (output Q6KxF32) ");
+    LOG_INF("| %-10lu   ", dev_info_set[0].model_flops.output_q6k_f32);
     LOG_INF("\n");
 
-    LOG_INF("| Model flops (output)         ");
-    LOG_INF("| %-10lu   ", dev_info_set[0].model_flops.output_flops);
+    LOG_INF("| Model flops (layer F32xF32)  ");
+    LOG_INF("| %-10lu   ", dev_info_set[0].model_flops.layer_f32_f32);
+    LOG_INF("\n");
+
+    LOG_INF("| Model flops (layer F16xF32)  ");
+    LOG_INF("| %-10lu   ", dev_info_set[0].model_flops.layer_f16_f32);
+    LOG_INF("\n");
+
+    LOG_INF("| Model flops (layer Q4KxF32)  ");
+    LOG_INF("| %-10lu   ", dev_info_set[0].model_flops.layer_q4k_f32);
+    LOG_INF("\n");
+
+    LOG_INF("| Model flops (layer Q6KxF32)  ");
+    LOG_INF("| %-10lu   ", dev_info_set[0].model_flops.layer_q6k_f32);
     LOG_INF("\n");
 
     LOG_INF("| Model params (input)         ");
-    LOG_INF("| %-10lu   ", dev_info_set[0].model_flops.input_params);
+    LOG_INF("| %-10lu   ", dev_info_set[0].model_params.input_params);
     LOG_INF("\n");
 
     LOG_INF("| Model params (each layer)    ");
-    LOG_INF("| %-10lu   ", dev_info_set[0].model_flops.layer_params);
+    LOG_INF("| %-10lu   ", dev_info_set[0].model_params.layer_params);
     LOG_INF("\n");
 
     LOG_INF("| Model params (output)        ");
-    LOG_INF("| %-10lu   ", dev_info_set[0].model_flops.output_params);
+    LOG_INF("| %-10lu   ", dev_info_set[0].model_params.output_params);
     LOG_INF("\n");
 
     model_flops ffo  = dev_info_set[0].model_flops;
-    int64_t total_flops = ffo.input_flops + ffo.output_flops + (ffo.layer_flops * llama_model_n_layers(model));
-    double cpu_flops_f16 = dev_info_set[0].cpu_props.flops_f16 * 1e9;
+    int64_t total_flops = ffo.output_f32_f32 + (ffo.layer_f32_f32 * llama_model_n_layers(model)); // todo
+    double cpu_flops_f16 = dev_info_set[0].cpu_props.flops_f16_f32 * 1e9;
 
     LOG_INF("| Token latency (ms)           ");
     LOG_INF("| %-10.2f   ", total_flops / cpu_flops_f16 * 1000);
@@ -739,10 +751,10 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
     memcpy(ptr, &dev_info->cpu_props.cores, sizeof(uint32_t));
     ptr += sizeof(uint32_t);
 
-    memcpy(ptr, &dev_info->cpu_props.flops_f32, sizeof(float));
+    memcpy(ptr, &dev_info->cpu_props.flops_f32_f32, sizeof(float));
     ptr += sizeof(float);
 
-    memcpy(ptr, &dev_info->cpu_props.flops_f16, sizeof(float));
+    memcpy(ptr, &dev_info->cpu_props.flops_f16_f32, sizeof(float));
     ptr += sizeof(float);
 
     memcpy(ptr, &dev_info->cpu_props.flops_q4k_f32, sizeof(float));
@@ -763,10 +775,10 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
     memcpy(ptr, &dev_info->gpu_props.memory_total, sizeof(float));
     ptr += sizeof(float);
 
-    memcpy(ptr, &dev_info->gpu_props.metal_flops_f32, sizeof(float));
+    memcpy(ptr, &dev_info->gpu_props.metal_flops_f32_f32, sizeof(float));
     ptr += sizeof(float);
 
-    memcpy(ptr, &dev_info->gpu_props.metal_flops_f16, sizeof(float));
+    memcpy(ptr, &dev_info->gpu_props.metal_flops_f16_f32, sizeof(float));
     ptr += sizeof(float);
 
     memcpy(ptr, &dev_info->gpu_props.metal_flops_q4k_f32, sizeof(float));
@@ -775,10 +787,10 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
     memcpy(ptr, &dev_info->gpu_props.metal_flops_q6k_f32, sizeof(float));
     ptr += sizeof(float);
 
-    memcpy(ptr, &dev_info->gpu_props.cuda_flops_f32, sizeof(float));
+    memcpy(ptr, &dev_info->gpu_props.cuda_flops_f32_f32, sizeof(float));
     ptr += sizeof(float);
 
-    memcpy(ptr, &dev_info->gpu_props.cuda_flops_f16, sizeof(float));
+    memcpy(ptr, &dev_info->gpu_props.cuda_flops_f16_f32, sizeof(float));
     ptr += sizeof(float);
 
     memcpy(ptr, &dev_info->gpu_props.cuda_flops_q4k_f32, sizeof(float));
@@ -786,7 +798,7 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
 
     memcpy(ptr, &dev_info->gpu_props.cuda_flops_q6k_f32, sizeof(float));
 
-    // no need to synchronize model flops
+    // no need to synchronize model flops and model params
     return total_size;
 }
 
@@ -844,10 +856,10 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
     memcpy(&dev_info->cpu_props.cores, ptr, sizeof(uint32_t));
     ptr += sizeof(uint32_t);
 
-    memcpy(&dev_info->cpu_props.flops_f32, ptr, sizeof(float));
+    memcpy(&dev_info->cpu_props.flops_f32_f32, ptr, sizeof(float));
     ptr += sizeof(float);
 
-    memcpy(&dev_info->cpu_props.flops_f16, ptr, sizeof(float));
+    memcpy(&dev_info->cpu_props.flops_f16_f32, ptr, sizeof(float));
     ptr += sizeof(float);
 
     memcpy(&dev_info->cpu_props.flops_q4k_f32, ptr, sizeof(float));
@@ -868,10 +880,10 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
     memcpy(&dev_info->gpu_props.memory_total, ptr, sizeof(float));
     ptr += sizeof(float);
 
-    memcpy(&dev_info->gpu_props.metal_flops_f32, ptr, sizeof(float));
+    memcpy(&dev_info->gpu_props.metal_flops_f32_f32, ptr, sizeof(float));
     ptr += sizeof(float);
 
-    memcpy(&dev_info->gpu_props.metal_flops_f16, ptr, sizeof(float));
+    memcpy(&dev_info->gpu_props.metal_flops_f16_f32, ptr, sizeof(float));
     ptr += sizeof(float);
 
     memcpy(&dev_info->gpu_props.metal_flops_q4k_f32, ptr, sizeof(float));
@@ -880,10 +892,10 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
     memcpy(&dev_info->gpu_props.metal_flops_q6k_f32, ptr, sizeof(float));
     ptr += sizeof(float);
 
-    memcpy(&dev_info->gpu_props.cuda_flops_f32, ptr, sizeof(float));
+    memcpy(&dev_info->gpu_props.cuda_flops_f32_f32, ptr, sizeof(float));
     ptr += sizeof(float);
 
-    memcpy(&dev_info->gpu_props.cuda_flops_f16, ptr, sizeof(float));
+    memcpy(&dev_info->gpu_props.cuda_flops_f16_f32, ptr, sizeof(float));
     ptr += sizeof(float);
 
     memcpy(&dev_info->gpu_props.cuda_flops_q4k_f32, ptr, sizeof(float));
@@ -891,5 +903,5 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
 
     memcpy(&dev_info->gpu_props.cuda_flops_q6k_f32, ptr, sizeof(float));
 
-    // no need to synchronize model flops
+    // no need to synchronize model flops and model params
 }
\ No newline at end of file
diff --git a/common/profiler.h b/common/profiler.h
index f1c79d8d..bda570ff 100644
--- a/common/profiler.h
+++ b/common/profiler.h
@@ -8,8 +8,8 @@ struct cpu_props {
     const char * name;
     const char * description;
     uint32_t     cores;
-    float        flops_f32;     // in GFLOPS
-    float        flops_f16;     // in GFLOPS
+    float        flops_f32_f32; // in GFLOPS
+    float        flops_f16_f32; // in GFLOPS
     float        flops_q4k_f32; // in GFLOPS
     float        flops_q6k_f32; // in GFLOPS
 
@@ -17,8 +17,8 @@ struct cpu_props {
         name(""), 
         description(""), 
         cores(0), 
-        flops_f32    (0.0f), 
-        flops_f16    (0.0f), 
+        flops_f32_f32(0.0f), 
+        flops_f16_f32(0.0f), 
         flops_q4k_f32(0.0f),
         flops_q6k_f32(0.0f) {}
 };
@@ -62,12 +62,12 @@ struct gpu_props {
     const char * description;
     float        memory_free;         // in GB
     float        memory_total;        // in GB
-    float        metal_flops_f32;     // in GFLOPS
-    float        metal_flops_f16;     // in GFLOPS
+    float        metal_flops_f32_f32; // in GFLOPS
+    float        metal_flops_f16_f32; // in GFLOPS
     float        metal_flops_q4k_f32; // in GFLOPS
     float        metal_flops_q6k_f32; // in GFLOPS
-    float        cuda_flops_f32;      // in GFLOPS
-    float        cuda_flops_f16;      // in GFLOPS
+    float        cuda_flops_f32_f32;  // in GFLOPS
+    float        cuda_flops_f16_f32;  // in GFLOPS
     float        cuda_flops_q4k_f32;  // in GFLOPS
     float        cuda_flops_q6k_f32;  // in GFLOPS
 
@@ -76,45 +76,54 @@ struct gpu_props {
         description(""), 
         memory_free        (0.0f), 
         memory_total       (0.0f), 
-        metal_flops_f32    (0.0f), 
-        metal_flops_f16    (0.0f),
+        metal_flops_f32_f32(0.0f), 
+        metal_flops_f16_f32(0.0f),
         metal_flops_q4k_f32(0.0f),
         metal_flops_q6k_f32(0.0f),
-        cuda_flops_f32     (0.0f), 
-        cuda_flops_f16     (0.0f), 
+        cuda_flops_f32_f32 (0.0f), 
+        cuda_flops_f16_f32 (0.0f), 
         cuda_flops_q4k_f32 (0.0f), 
         cuda_flops_q6k_f32 (0.0f) {}
 };
 
 struct model_flops {
-    // model flops
-    int64_t input_flops;
-    int64_t output_flops;
-    int64_t layer_flops;
-    
-    // model params
+    int64_t output_f32_f32;
+    int64_t output_q6k_f32;
+    int64_t layer_f32_f32;
+    int64_t layer_f16_f32;
+    int64_t layer_q4k_f32;
+    int64_t layer_q6k_f32;
+
+    model_flops() : 
+        output_f32_f32(0), 
+        output_q6k_f32(0), 
+        layer_f32_f32 (0),
+        layer_f16_f32 (0),
+        layer_q4k_f32 (0),
+        layer_q6k_f32 (0) {}
+};
+
+struct model_params {
     int64_t input_params;
     int64_t output_params;
     int64_t layer_params;
 
-    model_flops() : 
-        input_flops  (0), 
-        output_flops (0), 
-        layer_flops  (0), 
-        input_params (0), 
-        output_params(0), 
+    model_params() :
+        input_params (0),
+        output_params(0),
         layer_params (0) {}
 };
 
 struct device_info {
-    uint32_t           rank;
-    const char *       device_name;
-    float              disk_read_bandwidth;  // in GB/s
-    struct cpu_props   cpu_props;
-    struct memory_info memory;
-    struct gpu_support gpu_support;
-    struct gpu_props   gpu_props;
-    struct model_flops model_flops;
+    uint32_t            rank;
+    const char *        device_name;
+    float               disk_read_bandwidth;  // in GB/s
+    struct cpu_props    cpu_props;
+    struct memory_info  memory;
+    struct gpu_support  gpu_support;
+    struct gpu_props    gpu_props;
+    struct model_flops  model_flops;
+    struct model_params model_params;
 
     device_info() : 
         rank(0), 
@@ -124,7 +133,8 @@ struct device_info {
         memory(), 
         gpu_support(), 
         gpu_props(), 
-        model_flops() {}
+        model_flops(),
+        model_params() {}
 };
 
 enum profiler_backend_type {
@@ -133,6 +143,12 @@ enum profiler_backend_type {
     PROFILER_BACKEND_TYPE_CUDA  = 2,
 };
 
+enum profiler_layer_type {
+    PROFILER_LAYER_INPUT   = 0,
+    PROFILER_LAYER_OUTPUT  = 1,
+    PROFILER_LAYER_BACKEND = 2,
+};
+
 const char * device_name(void); 
 
 uint32_t device_cpu_cores      (void);
diff --git a/include/llama.h b/include/llama.h
index 7fe18d39..24663712 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -528,7 +528,8 @@ extern "C" {
     LLAMA_API void llama_model_n_flops(
                         struct llama_model * model, 
                  struct llama_model_loader * ml, 
-                        struct model_flops * ffo,
+                        struct model_flops * n_flops,
+                       struct model_params * n_params,
                              const int64_t   n_input,
                              const int64_t   n_history);
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 1a3eb1d6..50a3b832 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3549,8 +3549,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
 void llama_profile_device(device_info * dev_info, struct llama_model * model, llama_model_loader * ml, const char * test_file, int n_threads) {
     dev_info->device_name               = device_name();
     dev_info->cpu_props.cores           = device_cpu_cores();
-    dev_info->cpu_props.flops_f32       = device_cpu_flops(model, GGML_TYPE_F32,  GGML_TYPE_F32, n_threads);
-    dev_info->cpu_props.flops_f16       = device_cpu_flops(model, GGML_TYPE_F16,  GGML_TYPE_F16, n_threads);
+    dev_info->cpu_props.flops_f32_f32   = device_cpu_flops(model, GGML_TYPE_F32,  GGML_TYPE_F32, n_threads);
+    dev_info->cpu_props.flops_f16_f32   = device_cpu_flops(model, GGML_TYPE_F16,  GGML_TYPE_F32, n_threads);
     dev_info->cpu_props.flops_q4k_f32   = device_cpu_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32, n_threads);
     dev_info->cpu_props.flops_q6k_f32   = device_cpu_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32, n_threads);
 
@@ -3582,18 +3582,19 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, ll
     dev_info->gpu_props.description         = gpu_props.description;
     dev_info->gpu_props.memory_free         = round(gpu_props.memory_free  / (double)(1 << 30) * 100) / 100;
     dev_info->gpu_props.memory_total        = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100;
-    dev_info->gpu_props.metal_flops_f32     = device_metal_flops(model, GGML_TYPE_F32,  GGML_TYPE_F32);
-    dev_info->gpu_props.metal_flops_f16     = device_metal_flops(model, GGML_TYPE_F16,  GGML_TYPE_F16);
+    dev_info->gpu_props.metal_flops_f32_f32 = device_metal_flops(model, GGML_TYPE_F32,  GGML_TYPE_F32);
+    dev_info->gpu_props.metal_flops_f16_f32 = device_metal_flops(model, GGML_TYPE_F16,  GGML_TYPE_F32);
     dev_info->gpu_props.metal_flops_q4k_f32 = device_metal_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
     dev_info->gpu_props.metal_flops_q6k_f32 = device_metal_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32);
-    dev_info->gpu_props.cuda_flops_f32      = device_cuda_flops (model, GGML_TYPE_F32,  GGML_TYPE_F32);
-    dev_info->gpu_props.cuda_flops_f16      = device_cuda_flops (model, GGML_TYPE_F16,  GGML_TYPE_F16);
+    dev_info->gpu_props.cuda_flops_f32_f32  = device_cuda_flops (model, GGML_TYPE_F32,  GGML_TYPE_F32);
+    dev_info->gpu_props.cuda_flops_f16_f32  = device_cuda_flops (model, GGML_TYPE_F16,  GGML_TYPE_F32);
     dev_info->gpu_props.cuda_flops_q4k_f32  = device_cuda_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
     dev_info->gpu_props.cuda_flops_q6k_f32  = device_cuda_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32);
 
     if (dev_info->rank == 0) {
-        struct model_flops * ffo = &dev_info->model_flops;
-        llama_model_n_flops(model, ml, ffo, 1, 10);
+        struct model_flops  * n_flops  = &dev_info->model_flops;
+        struct model_params * n_params = &dev_info->model_params;
+        llama_model_n_flops(model, ml, n_flops, n_params, 1, 10);
     }
 }
 
@@ -20669,7 +20670,46 @@ static void llama_model_reset_tensors(struct llama_model * model) {
     model->cls_out_b = nullptr;
 }
 
-void llama_model_n_flops(struct llama_model * model, struct llama_model_loader * ml, struct model_flops * ffo, const int64_t n_input, const int64_t n_history) {
+static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, enum profiler_layer_type ltype, int64_t n) {
+    switch (ltype) {
+        case PROFILER_LAYER_OUTPUT:
+            switch (dtype) {
+                case GGML_TYPE_F32:
+                    n_flops->output_f32_f32 += n;
+                    break;
+                case GGML_TYPE_Q6_K:
+                    n_flops->output_q6k_f32 += n;
+                    break;
+                default:
+                    throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
+            }
+            break;
+
+        case PROFILER_LAYER_BACKEND:
+            switch (dtype) {
+                case GGML_TYPE_F32:
+                    n_flops->layer_f32_f32 += n;
+                    break;
+                case GGML_TYPE_F16:
+                    n_flops->layer_f16_f32 += n;
+                    break;
+                case GGML_TYPE_Q4_K:
+                    n_flops->layer_q4k_f32 += n;
+                    break;
+                case GGML_TYPE_Q6_K:
+                    n_flops->layer_q6k_f32 += n;
+                    break;
+                default:
+                    throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n");
+            }
+            break;
+
+        default:
+            throw std::runtime_error("Unrecognized profiler layer type\n");
+    }
+}
+
+void llama_model_n_flops(struct llama_model * model, struct llama_model_loader * ml, struct model_flops * n_flops, struct model_params * n_params, const int64_t n_input, const int64_t n_history) {
     const llama_hparams hparams  = model->hparams;
     const int64_t n_layer        = hparams.n_layer;
     const int64_t n_vocab        = hparams.n_vocab;
@@ -20774,73 +20814,73 @@ void llama_model_n_flops(struct llama_model * model, struct llama_model_loader *
             if (it != tensor_name_map.end()) {
                 switch (it->second) {
                     case 1: { // "token_embd.weight"
-                        ffo->input_flops  += (2 * n_input * n_embd * n_vocab - n_input * n_embd);
-                        ffo->input_params += static_cast<int64_t>(ggml_nelements(cur));
+                        n_params->input_params  += static_cast<int64_t>(ggml_nelements(cur));
                         break;
                     }
                     case 2: { // "output_norm.weight"
-                        ffo->output_flops  += n_input * (8 * n_embd + 1);
-                        ffo->output_params += static_cast<int64_t>(ggml_nelements(cur));
+                        count_n_flops(n_flops, cur->type, PROFILER_LAYER_OUTPUT, n_input * (4 * n_embd + 1));
+                        n_params->output_params += static_cast<int64_t>(ggml_nelements(cur));
                         break;
                     }
                     case 3: { // "output.weight"
-                        ffo->output_flops  += 2 * n_input * n_embd * n_vocab;
-                        ffo->output_flops  += 5 * n_input * n_vocab;
-                        ffo->output_params += static_cast<int64_t>(ggml_nelements(cur));
+                        count_n_flops(n_flops, cur->type,     PROFILER_LAYER_OUTPUT, 2 * n_input * n_embd * n_vocab);
+                        count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 5 * n_input * n_vocab); // softmax
+                        n_params->output_params += static_cast<int64_t>(ggml_nelements(cur));
                         break;
                     }
                     case 4:  // "blk.0.attn_norm.weight"
                     case 12: // "blk.0.ffn_norm.weight"
                     { 
-                        ffo->layer_flops  += n_input * (8 * n_embd + 1);
-                        ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
+                        count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, n_input * (4 * n_embd + 1));
+                        n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
                         break;
                     }
                     case 5: { // "blk.0.attn_q.weight"
-                        ffo->layer_flops  += 2 * n_input * n_embd * (n_head * n_embd_head_k);
-                        ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
+                        count_n_flops(n_flops, cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_head_k));
+                        count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd_head_k); // rope
+                        n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
                         break;
                     }
                     case 6: { // "blk.0.attn_k.weight"
-                        ffo->layer_flops  += 2 * n_input * n_embd * (n_head * n_embd_k_gqa);
-                        ffo->layer_flops  += 2 * n_input * (n_input + n_history) * n_embd_head_k * n_head; // Q*K with KVCache
-                        ffo->layer_flops  += 7 * n_input * (n_input + n_history) * n_head; // scale, mask, and softmax
-                        ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
+                        count_n_flops(n_flops, cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_k_gqa));
+                        count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd_k_gqa); // rope
+                        count_n_flops(n_flops, GGML_TYPE_F16, PROFILER_LAYER_BACKEND, 2 * n_input * (n_input + n_history) * n_embd_head_k * n_head); // compute kq with kvcache
+                        count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 7 * n_input * (n_input + n_history) * n_head); // scale, mask, and softmax
+                        n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
                         break;
                     }
                     case 7: { // "blk.0.attn_v.weight"
-                        ffo->layer_flops  += 2 * n_input * n_embd * (n_head * n_embd_v_gqa);
-                        ffo->layer_flops  += n_input * (n_input + n_history) * n_embd_head_k * n_head; // QKV with KVCache
-                        ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
+                        count_n_flops(n_flops, cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_v_gqa));
+                        count_n_flops(n_flops, GGML_TYPE_F16, PROFILER_LAYER_BACKEND, n_input * (n_input + n_history) * n_embd_head_k * n_head); // compute kqv with kvcache
+                        n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
                         break;
                     }
                     case 8: { // "blk.0.attn_output.weight"
-                        ffo->layer_flops  += 2 * n_input * (n_head * n_embd_head_k) * n_embd;
-                        ffo->layer_flops  += n_input * n_embd; // shortcut
-                        ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
+                        count_n_flops(n_flops, cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * (n_head * n_embd_head_k) * n_embd);
+                        count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_embd); // shortcut
+                        n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
                         break;
                     }
                     case 9: { // "blk.0.ffn_gate.weight"
-                        ffo->layer_flops  += 2 * n_input * n_embd * n_ff;
-                        ffo->layer_flops  += 5 * n_input * n_ff; // SiLU
-                        ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
+                        count_n_flops(n_flops, cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff);
+                        count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 5 * n_input * n_ff); // SiLU
+                        n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
                         break;
                     }
                     case 10: { // "blk.0.ffn_down.weight"
-                        ffo->layer_flops  += 2 * n_input * n_embd * n_ff;
-                        ffo->layer_flops  += n_input * n_embd; // shortcut
-                        ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
+                        count_n_flops(n_flops, cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff);
+                        count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_embd); // shortcut
+                        n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
                         break;
                     }
                     case 11: { // "blk.0.ffn_up.weight"
-                        ffo->layer_flops  += 2 * n_input * n_embd * n_ff;
-                        ffo->layer_flops  += n_input * n_ff; // silu(gate(x)) * up(x)
-                        ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
+                        count_n_flops(n_flops, cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff);
+                        count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_ff); // silu(gate(x)) * up(x)
+                        n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
                         break;
                     }
-                    case 13: { // rope_freqs.weight, for Q and K
-                        ffo->layer_flops  += 8 * n_input * n_head * n_embd_head_k;
-                        ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
+                    case 13: { // rope_freqs.weight, has been counted in q and k
+                        n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
                         break;
                     }
                     // optional: bias tensors
@@ -20850,29 +20890,29 @@ void llama_model_n_flops(struct llama_model * model, struct llama_model_loader *
                     case 17: // "blk.0.attn_output.bias"
                     case 19: // "blk.0.ffn_down.bias"
                     {
-                        ffo->layer_flops  += n_input * n_embd; 
-                        ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
+                        count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, n_input * n_embd);
+                        n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
                         break;
                     }
                     case 18: // "blk.0.ffn_gate.bias"
                     case 20: // "blk.0.ffn_up.bias"
                     {
-                        ffo->layer_flops  += n_input * n_ff;
-                        ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
+                        count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, n_input * n_ff);
+                        n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
                         break; 
                     }
                     // optional: expert tensors
                     case 21: { // "blk.0.ffn_gate_inp.weight"
-                        ffo->layer_flops  += 2 * n_input * n_embd * n_expert;
-                        ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
+                        count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_expert);
+                        n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
                         break;
                     }
                     case 22: // "blk.0.ffn_gate_exps.weight"
                     case 23: // "blk.0.ffn_down_exps.weight"
                     case 24: // "blk.0.ffn_up_exps.weight"
                     { 
-                        ffo->layer_flops  += 2 * n_input * n_embd * n_ff * n_expert;
-                        ffo->layer_params += static_cast<int64_t>(ggml_nelements(cur));
+                        count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff * n_expert);
+                        n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
                         break;
                     }
                     default: