fix flops count and ram/vram speed test

2025-09-06 04:59:05 +00:00 · 2024-12-08 10:14:05 +04:00 · 2024-12-08 10:14:05 +04:00 · df813675d0
commit df813675d0
parent 26c2ffb5b7
5 changed files with 136 additions and 83 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -896,7 +896,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
    device_info dev_info;
    dev_info.rank = params.rank;
-    llama_profile_device(&dev_info, model, ml, params.n_predict, params.cpuparams.n_threads);
+    llama_profile_device(&dev_info, model, ml, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
    // create llama context
    struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
@ -1133,6 +1133,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    std::strcpy(cparams.next_node_ip, params.next_node_ip.c_str());
    cparams.n_ctx             = params.n_ctx;
    cparams.n_predict         = params.n_predict;
    cparams.n_seq_max         = params.n_parallel;
    cparams.n_batch           = params.n_batch;
    cparams.n_ubatch          = params.n_ubatch;
--- a/common/profiler.cpp
+++ b/common/profiler.cpp
@ -97,8 +97,9 @@ uint32_t device_cpu_cores() {
 }
 static float device_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, enum profiler_backend_type btype, int n_threads) {
-    const int n_repeat = 1;
+    int n_repeat = 1;
-    const int n_embd   = std::min(llama_n_embd(model), 4096);
+    int n_embd = std::min(llama_n_embd(model), 4096);
    if (btype == PROFILER_BACKEND_TYPE_CPU) n_embd /= 8; // simulate small tensor calculation on cpu
    std::vector<float> matrix_A(n_embd * n_embd, 1.0f); 
    std::vector<float> matrix_B(n_embd * n_embd, 1.0f / n_embd);
@ -142,12 +143,9 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum
    struct ggml_cgraph  * gf         = NULL;
    struct ggml_context * ctx_cgraph = NULL;
    struct ggml_tensor  * cur        = NULL;
    struct ggml_tensor  * cur1       = NULL;
    struct ggml_tensor  * cur2       = NULL;
    struct ggml_tensor  * cur3       = NULL;
    {
        struct ggml_init_params params0 = {
-            /*.mem_size   =*/ ggml_tensor_overhead() * (5 * n_repeat + 1) + ggml_graph_overhead(),
+            /*.mem_size   =*/ ggml_tensor_overhead() * (n_repeat + 2) + ggml_graph_overhead(),
            /*.mem_buffer =*/ NULL,
            /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
        };
@ -155,12 +153,8 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum
        gf = ggml_new_graph(ctx_cgraph);
        cur = ggml_mul_mat(ctx_cgraph, tensor_a, tensor_b);
-        for (int i = 0; i < n_repeat; i++) {
+        for (int i = 0; i < n_repeat - 1; i++) {
-            cur1 = ggml_mul_mat(ctx_cgraph, tensor_a, cur);
+            cur = ggml_mul_mat(ctx_cgraph, tensor_a, cur);
            cur2 = ggml_mul_mat(ctx_cgraph, tensor_a, cur);
            cur  = ggml_add(ctx_cgraph, cur1, cur2);
            cur3 = ggml_mul_mat(ctx_cgraph, tensor_a, cur);
            cur  = ggml_add(ctx_cgraph, cur, cur3);
        }
        ggml_build_forward_expand(gf, cur);
    }
@ -204,15 +198,14 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum
    ggml_backend_sched_alloc_graph(sched, gf);
    // warm-up
-    // ggml_backend_graph_compute(backend, gf);
+    ggml_backend_graph_compute(backend, gf);
    const int64_t t_start = ggml_time_us();
    ggml_backend_graph_compute(backend, gf);
    const int64_t t_end = ggml_time_us();
    double elapsed_seconds = ((double)t_end - (double)t_start) / 1e6; // convert to seconds
-    double flops = (2.0 * (double)n_embd * (double)n_embd * (double)n_embd + 
+    double flops = (2.0 * (double)n_embd * (double)n_embd * (double)n_embd * n_repeat) / elapsed_seconds / 1e9; // convert to GFLOPS
                   n_repeat * 4 * 2.0 * (double)n_embd * (double)n_embd * (double)n_embd) / elapsed_seconds / 1e9; // convert to GFLOPS
    ggml_free(ctx_cgraph);
    ggml_gallocr_free(allocr);
@ -933,8 +926,8 @@ float device_memory_bw(int n_thread) {
    return static_cast<float>(bandwidth);
 }
-static float device_read_vram_bw(struct llama_model * model, enum profiler_backend_type btype) {
+static float device_read_vram_bw(enum profiler_backend_type btype) {
-    const int n_embd = std::min(llama_n_embd(model) * 2, 4096 * 2);
+    const int n_embd = 8192;
    std::vector<float> matrix_A(n_embd * n_embd, 1.0f);
    ggml_backend_t backend = NULL;
@ -1006,21 +999,19 @@ static float device_read_vram_bw(struct llama_model * model, enum profiler_backe
    return bandwidth;
 }
-float device_metal_read_vram_bw(struct llama_model * model) {
+float device_metal_read_vram_bw() {
 #ifdef GGML_USE_METAL
-    return device_read_vram_bw(model, PROFILER_BACKEND_TYPE_METAL);
+    return device_read_vram_bw(PROFILER_BACKEND_TYPE_METAL);
 #endif
    (void)model;
    return 0.0f;
 }
-float device_cuda_read_vram_bw(struct llama_model * model) {
+float device_cuda_read_vram_bw() {
 #ifdef GGML_USE_CUDA
-    return device_read_vram_bw(model, PROFILER_BACKEND_TYPE_CUDA);
+    return device_read_vram_bw(PROFILER_BACKEND_TYPE_CUDA);
 #endif
    (void)model;
    return 0.0f;
 }
@ -1124,7 +1115,7 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c
 }
 // estimate the memory access delay, except for the input embedding because it has been considered in n_flops.inp_embd_ms
-static float device_memory_access_delay(struct device_info & dev_info, const struct llama_context_params cparams, int n_layers) {
+static float device_memory_access_delay(struct device_info & dev_info, struct llama_model * model, const struct llama_context_params cparams, int n_layers) {
    struct model_params n_params = dev_info.model_params;
    int n_gpu_layers = std::min(static_cast<int>(cparams.n_gpu_layers), n_layers);
@ -1144,9 +1135,14 @@ static float device_memory_access_delay(struct device_info & dev_info, const str
                   n_params.output_q6k * 6 / 8 +
                   n_params.output_q80;
-#if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL)
+    uint64_t cpu_kv_size;
-    int64_t vram_bytes = layer_bytes * n_gpu_layers;
+    uint64_t gpu_kv_size;
-    int64_t ram_bytes  = layer_bytes * (n_layers - n_gpu_layers) + output_bytes;
+
 #if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
    llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, true);
    int64_t vram_bytes = layer_bytes * n_gpu_layers + gpu_kv_size;
    int64_t ram_bytes  = layer_bytes * (n_layers - n_gpu_layers) + output_bytes + cpu_kv_size;
 #ifdef GGML_USE_CUDA
    double vram_access_delay = (double)(vram_bytes) / 1e6 / dev_info.gpu_props.cuda_read_vram_bw;
@ -1158,8 +1154,11 @@ static float device_memory_access_delay(struct device_info & dev_info, const str
    return static_cast<float>(vram_access_delay + ram_access_delay); // ms
 #else
    llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, false);
    (void)n_gpu_layers;
-    int64_t ram_bytes = layer_bytes * n_layers + output_bytes;
+    (void)gpu_kv_size;
    int64_t ram_bytes = layer_bytes * n_layers + output_bytes + cpu_kv_size;
    double ram_access_delay = (double)(ram_bytes) / 1e6 / dev_info.memory.cpu_read_ram_bw;
    return static_cast<float>(ram_access_delay); // ms
 #endif
@ -1191,7 +1190,9 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam
 #if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
    cpu_total_bytes += layer_bytes * (n_layers - n_gpu_layers);
 #if defined(GGML_USE_METAL)
    int64_t gpu_total_bytes = layer_bytes * n_gpu_layers;
 #endif
 #else
    (void)n_gpu_layers;
    cpu_total_bytes += layer_bytes * n_layers;
@ -1211,10 +1212,10 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam
    uint64_t gpu_compute_buf;
 #if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
-    llama_model_kvcache_size(&cpu_kv_size, &gpu_kv_size, model, cparams, true);
+    llama_total_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, true);
    llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, true);
 #else
-    llama_model_kvcache_size(&cpu_kv_size, &gpu_kv_size, model, cparams, false);
+    llama_total_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, false);
    llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, false);
 #endif
@ -1652,7 +1653,7 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
    float latency = 0.0f;
    int n_layers  = llama_model_n_layers (model);
    latency += device_compute_delay      (dev_info_set[0], n_layers,cparams);
-    latency += device_memory_access_delay(dev_info_set[0], cparams,  n_layers);
+    latency += device_memory_access_delay(dev_info_set[0], model, cparams, n_layers);
    latency += device_disk_access_delay  (dev_info_set[0], model, cparams); // if physical memory is not enough, some mapped data will be released and reloaded later
    LOG_INF("| Token latency (ms)           ");
--- a/common/profiler.h
+++ b/common/profiler.h
@ -241,8 +241,8 @@ uint64_t device_swap_memory       (bool available);
 void     device_disk_seq_bw       (float * read_seq_bw, float * write_seq_bw, int n_threads);
 void     device_disk_rnd_bw       (float * read_rnd_bw, float * write_rnd_bw, int n_threads);
 float    device_memory_bw         (int n_thread);
-float    device_metal_read_vram_bw(struct llama_model * model);
+float    device_metal_read_vram_bw();
-float    device_cuda_read_vram_bw (struct llama_model * model);
+float    device_cuda_read_vram_bw ();
 void     device_get_props         (struct llama_model * model, int device, struct ggml_backend_dev_props * props); 
 void     device_print_props       (struct device_info * dev_info_set, int n, struct llama_model * model, const struct llama_context_params cparams);
--- a/include/llama.h
+++ b/include/llama.h
@ -325,6 +325,7 @@ extern "C" {
        char *      master_ip;         // ip address of the master node
        char *      next_node_ip;      // ip address of the next node
        uint32_t    n_ctx;             // text context, 0 = from model
        uint32_t    n_predict;         // number of tokens to predict
        uint32_t    n_batch;           // logical maximum batch size that can be submitted to llama_decode
        uint32_t    n_ubatch;          // physical maximum batch size
        uint32_t    n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
@ -416,7 +417,9 @@ extern "C" {
                       struct llama_model * model, 
                struct llama_model_loader * ml,
                                      int   n_predict,
-                int                         n_threads);
+                                      int   n_ctx,
                                      int   n_threads,
                                     bool   flash_attn);
    LLAMA_API ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device);
@ -534,7 +537,14 @@ extern "C" {
                                      bool   use_gpu);
    // Return the size of KV cache in the model
-    LLAMA_API void llama_model_kvcache_size(
+    LLAMA_API void llama_total_kv_size(
                                  uint64_t * cpu_cache, 
                                  uint64_t * gpu_cache, 
                  const struct llama_model * model, 
         const struct llama_context_params   cparams,
                                      bool   use_gpu);
    LLAMA_API void llama_kv_size(
                            uint64_t * cpu_cache, 
                            uint64_t * gpu_cache, 
            const struct llama_model * model, 
@ -547,9 +557,10 @@ extern "C" {
                 struct llama_model_loader * ml, 
                        struct model_flops * n_flops,
                       struct model_params * n_params,
                             const int64_t   n_input,
                             const int64_t   n_history,
-                            enum ggml_type * inp_embd_dtype);
+                             const int64_t   n_ctx,
                            enum ggml_type * inp_embd_dtype,
                                      bool   flash_attn);
    // Get a llama model tensor
    LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -3570,7 +3570,14 @@ static bool is_dtype_exist(struct model_params * n_params, enum ggml_type dtype)
    }
 }
-void llama_profile_device(device_info * dev_info, struct llama_model * model, llama_model_loader * ml, int n_predict, int n_threads) {
+void llama_profile_device(
                device_info * dev_info, 
         struct llama_model * model, 
         llama_model_loader * ml, 
                        int   n_predict,
                        int   n_ctx, 
                        int   n_threads,
                       bool   flash_attn) {
    dev_info->device_name               = device_name();
    dev_info->cpu_props.cores           = device_cpu_cores();
@ -3584,7 +3591,7 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, ll
    struct model_params * n_params = &dev_info->model_params;
    if (dev_info->rank == 0) {    
        enum ggml_type inp_embd_dtype  = GGML_TYPE_F32;
-        llama_model_n_flops(model, ml, n_flops, n_params, 1, n_predict, &inp_embd_dtype);
+        llama_model_n_flops(model, ml, n_flops, n_params, n_predict, n_ctx, &inp_embd_dtype, flash_attn);
        n_flops->inp_embd_ms = device_inp_embd_delay(model, inp_embd_dtype, 1, n_threads);
    }
@ -3611,8 +3618,8 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, ll
    dev_info->gpu_props.description         = gpu_props.description;
    dev_info->gpu_props.memory_free         = round(gpu_props.memory_free  / (double)(1 << 30) * 100) / 100;
    dev_info->gpu_props.memory_total        = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100;
-    dev_info->gpu_props.metal_read_vram_bw  = device_metal_read_vram_bw(model);
+    dev_info->gpu_props.metal_read_vram_bw  = device_metal_read_vram_bw();
-    dev_info->gpu_props.cuda_read_vram_bw   = device_cuda_read_vram_bw(model);
+    dev_info->gpu_props.cuda_read_vram_bw   = device_cuda_read_vram_bw();
    if (is_dtype_exist(n_params, GGML_TYPE_F32)) {
        dev_info->cpu_props.flops_f32_f32       = device_cpu_flops  (model, GGML_TYPE_F32,  GGML_TYPE_F32, n_threads);
@ -19669,6 +19676,7 @@ struct llama_context_params llama_context_default_params() {
        /*.master_ip                   =*/ nullptr,
        /*.next_node_ip                =*/ nullptr,
        /*.n_ctx                       =*/ 512,
        /*.n_predict                   =*/ 512,
        /*.n_batch                     =*/ 2048,
        /*.n_ubatch                    =*/ 512,
        /*.n_seq_max                   =*/ 1,
@ -20910,22 +20918,49 @@ void llama_model_compute_buf_size(
    }
 }
-void llama_model_kvcache_size(
+void llama_total_kv_size(
                            uint64_t * cpu_cache, 
                            uint64_t * gpu_cache, 
            const struct llama_model * model, 
   const struct llama_context_params   cparams, 
                                bool   use_gpu) {
    const llama_hparams hparams = model->hparams;
-    uint64_t ne_k = static_cast<uint64_t>(hparams.n_embd_k_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_k);
+    uint64_t nb_k = static_cast<uint64_t>(hparams.n_embd_k_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_k);
-    uint64_t ne_v = static_cast<uint64_t>(hparams.n_embd_v_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_v);
+    uint64_t nb_v = static_cast<uint64_t>(hparams.n_embd_v_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_v);
    if (use_gpu) {
        int n_gpu_layers = std::min(cparams.n_gpu_layers, hparams.n_layer);
-        *gpu_cache = (ne_k + ne_v) * n_gpu_layers;
+        *gpu_cache = (nb_k + nb_v) * n_gpu_layers;
-        *cpu_cache = (ne_k + ne_v) * (llama_model_n_layers(model) - n_gpu_layers);
+        *cpu_cache = (nb_k + nb_v) * (llama_model_n_layers(model) - n_gpu_layers);
    } else {
        *gpu_cache = 0;
-        *cpu_cache = (ne_k + ne_v) * llama_model_n_layers(model);
+        *cpu_cache = (nb_k + nb_v) * llama_model_n_layers(model);
    }
 }
 void llama_kv_size(
                            uint64_t * cpu_cache, 
                            uint64_t * gpu_cache, 
            const struct llama_model * model, 
   const struct llama_context_params   cparams, 
                                bool   use_gpu) {
    const llama_hparams hparams = model->hparams;
    const int64_t n_layer      = llama_model_n_layers(model);
    const int64_t n_ctx        = cparams.n_ctx;
    const int64_t n_history    = cparams.n_predict;
    const int64_t n_pad        = cparams.flash_attn ? 256u : 32u;
    const int64_t n_kv         = std::min(n_ctx, std::max(n_pad, GGML_PAD(n_history, n_pad)));
    const int64_t n_embd_k_gqa = static_cast<int64_t>(hparams.n_embd_k_gqa());
    const int64_t n_embd_v_gqa = static_cast<int64_t>(hparams.n_embd_v_gqa());
    const int64_t nb_k = n_embd_k_gqa * n_kv * ggml_type_size(cparams.type_k);
    const int64_t nb_v = n_embd_v_gqa * n_kv * ggml_type_size(cparams.type_v);
    if (use_gpu) {
        const int64_t n_gpu_layers = std::min(n_layer, static_cast<int64_t>(cparams.n_gpu_layers));
        *gpu_cache = (nb_k + nb_v) * n_gpu_layers;
        *cpu_cache = (nb_k + nb_v) * (n_layer - n_gpu_layers);
    } else {
        *gpu_cache = 0;
        *cpu_cache = (nb_k + nb_v) * n_layer;
    }
 }
@ -20934,19 +20969,22 @@ void llama_model_n_flops(
     struct llama_model_loader * ml, 
            struct model_flops * n_flops, 
           struct model_params * n_params, 
                 const int64_t   n_input, 
                 const int64_t   n_history,
-                enum ggml_type * inp_embd_dtype) {
+                 const int64_t   n_ctx, 
                enum ggml_type * inp_embd_dtype,
                          bool   flash_attn) {
    const llama_hparams hparams  = model->hparams;
    const int64_t n_layer        = hparams.n_layer;
    const int64_t n_vocab        = hparams.n_vocab;
    const int64_t n_embd         = hparams.n_embd;
    const int64_t n_head         = hparams.n_head();
    const int64_t n_head_kv      = hparams.n_head_kv();
    const int64_t n_ff           = hparams.n_ff();
    const int64_t n_embd_k_gqa   = hparams.n_embd_k_gqa();
    const int64_t n_embd_v_gqa   = hparams.n_embd_v_gqa();
    const int64_t n_embd_head_k  = hparams.n_embd_head_k;
    const int64_t n_embd_head_v  = hparams.n_embd_head_v;
    const int64_t n_expert       = hparams.n_expert;
    const int64_t n_pad          = flash_attn ? 256u : 32u;
    const int64_t n_kv           = std::min(n_ctx, std::max(n_pad, GGML_PAD(n_history, n_pad)));
    // assign all the tensors on CPU by default
    model->buft_input  = llama_default_buffer_type_cpu(*model, true);
@ -21045,64 +21083,66 @@ void llama_model_n_flops(
                    break;
                }
                case 2: { // "output_norm.weight"
-                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_OUTPUT, n_input * (4 * n_embd + 1));
+                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 4 * n_embd + 1); // rms_norm
                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_OUTPUT, n_embd); // norm weights
                    count_n_params(n_params, cur->type,     PROFILER_LAYER_OUTPUT, ggml_nelements(cur));
                    break;
                }
                case 3: { // "output.weight"
-                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_OUTPUT, 2 * n_input * n_embd * n_vocab);
+                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_OUTPUT, 2 * n_embd * n_vocab);
-                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 5 * n_input * n_vocab); // softmax
+                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 5 * n_vocab); // softmax
                    count_n_params(n_params, cur->type,     PROFILER_LAYER_OUTPUT, ggml_nelements(cur));
                    break;
                }
                case 4:  // "blk.0.attn_norm.weight"
                case 12: // "blk.0.ffn_norm.weight"
                { 
-                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, n_input * (4 * n_embd + 1));
+                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 4 * n_embd + 1); // rms norm
                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, n_embd); // norm weights
                    count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                    break;
                }
                case 5: { // "blk.0.attn_q.weight"
-                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_head_k));
+                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_embd);
-                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd_head_k); // rope
+                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2.5 * n_embd); // rope
                    count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                    break;
                }
                case 6: { // "blk.0.attn_k.weight"
-                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_k_gqa));
+                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_head_kv * n_embd_head_k);
-                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd_k_gqa); // rope
+                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2.5 * n_embd_head_k * n_head_kv); // rope
-                    count_n_flops (n_flops,  GGML_TYPE_F16, PROFILER_LAYER_BACKEND, 2 * n_input * (n_input + n_history) * n_embd_head_k * n_head); // compute kq with kvcache
+                    count_n_flops (n_flops,  GGML_TYPE_F16, PROFILER_LAYER_BACKEND, 2 * n_embd_head_k * n_head * n_kv * n_head_kv); // compute kq with kvcache
-                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 7 * n_input * (n_input + n_history) * n_head); // scale, mask, and softmax
+                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 7 * n_head * n_kv); // scale, mask, and softmax
                    count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                    break;
                }
                case 7: { // "blk.0.attn_v.weight"
-                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_v_gqa));
+                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_head_kv * n_embd_head_v);
-                    count_n_flops (n_flops,  GGML_TYPE_F16, PROFILER_LAYER_BACKEND, n_input * (n_input + n_history) * n_embd_head_k * n_head); // compute kqv with kvcache
+                    count_n_flops (n_flops,  GGML_TYPE_F16, PROFILER_LAYER_BACKEND, 2 * n_embd_head_v * n_head * n_kv * n_head_kv); // compute kqv with kvcache
                    count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                    break;
                }
                case 8: { // "blk.0.attn_output.weight"
-                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * (n_head * n_embd_head_k) * n_embd);
+                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_embd);
-                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_embd); // shortcut
+                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_embd); // shortcut
                    count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                    break;
                }
                case 9: { // "blk.0.ffn_gate.weight"
-                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff);
+                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff);
-                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 5 * n_input * n_ff); // SiLU
+                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 8 * n_ff); // SiLU
                    count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                    break;
                }
                case 10: { // "blk.0.ffn_down.weight"
-                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff);
+                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff);
-                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_embd); // shortcut
+                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_embd); // shortcut
                    count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                    break;
                }
                case 11: { // "blk.0.ffn_up.weight"
-                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff);
+                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff);
-                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_ff); // silu(gate(x)) * up(x)
+                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_ff); // silu(gate(x)) * up(x)
                    count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                    break;
                }
@ -21117,20 +21157,20 @@ void llama_model_n_flops(
                case 17: // "blk.0.attn_output.bias"
                case 19: // "blk.0.ffn_down.bias"
                {
-                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, n_input * n_embd);
+                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, n_embd);
                    count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                    break;
                }
                case 18: // "blk.0.ffn_gate.bias"
                case 20: // "blk.0.ffn_up.bias"
                {
-                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, n_input * n_ff);
+                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, n_ff);
                    count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                    break; 
                }
                // optional: expert tensors
                case 21: { // "blk.0.ffn_gate_inp.weight"
-                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_expert);
+                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_expert);
                    count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                    break;
                }
@ -21138,7 +21178,7 @@ void llama_model_n_flops(
                case 23: // "blk.0.ffn_down_exps.weight"
                case 24: // "blk.0.ffn_up_exps.weight"
                { 
-                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff * n_expert);
+                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff * n_expert);
                    count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                    break;
                }