diff --git a/common/common.cpp b/common/common.cpp
index 9c373a9a..7e09fad2 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -896,7 +896,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
 
     device_info dev_info;
     dev_info.rank = params.rank;
-    llama_profile_device(&dev_info, model, ml, params.n_predict, params.cpuparams.n_threads);
+    llama_profile_device(&dev_info, model, ml, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
 
     // create llama context
     struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
@@ -1133,6 +1133,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     std::strcpy(cparams.next_node_ip, params.next_node_ip.c_str());
 
     cparams.n_ctx             = params.n_ctx;
+    cparams.n_predict         = params.n_predict;
     cparams.n_seq_max         = params.n_parallel;
     cparams.n_batch           = params.n_batch;
     cparams.n_ubatch          = params.n_ubatch;
diff --git a/common/profiler.cpp b/common/profiler.cpp
index c75e0249..0e041ad3 100644
--- a/common/profiler.cpp
+++ b/common/profiler.cpp
@@ -97,8 +97,9 @@ uint32_t device_cpu_cores() {
 }
 
 static float device_flops(struct llama_model * model, enum ggml_type src0t, enum ggml_type src1t, enum profiler_backend_type btype, int n_threads) {
-    const int n_repeat = 1;
-    const int n_embd   = std::min(llama_n_embd(model), 4096);
+    int n_repeat = 1;
+    int n_embd = std::min(llama_n_embd(model), 4096);
+    if (btype == PROFILER_BACKEND_TYPE_CPU) n_embd /= 8; // simulate small tensor calculation on cpu
     std::vector<float> matrix_A(n_embd * n_embd, 1.0f); 
     std::vector<float> matrix_B(n_embd * n_embd, 1.0f / n_embd);
 
@@ -142,12 +143,9 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum
     struct ggml_cgraph  * gf         = NULL;
     struct ggml_context * ctx_cgraph = NULL;
     struct ggml_tensor  * cur        = NULL;
-    struct ggml_tensor  * cur1       = NULL;
-    struct ggml_tensor  * cur2       = NULL;
-    struct ggml_tensor  * cur3       = NULL;
     {
         struct ggml_init_params params0 = {
-            /*.mem_size   =*/ ggml_tensor_overhead() * (5 * n_repeat + 1) + ggml_graph_overhead(),
+            /*.mem_size   =*/ ggml_tensor_overhead() * (n_repeat + 2) + ggml_graph_overhead(),
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
         };
@@ -155,12 +153,8 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum
 
         gf = ggml_new_graph(ctx_cgraph);
         cur = ggml_mul_mat(ctx_cgraph, tensor_a, tensor_b);
-        for (int i = 0; i < n_repeat; i++) {
-            cur1 = ggml_mul_mat(ctx_cgraph, tensor_a, cur);
-            cur2 = ggml_mul_mat(ctx_cgraph, tensor_a, cur);
-            cur  = ggml_add(ctx_cgraph, cur1, cur2);
-            cur3 = ggml_mul_mat(ctx_cgraph, tensor_a, cur);
-            cur  = ggml_add(ctx_cgraph, cur, cur3);
+        for (int i = 0; i < n_repeat - 1; i++) {
+            cur = ggml_mul_mat(ctx_cgraph, tensor_a, cur);
         }
         ggml_build_forward_expand(gf, cur);
     }
@@ -204,15 +198,14 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum
     ggml_backend_sched_alloc_graph(sched, gf);
 
     // warm-up
-    // ggml_backend_graph_compute(backend, gf);
+    ggml_backend_graph_compute(backend, gf);
 
     const int64_t t_start = ggml_time_us();
     ggml_backend_graph_compute(backend, gf);
     const int64_t t_end = ggml_time_us();
 
     double elapsed_seconds = ((double)t_end - (double)t_start) / 1e6; // convert to seconds
-    double flops = (2.0 * (double)n_embd * (double)n_embd * (double)n_embd + 
-                   n_repeat * 4 * 2.0 * (double)n_embd * (double)n_embd * (double)n_embd) / elapsed_seconds / 1e9; // convert to GFLOPS
+    double flops = (2.0 * (double)n_embd * (double)n_embd * (double)n_embd * n_repeat) / elapsed_seconds / 1e9; // convert to GFLOPS
 
     ggml_free(ctx_cgraph);
     ggml_gallocr_free(allocr);
@@ -933,8 +926,8 @@ float device_memory_bw(int n_thread) {
     return static_cast<float>(bandwidth);
 }
 
-static float device_read_vram_bw(struct llama_model * model, enum profiler_backend_type btype) {
-    const int n_embd = std::min(llama_n_embd(model) * 2, 4096 * 2);
+static float device_read_vram_bw(enum profiler_backend_type btype) {
+    const int n_embd = 8192;
     std::vector<float> matrix_A(n_embd * n_embd, 1.0f);
 
     ggml_backend_t backend = NULL;
@@ -1006,21 +999,19 @@ static float device_read_vram_bw(struct llama_model * model, enum profiler_backe
     return bandwidth;
 }
 
-float device_metal_read_vram_bw(struct llama_model * model) {
+float device_metal_read_vram_bw() {
 #ifdef GGML_USE_METAL
-    return device_read_vram_bw(model, PROFILER_BACKEND_TYPE_METAL);
+    return device_read_vram_bw(PROFILER_BACKEND_TYPE_METAL);
 #endif
 
-    (void)model;
     return 0.0f;
 }
 
-float device_cuda_read_vram_bw(struct llama_model * model) {
+float device_cuda_read_vram_bw() {
 #ifdef GGML_USE_CUDA
-    return device_read_vram_bw(model, PROFILER_BACKEND_TYPE_CUDA);
+    return device_read_vram_bw(PROFILER_BACKEND_TYPE_CUDA);
 #endif
 
-    (void)model;
     return 0.0f;
 }
 
@@ -1124,7 +1115,7 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c
 }
 
 // estimate the memory access delay, except for the input embedding because it has been considered in n_flops.inp_embd_ms
-static float device_memory_access_delay(struct device_info & dev_info, const struct llama_context_params cparams, int n_layers) {
+static float device_memory_access_delay(struct device_info & dev_info, struct llama_model * model, const struct llama_context_params cparams, int n_layers) {
     struct model_params n_params = dev_info.model_params;
     int n_gpu_layers = std::min(static_cast<int>(cparams.n_gpu_layers), n_layers);
 
@@ -1143,10 +1134,15 @@ static float device_memory_access_delay(struct device_info & dev_info, const str
                    n_params.output_q5k * 5 / 8 +
                    n_params.output_q6k * 6 / 8 +
                    n_params.output_q80;
+    
+    uint64_t cpu_kv_size;
+    uint64_t gpu_kv_size;
 
-#if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL)
-    int64_t vram_bytes = layer_bytes * n_gpu_layers;
-    int64_t ram_bytes  = layer_bytes * (n_layers - n_gpu_layers) + output_bytes;
+#if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
+    llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, true);
+
+    int64_t vram_bytes = layer_bytes * n_gpu_layers + gpu_kv_size;
+    int64_t ram_bytes  = layer_bytes * (n_layers - n_gpu_layers) + output_bytes + cpu_kv_size;
 
 #ifdef GGML_USE_CUDA
     double vram_access_delay = (double)(vram_bytes) / 1e6 / dev_info.gpu_props.cuda_read_vram_bw;
@@ -1158,8 +1154,11 @@ static float device_memory_access_delay(struct device_info & dev_info, const str
     return static_cast<float>(vram_access_delay + ram_access_delay); // ms
 
 #else
+    llama_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, false);
+
     (void)n_gpu_layers;
-    int64_t ram_bytes = layer_bytes * n_layers + output_bytes;
+    (void)gpu_kv_size;
+    int64_t ram_bytes = layer_bytes * n_layers + output_bytes + cpu_kv_size;
     double ram_access_delay = (double)(ram_bytes) / 1e6 / dev_info.memory.cpu_read_ram_bw;
     return static_cast<float>(ram_access_delay); // ms
 #endif
@@ -1191,7 +1190,9 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam
 
 #if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
     cpu_total_bytes += layer_bytes * (n_layers - n_gpu_layers);
+#if defined(GGML_USE_METAL)
     int64_t gpu_total_bytes = layer_bytes * n_gpu_layers;
+#endif
 #else
     (void)n_gpu_layers;
     cpu_total_bytes += layer_bytes * n_layers;
@@ -1211,10 +1212,10 @@ static float device_disk_access_delay(struct device_info & dev_info, struct llam
     uint64_t gpu_compute_buf;
 
 #if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
-    llama_model_kvcache_size(&cpu_kv_size, &gpu_kv_size, model, cparams, true);
+    llama_total_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, true);
     llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, true);
 #else
-    llama_model_kvcache_size(&cpu_kv_size, &gpu_kv_size, model, cparams, false);
+    llama_total_kv_size(&cpu_kv_size, &gpu_kv_size, model, cparams, false);
     llama_model_compute_buf_size(&cpu_compute_buf, &gpu_compute_buf, model, cparams, false);
 #endif
 
@@ -1651,9 +1652,9 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
     // todo: calculate for each device, not only master
     float latency = 0.0f;
     int n_layers  = llama_model_n_layers (model);
-    latency += device_compute_delay      (dev_info_set[0], n_layers, cparams);
-    latency += device_memory_access_delay(dev_info_set[0], cparams,  n_layers);
-    latency += device_disk_access_delay  (dev_info_set[0], model,    cparams); // if physical memory is not enough, some mapped data will be released and reloaded later
+    latency += device_compute_delay      (dev_info_set[0], n_layers,cparams);
+    latency += device_memory_access_delay(dev_info_set[0], model, cparams, n_layers);
+    latency += device_disk_access_delay  (dev_info_set[0], model, cparams); // if physical memory is not enough, some mapped data will be released and reloaded later
 
     LOG_INF("| Token latency (ms)           ");
     LOG_INF("| %-10.2f   ", latency);
diff --git a/common/profiler.h b/common/profiler.h
index 18176cc4..39d37ce8 100644
--- a/common/profiler.h
+++ b/common/profiler.h
@@ -241,8 +241,8 @@ uint64_t device_swap_memory       (bool available);
 void     device_disk_seq_bw       (float * read_seq_bw, float * write_seq_bw, int n_threads);
 void     device_disk_rnd_bw       (float * read_rnd_bw, float * write_rnd_bw, int n_threads);
 float    device_memory_bw         (int n_thread);
-float    device_metal_read_vram_bw(struct llama_model * model);
-float    device_cuda_read_vram_bw (struct llama_model * model);
+float    device_metal_read_vram_bw();
+float    device_cuda_read_vram_bw ();
 void     device_get_props         (struct llama_model * model, int device, struct ggml_backend_dev_props * props); 
 void     device_print_props       (struct device_info * dev_info_set, int n, struct llama_model * model, const struct llama_context_params cparams);
 
diff --git a/include/llama.h b/include/llama.h
index 886f696d..b68cc269 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -325,6 +325,7 @@ extern "C" {
         char *      master_ip;         // ip address of the master node
         char *      next_node_ip;      // ip address of the next node
         uint32_t    n_ctx;             // text context, 0 = from model
+        uint32_t    n_predict;         // number of tokens to predict
         uint32_t    n_batch;           // logical maximum batch size that can be submitted to llama_decode
         uint32_t    n_ubatch;          // physical maximum batch size
         uint32_t    n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
@@ -412,11 +413,13 @@ extern "C" {
     LLAMA_API void llama_backend_init(void);
 
     LLAMA_API void llama_profile_device(
-                struct device_info        * dev_info, 
-                struct llama_model        * model, 
+                       struct device_info * dev_info, 
+                       struct llama_model * model, 
                 struct llama_model_loader * ml,
-                int                         n_predict,
-                int                         n_threads);
+                                      int   n_predict,
+                                      int   n_ctx,
+                                      int   n_threads,
+                                     bool   flash_attn);
 
     LLAMA_API ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device);
 
@@ -534,12 +537,19 @@ extern "C" {
                                       bool   use_gpu);
 
     // Return the size of KV cache in the model
-    LLAMA_API void llama_model_kvcache_size(
+    LLAMA_API void llama_total_kv_size(
                                   uint64_t * cpu_cache, 
                                   uint64_t * gpu_cache, 
                   const struct llama_model * model, 
          const struct llama_context_params   cparams,
                                       bool   use_gpu);
+    
+    LLAMA_API void llama_kv_size(
+                            uint64_t * cpu_cache, 
+                            uint64_t * gpu_cache, 
+            const struct llama_model * model, 
+   const struct llama_context_params   cparams,
+                                bool   use_gpu);
 
     // Return the total number of float operations in the model
     LLAMA_API void llama_model_n_flops(
@@ -547,9 +557,10 @@ extern "C" {
                  struct llama_model_loader * ml, 
                         struct model_flops * n_flops,
                        struct model_params * n_params,
-                             const int64_t   n_input,
                              const int64_t   n_history,
-                            enum ggml_type * inp_embd_dtype);
+                             const int64_t   n_ctx,
+                            enum ggml_type * inp_embd_dtype,
+                                      bool   flash_attn);
 
     // Get a llama model tensor
     LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
diff --git a/src/llama.cpp b/src/llama.cpp
index f5f9889f..1696cd07 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3570,7 +3570,14 @@ static bool is_dtype_exist(struct model_params * n_params, enum ggml_type dtype)
     }
 }
 
-void llama_profile_device(device_info * dev_info, struct llama_model * model, llama_model_loader * ml, int n_predict, int n_threads) {
+void llama_profile_device(
+                device_info * dev_info, 
+         struct llama_model * model, 
+         llama_model_loader * ml, 
+                        int   n_predict,
+                        int   n_ctx, 
+                        int   n_threads,
+                       bool   flash_attn) {
     dev_info->device_name               = device_name();
     dev_info->cpu_props.cores           = device_cpu_cores();
 
@@ -3584,7 +3591,7 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, ll
     struct model_params * n_params = &dev_info->model_params;
     if (dev_info->rank == 0) {    
         enum ggml_type inp_embd_dtype  = GGML_TYPE_F32;
-        llama_model_n_flops(model, ml, n_flops, n_params, 1, n_predict, &inp_embd_dtype);
+        llama_model_n_flops(model, ml, n_flops, n_params, n_predict, n_ctx, &inp_embd_dtype, flash_attn);
         n_flops->inp_embd_ms = device_inp_embd_delay(model, inp_embd_dtype, 1, n_threads);
     }
 
@@ -3611,8 +3618,8 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, ll
     dev_info->gpu_props.description         = gpu_props.description;
     dev_info->gpu_props.memory_free         = round(gpu_props.memory_free  / (double)(1 << 30) * 100) / 100;
     dev_info->gpu_props.memory_total        = round(gpu_props.memory_total / (double)(1 << 30) * 100) / 100;
-    dev_info->gpu_props.metal_read_vram_bw  = device_metal_read_vram_bw(model);
-    dev_info->gpu_props.cuda_read_vram_bw   = device_cuda_read_vram_bw(model);
+    dev_info->gpu_props.metal_read_vram_bw  = device_metal_read_vram_bw();
+    dev_info->gpu_props.cuda_read_vram_bw   = device_cuda_read_vram_bw();
 
     if (is_dtype_exist(n_params, GGML_TYPE_F32)) {
         dev_info->cpu_props.flops_f32_f32       = device_cpu_flops  (model, GGML_TYPE_F32,  GGML_TYPE_F32, n_threads);
@@ -19669,6 +19676,7 @@ struct llama_context_params llama_context_default_params() {
         /*.master_ip                   =*/ nullptr,
         /*.next_node_ip                =*/ nullptr,
         /*.n_ctx                       =*/ 512,
+        /*.n_predict                   =*/ 512,
         /*.n_batch                     =*/ 2048,
         /*.n_ubatch                    =*/ 512,
         /*.n_seq_max                   =*/ 1,
@@ -20910,22 +20918,49 @@ void llama_model_compute_buf_size(
     }
 }
 
-void llama_model_kvcache_size(
+void llama_total_kv_size(
                             uint64_t * cpu_cache, 
                             uint64_t * gpu_cache, 
             const struct llama_model * model, 
    const struct llama_context_params   cparams, 
                                 bool   use_gpu) {
     const llama_hparams hparams = model->hparams;
-    uint64_t ne_k = static_cast<uint64_t>(hparams.n_embd_k_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_k);
-    uint64_t ne_v = static_cast<uint64_t>(hparams.n_embd_v_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_v);
+    uint64_t nb_k = static_cast<uint64_t>(hparams.n_embd_k_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_k);
+    uint64_t nb_v = static_cast<uint64_t>(hparams.n_embd_v_gqa()) * cparams.n_ctx * ggml_type_size(cparams.type_v);
     if (use_gpu) {
         int n_gpu_layers = std::min(cparams.n_gpu_layers, hparams.n_layer);
-        *gpu_cache = (ne_k + ne_v) * n_gpu_layers;
-        *cpu_cache = (ne_k + ne_v) * (llama_model_n_layers(model) - n_gpu_layers);
+        *gpu_cache = (nb_k + nb_v) * n_gpu_layers;
+        *cpu_cache = (nb_k + nb_v) * (llama_model_n_layers(model) - n_gpu_layers);
     } else {
         *gpu_cache = 0;
-        *cpu_cache = (ne_k + ne_v) * llama_model_n_layers(model);
+        *cpu_cache = (nb_k + nb_v) * llama_model_n_layers(model);
+    }
+}
+
+void llama_kv_size(
+                            uint64_t * cpu_cache, 
+                            uint64_t * gpu_cache, 
+            const struct llama_model * model, 
+   const struct llama_context_params   cparams, 
+                                bool   use_gpu) {
+    const llama_hparams hparams = model->hparams;
+    const int64_t n_layer      = llama_model_n_layers(model);
+    const int64_t n_ctx        = cparams.n_ctx;
+    const int64_t n_history    = cparams.n_predict;
+    const int64_t n_pad        = cparams.flash_attn ? 256u : 32u;
+    const int64_t n_kv         = std::min(n_ctx, std::max(n_pad, GGML_PAD(n_history, n_pad)));
+    const int64_t n_embd_k_gqa = static_cast<int64_t>(hparams.n_embd_k_gqa());
+    const int64_t n_embd_v_gqa = static_cast<int64_t>(hparams.n_embd_v_gqa());
+
+    const int64_t nb_k = n_embd_k_gqa * n_kv * ggml_type_size(cparams.type_k);
+    const int64_t nb_v = n_embd_v_gqa * n_kv * ggml_type_size(cparams.type_v);
+    if (use_gpu) {
+        const int64_t n_gpu_layers = std::min(n_layer, static_cast<int64_t>(cparams.n_gpu_layers));
+        *gpu_cache = (nb_k + nb_v) * n_gpu_layers;
+        *cpu_cache = (nb_k + nb_v) * (n_layer - n_gpu_layers);
+    } else {
+        *gpu_cache = 0;
+        *cpu_cache = (nb_k + nb_v) * n_layer;
     }
 }
 
@@ -20934,20 +20969,23 @@ void llama_model_n_flops(
      struct llama_model_loader * ml, 
             struct model_flops * n_flops, 
            struct model_params * n_params, 
-                 const int64_t   n_input, 
-                 const int64_t   n_history, 
-                enum ggml_type * inp_embd_dtype) {
+                 const int64_t   n_history,
+                 const int64_t   n_ctx, 
+                enum ggml_type * inp_embd_dtype,
+                          bool   flash_attn) {
     const llama_hparams hparams  = model->hparams;
     const int64_t n_layer        = hparams.n_layer;
     const int64_t n_vocab        = hparams.n_vocab;
     const int64_t n_embd         = hparams.n_embd;
     const int64_t n_head         = hparams.n_head();
+    const int64_t n_head_kv      = hparams.n_head_kv();
     const int64_t n_ff           = hparams.n_ff();
-    const int64_t n_embd_k_gqa   = hparams.n_embd_k_gqa();
-    const int64_t n_embd_v_gqa   = hparams.n_embd_v_gqa();
     const int64_t n_embd_head_k  = hparams.n_embd_head_k;
+    const int64_t n_embd_head_v  = hparams.n_embd_head_v;
     const int64_t n_expert       = hparams.n_expert;
-    
+    const int64_t n_pad          = flash_attn ? 256u : 32u;
+    const int64_t n_kv           = std::min(n_ctx, std::max(n_pad, GGML_PAD(n_history, n_pad)));
+
     // assign all the tensors on CPU by default
     model->buft_input  = llama_default_buffer_type_cpu(*model, true);
     model->buft_output = llama_default_buffer_type_cpu(*model, true);
@@ -21045,64 +21083,66 @@ void llama_model_n_flops(
                     break;
                 }
                 case 2: { // "output_norm.weight"
-                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_OUTPUT, n_input * (4 * n_embd + 1));
+                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 4 * n_embd + 1); // rms_norm
+                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_OUTPUT, n_embd); // norm weights
                     count_n_params(n_params, cur->type,     PROFILER_LAYER_OUTPUT, ggml_nelements(cur));
                     break;
                 }
                 case 3: { // "output.weight"
-                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_OUTPUT, 2 * n_input * n_embd * n_vocab);
-                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 5 * n_input * n_vocab); // softmax
+                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_OUTPUT, 2 * n_embd * n_vocab);
+                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 5 * n_vocab); // softmax
                     count_n_params(n_params, cur->type,     PROFILER_LAYER_OUTPUT, ggml_nelements(cur));
                     break;
                 }
                 case 4:  // "blk.0.attn_norm.weight"
                 case 12: // "blk.0.ffn_norm.weight"
                 { 
-                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, n_input * (4 * n_embd + 1));
+                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 4 * n_embd + 1); // rms norm
+                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, n_embd); // norm weights
                     count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                     break;
                 }
                 case 5: { // "blk.0.attn_q.weight"
-                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_head_k));
-                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd_head_k); // rope
+                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_embd);
+                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2.5 * n_embd); // rope
                     count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                     break;
                 }
                 case 6: { // "blk.0.attn_k.weight"
-                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_k_gqa));
-                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd_k_gqa); // rope
-                    count_n_flops (n_flops,  GGML_TYPE_F16, PROFILER_LAYER_BACKEND, 2 * n_input * (n_input + n_history) * n_embd_head_k * n_head); // compute kq with kvcache
-                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 7 * n_input * (n_input + n_history) * n_head); // scale, mask, and softmax
+                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_head_kv * n_embd_head_k);
+                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2.5 * n_embd_head_k * n_head_kv); // rope
+                    count_n_flops (n_flops,  GGML_TYPE_F16, PROFILER_LAYER_BACKEND, 2 * n_embd_head_k * n_head * n_kv * n_head_kv); // compute kq with kvcache
+                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 7 * n_head * n_kv); // scale, mask, and softmax
                     count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                     break;
                 }
                 case 7: { // "blk.0.attn_v.weight"
-                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_v_gqa));
-                    count_n_flops (n_flops,  GGML_TYPE_F16, PROFILER_LAYER_BACKEND, n_input * (n_input + n_history) * n_embd_head_k * n_head); // compute kqv with kvcache
+                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_head_kv * n_embd_head_v);
+                    count_n_flops (n_flops,  GGML_TYPE_F16, PROFILER_LAYER_BACKEND, 2 * n_embd_head_v * n_head * n_kv * n_head_kv); // compute kqv with kvcache
                     count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                     break;
                 }
                 case 8: { // "blk.0.attn_output.weight"
-                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * (n_head * n_embd_head_k) * n_embd);
-                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_embd); // shortcut
+                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_embd);
+                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_embd); // shortcut
                     count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                     break;
                 }
                 case 9: { // "blk.0.ffn_gate.weight"
-                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff);
-                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 5 * n_input * n_ff); // SiLU
+                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff);
+                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 8 * n_ff); // SiLU
                     count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                     break;
                 }
                 case 10: { // "blk.0.ffn_down.weight"
-                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff);
-                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_embd); // shortcut
+                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff);
+                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_embd); // shortcut
                     count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                     break;
                 }
                 case 11: { // "blk.0.ffn_up.weight"
-                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff);
-                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_ff); // silu(gate(x)) * up(x)
+                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff);
+                    count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_ff); // silu(gate(x)) * up(x)
                     count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                     break;
                 }
@@ -21117,20 +21157,20 @@ void llama_model_n_flops(
                 case 17: // "blk.0.attn_output.bias"
                 case 19: // "blk.0.ffn_down.bias"
                 {
-                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, n_input * n_embd);
+                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, n_embd);
                     count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                     break;
                 }
                 case 18: // "blk.0.ffn_gate.bias"
                 case 20: // "blk.0.ffn_up.bias"
                 {
-                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, n_input * n_ff);
+                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, n_ff);
                     count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                     break; 
                 }
                 // optional: expert tensors
                 case 21: { // "blk.0.ffn_gate_inp.weight"
-                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_expert);
+                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_expert);
                     count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                     break;
                 }
@@ -21138,7 +21178,7 @@ void llama_model_n_flops(
                 case 23: // "blk.0.ffn_down_exps.weight"
                 case 24: // "blk.0.ffn_up_exps.weight"
                 { 
-                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff * n_expert);
+                    count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_embd * n_ff * n_expert);
                     count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                     break;
                 }