diff --git a/common/profiler.cpp b/common/profiler.cpp
index d2363a4f..1bdd88b6 100644
--- a/common/profiler.cpp
+++ b/common/profiler.cpp
@@ -31,6 +31,7 @@
 #include <sstream>
 #include <sys/types.h>
 #include <vector>
+#include <inttypes.h>
 
 const char * device_name() {
     static char device_name[256];
@@ -489,6 +490,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
     }
     LOG_INF("\n");
 
+    LOG_INF("| CPU flops (Q80 x F32, GFLOPS)");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.1f   ", dev_info_set[i].cpu_props.flops_q80_f32);
+    }
+    LOG_INF("\n");
+
     LOG_INF("| Physical Mem Total (GB)      ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.2f   ", dev_info_set[i].memory.total_physical);
@@ -615,6 +622,12 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
     }
     LOG_INF("\n");
 
+    LOG_INF("| Metal flops (Q80xF32, GFLOPS)");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.metal_flops_q80_f32);
+    }
+    LOG_INF("\n");
+
     LOG_INF("| CUDA  flops (F32xF32, GFLOPS)");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.cuda_flops_f32_f32);
@@ -639,49 +652,119 @@ void device_print_props(struct device_info * dev_info_set, int n, struct llama_m
     }
     LOG_INF("\n");
 
+    LOG_INF("| CUDA  flops (Q80xF32, GFLOPS)");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.cuda_flops_q80_f32);
+    }
+    LOG_INF("\n");
+
     LOG_INF("| Model flops (output F32xF32) ");
-    LOG_INF("| %-10lu   ", dev_info_set[0].model_flops.output_f32_f32);
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.output_f32_f32);
+    LOG_INF("\n");
+
+    LOG_INF("| Model flops (output F16xF32) ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.output_f16_f32);
+    LOG_INF("\n");
+
+    LOG_INF("| Model flops (output Q4KxF32) ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.output_q4k_f32);
     LOG_INF("\n");
 
     LOG_INF("| Model flops (output Q6KxF32) ");
-    LOG_INF("| %-10lu   ", dev_info_set[0].model_flops.output_q6k_f32);
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.output_q6k_f32);
+    LOG_INF("\n");
+
+    LOG_INF("| Model flops (output Q80xF32) ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.output_q80_f32);
     LOG_INF("\n");
 
     LOG_INF("| Model flops (layer F32xF32)  ");
-    LOG_INF("| %-10lu   ", dev_info_set[0].model_flops.layer_f32_f32);
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.layer_f32_f32);
     LOG_INF("\n");
 
     LOG_INF("| Model flops (layer F16xF32)  ");
-    LOG_INF("| %-10lu   ", dev_info_set[0].model_flops.layer_f16_f32);
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.layer_f16_f32);
     LOG_INF("\n");
 
     LOG_INF("| Model flops (layer Q4KxF32)  ");
-    LOG_INF("| %-10lu   ", dev_info_set[0].model_flops.layer_q4k_f32);
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.layer_q4k_f32);
     LOG_INF("\n");
 
     LOG_INF("| Model flops (layer Q6KxF32)  ");
-    LOG_INF("| %-10lu   ", dev_info_set[0].model_flops.layer_q6k_f32);
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.layer_q6k_f32);
     LOG_INF("\n");
 
-    LOG_INF("| Model params (input)         ");
-    LOG_INF("| %-10lu   ", dev_info_set[0].model_params.input_params);
+    LOG_INF("| Model flops (layer Q80xF32)  ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.layer_q80_f32);
     LOG_INF("\n");
 
-    LOG_INF("| Model params (each layer)    ");
-    LOG_INF("| %-10lu   ", dev_info_set[0].model_params.layer_params);
+    LOG_INF("| Model params (input F32)     ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.input_f32);
     LOG_INF("\n");
 
-    LOG_INF("| Model params (output)        ");
-    LOG_INF("| %-10lu   ", dev_info_set[0].model_params.output_params);
+    LOG_INF("| Model params (input F16)     ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.input_f16);
+    LOG_INF("\n");
+
+    LOG_INF("| Model params (input Q4K)     ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.input_q4k);
+    LOG_INF("\n");
+
+    LOG_INF("| Model params (input Q6K)     ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.input_q6k);
+    LOG_INF("\n");
+
+    LOG_INF("| Model params (input Q80)     ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.input_q80);
+    LOG_INF("\n");
+
+    LOG_INF("| Model params (layer F32)     ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.layer_f32);
+    LOG_INF("\n");
+
+    LOG_INF("| Model params (layer F16)     ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.layer_f16);
+    LOG_INF("\n");
+
+    LOG_INF("| Model params (layer Q4K)     ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.layer_q4k);
+    LOG_INF("\n");
+
+    LOG_INF("| Model params (layer Q6K)     ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.layer_q6k);
+    LOG_INF("\n");
+
+    LOG_INF("| Model params (layer Q80)     ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.layer_q80);
+    LOG_INF("\n");
+
+    LOG_INF("| Model params (output F32)    ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.output_f32);
+    LOG_INF("\n");
+
+    LOG_INF("| Model params (output F16)    ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.output_f16);
+    LOG_INF("\n");
+
+    LOG_INF("| Model params (output Q4K)    ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.output_q4k);
+    LOG_INF("\n");
+
+    LOG_INF("| Model params (output Q6K)    ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.output_q6k);
+    LOG_INF("\n");
+
+    LOG_INF("| Model params (output Q80)    ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.output_q80);
     LOG_INF("\n");
 
     model_flops ffo  = dev_info_set[0].model_flops;
     int64_t total_flops = ffo.output_f32_f32 + (ffo.layer_f32_f32 * llama_model_n_layers(model)); // todo
     double cpu_flops_f16 = dev_info_set[0].cpu_props.flops_f16_f32 * 1e9;
 
-    LOG_INF("| Token latency (ms)           ");
-    LOG_INF("| %-10.2f   ", total_flops / cpu_flops_f16 * 1000);
-    LOG_INF("\n");
+    // LOG_INF("| Token latency (ms)           ");
+    // LOG_INF("| %-10.2f   ", total_flops / cpu_flops_f16 * 1000);
+    // LOG_INF("\n");
 
     LOG_INF("-------------------------------------------------------------------------------------------\n\n");
 }
@@ -704,12 +787,12 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
                       + gpu_description_len
                       + sizeof(float)       // disk_read_bandwidth
                       + sizeof(uint32_t)    // cpu_props.cores
-                      + sizeof(float) * 4    // cpu_props.flops_f32, cpu_props.flops_f16, cpu_props.flops_q4k_f32, cpu_props.flops_q6k_f32
+                      + sizeof(float) * 5    // cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q6k_f32, cpu_props.flops_q80_f32
                       + sizeof(struct memory_info)
                       + sizeof(struct gpu_support)
-                      + sizeof(float) * 10; // gpu_props.memory_free, gpu_props.memory_total, 
-                                            // gpu_props.metal_flops_f32, gpu_props.metal_flops_f16, gpu_props.metal_flops_q4k_f32, gpu_props.metal_flops_q6k_f32, 
-                                            // gpu_props.cuda_flops_f32, gpu_props.cuda_flops_f16, gpu_props.cuda_flops_q8, and gpu_props.cuda_flops_q4k
+                      + sizeof(float) * 12; // gpu_props.memory_free, gpu_props.memory_total, 
+                                            // gpu_props.metal_flops_f32_f32, gpu_props.metal_flops_f16_f32, gpu_props.metal_flops_q4k_f32, gpu_props.metal_flops_q6k_f32, gpu_props.metal_flops_q80_f32, 
+                                            // gpu_props.cuda_flops_f32_f32, gpu_props.cuda_flops_f16_f32, gpu_props.cuda_flops_q4k_f32, gpu_props.cuda_flops_q6k_f32, gpu_props.cuda_flops_q80_f32
 
     *buffer = (char *)malloc(total_size);
     char * ptr = *buffer;
@@ -763,6 +846,9 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
     memcpy(ptr, &dev_info->cpu_props.flops_q6k_f32, sizeof(float));
     ptr += sizeof(float);
 
+    memcpy(ptr, &dev_info->cpu_props.flops_q80_f32, sizeof(float));
+    ptr += sizeof(float);
+
     memcpy(ptr, &dev_info->memory, sizeof(struct memory_info));
     ptr += sizeof(struct memory_info);
 
@@ -787,6 +873,9 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
     memcpy(ptr, &dev_info->gpu_props.metal_flops_q6k_f32, sizeof(float));
     ptr += sizeof(float);
 
+    memcpy(ptr, &dev_info->gpu_props.metal_flops_q80_f32, sizeof(float));
+    ptr += sizeof(float);
+
     memcpy(ptr, &dev_info->gpu_props.cuda_flops_f32_f32, sizeof(float));
     ptr += sizeof(float);
 
@@ -797,6 +886,9 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
     ptr += sizeof(float);
 
     memcpy(ptr, &dev_info->gpu_props.cuda_flops_q6k_f32, sizeof(float));
+    ptr += sizeof(float);
+
+    memcpy(ptr, &dev_info->gpu_props.cuda_flops_q80_f32, sizeof(float));
 
     // no need to synchronize model flops and model params
     return total_size;
@@ -868,6 +960,9 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
     memcpy(&dev_info->cpu_props.flops_q6k_f32, ptr, sizeof(float));
     ptr += sizeof(float);
 
+    memcpy(&dev_info->cpu_props.flops_q80_f32, ptr, sizeof(float));
+    ptr += sizeof(float);
+
     memcpy(&dev_info->memory, ptr, sizeof(struct memory_info));
     ptr += sizeof(struct memory_info);
 
@@ -892,6 +987,9 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
     memcpy(&dev_info->gpu_props.metal_flops_q6k_f32, ptr, sizeof(float));
     ptr += sizeof(float);
 
+    memcpy(&dev_info->gpu_props.metal_flops_q80_f32, ptr, sizeof(float));
+    ptr += sizeof(float);
+
     memcpy(&dev_info->gpu_props.cuda_flops_f32_f32, ptr, sizeof(float));
     ptr += sizeof(float);
 
@@ -902,6 +1000,9 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
     ptr += sizeof(float);
 
     memcpy(&dev_info->gpu_props.cuda_flops_q6k_f32, ptr, sizeof(float));
+    ptr += sizeof(float);
+
+    memcpy(&dev_info->gpu_props.cuda_flops_q80_f32, ptr, sizeof(float));
 
     // no need to synchronize model flops and model params
 }
\ No newline at end of file
diff --git a/common/profiler.h b/common/profiler.h
index bda570ff..a2395a14 100644
--- a/common/profiler.h
+++ b/common/profiler.h
@@ -12,6 +12,7 @@ struct cpu_props {
     float        flops_f16_f32; // in GFLOPS
     float        flops_q4k_f32; // in GFLOPS
     float        flops_q6k_f32; // in GFLOPS
+    float        flops_q80_f32; // in GFLOPS
 
     cpu_props() : 
         name(""), 
@@ -20,7 +21,8 @@ struct cpu_props {
         flops_f32_f32(0.0f), 
         flops_f16_f32(0.0f), 
         flops_q4k_f32(0.0f),
-        flops_q6k_f32(0.0f) {}
+        flops_q6k_f32(0.0f),
+        flops_q80_f32(0.0f) {}
 };
 
 struct memory_info {
@@ -66,10 +68,12 @@ struct gpu_props {
     float        metal_flops_f16_f32; // in GFLOPS
     float        metal_flops_q4k_f32; // in GFLOPS
     float        metal_flops_q6k_f32; // in GFLOPS
+    float        metal_flops_q80_f32; // in GFLOPS
     float        cuda_flops_f32_f32;  // in GFLOPS
     float        cuda_flops_f16_f32;  // in GFLOPS
     float        cuda_flops_q4k_f32;  // in GFLOPS
     float        cuda_flops_q6k_f32;  // in GFLOPS
+    float        cuda_flops_q80_f32;  // in GFLOPS
 
     gpu_props() : 
         name(""), 
@@ -80,38 +84,72 @@ struct gpu_props {
         metal_flops_f16_f32(0.0f),
         metal_flops_q4k_f32(0.0f),
         metal_flops_q6k_f32(0.0f),
+        metal_flops_q80_f32(0.0f),
         cuda_flops_f32_f32 (0.0f), 
         cuda_flops_f16_f32 (0.0f), 
         cuda_flops_q4k_f32 (0.0f), 
-        cuda_flops_q6k_f32 (0.0f) {}
+        cuda_flops_q6k_f32 (0.0f),
+        cuda_flops_q80_f32 (0.0f) {}
 };
 
 struct model_flops {
     int64_t output_f32_f32;
+    int64_t output_f16_f32;
+    int64_t output_q4k_f32;
     int64_t output_q6k_f32;
+    int64_t output_q80_f32;
     int64_t layer_f32_f32;
     int64_t layer_f16_f32;
     int64_t layer_q4k_f32;
     int64_t layer_q6k_f32;
+    int64_t layer_q80_f32;
 
     model_flops() : 
         output_f32_f32(0), 
+        output_f16_f32(0),
+        output_q4k_f32(0),
         output_q6k_f32(0), 
+        output_q80_f32(0),
         layer_f32_f32 (0),
         layer_f16_f32 (0),
         layer_q4k_f32 (0),
-        layer_q6k_f32 (0) {}
+        layer_q6k_f32 (0),
+        layer_q80_f32 (0) {}
 };
 
 struct model_params {
-    int64_t input_params;
-    int64_t output_params;
-    int64_t layer_params;
+    int64_t input_f32;
+    int64_t input_f16;
+    int64_t input_q4k;
+    int64_t input_q6k;
+    int64_t input_q80;
+    int64_t output_f32;
+    int64_t output_f16;
+    int64_t output_q4k;
+    int64_t output_q6k;
+    int64_t output_q80;
+    int64_t layer_f32;
+    int64_t layer_f16;
+    int64_t layer_q4k;
+    int64_t layer_q6k;
+    int64_t layer_q80;
 
     model_params() :
-        input_params (0),
-        output_params(0),
-        layer_params (0) {}
+        input_f32 (0),
+        input_f16 (0),
+        input_q4k (0),
+        input_q6k (0),
+        input_q80 (0),
+        output_f32(0),
+        output_f16(0),
+        output_q4k(0),
+        output_q6k(0),
+        output_q80(0),
+        layer_f32 (0),
+        layer_f16 (0),
+        layer_q4k (0),
+        layer_q6k (0),
+        layer_q80 (0) {}
 };
 
 struct device_info {
diff --git a/src/llama.cpp b/src/llama.cpp
index 50a3b832..fd7cb279 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -91,6 +91,7 @@
 #include <unordered_map>
 #include <chrono>
 #include <regex>
+#include <inttypes.h>
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -110,7 +111,7 @@ struct Timer {
     ~Timer() {
         if (enable_timer) {
             int64_t end_time = ggml_time_us();
-            LLAMA_LOG_INFO("Time to run %s: %lu ms\n", name, (end_time - start_time)/1000);
+            LLAMA_LOG_INFO("Time to run %s: %" PRId64 " ms\n", name, (end_time - start_time) / 1000);
         }
     }
 };
@@ -3553,6 +3554,7 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, ll
     dev_info->cpu_props.flops_f16_f32   = device_cpu_flops(model, GGML_TYPE_F16,  GGML_TYPE_F32, n_threads);
     dev_info->cpu_props.flops_q4k_f32   = device_cpu_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32, n_threads);
     dev_info->cpu_props.flops_q6k_f32   = device_cpu_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32, n_threads);
+    dev_info->cpu_props.flops_q80_f32   = device_cpu_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32, n_threads);
 
     dev_info->memory.total_physical     = round(device_physical_memory(false) / (double)(1 << 30) * 100) / 100;
     dev_info->memory.available_physical = round(device_physical_memory(true)  / (double)(1 << 30) * 100) / 100;
@@ -3586,10 +3588,12 @@ void llama_profile_device(device_info * dev_info, struct llama_model * model, ll
     dev_info->gpu_props.metal_flops_f16_f32 = device_metal_flops(model, GGML_TYPE_F16,  GGML_TYPE_F32);
     dev_info->gpu_props.metal_flops_q4k_f32 = device_metal_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
     dev_info->gpu_props.metal_flops_q6k_f32 = device_metal_flops(model, GGML_TYPE_Q6_K, GGML_TYPE_F32);
+    dev_info->gpu_props.metal_flops_q80_f32 = device_metal_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32);
     dev_info->gpu_props.cuda_flops_f32_f32  = device_cuda_flops (model, GGML_TYPE_F32,  GGML_TYPE_F32);
     dev_info->gpu_props.cuda_flops_f16_f32  = device_cuda_flops (model, GGML_TYPE_F16,  GGML_TYPE_F32);
     dev_info->gpu_props.cuda_flops_q4k_f32  = device_cuda_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
     dev_info->gpu_props.cuda_flops_q6k_f32  = device_cuda_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32);
+    dev_info->gpu_props.cuda_flops_q80_f32  = device_cuda_flops (model, GGML_TYPE_Q8_0, GGML_TYPE_F32);
 
     if (dev_info->rank == 0) {
         struct model_flops  * n_flops  = &dev_info->model_flops;
@@ -20677,9 +20681,18 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en
                 case GGML_TYPE_F32:
                     n_flops->output_f32_f32 += n;
                     break;
+                case GGML_TYPE_F16:
+                    n_flops->output_f16_f32 += n;
+                    break;
+                case GGML_TYPE_Q4_K:
+                    n_flops->output_q4k_f32 += n;
+                    break;
                 case GGML_TYPE_Q6_K:
                     n_flops->output_q6k_f32 += n;
                     break;
+                case GGML_TYPE_Q8_0:
+                    n_flops->output_q80_f32 += n;
+                    break;
                 default:
                     throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
             }
@@ -20699,6 +20712,82 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en
                 case GGML_TYPE_Q6_K:
                     n_flops->layer_q6k_f32 += n;
                     break;
+                case GGML_TYPE_Q8_0:
+                    n_flops->layer_q80_f32 += n;
+                    break;
+                default:
+                    throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n");
+            }
+            break;
+
+        default:
+            throw std::runtime_error("Unrecognized profiler layer type\n");
+    }
+}
+
+static void count_n_params(struct model_params * n_params, enum ggml_type dtype, enum profiler_layer_type ltype, size_t n) {
+    int64_t n_i64t = static_cast<int64_t>(n);
+    switch (ltype) {
+        case PROFILER_LAYER_INPUT:
+            switch (dtype) {
+                case GGML_TYPE_F32:
+                    n_params->input_f32 += n_i64t;
+                    break;
+                case GGML_TYPE_F16:
+                    n_params->input_f16 += n_i64t;
+                    break;
+                case GGML_TYPE_Q4_K:
+                    n_params->input_q4k += n_i64t;
+                    break;
+                case GGML_TYPE_Q6_K:
+                    n_params->input_q6k += n_i64t;
+                    break;
+                case GGML_TYPE_Q8_0:
+                    n_params->input_q80 += n_i64t;
+                    break;
+                default:
+                    throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
+            }
+            break;
+
+        case PROFILER_LAYER_OUTPUT:
+            switch (dtype) {
+                case GGML_TYPE_F32:
+                    n_params->output_f32 += n_i64t;
+                    break;
+                case GGML_TYPE_F16:
+                    n_params->output_f16 += n_i64t;
+                    break;
+                case GGML_TYPE_Q4_K:
+                    n_params->output_q4k += n_i64t;
+                    break;
+                case GGML_TYPE_Q6_K:
+                    n_params->output_q6k += n_i64t;
+                    break;
+                case GGML_TYPE_Q8_0:
+                    n_params->output_q80 += n_i64t;
+                default:
+                    throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
+            }
+            break;
+
+        case PROFILER_LAYER_BACKEND:
+            switch (dtype) {
+                case GGML_TYPE_F32:
+                    n_params->layer_f32 += n_i64t;
+                    break;
+                case GGML_TYPE_F16:
+                    n_params->layer_f16 += n_i64t;
+                    break;
+                case GGML_TYPE_Q4_K:
+                    n_params->layer_q4k += n_i64t;
+                    break;
+                case GGML_TYPE_Q6_K:
+                    n_params->layer_q6k += n_i64t;
+                    break;
+                case GGML_TYPE_Q8_0:
+                    n_params->layer_q80 += n_i64t;
+                    break;
                 default:
                     throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n");
             }
@@ -20814,73 +20903,73 @@ void llama_model_n_flops(struct llama_model * model, struct llama_model_loader *
             if (it != tensor_name_map.end()) {
                 switch (it->second) {
                     case 1: { // "token_embd.weight"
-                        n_params->input_params  += static_cast<int64_t>(ggml_nelements(cur));
+                        count_n_params(n_params, cur->type,     PROFILER_LAYER_INPUT, ggml_nelements(cur));
                         break;
                     }
                     case 2: { // "output_norm.weight"
-                        count_n_flops(n_flops, cur->type, PROFILER_LAYER_OUTPUT, n_input * (4 * n_embd + 1));
-                        n_params->output_params += static_cast<int64_t>(ggml_nelements(cur));
+                        count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_OUTPUT, n_input * (4 * n_embd + 1));
+                        count_n_params(n_params, cur->type,     PROFILER_LAYER_OUTPUT, ggml_nelements(cur));
                         break;
                     }
                     case 3: { // "output.weight"
-                        count_n_flops(n_flops, cur->type,     PROFILER_LAYER_OUTPUT, 2 * n_input * n_embd * n_vocab);
-                        count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 5 * n_input * n_vocab); // softmax
-                        n_params->output_params += static_cast<int64_t>(ggml_nelements(cur));
+                        count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_OUTPUT, 2 * n_input * n_embd * n_vocab);
+                        count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_OUTPUT, 5 * n_input * n_vocab); // softmax
+                        count_n_params(n_params, cur->type,     PROFILER_LAYER_OUTPUT, ggml_nelements(cur));
                         break;
                     }
                     case 4:  // "blk.0.attn_norm.weight"
                     case 12: // "blk.0.ffn_norm.weight"
                     { 
-                        count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, n_input * (4 * n_embd + 1));
-                        n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
+                        count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, n_input * (4 * n_embd + 1));
+                        count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                         break;
                     }
                     case 5: { // "blk.0.attn_q.weight"
-                        count_n_flops(n_flops, cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_head_k));
-                        count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd_head_k); // rope
-                        n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
+                        count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_head_k));
+                        count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd_head_k); // rope
+                        count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                         break;
                     }
                     case 6: { // "blk.0.attn_k.weight"
-                        count_n_flops(n_flops, cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_k_gqa));
-                        count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd_k_gqa); // rope
-                        count_n_flops(n_flops, GGML_TYPE_F16, PROFILER_LAYER_BACKEND, 2 * n_input * (n_input + n_history) * n_embd_head_k * n_head); // compute kq with kvcache
-                        count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 7 * n_input * (n_input + n_history) * n_head); // scale, mask, and softmax
-                        n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
+                        count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_k_gqa));
+                        count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd_k_gqa); // rope
+                        count_n_flops (n_flops,  GGML_TYPE_F16, PROFILER_LAYER_BACKEND, 2 * n_input * (n_input + n_history) * n_embd_head_k * n_head); // compute kq with kvcache
+                        count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 7 * n_input * (n_input + n_history) * n_head); // scale, mask, and softmax
+                        count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                         break;
                     }
                     case 7: { // "blk.0.attn_v.weight"
-                        count_n_flops(n_flops, cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_v_gqa));
-                        count_n_flops(n_flops, GGML_TYPE_F16, PROFILER_LAYER_BACKEND, n_input * (n_input + n_history) * n_embd_head_k * n_head); // compute kqv with kvcache
-                        n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
+                        count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * (n_head * n_embd_v_gqa));
+                        count_n_flops (n_flops,  GGML_TYPE_F16, PROFILER_LAYER_BACKEND, n_input * (n_input + n_history) * n_embd_head_k * n_head); // compute kqv with kvcache
+                        count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                         break;
                     }
                     case 8: { // "blk.0.attn_output.weight"
-                        count_n_flops(n_flops, cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * (n_head * n_embd_head_k) * n_embd);
-                        count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_embd); // shortcut
-                        n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
+                        count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * (n_head * n_embd_head_k) * n_embd);
+                        count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_embd); // shortcut
+                        count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                         break;
                     }
                     case 9: { // "blk.0.ffn_gate.weight"
-                        count_n_flops(n_flops, cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff);
-                        count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 5 * n_input * n_ff); // SiLU
-                        n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
+                        count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff);
+                        count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, 5 * n_input * n_ff); // SiLU
+                        count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                         break;
                     }
                     case 10: { // "blk.0.ffn_down.weight"
-                        count_n_flops(n_flops, cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff);
-                        count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_embd); // shortcut
-                        n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
+                        count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff);
+                        count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_embd); // shortcut
+                        count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                         break;
                     }
                     case 11: { // "blk.0.ffn_up.weight"
-                        count_n_flops(n_flops, cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff);
-                        count_n_flops(n_flops, GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_ff); // silu(gate(x)) * up(x)
-                        n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
+                        count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff);
+                        count_n_flops (n_flops,  GGML_TYPE_F32, PROFILER_LAYER_BACKEND, n_input * n_ff); // silu(gate(x)) * up(x)
+                        count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                         break;
                     }
                     case 13: { // rope_freqs.weight, has been counted in q and k
-                        n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
+                        count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                         break;
                     }
                     // optional: bias tensors
@@ -20890,29 +20979,29 @@ void llama_model_n_flops(struct llama_model * model, struct llama_model_loader *
                     case 17: // "blk.0.attn_output.bias"
                     case 19: // "blk.0.ffn_down.bias"
                     {
-                        count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, n_input * n_embd);
-                        n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
+                        count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, n_input * n_embd);
+                        count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                         break;
                     }
                     case 18: // "blk.0.ffn_gate.bias"
                     case 20: // "blk.0.ffn_up.bias"
                     {
-                        count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, n_input * n_ff);
-                        n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
+                        count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, n_input * n_ff);
+                        count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                         break; 
                     }
                     // optional: expert tensors
                     case 21: { // "blk.0.ffn_gate_inp.weight"
-                        count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_expert);
-                        n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
+                        count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_expert);
+                        count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                         break;
                     }
                     case 22: // "blk.0.ffn_gate_exps.weight"
                     case 23: // "blk.0.ffn_down_exps.weight"
                     case 24: // "blk.0.ffn_up_exps.weight"
                     { 
-                        count_n_flops(n_flops, cur->type, PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff * n_expert);
-                        n_params->layer_params += static_cast<int64_t>(ggml_nelements(cur));
+                        count_n_flops (n_flops,  cur->type,     PROFILER_LAYER_BACKEND, 2 * n_input * n_embd * n_ff * n_expert);
+                        count_n_params(n_params, cur->type,     PROFILER_LAYER_BACKEND, ggml_nelements(cur));
                         break;
                     }
                     default: