Added support for IQ1_M and IQ2_XXS quantization type

2025-09-06 10:59:04 +00:00 · 2025-03-07 16:56:16 +00:00 · 2025-03-07 16:56:16 +00:00 · 45ec52c2cb
commit 45ec52c2cb
parent 230c68b80c
4 changed files with 555 additions and 336 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -901,16 +901,18 @@ static bool assign_layers_to_device(
        float t_read_ram_cpu = 0.0f;

        float t_calc_cpu = (
-            master.model_flops.layer_f32_f32   / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) +
-            master.model_flops.layer_f16_f32   / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) +
-            master.model_flops.layer_q2k_f32   / (dev.cpu_props.flops_q2k_f32 * 1e9 + EPS) +
-            master.model_flops.layer_q4k_f32   / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) +
-            master.model_flops.layer_q5k_f32   / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) +
-            master.model_flops.layer_q6k_f32   / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) +
-            master.model_flops.layer_q50_f32   / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) +
-            master.model_flops.layer_q80_f32   / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS) +
-            master.model_flops.layer_iq1s_f32  / (dev.cpu_props.flops_iq1s_f32 * 1e9 + EPS)+
-            master.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS)) * 1000; // in ms
+            master.model_flops.layer_f32_f32   / (dev.cpu_props.flops_f32_f32   * 1e9 + EPS) +
+            master.model_flops.layer_f16_f32   / (dev.cpu_props.flops_f16_f32   * 1e9 + EPS) +
+            master.model_flops.layer_q2k_f32   / (dev.cpu_props.flops_q2k_f32   * 1e9 + EPS) +
+            master.model_flops.layer_q4k_f32   / (dev.cpu_props.flops_q4k_f32   * 1e9 + EPS) +
+            master.model_flops.layer_q5k_f32   / (dev.cpu_props.flops_q5k_f32   * 1e9 + EPS) +
+            master.model_flops.layer_q6k_f32   / (dev.cpu_props.flops_q6k_f32   * 1e9 + EPS) +
+            master.model_flops.layer_iq2xxs_f32/ (dev.cpu_props.flops_iq2xxs_f32* 1e9 + EPS) +
+            master.model_flops.layer_q50_f32   / (dev.cpu_props.flops_q50_f32   * 1e9 + EPS) +
+            master.model_flops.layer_q80_f32   / (dev.cpu_props.flops_q80_f32   * 1e9 + EPS) +
+            master.model_flops.layer_iq1s_f32  / (dev.cpu_props.flops_iq1s_f32  * 1e9 + EPS) +
+            master.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS) +
+            master.model_flops.layer_iq1m_f32  / (dev.cpu_props.flops_iq1m_f32  * 1e9 + EPS) ) * 1000; // in ms

        float t_kv_cpy_cpu = dev.memory.mem_cpy_delay; // in ms
        // t_read_ram_cpu = b_prime / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms
@ -925,31 +927,35 @@ static bool assign_layers_to_device(

            if (dev.gpu_support.metal) {
                t_calc_gpu = (
-                    master.model_flops.layer_f32_f32   / (dev.gpu_props.metal_flops_f32_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_f16_f32   / (dev.gpu_props.metal_flops_f16_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_q2k_f32   / (dev.gpu_props.metal_flops_q2k_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_q4k_f32   / (dev.gpu_props.metal_flops_q4k_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_q5k_f32   / (dev.gpu_props.metal_flops_q5k_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_q6k_f32   / (dev.gpu_props.metal_flops_q6k_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_q50_f32   / (dev.gpu_props.metal_flops_q50_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_q80_f32   / (dev.gpu_props.metal_flops_q80_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_iq1s_f32  / (dev.gpu_props.metal_flops_iq1s_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_iq4nl_f32 / (dev.gpu_props.metal_flops_iq4nl_f32 * 1e9 + EPS)) * 1000; // in ms
+                    master.model_flops.layer_f32_f32    / (dev.gpu_props.metal_flops_f32_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_f16_f32    / (dev.gpu_props.metal_flops_f16_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_q2k_f32    / (dev.gpu_props.metal_flops_q2k_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_q4k_f32    / (dev.gpu_props.metal_flops_q4k_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_q5k_f32    / (dev.gpu_props.metal_flops_q5k_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_q6k_f32    / (dev.gpu_props.metal_flops_q6k_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_iq2xxs_f32 / (dev.gpu_props.metal_flops_iq2xxs_f32 * 1e9 + EPS) +
+                    master.model_flops.layer_q50_f32    / (dev.gpu_props.metal_flops_q50_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_q80_f32    / (dev.gpu_props.metal_flops_q80_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_iq1s_f32   / (dev.gpu_props.metal_flops_iq1s_f32   * 1e9 + EPS) +
+                    master.model_flops.layer_iq4nl_f32  / (dev.gpu_props.metal_flops_iq4nl_f32  * 1e9 + EPS) +
+                    master.model_flops.layer_iq1m_f32   / (dev.gpu_props.metal_flops_iq1m_f32   * 1e9 + EPS) ) * 1000; // in ms

                t_kv_cpy_gpu = dev.gpu_props.metal_mem_cpy_delay; // in ms
                // t_read_ram_gpu = b_prime / (dev.gpu_props.metal_read_vram_bw * 1e9) * 1000; // in ms
            } else {
                t_calc_gpu = (
-                    master.model_flops.layer_f32_f32   / (dev.gpu_props.cuda_flops_f32_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_f16_f32   / (dev.gpu_props.cuda_flops_f16_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_q2k_f32   / (dev.gpu_props.cuda_flops_q2k_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_q4k_f32   / (dev.gpu_props.cuda_flops_q4k_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_q5k_f32   / (dev.gpu_props.cuda_flops_q5k_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_q6k_f32   / (dev.gpu_props.cuda_flops_q6k_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_q50_f32   / (dev.gpu_props.cuda_flops_q50_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_q80_f32   / (dev.gpu_props.cuda_flops_q80_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_iq1s_f32  / (dev.gpu_props.cuda_flops_iq1s_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_iq4nl_f32 / (dev.gpu_props.cuda_flops_iq4nl_f32 * 1e9 + EPS)) * 1000; // in ms
+                    master.model_flops.layer_f32_f32    / (dev.gpu_props.cuda_flops_f32_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_f16_f32    / (dev.gpu_props.cuda_flops_f16_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_q2k_f32    / (dev.gpu_props.cuda_flops_q2k_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_q4k_f32    / (dev.gpu_props.cuda_flops_q4k_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_q5k_f32    / (dev.gpu_props.cuda_flops_q5k_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_q6k_f32    / (dev.gpu_props.cuda_flops_q6k_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_iq2xxs_f32 / (dev.gpu_props.cuda_flops_iq2xxs_f32 * 1e9 + EPS) +
+                    master.model_flops.layer_q50_f32    / (dev.gpu_props.cuda_flops_q50_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_q80_f32    / (dev.gpu_props.cuda_flops_q80_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_iq1s_f32   / (dev.gpu_props.cuda_flops_iq1s_f32   * 1e9 + EPS) +
+                    master.model_flops.layer_iq4nl_f32  / (dev.gpu_props.cuda_flops_iq4nl_f32  * 1e9 + EPS) +
+                    master.model_flops.layer_iq1m_f32   / (dev.gpu_props.cuda_flops_iq1m_f32   * 1e9 + EPS) ) * 1000; // in ms

                t_kv_cpy_gpu = dev.gpu_props.cuda_mem_cpy_delay; // in ms
                // t_read_ram_gpu = b_prime / (dev.gpu_props.cuda_read_vram_bw * 1e9) * 1000; // in ms
@ -1125,17 +1131,18 @@ static bool assign_layers_to_device(

            if (m == 0) {
                kappa = (
-                    dev.model_flops.layer_f32_f32   / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) +
-                    dev.model_flops.layer_f16_f32   / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) +
-                    dev.model_flops.layer_q2k_f32   / (dev.cpu_props.flops_q2k_f32 * 1e9 + EPS) +
-                    dev.model_flops.layer_q4k_f32   / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) +
-                    dev.model_flops.layer_q5k_f32   / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) +
-                    dev.model_flops.layer_q6k_f32   / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) +
-                    dev.model_flops.layer_q50_f32   / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) +
-                    dev.model_flops.layer_q80_f32   / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS) +
-                    dev.model_flops.layer_iq1s_f32  / (dev.cpu_props.flops_iq1s_f32 * 1e9 + EPS) +
-                    dev.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS)) * 1000; // in ms
-
+                    dev.model_flops.layer_f32_f32    / (dev.cpu_props.flops_f32_f32    * 1e9 + EPS) +
+                    dev.model_flops.layer_f16_f32    / (dev.cpu_props.flops_f16_f32    * 1e9 + EPS) +
+                    dev.model_flops.layer_q2k_f32    / (dev.cpu_props.flops_q2k_f32    * 1e9 + EPS) +
+                    dev.model_flops.layer_q4k_f32    / (dev.cpu_props.flops_q4k_f32    * 1e9 + EPS) +
+                    dev.model_flops.layer_q5k_f32    / (dev.cpu_props.flops_q5k_f32    * 1e9 + EPS) +
+                    dev.model_flops.layer_q6k_f32    / (dev.cpu_props.flops_q6k_f32    * 1e9 + EPS) +
+                    dev.model_flops.layer_iq2xxs_f32 / (dev.cpu_props.flops_iq2xxs_f32 * 1e9 + EPS) +
+                    dev.model_flops.layer_q50_f32    / (dev.cpu_props.flops_q50_f32    * 1e9 + EPS) +
+                    dev.model_flops.layer_q80_f32    / (dev.cpu_props.flops_q80_f32    * 1e9 + EPS) +
+                    dev.model_flops.layer_iq1s_f32   / (dev.cpu_props.flops_iq1s_f32   * 1e9 + EPS) +
+                    dev.model_flops.layer_iq4nl_f32  / (dev.cpu_props.flops_iq4nl_f32  * 1e9 + EPS) +
+                    dev.model_flops.layer_iq1m_f32   / (dev.cpu_props.flops_iq1m_f32   * 1e9 + EPS) ) * 1000; // in ms
                // kappa += (bi / n_vocab + bo) / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms

                kappa += (bi / n_vocab) / (disk_speed[m] * 1e9) * 1000; // in ms
--- a/common/profiler.cpp
+++ b/common/profiler.cpp
--- a/common/profiler.h
+++ b/common/profiler.h
@ -15,16 +15,18 @@ struct cpu_props {
    const char * name;
    const char * description;
    uint32_t     cores;
-    float        flops_f32_f32; // in GFLOPS
-    float        flops_f16_f32; // in GFLOPS
-    float        flops_q2k_f32; // in GFLOPS
-    float        flops_q4k_f32; // in GFLOPS
-    float        flops_q5k_f32; // in GFLOPS
-    float        flops_q6k_f32; // in GFLOPS
-    float        flops_q50_f32; // in GFLOPS    
-    float        flops_q80_f32; // in GFLOPS
-    float        flops_iq1s_f32; // in GFLOPS
-    float        flops_iq4nl_f32; // in GFLOPS
+    float        flops_f32_f32;     // in GFLOPS
+    float        flops_f16_f32;     // in GFLOPS
+    float        flops_q2k_f32;     // in GFLOPS
+    float        flops_q4k_f32;     // in GFLOPS
+    float        flops_q5k_f32;     // in GFLOPS
+    float        flops_q6k_f32;     // in GFLOPS
+    float        flops_iq2xxs_f32;  // in GFLOPS
+    float        flops_q50_f32;     // in GFLOPS    
+    float        flops_q80_f32;     // in GFLOPS
+    float        flops_iq1s_f32;    // in GFLOPS
+    float        flops_iq4nl_f32;   // in GFLOPS
+    float        flops_iq1m_f32;    // in GFLOPS

    cpu_props()
        : name            (""), 
@ -36,10 +38,12 @@ struct cpu_props {
          flops_q4k_f32   (0.0f),
          flops_q5k_f32   (0.0f),
          flops_q6k_f32   (0.0f),
+          flops_iq2xxs_f32(0.0f),
          flops_q50_f32   (0.0f),
          flops_q80_f32   (0.0f),
          flops_iq1s_f32  (0.0f), 
-          flops_iq4nl_f32 (0.0f)
+          flops_iq4nl_f32 (0.0f),
+          flops_iq1m_f32  (0.0f)
    {}
 };

@ -84,32 +88,36 @@ struct gpu_support {
 struct gpu_props {
    const char * name;
    const char * description;
-    float        memory_free;         // in GiB
-    float        memory_total;        // in GiB
-    float        metal_read_vram_bw;  // in GB/s
-    float        metal_flops_f32_f32; // in GFLOPS
-    float        metal_flops_f16_f32; // in GFLOPS
-    float        metal_flops_q2k_f32; // in GFLOPS
-    float        metal_flops_q4k_f32; // in GFLOPS
-    float        metal_flops_q5k_f32; // in GFLOPS
-    float        metal_flops_q6k_f32; // in GFLOPS
-    float        metal_flops_q50_f32; // in GFLOPS
-    float        metal_flops_q80_f32; // in GFLOPS
-    float        metal_flops_iq1s_f32; // in GFLOPS
-    float        metal_flops_iq4nl_f32; // in GFLOPS
-    float        metal_mem_cpy_delay; // in ms
-    float        cuda_read_vram_bw;   // in GB/s
-    float        cuda_flops_f32_f32;  // in GFLOPS
-    float        cuda_flops_f16_f32;  // in GFLOPS
-    float        cuda_flops_q2k_f32;  // in GFLOPS
-    float        cuda_flops_q4k_f32;  // in GFLOPS
-    float        cuda_flops_q5k_f32;  // in GFLOPS
-    float        cuda_flops_q6k_f32;  // in GFLOPS
-    float        cuda_flops_q50_f32;  // in GFLOPS
-    float        cuda_flops_q80_f32;  // in GFLOPS
-    float        cuda_flops_iq1s_f32; // in GFLOPS
-    float        cuda_flops_iq4nl_f32; // in GFLOPS
-    float        cuda_mem_cpy_delay;  // in ms
+    float        memory_free;               // in GiB
+    float        memory_total;              // in GiB
+    float        metal_read_vram_bw;        // in GB/s
+    float        metal_flops_f32_f32;       // in GFLOPS
+    float        metal_flops_f16_f32;       // in GFLOPS
+    float        metal_flops_q2k_f32;       // in GFLOPS
+    float        metal_flops_q4k_f32;     // in GFLOPS
+    float        metal_flops_q5k_f32;     // in GFLOPS
+    float        metal_flops_q6k_f32;     // in GFLOPS
+    float        metal_flops_iq2xxs_f32;  // in GFLOPS
+    float        metal_flops_q50_f32;     // in GFLOPS
+    float        metal_flops_q80_f32;     // in GFLOPS
+    float        metal_flops_iq1s_f32;    // in GFLOPS
+    float        metal_flops_iq4nl_f32;   // in GFLOPS
+    float        metal_flops_iq1m_f32;    // in GFLOPS
+    float        metal_mem_cpy_delay;     // in ms
+    float        cuda_read_vram_bw;       // in GB/s
+    float        cuda_flops_f32_f32;      // in GFLOPS
+    float        cuda_flops_f16_f32;      // in GFLOPS
+    float        cuda_flops_q2k_f32;      // in GFLOPS
+    float        cuda_flops_q4k_f32;      // in GFLOPS
+    float        cuda_flops_q5k_f32;      // in GFLOPS
+    float        cuda_flops_q6k_f32;      // in GFLOPS
+    float        cuda_flops_iq2xxs_f32;   // in GFLOPS
+    float        cuda_flops_q50_f32;      // in GFLOPS
+    float        cuda_flops_q80_f32;      // in GFLOPS
+    float        cuda_flops_iq1s_f32;     // in GFLOPS
+    float        cuda_flops_iq4nl_f32;    // in GFLOPS
+    float        cuda_flops_iq1m_f32;     // in GFLOPS
+    float        cuda_mem_cpy_delay;      // in ms

    gpu_props() : 
        name                    (""), 
@ -123,10 +131,12 @@ struct gpu_props {
        metal_flops_q4k_f32     (0.0f),
        metal_flops_q5k_f32     (0.0f),
        metal_flops_q6k_f32     (0.0f),
+        metal_flops_iq2xxs_f32  (0.0f),
        metal_flops_q50_f32     (0.0f),
        metal_flops_q80_f32     (0.0f),
        metal_flops_iq1s_f32    (0.0f),
        metal_flops_iq4nl_f32   (0.0f),
+        metal_flops_iq1m_f32    (0.0f),
        metal_mem_cpy_delay     (0.0f),
        cuda_read_vram_bw       (0.0f),
        cuda_flops_f32_f32      (0.0f), 
@ -135,10 +145,12 @@ struct gpu_props {
        cuda_flops_q4k_f32      (0.0f),
        cuda_flops_q5k_f32      (0.0f),
        cuda_flops_q6k_f32      (0.0f),
+        cuda_flops_iq2xxs_f32   (0.0f),
        cuda_flops_q50_f32      (0.0f),
        cuda_flops_q80_f32      (0.0f),
        cuda_flops_iq1s_f32     (0.0f),
        cuda_flops_iq4nl_f32    (0.0f),
+        cuda_flops_iq1m_f32     (0.0f),
        cuda_mem_cpy_delay      (0.0f) {}
 };

@ -150,43 +162,52 @@ struct model_flops {
    int64_t output_q4k_f32;
    int64_t output_q5k_f32;
    int64_t output_q6k_f32;
+    int64_t output_iq2xxs_f32;
    int64_t output_q50_f32;
    int64_t output_q80_f32;
    int64_t output_iq1s_f32;
    int64_t output_iq4nl_f32;
+    int64_t output_iq1m_f32;
    int64_t layer_f32_f32;
    int64_t layer_f16_f32;
    int64_t layer_q2k_f32;
    int64_t layer_q4k_f32;
    int64_t layer_q5k_f32;
    int64_t layer_q6k_f32;
+    int64_t layer_iq2xxs_f32;
    int64_t layer_q50_f32;
    int64_t layer_q80_f32;
    int64_t layer_iq1s_f32;
    int64_t layer_iq4nl_f32;
+    int64_t layer_iq1m_f32;

    model_flops() : 
-        inp_embd_ms(0.0f),
-        output_f32_f32(0), 
-        output_f16_f32(0),
-        output_q2k_f32(0),
-        output_q4k_f32(0),
-        output_q5k_f32(0),
-        output_q6k_f32(0), 
-        output_q50_f32(0),
-        output_q80_f32(0),
-        output_iq1s_f32(0),
-        output_iq4nl_f32(0),
-        layer_f32_f32 (0),
-        layer_f16_f32 (0),
-        layer_q2k_f32 (0),
-        layer_q4k_f32 (0),
-        layer_q5k_f32 (0),
-        layer_q6k_f32 (0),
-        layer_q50_f32 (0),
-        layer_q80_f32 (0),
-        layer_iq1s_f32 (0),
-        layer_iq4nl_f32 (0) {}
+        inp_embd_ms        (0.0f),
+        output_f32_f32     (0), 
+        output_f16_f32     (0),
+        output_q2k_f32     (0),
+        output_q4k_f32     (0),
+        output_q5k_f32     (0),
+        output_q6k_f32     (0), 
+        output_iq2xxs_f32  (0),
+        output_q50_f32     (0),
+        output_q80_f32     (0),
+        output_iq1s_f32    (0),
+        output_iq4nl_f32   (0),
+        output_iq1m_f32    (0),
+        layer_f32_f32      (0),
+        layer_f16_f32      (0),
+        layer_q2k_f32      (0),
+        layer_q4k_f32      (0),
+        layer_q5k_f32      (0),
+        layer_q6k_f32      (0),
+        layer_iq2xxs_f32   (0),
+        layer_q50_f32      (0),
+        layer_q80_f32      (0),
+        layer_iq1s_f32     (0),
+        layer_iq4nl_f32    (0), 
+        layer_iq1m_f32     (0)
+        {}
 };

 struct model_params {
@ -196,62 +217,75 @@ struct model_params {
    int64_t input_q4k;
    int64_t input_q5k;
    int64_t input_q6k;
+    int64_t input_iq2xxs;
    int64_t input_q50;
    int64_t input_q80;
    int64_t input_iq1s;
    int64_t input_iq4nl;
+    int64_t input_iq1m;
    int64_t output_f32;
    int64_t output_f16;
    int64_t output_q2k;
    int64_t output_q4k;
    int64_t output_q5k;
    int64_t output_q6k;
+    int64_t output_iq2xxs;
    int64_t output_q50;
    int64_t output_q80;
    int64_t output_iq1s;
    int64_t output_iq4nl;
+    int64_t output_iq1m;
    int64_t layer_f32;
    int64_t layer_f16;
    int64_t layer_q2k;
    int64_t layer_q4k;
    int64_t layer_q5k;
    int64_t layer_q6k;
+    int64_t layer_iq2xxs;
    int64_t layer_q50;
    int64_t layer_q80;
    int64_t layer_iq1s;
    int64_t layer_iq4nl;
+    int64_t layer_iq1m;

    model_params() :
-        input_f32 (0),
-        input_f16 (0),
-        input_q2k (0),
-        input_q4k (0),
-        input_q5k (0),
-        input_q6k (0),
-        input_q50 (0),
-        input_q80 (0),
-        input_iq1s(0),
-        input_iq4nl(0),
-        output_f32(0),
-        output_f16(0),
-        output_q2k(0),
-        output_q4k(0),
-        output_q5k(0),
-        output_q6k(0),
-        output_q50(0),
-        output_q80(0),
-        output_iq1s(0),
-        output_iq4nl(0),
-        layer_f32 (0),
-        layer_f16 (0),
-        layer_q2k (0),
-        layer_q4k (0),
-        layer_q5k (0),
-        layer_q6k (0),
-        layer_q50 (0),
-        layer_q80 (0),
-        layer_iq1s (0),
-        layer_iq4nl (0) {}
+        input_f32       (0),
+        input_f16       (0),
+        input_q2k       (0),
+        input_q4k       (0),
+        input_q5k       (0),
+        input_q6k       (0),
+        input_iq2xxs    (0),
+        input_q50       (0),
+        input_q80       (0),
+        input_iq1s      (0),
+        input_iq4nl     (0),
+        input_iq1m      (0),
+        output_f32      (0),
+        output_f16      (0),
+        output_q2k      (0),
+        output_q4k      (0),
+        output_q5k      (0),
+        output_q6k      (0),
+        output_iq2xxs   (0),
+        output_q50      (0),
+        output_q80      (0),
+        output_iq1s     (0),
+        output_iq4nl    (0),
+        output_iq1m     (0),
+        layer_f32       (0),
+        layer_f16       (0),
+        layer_q2k       (0),
+        layer_q4k       (0),
+        layer_q5k       (0),
+        layer_q6k       (0),
+        layer_iq2xxs    (0),
+        layer_q50       (0),
+        layer_q80       (0),
+        layer_iq1s      (0),
+        layer_iq4nl     (0), 
+        layer_iq1m      (0)
+        {}
 };

 struct model_bytes {
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -3560,21 +3560,25 @@ static bool is_dtype_exist(struct model_params * n_params, enum ggml_type dtype)
        case GGML_TYPE_F16:    
            return true;
        case GGML_TYPE_Q2_K:
-            return n_params->layer_q2k > 0   || n_params->output_q2k   > 0;
+            return n_params->layer_q2k    > 0 || n_params->output_q2k    > 0;
        case GGML_TYPE_Q4_K:
-            return n_params->layer_q4k > 0   || n_params->output_q4k   > 0;
+            return n_params->layer_q4k    > 0 || n_params->output_q4k    > 0;
        case GGML_TYPE_Q5_K:
-            return n_params->layer_q5k > 0   || n_params->output_q5k   > 0;
+            return n_params->layer_q5k    > 0 || n_params->output_q5k    > 0;
        case GGML_TYPE_Q6_K:
-            return n_params->layer_q6k > 0   || n_params->output_q6k   > 0;
+            return n_params->layer_q6k    > 0 || n_params->output_q6k    > 0;
+        case GGML_TYPE_IQ2_XXS:
+            return n_params->layer_iq2xxs > 0 || n_params->output_iq2xxs > 0;
        case GGML_TYPE_Q5_0:
-            return n_params->layer_q50 > 0   || n_params->output_q50   > 0;
+            return n_params->layer_q50    > 0 || n_params->output_q50    > 0;
        case GGML_TYPE_Q8_0:
-            return n_params->layer_q80 > 0   || n_params->output_q80   > 0;
+            return n_params->layer_q80    > 0 || n_params->output_q80    > 0;
        case GGML_TYPE_IQ1_S:
-            return n_params->layer_iq1s  > 0 || n_params->output_iq1s  > 0;
+            return n_params->layer_iq1s   > 0 || n_params->output_iq1s   > 0;
        case GGML_TYPE_IQ4_NL:
-            return n_params->layer_iq4nl > 0 || n_params->output_iq4nl > 0;
+            return n_params->layer_iq4nl  > 0 || n_params->output_iq4nl  > 0;
+        case GGML_TYPE_IQ1_M:
+            return n_params->layer_iq1m   > 0 || n_params->output_iq1m   > 0;
        default:
            throw std::runtime_error("Unrecognized data type\n");
    }
@ -3679,6 +3683,12 @@ void llama_profile_device(
        dev_info->gpu_props.cuda_flops_q6k_f32  = device_cuda_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32);
    }

+    if (is_dtype_exist(n_params, GGML_TYPE_IQ2_XXS)) {
+        dev_info->cpu_props.flops_iq2xxs_f32    = device_cpu_flops  (model, GGML_TYPE_IQ2_XXS, GGML_TYPE_F32, n_threads);
+        dev_info->gpu_props.metal_flops_iq2xxs_f32= device_metal_flops(model, GGML_TYPE_IQ2_XXS, GGML_TYPE_F32);
+        dev_info->gpu_props.cuda_flops_iq2xxs_f32 = device_cuda_flops (model, GGML_TYPE_IQ2_XXS, GGML_TYPE_F32);
+    }
+
    if (is_dtype_exist(n_params, GGML_TYPE_Q5_0)) {
        dev_info->cpu_props.flops_q50_f32       = device_cpu_flops  (model, GGML_TYPE_Q5_0, GGML_TYPE_F32, n_threads);
        dev_info->gpu_props.metal_flops_q50_f32 = device_metal_flops(model, GGML_TYPE_Q5_0, GGML_TYPE_F32);
@ -3703,6 +3713,12 @@ void llama_profile_device(
        dev_info->gpu_props.metal_flops_iq4nl_f32= device_metal_flops(model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32);
        dev_info->gpu_props.cuda_flops_iq4nl_f32 = device_cuda_flops (model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32);
    }
+
+    if (is_dtype_exist(n_params, GGML_TYPE_IQ1_M)) {
+        dev_info->cpu_props.flops_iq1m_f32      = device_cpu_flops  (model, GGML_TYPE_IQ1_M, GGML_TYPE_F32, n_threads);
+        dev_info->gpu_props.metal_flops_iq1m_f32= device_metal_flops(model, GGML_TYPE_IQ1_M, GGML_TYPE_F32);
+        dev_info->gpu_props.cuda_flops_iq1m_f32 = device_cuda_flops (model, GGML_TYPE_IQ1_M, GGML_TYPE_F32);
+    }
 }

 ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device) {
@ -21049,34 +21065,40 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en
        case PROFILER_LAYER_OUTPUT:
            switch (dtype) {
                case GGML_TYPE_F32:
-                    n_flops->output_f32_f32 += n;
+                    n_flops->output_f32_f32    += n;
                    break;
                case GGML_TYPE_F16:
-                    n_flops->output_f16_f32 += n;
+                    n_flops->output_f16_f32    += n;
                    break;
                case GGML_TYPE_Q2_K:
-                    n_flops->output_q2k_f32 += n;
+                    n_flops->output_q2k_f32    += n;
                    break;
                case GGML_TYPE_Q4_K:
-                    n_flops->output_q4k_f32 += n;
+                    n_flops->output_q4k_f32    += n;
                    break;
                case GGML_TYPE_Q5_K:
-                    n_flops->output_q5k_f32 += n;
+                    n_flops->output_q5k_f32    += n;
                    break;
                case GGML_TYPE_Q6_K:
-                    n_flops->output_q6k_f32 += n;
+                    n_flops->output_q6k_f32    += n;
+                    break;
+                case GGML_TYPE_IQ2_XXS:
+                    n_flops->output_iq2xxs_f32 += n;
                    break;
                case GGML_TYPE_Q5_0:
-                    n_flops->output_q50_f32 += n;
+                    n_flops->output_q50_f32    += n;
                    break;
                case GGML_TYPE_Q8_0:
-                    n_flops->output_q80_f32 += n;
+                    n_flops->output_q80_f32    += n;
                    break;
                case GGML_TYPE_IQ1_S:
-                    n_flops->output_iq1s_f32 += n;
+                    n_flops->output_iq1s_f32   += n;
                    break;
                case GGML_TYPE_IQ4_NL:
-                    n_flops->output_iq4nl_f32 += n;
+                    n_flops->output_iq4nl_f32  += n;
+                    break;
+                case GGML_TYPE_IQ1_M:
+                    n_flops->output_iq1m_f32   += n;
                    break;
                default:
                    throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
@ -21086,34 +21108,40 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en
        case PROFILER_LAYER_BACKEND:
              switch (dtype) {
                case GGML_TYPE_F32:
-                    n_flops->layer_f32_f32 += n;
+                    n_flops->layer_f32_f32    += n;
                    break;
                case GGML_TYPE_F16:
-                    n_flops->layer_f16_f32 += n;
+                    n_flops->layer_f16_f32    += n;
                    break;
                case GGML_TYPE_Q2_K:
-                    n_flops->layer_q2k_f32 += n;
+                    n_flops->layer_q2k_f32    += n;
                    break;
                case GGML_TYPE_Q4_K:
-                    n_flops->layer_q4k_f32 += n;
+                    n_flops->layer_q4k_f32    += n;
                    break;
                case GGML_TYPE_Q5_K:
-                    n_flops->layer_q5k_f32 += n;
+                    n_flops->layer_q5k_f32    += n;
                    break;
                case GGML_TYPE_Q6_K:
-                    n_flops->layer_q6k_f32 += n;
+                    n_flops->layer_q6k_f32    += n;
+                    break;
+                case GGML_TYPE_IQ2_XXS:
+                    n_flops->layer_iq2xxs_f32 += n;
                    break;
                case GGML_TYPE_Q5_0:
-                    n_flops->layer_q50_f32 += n;
+                    n_flops->layer_q50_f32    += n;
                    break;
                case GGML_TYPE_Q8_0:
-                    n_flops->layer_q80_f32 += n;
+                    n_flops->layer_q80_f32    += n;
                    break;
                case GGML_TYPE_IQ1_S:
-                    n_flops->layer_iq1s_f32 += n;
+                    n_flops->layer_iq1s_f32   += n;
                    break;
                case GGML_TYPE_IQ4_NL:
-                    n_flops->layer_iq4nl_f32 += n;
+                    n_flops->layer_iq4nl_f32  += n;
+                    break;
+                case GGML_TYPE_IQ1_M:
+                    n_flops->layer_iq1m_f32   += n;
                    break;
                default:
                    throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n");
@ -21131,34 +21159,40 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
        case PROFILER_LAYER_INPUT:
            switch (dtype) {
                case GGML_TYPE_F32:
-                    n_params->input_f32 += n_i64t;
+                    n_params->input_f32    += n_i64t;
                    break;
                case GGML_TYPE_F16:
-                    n_params->input_f16 += n_i64t;
+                    n_params->input_f16    += n_i64t;
                    break;
                case GGML_TYPE_Q2_K:
-                    n_params->input_q2k += n_i64t;
+                    n_params->input_q2k    += n_i64t;
                    break;
                case GGML_TYPE_Q4_K:
-                    n_params->input_q4k += n_i64t;
+                    n_params->input_q4k    += n_i64t;
                    break;
                case GGML_TYPE_Q5_K:
-                    n_params->input_q5k += n_i64t;
+                    n_params->input_q5k    += n_i64t;
                    break;
                case GGML_TYPE_Q6_K:
-                    n_params->input_q6k += n_i64t;
+                    n_params->input_q6k    += n_i64t;
+                    break;
+                case GGML_TYPE_IQ2_XXS:
+                    n_params->input_iq2xxs += n_i64t;
                    break;
                case GGML_TYPE_Q5_0:
-                    n_params->input_q50 += n_i64t;
+                    n_params->input_q50    += n_i64t;
                    break;
                case GGML_TYPE_Q8_0:
-                    n_params->input_q80 += n_i64t;
+                    n_params->input_q80    += n_i64t;
                    break;
                case GGML_TYPE_IQ1_S:
-                    n_params->input_iq1s += n_i64t;
+                    n_params->input_iq1s   += n_i64t;
                    break;
                case GGML_TYPE_IQ4_NL:
-                    n_params->input_iq4nl += n_i64t;
+                    n_params->input_iq4nl  += n_i64t;
+                    break;
+                case GGML_TYPE_IQ1_M:
+                    n_params->input_iq1m   += n_i64t;
                    break;
                default:
                    throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
@ -21185,6 +21219,9 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
                case GGML_TYPE_Q6_K:
                    n_params->output_q6k    += n_i64t;
                    break;
+                case GGML_TYPE_IQ2_XXS:
+                    n_params->output_iq2xxs += n_i64t;
+                    break;
                case GGML_TYPE_Q5_0:
                    n_params->output_q50    += n_i64t;
                    break;
@ -21197,6 +21234,9 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
                case GGML_TYPE_IQ4_NL:
                    n_params->output_iq4nl  += n_i64t;
                    break;
+                case GGML_TYPE_IQ1_M:
+                    n_params->output_iq1m   += n_i64t;
+                    break;
                default:
                    throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
            }
@ -21222,6 +21262,9 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
                case GGML_TYPE_Q6_K:
                    n_params->layer_q6k     += n_i64t;
                    break;
+                case GGML_TYPE_IQ2_XXS:
+                    n_params->layer_iq2xxs  += n_i64t;
+                    break;
                case GGML_TYPE_Q5_0:
                    n_params->layer_q50     += n_i64t;
                    break;
@ -21234,6 +21277,9 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
                case GGML_TYPE_IQ4_NL:
                    n_params->layer_iq4nl   += n_i64t;
                    break;
+                case GGML_TYPE_IQ1_M:
+                    n_params->layer_iq1m    += n_i64t;
+                    break;
                default:
                    throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n");
            }
@ -21522,27 +21568,31 @@ void llama_model_n_flops(
    }

    // use average values instead of total values
-    n_flops->layer_f32_f32   = static_cast<int64_t>((double)n_flops->layer_f32_f32  / (double)n_layer);
-    n_flops->layer_f16_f32   = static_cast<int64_t>((double)n_flops->layer_f16_f32  / (double)n_layer);
-    n_flops->layer_q2k_f32   = static_cast<int64_t>((double)n_flops->layer_q2k_f32  / (double)n_layer);
-    n_flops->layer_q4k_f32   = static_cast<int64_t>((double)n_flops->layer_q4k_f32  / (double)n_layer);
-    n_flops->layer_q5k_f32   = static_cast<int64_t>((double)n_flops->layer_q5k_f32  / (double)n_layer);
-    n_flops->layer_q6k_f32   = static_cast<int64_t>((double)n_flops->layer_q6k_f32  / (double)n_layer);
-    n_flops->layer_q50_f32   = static_cast<int64_t>((double)n_flops->layer_q50_f32  / (double)n_layer);
-    n_flops->layer_q80_f32   = static_cast<int64_t>((double)n_flops->layer_q80_f32  / (double)n_layer);
-    n_flops->layer_iq1s_f32  = static_cast<int64_t>((double)n_flops->layer_iq1s_f32 / (double)n_layer);
-    n_flops->layer_iq4nl_f32 = static_cast<int64_t>((double)n_flops->layer_iq4nl_f32 / (double)n_layer);
+    n_flops->layer_f32_f32    = static_cast<int64_t>((double)n_flops->layer_f32_f32    / (double)n_layer);
+    n_flops->layer_f16_f32    = static_cast<int64_t>((double)n_flops->layer_f16_f32    / (double)n_layer);
+    n_flops->layer_q2k_f32    = static_cast<int64_t>((double)n_flops->layer_q2k_f32    / (double)n_layer);
+    n_flops->layer_q4k_f32    = static_cast<int64_t>((double)n_flops->layer_q4k_f32    / (double)n_layer);
+    n_flops->layer_q5k_f32    = static_cast<int64_t>((double)n_flops->layer_q5k_f32    / (double)n_layer);
+    n_flops->layer_q6k_f32    = static_cast<int64_t>((double)n_flops->layer_q6k_f32    / (double)n_layer);
+    n_flops->layer_iq2xxs_f32 = static_cast<int64_t>((double)n_flops->layer_iq2xxs_f32 / (double)n_layer);
+    n_flops->layer_q50_f32    = static_cast<int64_t>((double)n_flops->layer_q50_f32    / (double)n_layer);
+    n_flops->layer_q80_f32    = static_cast<int64_t>((double)n_flops->layer_q80_f32    / (double)n_layer);
+    n_flops->layer_iq1s_f32   = static_cast<int64_t>((double)n_flops->layer_iq1s_f32   / (double)n_layer);
+    n_flops->layer_iq4nl_f32  = static_cast<int64_t>((double)n_flops->layer_iq4nl_f32  / (double)n_layer);
+    n_flops->layer_iq1m_f32   = static_cast<int64_t>((double)n_flops->layer_iq1m_f32   / (double)n_layer);
    
    n_params->layer_f32      = static_cast<int64_t>((double)n_params->layer_f32     / (double)n_layer);
    n_params->layer_f16      = static_cast<int64_t>((double)n_params->layer_f16     / (double)n_layer);
    n_params->layer_q2k      = static_cast<int64_t>((double)n_params->layer_q2k     / (double)n_layer);
    n_params->layer_q4k      = static_cast<int64_t>((double)n_params->layer_q4k     / (double)n_layer);
-    n_params->layer_q50      = static_cast<int64_t>((double)n_params->layer_q50     / (double)n_layer);
    n_params->layer_q5k      = static_cast<int64_t>((double)n_params->layer_q5k     / (double)n_layer);
    n_params->layer_q6k      = static_cast<int64_t>((double)n_params->layer_q6k     / (double)n_layer);
+    n_params->layer_iq2xxs   = static_cast<int64_t>((double)n_params->layer_iq2xxs  / (double)n_layer);
+    n_params->layer_q50      = static_cast<int64_t>((double)n_params->layer_q50     / (double)n_layer);
    n_params->layer_q80      = static_cast<int64_t>((double)n_params->layer_q80     / (double)n_layer);
    n_params->layer_iq1s     = static_cast<int64_t>((double)n_params->layer_iq1s    / (double)n_layer);
    n_params->layer_iq4nl    = static_cast<int64_t>((double)n_params->layer_iq4nl   / (double)n_layer);
+    n_params->layer_iq1m     = static_cast<int64_t>((double)n_params->layer_iq1m    / (double)n_layer);
    
    n_bytes->nb_layer        = static_cast<int64_t>((double)n_bytes->nb_layer       / (double)n_layer);