diff --git a/common/common.cpp b/common/common.cpp
index 704c7335..5c972c90 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -901,13 +901,19 @@ static bool assign_layers_to_device(
         float t_read_ram_cpu = 0.0f;
 
         float t_calc_cpu = (
-            master.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) +
-            master.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) +
-            master.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) +
-            master.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) +
-            master.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) +
-            master.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) +
-            master.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
+            master.model_flops.layer_f32_f32   / (dev.cpu_props.flops_f32_f32   * 1e9 + EPS) +
+            master.model_flops.layer_f16_f32   / (dev.cpu_props.flops_f16_f32   * 1e9 + EPS) +
+            master.model_flops.layer_q2k_f32   / (dev.cpu_props.flops_q2k_f32   * 1e9 + EPS) +
+            master.model_flops.layer_q4k_f32   / (dev.cpu_props.flops_q4k_f32   * 1e9 + EPS) +
+            master.model_flops.layer_q5k_f32   / (dev.cpu_props.flops_q5k_f32   * 1e9 + EPS) +
+            master.model_flops.layer_q6k_f32   / (dev.cpu_props.flops_q6k_f32   * 1e9 + EPS) +
+            master.model_flops.layer_iq2xxs_f32/ (dev.cpu_props.flops_iq2xxs_f32* 1e9 + EPS) +
+            master.model_flops.layer_q50_f32   / (dev.cpu_props.flops_q50_f32   * 1e9 + EPS) +
+            master.model_flops.layer_q80_f32   / (dev.cpu_props.flops_q80_f32   * 1e9 + EPS) +
+            master.model_flops.layer_iq1s_f32  / (dev.cpu_props.flops_iq1s_f32  * 1e9 + EPS) +
+            master.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS) +
+            master.model_flops.layer_iq1m_f32  / (dev.cpu_props.flops_iq1m_f32  * 1e9 + EPS) ) * 1000; // in ms
+
         float t_kv_cpy_cpu = dev.memory.mem_cpy_delay; // in ms
         // t_read_ram_cpu = b_prime / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms
 
@@ -921,24 +927,36 @@ static bool assign_layers_to_device(
 
             if (dev.gpu_support.metal) {
                 t_calc_gpu = (
-                    master.model_flops.layer_f32_f32 / (dev.gpu_props.metal_flops_f32_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_f16_f32 / (dev.gpu_props.metal_flops_f16_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_q4k_f32 / (dev.gpu_props.metal_flops_q4k_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_q50_f32 / (dev.gpu_props.metal_flops_q50_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_q5k_f32 / (dev.gpu_props.metal_flops_q5k_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_q6k_f32 / (dev.gpu_props.metal_flops_q6k_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_q80_f32 / (dev.gpu_props.metal_flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
+                    master.model_flops.layer_f32_f32    / (dev.gpu_props.metal_flops_f32_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_f16_f32    / (dev.gpu_props.metal_flops_f16_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_q2k_f32    / (dev.gpu_props.metal_flops_q2k_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_q4k_f32    / (dev.gpu_props.metal_flops_q4k_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_q5k_f32    / (dev.gpu_props.metal_flops_q5k_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_q6k_f32    / (dev.gpu_props.metal_flops_q6k_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_iq2xxs_f32 / (dev.gpu_props.metal_flops_iq2xxs_f32 * 1e9 + EPS) +
+                    master.model_flops.layer_q50_f32    / (dev.gpu_props.metal_flops_q50_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_q80_f32    / (dev.gpu_props.metal_flops_q80_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_iq1s_f32   / (dev.gpu_props.metal_flops_iq1s_f32   * 1e9 + EPS) +
+                    master.model_flops.layer_iq4nl_f32  / (dev.gpu_props.metal_flops_iq4nl_f32  * 1e9 + EPS) +
+                    master.model_flops.layer_iq1m_f32   / (dev.gpu_props.metal_flops_iq1m_f32   * 1e9 + EPS) ) * 1000; // in ms
+
                 t_kv_cpy_gpu = dev.gpu_props.metal_mem_cpy_delay; // in ms
                 // t_read_ram_gpu = b_prime / (dev.gpu_props.metal_read_vram_bw * 1e9) * 1000; // in ms
             } else {
                 t_calc_gpu = (
-                    master.model_flops.layer_f32_f32 / (dev.gpu_props.cuda_flops_f32_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_f16_f32 / (dev.gpu_props.cuda_flops_f16_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_q4k_f32 / (dev.gpu_props.cuda_flops_q4k_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_q50_f32 / (dev.gpu_props.cuda_flops_q50_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_q5k_f32 / (dev.gpu_props.cuda_flops_q5k_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_q6k_f32 / (dev.gpu_props.cuda_flops_q6k_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_q80_f32 / (dev.gpu_props.cuda_flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
+                    master.model_flops.layer_f32_f32    / (dev.gpu_props.cuda_flops_f32_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_f16_f32    / (dev.gpu_props.cuda_flops_f16_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_q2k_f32    / (dev.gpu_props.cuda_flops_q2k_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_q4k_f32    / (dev.gpu_props.cuda_flops_q4k_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_q5k_f32    / (dev.gpu_props.cuda_flops_q5k_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_q6k_f32    / (dev.gpu_props.cuda_flops_q6k_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_iq2xxs_f32 / (dev.gpu_props.cuda_flops_iq2xxs_f32 * 1e9 + EPS) +
+                    master.model_flops.layer_q50_f32    / (dev.gpu_props.cuda_flops_q50_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_q80_f32    / (dev.gpu_props.cuda_flops_q80_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_iq1s_f32   / (dev.gpu_props.cuda_flops_iq1s_f32   * 1e9 + EPS) +
+                    master.model_flops.layer_iq4nl_f32  / (dev.gpu_props.cuda_flops_iq4nl_f32  * 1e9 + EPS) +
+                    master.model_flops.layer_iq1m_f32   / (dev.gpu_props.cuda_flops_iq1m_f32   * 1e9 + EPS) ) * 1000; // in ms
+
                 t_kv_cpy_gpu = dev.gpu_props.cuda_mem_cpy_delay; // in ms
                 // t_read_ram_gpu = b_prime / (dev.gpu_props.cuda_read_vram_bw * 1e9) * 1000; // in ms
             }
@@ -1113,14 +1131,18 @@ static bool assign_layers_to_device(
 
             if (m == 0) {
                 kappa = (
-                    dev.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) +
-                    dev.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) +
-                    dev.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) +
-                    dev.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) +
-                    dev.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) +
-                    dev.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) +
-                    dev.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
-
+                    dev.model_flops.layer_f32_f32    / (dev.cpu_props.flops_f32_f32    * 1e9 + EPS) +
+                    dev.model_flops.layer_f16_f32    / (dev.cpu_props.flops_f16_f32    * 1e9 + EPS) +
+                    dev.model_flops.layer_q2k_f32    / (dev.cpu_props.flops_q2k_f32    * 1e9 + EPS) +
+                    dev.model_flops.layer_q4k_f32    / (dev.cpu_props.flops_q4k_f32    * 1e9 + EPS) +
+                    dev.model_flops.layer_q5k_f32    / (dev.cpu_props.flops_q5k_f32    * 1e9 + EPS) +
+                    dev.model_flops.layer_q6k_f32    / (dev.cpu_props.flops_q6k_f32    * 1e9 + EPS) +
+                    dev.model_flops.layer_iq2xxs_f32 / (dev.cpu_props.flops_iq2xxs_f32 * 1e9 + EPS) +
+                    dev.model_flops.layer_q50_f32    / (dev.cpu_props.flops_q50_f32    * 1e9 + EPS) +
+                    dev.model_flops.layer_q80_f32    / (dev.cpu_props.flops_q80_f32    * 1e9 + EPS) +
+                    dev.model_flops.layer_iq1s_f32   / (dev.cpu_props.flops_iq1s_f32   * 1e9 + EPS) +
+                    dev.model_flops.layer_iq4nl_f32  / (dev.cpu_props.flops_iq4nl_f32  * 1e9 + EPS) +
+                    dev.model_flops.layer_iq1m_f32   / (dev.cpu_props.flops_iq1m_f32   * 1e9 + EPS) ) * 1000; // in ms
                 // kappa += (bi / n_vocab + bo) / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms
 
                 kappa += (bi / n_vocab) / (disk_speed[m] * 1e9) * 1000; // in ms
@@ -1766,33 +1788,25 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
     return mparams;
 }
 
-static ggml_type kv_cache_type_from_str(const std::string & s) {
-    if (s == "f32") {
-        return GGML_TYPE_F32;
-    }
-    if (s == "f16") {
-        return GGML_TYPE_F16;
-    }
-    if (s == "q8_0") {
-        return GGML_TYPE_Q8_0;
-    }
-    if (s == "q4_0") {
-        return GGML_TYPE_Q4_0;
-    }
-    if (s == "q4_1") {
-        return GGML_TYPE_Q4_1;
-    }
-    if (s == "iq4_nl") {
-        return GGML_TYPE_IQ4_NL;
-    }
-    if (s == "q5_0") {
-        return GGML_TYPE_Q5_0;
-    }
-    if (s == "q5_1") {
-        return GGML_TYPE_Q5_1;
-    }
+const std::vector<ggml_type> kv_cache_types = {
+    GGML_TYPE_F32,
+    GGML_TYPE_F16,
+    GGML_TYPE_BF16, // Added BF16 data type support
+    GGML_TYPE_Q8_0,
+    GGML_TYPE_Q4_0,
+    GGML_TYPE_Q4_1,
+    GGML_TYPE_IQ4_NL,
+    GGML_TYPE_Q5_0,
+    GGML_TYPE_Q5_1,
+};
 
-    throw std::runtime_error("Invalid cache type: " + s);
+static ggml_type kv_cache_type_from_str(const std::string & s) {
+    for (const auto & type : kv_cache_types) {
+        if (ggml_type_name(type) == s) {
+            return type;
+        }
+    }
+    throw std::runtime_error("Unsupported cache type: " + s);
 }
 
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
diff --git a/common/profiler.cpp b/common/profiler.cpp
index 69a20af0..18b345a9 100644
--- a/common/profiler.cpp
+++ b/common/profiler.cpp
@@ -188,6 +188,9 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum
     };
     struct ggml_context * ctx = ggml_init(params);
 
+    if(n_embd < ggml_blck_size(src0t)){
+        n_embd = 2 * ggml_blck_size(src0t);
+    }
     struct ggml_tensor * tensor_a = ggml_new_tensor_2d(ctx, src0t, n_embd, n_embd);
     struct ggml_tensor * tensor_b = ggml_new_tensor_2d(ctx, src1t, n_embd, n_embd);
 
@@ -208,10 +211,12 @@ static float device_flops(struct llama_model * model, enum ggml_type src0t, enum
         ctx_cgraph = ggml_init(params0);
 
         gf = ggml_new_graph(ctx_cgraph);
+        
         cur = ggml_mul_mat(ctx_cgraph, tensor_a, tensor_b);
         for (int i = 0; i < n_repeat - 1; i++) {
             cur = ggml_mul_mat(ctx_cgraph, tensor_a, cur);
         }
+
         ggml_build_forward_expand(gf, cur);
     }
 
@@ -364,14 +369,18 @@ float device_inp_embd_delay(struct llama_model * model, enum ggml_type src0t, in
             ggml_fp32_to_fp16_row(temp_f32.data(), static_cast<ggml_fp16_t *>(matrix_B), embd_size);
             break;
         }
+        case GGML_TYPE_Q2_K:
         case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_0:
         case GGML_TYPE_Q5_K:
         case GGML_TYPE_Q6_K:
         case GGML_TYPE_Q8_K:
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_Q5_0:
         case GGML_TYPE_Q8_0:
-            QK_K = ggml_blck_size(src0t);
-            matrix_B = malloc((embd_size / QK_K) * ggml_type_size(src0t));
+        case GGML_TYPE_IQ1_S:
+        case GGML_TYPE_IQ4_NL:
+        case GGML_TYPE_IQ1_M:
+            matrix_B = malloc((embd_size / ggml_blck_size(src0t) * ggml_type_size(src0t))); // The quantization block sizes are inconsistent for different quantization methods
             break;
         default:
             LOG_INF("Unsupported type: %d\n", src0t);
@@ -1347,33 +1356,47 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c
 #ifdef GGML_USE_CUDA
     struct gpu_props gpu = dev_info.gpu_props;
 
-    gpu_latency_per_layer += (double)n_flops.layer_f32_f32  / ((double)gpu.cuda_flops_f32_f32 + EPS) / 1e9;
-    gpu_latency_per_layer += (double)n_flops.layer_f16_f32  / ((double)gpu.cuda_flops_f16_f32 + EPS) / 1e9;
-    gpu_latency_per_layer += (double)n_flops.layer_q4k_f32  / ((double)gpu.cuda_flops_q4k_f32 + EPS) / 1e9;
-    gpu_latency_per_layer += (double)n_flops.layer_q50_f32  / ((double)gpu.cuda_flops_q50_f32 + EPS) / 1e9;
-    gpu_latency_per_layer += (double)n_flops.layer_q5k_f32  / ((double)gpu.cuda_flops_q5k_f32 + EPS) / 1e9;
-    gpu_latency_per_layer += (double)n_flops.layer_q6k_f32  / ((double)gpu.cuda_flops_q6k_f32 + EPS) / 1e9;
-    gpu_latency_per_layer += (double)n_flops.layer_q80_f32  / ((double)gpu.cuda_flops_q80_f32 + EPS) / 1e9;
+    gpu_latency_per_layer += (double)n_flops.layer_f32_f32    / ((double)gpu.cuda_flops_f32_f32    + EPS) / 1e9;
+    gpu_latency_per_layer += (double)n_flops.layer_f16_f32    / ((double)gpu.cuda_flops_f16_f32    + EPS) / 1e9;
+    gpu_latency_per_layer += (double)n_flops.layer_q2k_f32    / ((double)gpu.cuda_flops_q2k_f32    + EPS) / 1e9;
+    gpu_latency_per_layer += (double)n_flops.layer_q4k_f32    / ((double)gpu.cuda_flops_q4k_f32    + EPS) / 1e9;
+    gpu_latency_per_layer += (double)n_flops.layer_q5k_f32    / ((double)gpu.cuda_flops_q5k_f32    + EPS) / 1e9;
+    gpu_latency_per_layer += (double)n_flops.layer_q6k_f32    / ((double)gpu.cuda_flops_q6k_f32    + EPS) / 1e9;
+    gpu_latency_per_layer += (double)n_flops.layer_iq2xxs_f32 / ((double)gpu.cuda_flops_iq2xxs_f32 + EPS) / 1e9;
+    gpu_latency_per_layer += (double)n_flops.layer_q50_f32    / ((double)gpu.cuda_flops_q50_f32    + EPS) / 1e9;
+    gpu_latency_per_layer += (double)n_flops.layer_q80_f32    / ((double)gpu.cuda_flops_q80_f32    + EPS) / 1e9;
+    gpu_latency_per_layer += (double)n_flops.layer_iq1s_f32   / ((double)gpu.cuda_flops_iq1s_f32   + EPS) / 1e9;
+    gpu_latency_per_layer += (double)n_flops.layer_iq4nl_f32  / ((double)gpu.cuda_flops_iq4nl_f32  + EPS) / 1e9;
+    gpu_latency_per_layer += (double)n_flops.layer_iq1m_f32   / ((double)gpu.cuda_flops_iq1m_f32   + EPS) / 1e9;
 #elif GGML_USE_METAL
     struct gpu_props gpu = dev_info.gpu_props;
 
-    gpu_latency_per_layer += (double)n_flops.layer_f32_f32  / ((double)gpu.metal_flops_f32_f32 + EPS) / 1e9;
-    gpu_latency_per_layer += (double)n_flops.layer_f16_f32  / ((double)gpu.metal_flops_f16_f32 + EPS) / 1e9;
-    gpu_latency_per_layer += (double)n_flops.layer_q4k_f32  / ((double)gpu.metal_flops_q4k_f32 + EPS) / 1e9;
-    gpu_latency_per_layer += (double)n_flops.layer_q50_f32  / ((double)gpu.metal_flops_q50_f32 + EPS) / 1e9;
-    gpu_latency_per_layer += (double)n_flops.layer_q5k_f32  / ((double)gpu.metal_flops_q5k_f32 + EPS) / 1e9;
-    gpu_latency_per_layer += (double)n_flops.layer_q6k_f32  / ((double)gpu.metal_flops_q6k_f32 + EPS) / 1e9;
-    gpu_latency_per_layer += (double)n_flops.layer_q80_f32  / ((double)gpu.metal_flops_q80_f32 + EPS) / 1e9;
+    gpu_latency_per_layer += (double)n_flops.layer_f32_f32    / ((double)gpu.metal_flops_f32_f32    + EPS) / 1e9;
+    gpu_latency_per_layer += (double)n_flops.layer_f16_f32    / ((double)gpu.metal_flops_f16_f32    + EPS) / 1e9;
+    gpu_latency_per_layer += (double)n_flops.layer_q2k_f32    / ((double)gpu.metal_flops_q2k_f32    + EPS) / 1e9;
+    gpu_latency_per_layer += (double)n_flops.layer_q4k_f32    / ((double)gpu.metal_flops_q4k_f32    + EPS) / 1e9;
+    gpu_latency_per_layer += (double)n_flops.layer_q5k_f32    / ((double)gpu.metal_flops_q5k_f32    + EPS) / 1e9;
+    gpu_latency_per_layer += (double)n_flops.layer_q6k_f32    / ((double)gpu.metal_flops_q6k_f32    + EPS) / 1e9;
+    gpu_latency_per_layer += (double)n_flops.layer_iq2xxs_f32 / ((double)gpu.metal_flops_iq2xxs_f32 + EPS) / 1e9;
+    gpu_latency_per_layer += (double)n_flops.layer_q50_f32    / ((double)gpu.metal_flops_q50_f32    + EPS) / 1e9;
+    gpu_latency_per_layer += (double)n_flops.layer_q80_f32    / ((double)gpu.metal_flops_q80_f32    + EPS) / 1e9;
+    gpu_latency_per_layer += (double)n_flops.layer_iq1s_f32   / ((double)gpu.metal_flops_iq1s_f32   + EPS) / 1e9;
+    gpu_latency_per_layer += (double)n_flops.layer_iq4nl_f32  / ((double)gpu.metal_flops_iq4nl_f32  + EPS) / 1e9;
+    gpu_latency_per_layer += (double)n_flops.layer_iq1m_f32   / ((double)gpu.metal_flops_iq1m_f32   + EPS) / 1e9;
 #endif
 
-    cpu_latency_per_layer += (double)n_flops.layer_f32_f32  / ((double)cpu.flops_f32_f32 + EPS) / 1e9;
-    cpu_latency_per_layer += (double)n_flops.layer_f16_f32  / ((double)cpu.flops_f16_f32 + EPS) / 1e9;
-    cpu_latency_per_layer += (double)n_flops.layer_q4k_f32  / ((double)cpu.flops_q4k_f32 + EPS) / 1e9;
-    cpu_latency_per_layer += (double)n_flops.layer_q50_f32  / ((double)cpu.flops_q50_f32 + EPS) / 1e9;
-    cpu_latency_per_layer += (double)n_flops.layer_q5k_f32  / ((double)cpu.flops_q5k_f32 + EPS) / 1e9;
-    cpu_latency_per_layer += (double)n_flops.layer_q6k_f32  / ((double)cpu.flops_q6k_f32 + EPS) / 1e9;
-    cpu_latency_per_layer += (double)n_flops.layer_q80_f32  / ((double)cpu.flops_q80_f32 + EPS) / 1e9;
-
+    cpu_latency_per_layer += (double)n_flops.layer_f32_f32    / ((double)cpu.flops_f32_f32    + EPS) / 1e9;
+    cpu_latency_per_layer += (double)n_flops.layer_f16_f32    / ((double)cpu.flops_f16_f32    + EPS) / 1e9;
+    cpu_latency_per_layer += (double)n_flops.layer_q2k_f32    / ((double)cpu.flops_q2k_f32    + EPS) / 1e9;
+    cpu_latency_per_layer += (double)n_flops.layer_q4k_f32    / ((double)cpu.flops_q4k_f32    + EPS) / 1e9;
+    cpu_latency_per_layer += (double)n_flops.layer_q5k_f32    / ((double)cpu.flops_q5k_f32    + EPS) / 1e9;
+    cpu_latency_per_layer += (double)n_flops.layer_q6k_f32    / ((double)cpu.flops_q6k_f32    + EPS) / 1e9;
+    cpu_latency_per_layer += (double)n_flops.layer_iq2xxs_f32 / ((double)cpu.flops_iq2xxs_f32 + EPS) / 1e9;
+    cpu_latency_per_layer += (double)n_flops.layer_q50_f32    / ((double)cpu.flops_q50_f32    + EPS) / 1e9;
+    cpu_latency_per_layer += (double)n_flops.layer_q80_f32    / ((double)cpu.flops_q80_f32    + EPS) / 1e9;
+    cpu_latency_per_layer += (double)n_flops.layer_iq1s_f32   / ((double)cpu.flops_iq1s_f32   + EPS) / 1e9;
+    cpu_latency_per_layer += (double)n_flops.layer_iq4nl_f32  / ((double)cpu.flops_iq4nl_f32  + EPS) / 1e9;
+    cpu_latency_per_layer += (double)n_flops.layer_iq1m_f32   / ((double)cpu.flops_iq1m_f32   + EPS) / 1e9;
     double total_latency = 0.0f;
     
 #if defined(GGML_USE_METAL) || defined(GGML_USE_CUDA)
@@ -1385,13 +1408,18 @@ static float device_compute_delay(struct device_info & dev_info, int n_layers, c
     total_latency += cpu_latency_per_layer * n_layers;
 #endif
 
-    total_latency += (double)n_flops.output_f32_f32 / ((double)cpu.flops_f32_f32 + EPS) / 1e9;
-    total_latency += (double)n_flops.output_f16_f32 / ((double)cpu.flops_f16_f32 + EPS) / 1e9;
-    total_latency += (double)n_flops.output_q4k_f32 / ((double)cpu.flops_q4k_f32 + EPS) / 1e9;
-    total_latency += (double)n_flops.output_q50_f32 / ((double)cpu.flops_q50_f32 + EPS) / 1e9;
-    total_latency += (double)n_flops.output_q5k_f32 / ((double)cpu.flops_q5k_f32 + EPS) / 1e9;
-    total_latency += (double)n_flops.output_q6k_f32 / ((double)cpu.flops_q6k_f32 + EPS) / 1e9;
-    total_latency += (double)n_flops.output_q80_f32 / ((double)cpu.flops_q80_f32 + EPS) / 1e9;
+    total_latency += (double)n_flops.output_f32_f32    / ((double)cpu.flops_f32_f32    + EPS) / 1e9;
+    total_latency += (double)n_flops.output_f16_f32    / ((double)cpu.flops_f16_f32    + EPS) / 1e9;
+    total_latency += (double)n_flops.output_q2k_f32    / ((double)cpu.flops_q2k_f32    + EPS) / 1e9;
+    total_latency += (double)n_flops.output_q4k_f32    / ((double)cpu.flops_q4k_f32    + EPS) / 1e9;
+    total_latency += (double)n_flops.output_q5k_f32    / ((double)cpu.flops_q5k_f32    + EPS) / 1e9;
+    total_latency += (double)n_flops.output_q6k_f32    / ((double)cpu.flops_q6k_f32    + EPS) / 1e9;
+    total_latency += (double)n_flops.output_iq2xxs_f32 / ((double)cpu.flops_iq2xxs_f32 + EPS) / 1e9;
+    total_latency += (double)n_flops.output_q50_f32    / ((double)cpu.flops_q50_f32    + EPS) / 1e9;
+    total_latency += (double)n_flops.output_q80_f32    / ((double)cpu.flops_q80_f32    + EPS) / 1e9;
+    total_latency += (double)n_flops.output_iq1s_f32   / ((double)cpu.flops_iq1s_f32   + EPS) / 1e9;
+    total_latency += (double)n_flops.output_iq4nl_f32  / ((double)cpu.flops_iq4nl_f32  + EPS) / 1e9;
+    total_latency += (double)n_flops.output_iq1m_f32   / ((double)cpu.flops_iq1m_f32   + EPS) / 1e9;
 
     total_latency *= 1000; // convert to ms
 
@@ -1647,474 +1675,664 @@ static float device_mem_copy_delay(struct device_info & dev_info, struct llama_m
 
 void device_print_props(struct device_info * dev_info_set, int n, struct llama_model * model, const struct llama_context_params cparams) {
     LOG_INF("\n-------------------------------------------------------------------------------------------\n");
-    LOG_INF("| Property                     ");
+    LOG_INF("| Property                        ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| Rank %-8d", i);
         GGML_ASSERT((int)dev_info_set[i].rank == i);
     }
     LOG_INF("\n-------------------------------------------------------------------------------------------\n");
 
-    LOG_INF("| Device Name                  ");
+    LOG_INF("| Device Name                     ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.10s   ", dev_info_set[i].device_name);
     }
     LOG_INF("\n");
 
-    LOG_INF("| Device OS                    ");
+    LOG_INF("| Device OS                       ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.10s   ", dev_info_set[i].device_os);
     }
     LOG_INF("\n");
 
-    LOG_INF("| CPU Name                     ");
+    LOG_INF("| CPU Name                        ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.10s   ", dev_info_set[i].cpu_props.name);
     }
     LOG_INF("\n");
 
-    LOG_INF("| CPU Description              ");
+    LOG_INF("| CPU Description                 ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.10s   ", dev_info_set[i].cpu_props.description);
     }
     LOG_INF("\n");
 
-    LOG_INF("| Number of CPU cores          ");
+    LOG_INF("| Number of CPU cores             ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10u   ", dev_info_set[i].cpu_props.cores);
     }
     LOG_INF("\n");
 
-    LOG_INF("| CPU flops (F32xF32, GFLOPS)  ");
+    LOG_INF("| CPU flops (F32xF32, GFLOPS)     ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.1f   ", dev_info_set[i].cpu_props.flops_f32_f32);
     }
     LOG_INF("\n");
 
-    LOG_INF("| CPU flops (F16xF32, GFLOPS)  ");
+    LOG_INF("| CPU flops (F16xF32, GFLOPS)     ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.1f   ", dev_info_set[i].cpu_props.flops_f16_f32);
     }
     LOG_INF("\n");
 
-    LOG_INF("| CPU flops (Q4K x F32, GFLOPS)");
+    LOG_INF("| CPU flops (Q2K x F32, GFLOPS)   ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.1f   ", dev_info_set[i].cpu_props.flops_q2k_f32);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| CPU flops (Q4K x F32, GFLOPS)   ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.1f   ", dev_info_set[i].cpu_props.flops_q4k_f32);
     }
     LOG_INF("\n");
 
-    LOG_INF("| CPU flops (Q50 x F32, GFLOPS)");
-    for (int i = 0; i < n; ++i) {
-        LOG_INF("| %-10.1f   ", dev_info_set[i].cpu_props.flops_q50_f32);
-    }
-    LOG_INF("\n");
-
-    LOG_INF("| CPU flops (Q5K x F32, GFLOPS)");
+    LOG_INF("| CPU flops (Q5K x F32, GFLOPS)   ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.1f   ", dev_info_set[i].cpu_props.flops_q5k_f32);
     }
     LOG_INF("\n");
 
-    LOG_INF("| CPU flops (Q6K x F32, GFLOPS)");
+    LOG_INF("| CPU flops (Q6K x F32, GFLOPS)   ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.1f   ", dev_info_set[i].cpu_props.flops_q6k_f32);
     }
     LOG_INF("\n");
 
-    LOG_INF("| CPU flops (Q80 x F32, GFLOPS)");
+    LOG_INF("| CPU flops (IQ2XXS x F32, GFLOPS)");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.1f   ", dev_info_set[i].cpu_props.flops_iq2xxs_f32);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| CPU flops (Q50 x F32, GFLOPS)   ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.1f   ", dev_info_set[i].cpu_props.flops_q50_f32);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| CPU flops (Q80 x F32, GFLOPS)   ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.1f   ", dev_info_set[i].cpu_props.flops_q80_f32);
     }
     LOG_INF("\n");
 
-    LOG_INF("| Physical Mem Total (GiB)     ");
+    LOG_INF("| CPU flops (IQ1S x F32, GFLOPS)  ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.1f   ", dev_info_set[i].cpu_props.flops_iq1s_f32);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| CPU flops (IQ4NL x F32, GFLOPS) ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.1f   ", dev_info_set[i].cpu_props.flops_iq4nl_f32);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| CPU flops (IQ1M x F32, GFLOPS)  ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.1f   ", dev_info_set[i].cpu_props.flops_iq1m_f32);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| Physical Mem Total (GiB)        ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.2f   ", dev_info_set[i].memory.total_physical);
     }
     LOG_INF("\n");
 
-    LOG_INF("| Physical Mem Available (GiB) ");
+    LOG_INF("| Physical Mem Available (GiB)    ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.2f   ", dev_info_set[i].memory.available_physical);
     }
     LOG_INF("\n");
 
-    LOG_INF("| Used Mem Swappable (GiB)     ");
+    LOG_INF("| Used Mem Swappable (GiB)        ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.2f   ", dev_info_set[i].memory.used_can_swap);
     }
     LOG_INF("\n");
 
-    LOG_INF("| Swap Mem Total (GiB)         ");
+    LOG_INF("| Swap Mem Total (GiB)            ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.2f   ", dev_info_set[i].memory.total_swap);
     }
     LOG_INF("\n");
 
-    LOG_INF("| Swap Mem Available (GiB)     ");
+    LOG_INF("| Swap Mem Available (GiB)        ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.2f   ", dev_info_set[i].memory.available_swap);
     }
     LOG_INF("\n");
 
-    LOG_INF("| CPU RAM Read BW (GB/s)       ");
+    LOG_INF("| CPU RAM Read BW (GB/s)          ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.2f   ", dev_info_set[i].memory.cpu_read_ram_bw);
     }
     LOG_INF("\n");
 
-    LOG_INF("| CPU KVCache Copy Time (ms/l) ");
+    LOG_INF("| CPU KVCache Copy Time (ms/l)    ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.2f   ", dev_info_set[i].memory.mem_cpy_delay);
     }
     LOG_INF("\n");
 
-    LOG_INF("| Disk Read Seq Speed (GB/s)   ");
+    LOG_INF("| Disk Read Seq Speed (GB/s)      ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.2f   ", dev_info_set[i].disk.read_seq_bw);
     }
     LOG_INF("\n");
 
-    LOG_INF("| Disk Write Seq Speed (GB/s)  ");
+    LOG_INF("| Disk Write Seq Speed (GB/s)     ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.2f   ", dev_info_set[i].disk.write_seq_bw);
     }
     LOG_INF("\n");
 
-    LOG_INF("| Disk Read Rnd Speed (GB/s)   ");
+    LOG_INF("| Disk Read Rnd Speed (GB/s)      ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.2f   ", dev_info_set[i].disk.read_rnd_bw);
     }
     LOG_INF("\n");
 
-    LOG_INF("| Disk Write Rnd Speed (GB/s)  ");
+    LOG_INF("| Disk Write Rnd Speed (GB/s)     ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.2f   ", dev_info_set[i].disk.write_rnd_bw);
     }
     LOG_INF("\n");
 
-    LOG_INF("| GPU Metal                    ");
+    LOG_INF("| GPU Metal                       ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10d   ", dev_info_set[i].gpu_support.metal);
     }
     LOG_INF("\n");
 
-    LOG_INF("| GPU CUDA                     ");
+    LOG_INF("| GPU CUDA                        ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10d   ", dev_info_set[i].gpu_support.cuda);
     }
     LOG_INF("\n");
 
-    LOG_INF("| GPU Vulkan                   ");
+    LOG_INF("| GPU Vulkan                      ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10d   ", dev_info_set[i].gpu_support.vulkan);
     }
     LOG_INF("\n");
 
-    LOG_INF("| GPU Kompute                  ");
+    LOG_INF("| GPU Kompute                     ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10d   ", dev_info_set[i].gpu_support.kompute);
     }
     LOG_INF("\n");
 
-    LOG_INF("| GPU BLAS                     ");
+    LOG_INF("| GPU BLAS                        ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10d   ", dev_info_set[i].gpu_support.gpublas);
     }
     LOG_INF("\n");
 
-    LOG_INF("| BLAS                         ");
+    LOG_INF("| BLAS                            ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10d   ", dev_info_set[i].gpu_support.blas);
     }
     LOG_INF("\n");
 
-    LOG_INF("| SYCL                         ");
+    LOG_INF("| SYCL                            ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10d   ", dev_info_set[i].gpu_support.sycl);
     }
     LOG_INF("\n");
 
-    LOG_INF("| GPU Name                     ");
+    LOG_INF("| GPU Name                        ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.10s   ", dev_info_set[i].gpu_props.name);
     }
     LOG_INF("\n");
 
-    LOG_INF("| GPU Description              ");
+    LOG_INF("| GPU Description                 ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.10s   ", dev_info_set[i].gpu_props.description);
     }
     LOG_INF("\n");
 
-    LOG_INF("| GPU Mem Free (GiB)           ");
+    LOG_INF("| GPU Mem Free (GiB)              ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.2f   ", dev_info_set[i].gpu_props.memory_free);
     }
     LOG_INF("\n");
 
-    LOG_INF("| GPU Mem Total (GiB)          ");
+    LOG_INF("| GPU Mem Total (GiB)             ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.2f   ", dev_info_set[i].gpu_props.memory_total);
     }
     LOG_INF("\n");
 
-    LOG_INF("| Metal VRAM Read BW (GB/s)    ");
+    LOG_INF("| Metal VRAM Read BW (GB/s)       ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.2f   ", dev_info_set[i].gpu_props.metal_read_vram_bw);
     }
     LOG_INF("\n");
 
-    LOG_INF("| Metal KVCache Copy Time(ms/l)");
+    LOG_INF("| Metal KVCache Copy Time(ms/l)   ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.2f   ", dev_info_set[i].gpu_props.metal_mem_cpy_delay);
     }
     LOG_INF("\n");
 
-    LOG_INF("| Metal flops (F32xF32, GFLOPS)");
+    LOG_INF("| Metal flops (F32xF32, GFLOPS)   ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.metal_flops_f32_f32);
     }
     LOG_INF("\n");
 
-    LOG_INF("| Metal flops (F16xF32, GFLOPS)");
+    LOG_INF("| Metal flops (F16xF32, GFLOPS)   ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.metal_flops_f16_f32);
     }
     LOG_INF("\n");
 
-    LOG_INF("| Metal flops (Q4KxF32, GFLOPS)");
+    LOG_INF("| Metal flops (Q2KxF32, GFLOPS)   ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.metal_flops_q2k_f32);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| Metal flops (Q4KxF32, GFLOPS)   ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.metal_flops_q4k_f32);
     }
     LOG_INF("\n");
 
-    LOG_INF("| Metal flops (Q50xF32, GFLOPS)");
-    for (int i = 0; i < n; ++i) {
-        LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.metal_flops_q50_f32);
-    }
-    LOG_INF("\n");
-
-    LOG_INF("| Metal flops (Q5KxF32, GFLOPS)");
+    LOG_INF("| Metal flops (Q5KxF32, GFLOPS)   ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.metal_flops_q5k_f32);
     }
     LOG_INF("\n");
 
-    LOG_INF("| Metal flops (Q6KxF32, GFLOPS)");
+    LOG_INF("| Metal flops (Q6KxF32, GFLOPS)   ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.metal_flops_q6k_f32);
     }
     LOG_INF("\n");
 
-    LOG_INF("| Metal flops (Q80xF32, GFLOPS)");
+    LOG_INF("| Metal flops (IQ2XXSxF32, GFLOPS)");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.metal_flops_iq2xxs_f32);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| Metal flops (Q50xF32, GFLOPS)   ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.metal_flops_q50_f32);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| Metal flops (Q80xF32, GFLOPS)   ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.metal_flops_q80_f32);
     }
     LOG_INF("\n");
 
-    LOG_INF("| CUDA VRAM Read BW (GB/s)     ");
+    LOG_INF("| Metal flops (IQ1SxF32, GFLOPS)  ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.metal_flops_iq1s_f32);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| Metal flops (IQ4NLxF32, GFLOPS) ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.metal_flops_iq4nl_f32);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| Metal flops (IQ1MxF32, GFLOPS)  ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.metal_flops_iq1m_f32);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| CUDA VRAM Read BW (GB/s)        ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.2f   ", dev_info_set[i].gpu_props.cuda_read_vram_bw);
     }
     LOG_INF("\n");
 
-    LOG_INF("| CUDA KVCache Copy Time (ms/l)");
+    LOG_INF("| CUDA KVCache Copy Time (ms/l)   ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.2f   ", dev_info_set[i].gpu_props.cuda_mem_cpy_delay);
     }
     LOG_INF("\n");
 
-    LOG_INF("| CUDA flops (F32xF32, GFLOPS) ");
+    LOG_INF("| CUDA flops (F32xF32, GFLOPS)    ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.cuda_flops_f32_f32);
     }
     LOG_INF("\n");
 
-    LOG_INF("| CUDA flops (F16xF32, GFLOPS) ");
+    LOG_INF("| CUDA flops (F16xF32, GFLOPS)    ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.cuda_flops_f16_f32);
     }
     LOG_INF("\n");
 
-    LOG_INF("| CUDA flops (Q4KxF32, GFLOPS) ");
+    LOG_INF("| CUDA flops (Q2KxF32, GFLOPS)    ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.cuda_flops_q2k_f32);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| CUDA flops (Q4KxF32, GFLOPS)    ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.cuda_flops_q4k_f32);
     }
     LOG_INF("\n");
 
-    LOG_INF("| CUDA flops (Q50xF32, GFLOPS) ");
-    for (int i = 0; i < n; ++i) {
-        LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.cuda_flops_q50_f32);
-    }
-    LOG_INF("\n");
-
-    LOG_INF("| CUDA flops (Q5KxF32, GFLOPS) ");
+    LOG_INF("| CUDA flops (Q5KxF32, GFLOPS)    ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.cuda_flops_q5k_f32);
     }
     LOG_INF("\n");
 
-    LOG_INF("| CUDA flops (Q6KxF32, GFLOPS) ");
+    LOG_INF("| CUDA flops (Q6KxF32, GFLOPS)    ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.cuda_flops_q6k_f32);
     }
     LOG_INF("\n");
 
-    LOG_INF("| CUDA flops (Q80xF32, GFLOPS) ");
+    LOG_INF("| CUDA flops (IQ2XXSxF32, GFLOPS) ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.cuda_flops_iq2xxs_f32);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| CUDA flops (Q50xF32, GFLOPS)    ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.cuda_flops_q50_f32);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| CUDA flops (Q80xF32, GFLOPS)    ");
     for (int i = 0; i < n; ++i) {
         LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.cuda_flops_q80_f32);
     }
     LOG_INF("\n");
 
-    LOG_INF("| Model flops (output F32xF32) ");
+    LOG_INF("| CUDA flops (IQ1SxF32, GFLOPS)   ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.cuda_flops_iq1s_f32);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| CUDA flops (IQ4NLxF32, GFLOPS)  ");  
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.cuda_flops_iq4nl_f32);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| CUDA flops (IQ1MxF32, GFLOPS)   ");
+    for (int i = 0; i < n; ++i) {
+        LOG_INF("| %-10.1f   ", dev_info_set[i].gpu_props.cuda_flops_iq1m_f32);
+    }
+    LOG_INF("\n");
+
+    LOG_INF("| Model flops (output F32xF32)    ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.output_f32_f32);
     LOG_INF("\n");
 
-    LOG_INF("| Model flops (output F16xF32) ");
+    LOG_INF("| Model flops (output F16xF32)    ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.output_f16_f32);
     LOG_INF("\n");
 
-    LOG_INF("| Model flops (output Q4KxF32) ");
+    LOG_INF("| Model flops (output Q2KxF32)    ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.output_q2k_f32);
+    LOG_INF("\n");
+
+    LOG_INF("| Model flops (output Q4KxF32)    ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.output_q4k_f32);
     LOG_INF("\n");
 
-    LOG_INF("| Model flops (output Q50xF32) ");
-    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.output_q50_f32);
-    LOG_INF("\n");
-
-    LOG_INF("| Model flops (output Q5KxF32) ");
+    LOG_INF("| Model flops (output Q5KxF32)    ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.output_q5k_f32);
     LOG_INF("\n");
 
-    LOG_INF("| Model flops (output Q6KxF32) ");
+    LOG_INF("| Model flops (output Q6KxF32)    ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.output_q6k_f32);
     LOG_INF("\n");
 
-    LOG_INF("| Model flops (output Q80xF32) ");
+    LOG_INF("| Model flops (output IQ2XXSxF32) ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.output_iq2xxs_f32);
+    LOG_INF("\n");
+    
+    LOG_INF("| Model flops (output Q50xF32)    ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.output_q50_f32);
+    LOG_INF("\n");
+
+    LOG_INF("| Model flops (output Q80xF32)    ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.output_q80_f32);
     LOG_INF("\n");
 
-    LOG_INF("| Model flops (layer F32xF32)  ");
+    LOG_INF("| Model flops (output IQ1SxF32)   ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.output_iq1s_f32);
+    LOG_INF("\n");
+
+    LOG_INF("| Model flops (output IQ4NLxF32)  ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.output_iq4nl_f32);
+    LOG_INF("\n");
+
+    LOG_INF("| Model flops (output IQ1MxF32)   ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.output_iq1m_f32);
+    LOG_INF("\n");
+
+    LOG_INF("| Model flops (layer F32xF32)     ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.layer_f32_f32);
     LOG_INF("\n");
 
-    LOG_INF("| Model flops (layer F16xF32)  ");
+    LOG_INF("| Model flops (layer F16xF32)     ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.layer_f16_f32);
     LOG_INF("\n");
 
-    LOG_INF("| Model flops (layer Q4KxF32)  ");
+    LOG_INF("| Model flops (layer Q2KxF32)     ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.layer_q2k_f32);
+    LOG_INF("\n");
+
+    LOG_INF("| Model flops (layer Q4KxF32)     ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.layer_q4k_f32);
     LOG_INF("\n");
 
-    LOG_INF("| Model flops (layer Q50xF32)  ");
-    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.layer_q50_f32);
-    LOG_INF("\n");
-
-    LOG_INF("| Model flops (layer Q5KxF32)  ");
+    LOG_INF("| Model flops (layer Q5KxF32)     ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.layer_q5k_f32);
     LOG_INF("\n");
 
-    LOG_INF("| Model flops (layer Q6KxF32)  ");
+    LOG_INF("| Model flops (layer Q6KxF32)     ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.layer_q6k_f32);
     LOG_INF("\n");
 
-    LOG_INF("| Model flops (layer Q80xF32)  ");
+    LOG_INF("| Model flops (layer IQ2XXSxF32)  ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.layer_iq2xxs_f32);
+    LOG_INF("\n");
+
+    LOG_INF("| Model flops (layer Q50xF32)     ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.layer_q50_f32);
+    LOG_INF("\n");
+
+    LOG_INF("| Model flops (layer Q80xF32)     ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.layer_q80_f32);
     LOG_INF("\n");
 
-    LOG_INF("| Model params (input F32)     ");
+    LOG_INF("| Model flops (layer IQ1SxF32)    ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.layer_iq1s_f32);
+    LOG_INF("\n");
+
+    LOG_INF("| Model flops (layer IQ4NLxF32)   ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.layer_iq4nl_f32);
+    LOG_INF("\n");
+
+    LOG_INF("| Model flops (layer IQ1MxF32)    ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_flops.layer_iq1m_f32);
+    LOG_INF("\n");
+
+    LOG_INF("| Model params (input F32)        ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.input_f32);
     LOG_INF("\n");
 
-    LOG_INF("| Model params (input F16)     ");
+    LOG_INF("| Model params (input F16)        ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.input_f16);
     LOG_INF("\n");
 
-    LOG_INF("| Model params (input Q4K)     ");
+    LOG_INF("| Model params (input Q2K)        ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.input_q2k);
+    LOG_INF("\n");
+
+    LOG_INF("| Model params (input Q4K)        ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.input_q4k);
     LOG_INF("\n");
 
-    LOG_INF("| Model params (input Q50)     ");
-    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.input_q50);
-    LOG_INF("\n");
-
-    LOG_INF("| Model params (input Q5K)     ");
+    LOG_INF("| Model params (input Q5K)        ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.input_q5k);
     LOG_INF("\n");
 
-    LOG_INF("| Model params (input Q6K)     ");
+    LOG_INF("| Model params (input Q6K)        ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.input_q6k);
     LOG_INF("\n");
 
-    LOG_INF("| Model params (input Q80)     ");
+    LOG_INF("| Model params (input IQ2XXS)     ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.input_iq2xxs);
+    LOG_INF("\n");
+
+    LOG_INF("| Model params (input Q50)        ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.input_q50);
+    LOG_INF("\n");
+
+    LOG_INF("| Model params (input Q80)        ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.input_q80);
     LOG_INF("\n");
 
-    LOG_INF("| Model params (layer F32)     ");
+    LOG_INF("| Model params (input IQ1S)       ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.input_iq1s);
+    LOG_INF("\n");
+
+    LOG_INF("| Model params (input IQ4NL)      ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.input_iq4nl);
+    LOG_INF("\n");
+
+    LOG_INF("| Model params (input IQ1M)       ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.input_iq1m);
+    LOG_INF("\n");
+
+    LOG_INF("| Model params (layer F32)        ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.layer_f32);
     LOG_INF("\n");
 
-    LOG_INF("| Model params (layer F16)     ");
+    LOG_INF("| Model params (layer F16)        ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.layer_f16);
     LOG_INF("\n");
 
-    LOG_INF("| Model params (layer Q4K)     ");
+    LOG_INF("| Model params (layer Q2K)        ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.layer_q2k);
+    LOG_INF("\n");
+
+    LOG_INF("| Model params (layer Q4K)        ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.layer_q4k);
     LOG_INF("\n");
 
-    LOG_INF("| Model params (layer Q50)     ");
-    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.layer_q50);
-    LOG_INF("\n");
-
-    LOG_INF("| Model params (layer Q5K)     ");
+    LOG_INF("| Model params (layer Q5K)        ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.layer_q5k);
     LOG_INF("\n");
 
-    LOG_INF("| Model params (layer Q6K)     ");
+    LOG_INF("| Model params (layer Q6K)        ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.layer_q6k);
     LOG_INF("\n");
 
-    LOG_INF("| Model params (layer Q80)     ");
+    LOG_INF("| Model params (layer IQ2XXS)     ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.layer_iq2xxs);
+    LOG_INF("\n");
+
+    LOG_INF("| Model params (layer Q50)        ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.layer_q50);
+    LOG_INF("\n");
+
+    LOG_INF("| Model params (layer Q80)        ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.layer_q80);
     LOG_INF("\n");
 
-    LOG_INF("| Model params (output F32)    ");
+    LOG_INF("| Model params (layer IQ1S)       ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.layer_iq1s);
+    LOG_INF("\n");
+
+    LOG_INF("| Model params (layer IQ4NL)      ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.layer_iq4nl);
+    LOG_INF("\n");        
+    
+    LOG_INF("| Model params (layer IQ1M)       ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.layer_iq1m);
+    LOG_INF("\n");
+
+    LOG_INF("| Model params (output F32)       ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.output_f32);
     LOG_INF("\n");
 
-    LOG_INF("| Model params (output F16)    ");
+    LOG_INF("| Model params (output F16)       ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.output_f16);
     LOG_INF("\n");
 
-    LOG_INF("| Model params (output Q4K)    ");
+    LOG_INF("| Model params (output Q2K)       ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.output_q2k);
+    LOG_INF("\n");
+
+    LOG_INF("| Model params (output Q4K)       ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.output_q4k);
     LOG_INF("\n");
 
-    LOG_INF("| Model params (output Q50)    ");
-    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.output_q50);
-    LOG_INF("\n");
-
-    LOG_INF("| Model params (output Q5K)    ");
+    LOG_INF("| Model params (output Q5K)       ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.output_q5k);
     LOG_INF("\n");
 
-    LOG_INF("| Model params (output Q6K)    ");
+    LOG_INF("| Model params (output Q6K)       ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.output_q6k);
     LOG_INF("\n");
 
-    LOG_INF("| Model params (output Q80)    ");
+    LOG_INF("| Model params (output IQ2XXS)    ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.output_iq2xxs);
+    LOG_INF("\n");
+
+    LOG_INF("| Model params (output Q50)       ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.output_q50);
+    LOG_INF("\n");
+
+    LOG_INF("| Model params (output Q80)       ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.output_q80);
     LOG_INF("\n");
 
-    LOG_INF("| Model bytes  (input)         ");
+    LOG_INF("| Model params (output IQ1S)      ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.output_iq1s);
+    LOG_INF("\n");    
+
+    LOG_INF("| Model params (output IQ4NL)     ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.output_iq4nl);
+    LOG_INF("\n");  
+
+    LOG_INF("| Model params (output IQ1M)      ");
+    LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_params.output_iq1m);
+    LOG_INF("\n");
+
+    LOG_INF("| Model bytes  (input)            ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_bytes.nb_input);
     LOG_INF("\n");
 
-    LOG_INF("| Model bytes  (layer)         ");
+    LOG_INF("| Model bytes  (layer)            ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_bytes.nb_layer);
     LOG_INF("\n");
 
-    LOG_INF("| Model bytes  (output)        ");
+    LOG_INF("| Model bytes  (output)           ");
     LOG_INF("| %-10" PRId64 "   ", dev_info_set[0].model_bytes.nb_output);
     LOG_INF("\n");
 
@@ -2155,17 +2373,44 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
                       + gpu_description_len
                       + sizeof(struct disk_props)
                       + sizeof(uint32_t)    // cpu_props.cores
-                      + sizeof(float) * 7   // cpu_props.flops_f32_f32, cpu_props.flops_f16_f32, cpu_props.flops_q4k_f32, cpu_props.flops_q50_f32, cpu_props.flops_q5k_f32, cpu_props.flops_q6k_f32, cpu_props.flops_q80_f32
+                      + sizeof(float) * 12  // - cpu_props.flops_f32_f32,   cpu_props.flops_f16_f32,
+                                            // - cpu_props.flops_q2k_f32,   cpu_props.flops_q4k_f32, cpu_props.flops_q5k_f32, cpu_props.flops_q6k_f32
+                                            // - cpu_props.flops_iq2xxs_f32
+                                            // - cpu_props.flops_q50_f32,   cpu_props.flops_q80_f32
+                                            // - cpu_props.flops_iq1s_f32,  cpu_props.flops_iq4nl_f32
+                                            // - cpu_props.flops_iq1m_f32
                       + sizeof(struct memory_info)
                       + sizeof(struct gpu_support)
-                      + sizeof(float) * 20; // gpu_props.memory_free, gpu_props.memory_total, gpu_props.metal_read_vram_bw, gpu_props.cuda_read_vram_bw,
-                                            // gpu_props.metal_flops_f32_f32, gpu_props.metal_flops_f16_f32, gpu_props.metal_flops_q4k_f32, gpu_props.metal_flops_q50_f32, gpu_props.metal_flops_q5k_f32, gpu_props.metal_flops_q6k_f32, gpu_props.metal_flops_q80_f32, 
-                                            // gpu_props.cuda_flops_f32_f32, gpu_props.cuda_flops_f16_f32, gpu_props.cuda_flops_q4k_f32, gpu_props.cuda_flops_q50_f32, gpu_props.cuda_flops_q5k_f32, gpu_props.cuda_flops_q6k_f32, gpu_props.cuda_flops_q80_f32,
-                                            // gpu_props.metal_mem_cpy_delay, gpu_props.cuda_mem_cpy_delay
+                      + sizeof(float) * 30;     // GPU attributes
+                                                // memory:
+                                                // - memory_free,           memory_total
+                                                // - metal_read_vram_bw,    cuda_read_vram_bw
+                                                // Metal floating-point performance:
+                                                // - metal_flops_f32_f32,   metal_flops_f16_f32
+                                                // - metal_flops_q2k_f32,   metal_flops_q4k_f32, metal_flops_q5k_f32, metal_flops_q6k_f32
+                                                // - metal_flops_iq2xxs_f32
+                                                // - metal_flops_q50_f32,   metal_flops_q80_f32
+                                                // - metal_flops_iq1s_f32,  metal_flops_iq4nl_f32
+                                                // - metal_flops_iq1m_f32
+                                                // CUDA floating-point performance:
+                                                // - cuda_flops_f32_f32,    cuda_flops_f16_f32
+                                                // - cuda_flops_q2k_f32,    cuda_flops_q4k_f32, cuda_flops_q5k_f32, cuda_flops_q6k_f32
+                                                // - cuda_flops_iq2xxs_f32
+                                                // - cuda_flops_q50_f32,    cuda_flops_q80_f32
+                                                // - cuda_flops_iq1s_f32,   cuda_flops_iq4nl_f32
+                                                // - cuda_flops_iq1m_f32
+                                                // delay:
+                                                // - metal_mem_cpy_delay,   cuda_mem_cpy_delay
 
     *buffer = (char *)malloc(total_size);
     char * ptr = *buffer;
 
+    if (*buffer == NULL) {
+        LOG_ERR("%s: failed to allocate %zu bytes for device info serialization\n", 
+                __func__, total_size);
+        return 0;
+    }
+
     // rank
     memcpy(ptr, &dev_info->rank, sizeof(uint32_t));
     ptr += sizeof(uint32_t);
@@ -2214,10 +2459,10 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
     memcpy(ptr, &dev_info->cpu_props.flops_f16_f32, sizeof(float));
     ptr += sizeof(float);
 
-    memcpy(ptr, &dev_info->cpu_props.flops_q4k_f32, sizeof(float));
+    memcpy(ptr, &dev_info->cpu_props.flops_q2k_f32, sizeof(float));
     ptr += sizeof(float);
 
-    memcpy(ptr, &dev_info->cpu_props.flops_q50_f32, sizeof(float));
+    memcpy(ptr, &dev_info->cpu_props.flops_q4k_f32, sizeof(float));
     ptr += sizeof(float);
 
     memcpy(ptr, &dev_info->cpu_props.flops_q5k_f32, sizeof(float));
@@ -2226,9 +2471,24 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
     memcpy(ptr, &dev_info->cpu_props.flops_q6k_f32, sizeof(float));
     ptr += sizeof(float);
 
+    memcpy(ptr, &dev_info->cpu_props.flops_iq2xxs_f32, sizeof(float));
+    ptr += sizeof(float);
+
+    memcpy(ptr, &dev_info->cpu_props.flops_q50_f32, sizeof(float));
+    ptr += sizeof(float);
+
     memcpy(ptr, &dev_info->cpu_props.flops_q80_f32, sizeof(float));
     ptr += sizeof(float);
 
+    memcpy(ptr, &dev_info->cpu_props.flops_iq1s_f32, sizeof(float));
+    ptr += sizeof(float);
+
+    memcpy(ptr, &dev_info->cpu_props.flops_iq4nl_f32, sizeof(float));
+    ptr += sizeof(float);
+
+    memcpy(ptr, &dev_info->cpu_props.flops_iq1m_f32, sizeof(float));
+    ptr += sizeof(float);
+
     memcpy(ptr, &dev_info->memory, sizeof(struct memory_info));
     ptr += sizeof(struct memory_info);
 
@@ -2250,10 +2510,10 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
     memcpy(ptr, &dev_info->gpu_props.metal_flops_f16_f32, sizeof(float));
     ptr += sizeof(float);
 
-    memcpy(ptr, &dev_info->gpu_props.metal_flops_q4k_f32, sizeof(float));
+    memcpy(ptr, &dev_info->gpu_props.metal_flops_q2k_f32, sizeof(float));
     ptr += sizeof(float);
 
-    memcpy(ptr, &dev_info->gpu_props.metal_flops_q50_f32, sizeof(float));
+    memcpy(ptr, &dev_info->gpu_props.metal_flops_q4k_f32, sizeof(float));
     ptr += sizeof(float);
 
     memcpy(ptr, &dev_info->gpu_props.metal_flops_q5k_f32, sizeof(float));
@@ -2262,9 +2522,24 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
     memcpy(ptr, &dev_info->gpu_props.metal_flops_q6k_f32, sizeof(float));
     ptr += sizeof(float);
 
+    memcpy(ptr, &dev_info->gpu_props.metal_flops_iq2xxs_f32, sizeof(float));
+    ptr += sizeof(float);
+
+    memcpy(ptr, &dev_info->gpu_props.metal_flops_q50_f32, sizeof(float));
+    ptr += sizeof(float);
+
     memcpy(ptr, &dev_info->gpu_props.metal_flops_q80_f32, sizeof(float));
     ptr += sizeof(float);
 
+    memcpy(ptr, &dev_info->gpu_props.metal_flops_iq1s_f32, sizeof(float));
+    ptr += sizeof(float);
+
+    memcpy(ptr, &dev_info->gpu_props.metal_flops_iq4nl_f32, sizeof(float));
+    ptr += sizeof(float);
+
+    memcpy(ptr, &dev_info->gpu_props.metal_flops_iq1m_f32, sizeof(float));
+    ptr += sizeof(float);
+
     memcpy(ptr, &dev_info->gpu_props.metal_mem_cpy_delay, sizeof(float));
     ptr += sizeof(float);
 
@@ -2277,10 +2552,10 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
     memcpy(ptr, &dev_info->gpu_props.cuda_flops_f16_f32, sizeof(float));
     ptr += sizeof(float);
 
-    memcpy(ptr, &dev_info->gpu_props.cuda_flops_q4k_f32, sizeof(float));
+    memcpy(ptr, &dev_info->gpu_props.cuda_flops_q2k_f32, sizeof(float));
     ptr += sizeof(float);
 
-    memcpy(ptr, &dev_info->gpu_props.cuda_flops_q50_f32, sizeof(float));
+    memcpy(ptr, &dev_info->gpu_props.cuda_flops_q4k_f32, sizeof(float));
     ptr += sizeof(float);
 
     memcpy(ptr, &dev_info->gpu_props.cuda_flops_q5k_f32, sizeof(float));
@@ -2289,9 +2564,24 @@ size_t serialize(const struct device_info * dev_info, char ** buffer) {
     memcpy(ptr, &dev_info->gpu_props.cuda_flops_q6k_f32, sizeof(float));
     ptr += sizeof(float);
 
+    memcpy(ptr, &dev_info->gpu_props.cuda_flops_iq2xxs_f32, sizeof(float));
+    ptr += sizeof(float);
+
+    memcpy(ptr, &dev_info->gpu_props.cuda_flops_q50_f32, sizeof(float));
+    ptr += sizeof(float);
+
     memcpy(ptr, &dev_info->gpu_props.cuda_flops_q80_f32, sizeof(float));
     ptr += sizeof(float);
 
+    memcpy(ptr, &dev_info->gpu_props.cuda_flops_iq1s_f32, sizeof(float));
+    ptr += sizeof(float);
+
+    memcpy(ptr, &dev_info->gpu_props.cuda_flops_iq4nl_f32, sizeof(float));
+    ptr += sizeof(float);
+
+    memcpy(ptr, &dev_info->gpu_props.cuda_flops_iq1m_f32, sizeof(float));
+    ptr += sizeof(float);
+
     memcpy(ptr, &dev_info->gpu_props.cuda_mem_cpy_delay, sizeof(float));
 
     // no need to synchronize model flops and model params
@@ -2366,10 +2656,10 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
     memcpy(&dev_info->cpu_props.flops_f16_f32, ptr, sizeof(float));
     ptr += sizeof(float);
 
-    memcpy(&dev_info->cpu_props.flops_q4k_f32, ptr, sizeof(float));
+    memcpy(&dev_info->cpu_props.flops_q2k_f32, ptr, sizeof(float));
     ptr += sizeof(float);
 
-    memcpy(&dev_info->cpu_props.flops_q50_f32, ptr, sizeof(float));
+    memcpy(&dev_info->cpu_props.flops_q4k_f32, ptr, sizeof(float));
     ptr += sizeof(float);
 
     memcpy(&dev_info->cpu_props.flops_q5k_f32, ptr, sizeof(float));
@@ -2378,9 +2668,24 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
     memcpy(&dev_info->cpu_props.flops_q6k_f32, ptr, sizeof(float));
     ptr += sizeof(float);
 
+    memcpy(&dev_info->cpu_props.flops_iq2xxs_f32, ptr, sizeof(float));
+    ptr += sizeof(float);
+
+    memcpy(&dev_info->cpu_props.flops_q50_f32, ptr, sizeof(float));
+    ptr += sizeof(float);
+
     memcpy(&dev_info->cpu_props.flops_q80_f32, ptr, sizeof(float));
     ptr += sizeof(float);
 
+    memcpy(&dev_info->cpu_props.flops_iq1s_f32, ptr, sizeof(float));
+    ptr += sizeof(float);
+
+    memcpy(&dev_info->cpu_props.flops_iq4nl_f32, ptr, sizeof(float));
+    ptr += sizeof(float);
+
+    memcpy(&dev_info->cpu_props.flops_iq1m_f32, ptr, sizeof(float));
+    ptr += sizeof(float);
+
     memcpy(&dev_info->memory, ptr, sizeof(struct memory_info));
     ptr += sizeof(struct memory_info);
 
@@ -2402,10 +2707,10 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
     memcpy(&dev_info->gpu_props.metal_flops_f16_f32, ptr, sizeof(float));
     ptr += sizeof(float);
 
-    memcpy(&dev_info->gpu_props.metal_flops_q4k_f32, ptr, sizeof(float));
+    memcpy(&dev_info->gpu_props.metal_flops_q2k_f32, ptr, sizeof(float));
     ptr += sizeof(float);
 
-    memcpy(&dev_info->gpu_props.metal_flops_q50_f32, ptr, sizeof(float));
+    memcpy(&dev_info->gpu_props.metal_flops_q4k_f32, ptr, sizeof(float));
     ptr += sizeof(float);
 
     memcpy(&dev_info->gpu_props.metal_flops_q5k_f32, ptr, sizeof(float));
@@ -2414,9 +2719,24 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
     memcpy(&dev_info->gpu_props.metal_flops_q6k_f32, ptr, sizeof(float));
     ptr += sizeof(float);
 
+    memcpy(&dev_info->gpu_props.metal_flops_iq2xxs_f32, ptr, sizeof(float));
+    ptr += sizeof(float);
+
+    memcpy(&dev_info->gpu_props.metal_flops_q50_f32, ptr, sizeof(float));
+    ptr += sizeof(float);
+
     memcpy(&dev_info->gpu_props.metal_flops_q80_f32, ptr, sizeof(float));
     ptr += sizeof(float);
 
+    memcpy(&dev_info->gpu_props.metal_flops_iq1s_f32, ptr, sizeof(float));
+    ptr += sizeof(float);
+
+    memcpy(&dev_info->gpu_props.metal_flops_iq4nl_f32, ptr, sizeof(float));
+    ptr += sizeof(float);
+
+    memcpy(&dev_info->gpu_props.metal_flops_iq1m_f32, ptr, sizeof(float));
+    ptr += sizeof(float);
+
     memcpy(&dev_info->gpu_props.metal_mem_cpy_delay, ptr, sizeof(float));
     ptr += sizeof(float);
 
@@ -2429,10 +2749,10 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
     memcpy(&dev_info->gpu_props.cuda_flops_f16_f32, ptr, sizeof(float));
     ptr += sizeof(float);
 
-    memcpy(&dev_info->gpu_props.cuda_flops_q4k_f32, ptr, sizeof(float));
+    memcpy(&dev_info->gpu_props.cuda_flops_q2k_f32, ptr, sizeof(float));
     ptr += sizeof(float);
 
-    memcpy(&dev_info->gpu_props.cuda_flops_q50_f32, ptr, sizeof(float));
+    memcpy(&dev_info->gpu_props.cuda_flops_q4k_f32, ptr, sizeof(float));
     ptr += sizeof(float);
 
     memcpy(&dev_info->gpu_props.cuda_flops_q5k_f32, ptr, sizeof(float));
@@ -2441,9 +2761,24 @@ void deserialize(const char * buffer, struct device_info * dev_info) {
     memcpy(&dev_info->gpu_props.cuda_flops_q6k_f32, ptr, sizeof(float));
     ptr += sizeof(float);
 
+    memcpy(&dev_info->gpu_props.cuda_flops_iq2xxs_f32, ptr, sizeof(float));
+    ptr += sizeof(float);
+
+    memcpy(&dev_info->gpu_props.cuda_flops_q50_f32, ptr, sizeof(float));
+    ptr += sizeof(float);
+
     memcpy(&dev_info->gpu_props.cuda_flops_q80_f32, ptr, sizeof(float));
     ptr += sizeof(float);
 
+    memcpy(&dev_info->gpu_props.cuda_flops_iq1s_f32, ptr, sizeof(float));
+    ptr += sizeof(float);
+
+    memcpy(&dev_info->gpu_props.cuda_flops_iq4nl_f32, ptr, sizeof(float));
+    ptr += sizeof(float);
+
+    memcpy(&dev_info->gpu_props.cuda_flops_iq1m_f32, ptr, sizeof(float));
+    ptr += sizeof(float);
+
     memcpy(&dev_info->gpu_props.cuda_mem_cpy_delay, ptr, sizeof(float));
 
     // no need to synchronize model flops and model params
diff --git a/common/profiler.h b/common/profiler.h
index fb9a4ddb..b8fff0d1 100644
--- a/common/profiler.h
+++ b/common/profiler.h
@@ -15,25 +15,36 @@ struct cpu_props {
     const char * name;
     const char * description;
     uint32_t     cores;
-    float        flops_f32_f32; // in GFLOPS
-    float        flops_f16_f32; // in GFLOPS
-    float        flops_q4k_f32; // in GFLOPS
-    float        flops_q50_f32; // in GFLOPS
-    float        flops_q5k_f32; // in GFLOPS
-    float        flops_q6k_f32; // in GFLOPS
-    float        flops_q80_f32; // in GFLOPS
+    float        flops_f32_f32;     // in GFLOPS
+    float        flops_f16_f32;     // in GFLOPS
+    float        flops_q2k_f32;     // in GFLOPS
+    float        flops_q4k_f32;     // in GFLOPS
+    float        flops_q5k_f32;     // in GFLOPS
+    float        flops_q6k_f32;     // in GFLOPS
+    float        flops_iq2xxs_f32;  // in GFLOPS
+    float        flops_q50_f32;     // in GFLOPS    
+    float        flops_q80_f32;     // in GFLOPS
+    float        flops_iq1s_f32;    // in GFLOPS
+    float        flops_iq4nl_f32;   // in GFLOPS
+    float        flops_iq1m_f32;    // in GFLOPS
 
-    cpu_props() : 
-        name(""), 
-        description(""), 
-        cores(0), 
-        flops_f32_f32(0.0f), 
-        flops_f16_f32(0.0f), 
-        flops_q4k_f32(0.0f),
-        flops_q50_f32(0.0f),
-        flops_q5k_f32(0.0f),
-        flops_q6k_f32(0.0f),
-        flops_q80_f32(0.0f) {}
+    cpu_props()
+        : name            (""), 
+          description     (""), 
+          cores           (0), 
+          flops_f32_f32   (0.0f), 
+          flops_f16_f32   (0.0f), 
+          flops_q2k_f32   (0.0f),
+          flops_q4k_f32   (0.0f),
+          flops_q5k_f32   (0.0f),
+          flops_q6k_f32   (0.0f),
+          flops_iq2xxs_f32(0.0f),
+          flops_q50_f32   (0.0f),
+          flops_q80_f32   (0.0f),
+          flops_iq1s_f32  (0.0f), 
+          flops_iq4nl_f32 (0.0f),
+          flops_iq1m_f32  (0.0f)
+    {}
 };
 
 struct memory_info {
@@ -77,132 +88,204 @@ struct gpu_support {
 struct gpu_props {
     const char * name;
     const char * description;
-    float        memory_free;         // in GiB
-    float        memory_total;        // in GiB
-    float        metal_read_vram_bw;  // in GB/s
-    float        metal_flops_f32_f32; // in GFLOPS
-    float        metal_flops_f16_f32; // in GFLOPS
-    float        metal_flops_q4k_f32; // in GFLOPS
-    float        metal_flops_q50_f32; // in GFLOPS
-    float        metal_flops_q5k_f32; // in GFLOPS
-    float        metal_flops_q6k_f32; // in GFLOPS
-    float        metal_flops_q80_f32; // in GFLOPS
-    float        metal_mem_cpy_delay; // in ms
-    float        cuda_read_vram_bw;   // in GB/s
-    float        cuda_flops_f32_f32;  // in GFLOPS
-    float        cuda_flops_f16_f32;  // in GFLOPS
-    float        cuda_flops_q4k_f32;  // in GFLOPS
-    float        cuda_flops_q50_f32;  // in GFLOPS
-    float        cuda_flops_q5k_f32;  // in GFLOPS
-    float        cuda_flops_q6k_f32;  // in GFLOPS
-    float        cuda_flops_q80_f32;  // in GFLOPS
-    float        cuda_mem_cpy_delay;  // in ms
+    float        memory_free;               // in GiB
+    float        memory_total;              // in GiB
+    float        metal_read_vram_bw;        // in GB/s
+    float        metal_flops_f32_f32;       // in GFLOPS
+    float        metal_flops_f16_f32;       // in GFLOPS
+    float        metal_flops_q2k_f32;       // in GFLOPS
+    float        metal_flops_q4k_f32;     // in GFLOPS
+    float        metal_flops_q5k_f32;     // in GFLOPS
+    float        metal_flops_q6k_f32;     // in GFLOPS
+    float        metal_flops_iq2xxs_f32;  // in GFLOPS
+    float        metal_flops_q50_f32;     // in GFLOPS
+    float        metal_flops_q80_f32;     // in GFLOPS
+    float        metal_flops_iq1s_f32;    // in GFLOPS
+    float        metal_flops_iq4nl_f32;   // in GFLOPS
+    float        metal_flops_iq1m_f32;    // in GFLOPS
+    float        metal_mem_cpy_delay;     // in ms
+    float        cuda_read_vram_bw;       // in GB/s
+    float        cuda_flops_f32_f32;      // in GFLOPS
+    float        cuda_flops_f16_f32;      // in GFLOPS
+    float        cuda_flops_q2k_f32;      // in GFLOPS
+    float        cuda_flops_q4k_f32;      // in GFLOPS
+    float        cuda_flops_q5k_f32;      // in GFLOPS
+    float        cuda_flops_q6k_f32;      // in GFLOPS
+    float        cuda_flops_iq2xxs_f32;   // in GFLOPS
+    float        cuda_flops_q50_f32;      // in GFLOPS
+    float        cuda_flops_q80_f32;      // in GFLOPS
+    float        cuda_flops_iq1s_f32;     // in GFLOPS
+    float        cuda_flops_iq4nl_f32;    // in GFLOPS
+    float        cuda_flops_iq1m_f32;     // in GFLOPS
+    float        cuda_mem_cpy_delay;      // in ms
 
     gpu_props() : 
-        name(""), 
-        description(""), 
-        memory_free        (0.0f), 
-        memory_total       (0.0f), 
-        metal_read_vram_bw (0.0f),
-        metal_flops_f32_f32(0.0f), 
-        metal_flops_f16_f32(0.0f),
-        metal_flops_q4k_f32(0.0f),
-        metal_flops_q50_f32(0.0f),
-        metal_flops_q5k_f32(0.0f),
-        metal_flops_q6k_f32(0.0f),
-        metal_flops_q80_f32(0.0f),
-        metal_mem_cpy_delay(0.0f),
-        cuda_read_vram_bw  (0.0f),
-        cuda_flops_f32_f32 (0.0f), 
-        cuda_flops_f16_f32 (0.0f), 
-        cuda_flops_q4k_f32 (0.0f),
-        cuda_flops_q50_f32 (0.0f),
-        cuda_flops_q5k_f32 (0.0f),
-        cuda_flops_q6k_f32 (0.0f),
-        cuda_flops_q80_f32 (0.0f),
-        cuda_mem_cpy_delay (0.0f) {}
+        name                    (""), 
+        description             (""), 
+        memory_free             (0.0f), 
+        memory_total            (0.0f), 
+        metal_read_vram_bw      (0.0f),
+        metal_flops_f32_f32     (0.0f), 
+        metal_flops_f16_f32     (0.0f),
+        metal_flops_q2k_f32     (0.0f),
+        metal_flops_q4k_f32     (0.0f),
+        metal_flops_q5k_f32     (0.0f),
+        metal_flops_q6k_f32     (0.0f),
+        metal_flops_iq2xxs_f32  (0.0f),
+        metal_flops_q50_f32     (0.0f),
+        metal_flops_q80_f32     (0.0f),
+        metal_flops_iq1s_f32    (0.0f),
+        metal_flops_iq4nl_f32   (0.0f),
+        metal_flops_iq1m_f32    (0.0f),
+        metal_mem_cpy_delay     (0.0f),
+        cuda_read_vram_bw       (0.0f),
+        cuda_flops_f32_f32      (0.0f), 
+        cuda_flops_f16_f32      (0.0f), 
+        cuda_flops_q2k_f32      (0.0f),
+        cuda_flops_q4k_f32      (0.0f),
+        cuda_flops_q5k_f32      (0.0f),
+        cuda_flops_q6k_f32      (0.0f),
+        cuda_flops_iq2xxs_f32   (0.0f),
+        cuda_flops_q50_f32      (0.0f),
+        cuda_flops_q80_f32      (0.0f),
+        cuda_flops_iq1s_f32     (0.0f),
+        cuda_flops_iq4nl_f32    (0.0f),
+        cuda_flops_iq1m_f32     (0.0f),
+        cuda_mem_cpy_delay      (0.0f) {}
 };
 
 struct model_flops {
     float   inp_embd_ms;
     int64_t output_f32_f32;
     int64_t output_f16_f32;
+    int64_t output_q2k_f32;
     int64_t output_q4k_f32;
-    int64_t output_q50_f32;
     int64_t output_q5k_f32;
     int64_t output_q6k_f32;
+    int64_t output_iq2xxs_f32;
+    int64_t output_q50_f32;
     int64_t output_q80_f32;
+    int64_t output_iq1s_f32;
+    int64_t output_iq4nl_f32;
+    int64_t output_iq1m_f32;
     int64_t layer_f32_f32;
     int64_t layer_f16_f32;
+    int64_t layer_q2k_f32;
     int64_t layer_q4k_f32;
-    int64_t layer_q50_f32;
     int64_t layer_q5k_f32;
     int64_t layer_q6k_f32;
+    int64_t layer_iq2xxs_f32;
+    int64_t layer_q50_f32;
     int64_t layer_q80_f32;
+    int64_t layer_iq1s_f32;
+    int64_t layer_iq4nl_f32;
+    int64_t layer_iq1m_f32;
 
     model_flops() : 
-        inp_embd_ms(0.0f),
-        output_f32_f32(0), 
-        output_f16_f32(0),
-        output_q4k_f32(0),
-        output_q50_f32(0),
-        output_q5k_f32(0),
-        output_q6k_f32(0), 
-        output_q80_f32(0),
-        layer_f32_f32 (0),
-        layer_f16_f32 (0),
-        layer_q4k_f32 (0),
-        layer_q50_f32 (0),
-        layer_q5k_f32 (0),
-        layer_q6k_f32 (0),
-        layer_q80_f32 (0) {}
+        inp_embd_ms        (0.0f),
+        output_f32_f32     (0), 
+        output_f16_f32     (0),
+        output_q2k_f32     (0),
+        output_q4k_f32     (0),
+        output_q5k_f32     (0),
+        output_q6k_f32     (0), 
+        output_iq2xxs_f32  (0),
+        output_q50_f32     (0),
+        output_q80_f32     (0),
+        output_iq1s_f32    (0),
+        output_iq4nl_f32   (0),
+        output_iq1m_f32    (0),
+        layer_f32_f32      (0),
+        layer_f16_f32      (0),
+        layer_q2k_f32      (0),
+        layer_q4k_f32      (0),
+        layer_q5k_f32      (0),
+        layer_q6k_f32      (0),
+        layer_iq2xxs_f32   (0),
+        layer_q50_f32      (0),
+        layer_q80_f32      (0),
+        layer_iq1s_f32     (0),
+        layer_iq4nl_f32    (0), 
+        layer_iq1m_f32     (0)
+        {}
 };
 
 struct model_params {
     int64_t input_f32;
     int64_t input_f16;
+    int64_t input_q2k;
     int64_t input_q4k;
-    int64_t input_q50;
     int64_t input_q5k;
     int64_t input_q6k;
+    int64_t input_iq2xxs;
+    int64_t input_q50;
     int64_t input_q80;
+    int64_t input_iq1s;
+    int64_t input_iq4nl;
+    int64_t input_iq1m;
     int64_t output_f32;
     int64_t output_f16;
+    int64_t output_q2k;
     int64_t output_q4k;
-    int64_t output_q50;
     int64_t output_q5k;
     int64_t output_q6k;
+    int64_t output_iq2xxs;
+    int64_t output_q50;
     int64_t output_q80;
+    int64_t output_iq1s;
+    int64_t output_iq4nl;
+    int64_t output_iq1m;
     int64_t layer_f32;
     int64_t layer_f16;
+    int64_t layer_q2k;
     int64_t layer_q4k;
-    int64_t layer_q50;
     int64_t layer_q5k;
     int64_t layer_q6k;
+    int64_t layer_iq2xxs;
+    int64_t layer_q50;
     int64_t layer_q80;
+    int64_t layer_iq1s;
+    int64_t layer_iq4nl;
+    int64_t layer_iq1m;
 
     model_params() :
-        input_f32 (0),
-        input_f16 (0),
-        input_q4k (0),
-        input_q50 (0),
-        input_q5k (0),
-        input_q6k (0),
-        input_q80 (0),
-        output_f32(0),
-        output_f16(0),
-        output_q4k(0),
-        output_q50(0),
-        output_q5k(0),
-        output_q6k(0),
-        output_q80(0),
-        layer_f32 (0),
-        layer_f16 (0),
-        layer_q4k (0),
-        layer_q50 (0),
-        layer_q5k (0),
-        layer_q6k (0),
-        layer_q80 (0) {}
+        input_f32       (0),
+        input_f16       (0),
+        input_q2k       (0),
+        input_q4k       (0),
+        input_q5k       (0),
+        input_q6k       (0),
+        input_iq2xxs    (0),
+        input_q50       (0),
+        input_q80       (0),
+        input_iq1s      (0),
+        input_iq4nl     (0),
+        input_iq1m      (0),
+        output_f32      (0),
+        output_f16      (0),
+        output_q2k      (0),
+        output_q4k      (0),
+        output_q5k      (0),
+        output_q6k      (0),
+        output_iq2xxs   (0),
+        output_q50      (0),
+        output_q80      (0),
+        output_iq1s     (0),
+        output_iq4nl    (0),
+        output_iq1m     (0),
+        layer_f32       (0),
+        layer_f16       (0),
+        layer_q2k       (0),
+        layer_q4k       (0),
+        layer_q5k       (0),
+        layer_q6k       (0),
+        layer_iq2xxs    (0),
+        layer_q50       (0),
+        layer_q80       (0),
+        layer_iq1s      (0),
+        layer_iq4nl     (0), 
+        layer_iq1m      (0)
+        {}
 };
 
 struct model_bytes {
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 962dc032..4af68abc 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -385,12 +385,12 @@ extern "C" {
         GGML_TYPE_F64     = 28,
         GGML_TYPE_IQ1_M   = 29,
         GGML_TYPE_BF16    = 30,
-        GGML_TYPE_Q4_0_4_4 = 31,
-        GGML_TYPE_Q4_0_4_8 = 32,
-        GGML_TYPE_Q4_0_8_8 = 33,
+        // GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
+        // GGML_TYPE_Q4_0_4_8 = 32,
+        // GGML_TYPE_Q4_0_8_8 = 33,
         GGML_TYPE_TQ1_0   = 34,
         GGML_TYPE_TQ2_0   = 35,
-        GGML_TYPE_COUNT,
+        GGML_TYPE_COUNT   = 39,
     };
 
     // precision
@@ -431,9 +431,6 @@ extern "C" {
         GGML_FTYPE_MOSTLY_IQ4_XS  = 22, // except 1d tensors
         GGML_FTYPE_MOSTLY_IQ1_M   = 23, // except 1d tensors
         GGML_FTYPE_MOSTLY_BF16    = 24, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
     };
 
     // available tensor operations:
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 7aa6dce8..1c57cb95 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -15725,15 +15725,6 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
             {
                 VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
             } break;
-        case GGML_TYPE_Q4_0_4_4:
-        case GGML_TYPE_Q4_0_4_8:
-            {
-                VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x4, data, nbytes / sizeof(block_q4_0x4), 4);
-            } break;
-        case GGML_TYPE_Q4_0_8_8:
-            {
-                VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x8, data, nbytes / sizeof(block_q4_0x8), 8);
-            } break;
 
         case GGML_TYPE_I8:
         case GGML_TYPE_I16:
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 73426a5d..ffae7f2e 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1076,54 +1076,6 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
         .vec_dot_type             = GGML_TYPE_BF16,
         .nrows                    = 1,
     },
-    [GGML_TYPE_Q4_0_4_4] = {
-        .type_name                = "q4_0_4x4",
-        .blck_size                = QK4_0,
-        .blck_size_interleave     = 4,
-        .type_size                = sizeof(block_q4_0),
-        .is_quantized             = true,
-        .to_float                 = NULL,
-        .from_float               = NULL,
-        .from_float_ref           = NULL,
-        .vec_dot                  = NULL,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-        .nrows                    = 1,
-        .ncols                    = 4,
-        .gemv                     = ggml_gemv_q4_0_4x4_q8_0,
-        .gemm                     = ggml_gemm_q4_0_4x4_q8_0,
-    },
-    [GGML_TYPE_Q4_0_4_8] = {
-        .type_name                = "q4_0_4x8",
-        .blck_size                = QK4_0,
-        .blck_size_interleave     = 8,
-        .type_size                = sizeof(block_q4_0),
-        .is_quantized             = true,
-        .to_float                 = NULL,
-        .from_float               = NULL,
-        .from_float_ref           = NULL,
-        .vec_dot                  = NULL,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-        .nrows                    = 1,
-        .ncols                    = 4,
-        .gemv                     = ggml_gemv_q4_0_4x8_q8_0,
-        .gemm                     = ggml_gemm_q4_0_4x8_q8_0,
-    },
-    [GGML_TYPE_Q4_0_8_8] = {
-        .type_name                = "q4_0_8x8",
-        .blck_size                = QK4_0,
-        .blck_size_interleave     = 8,
-        .type_size                = sizeof(block_q4_0),
-        .is_quantized             = true,
-        .to_float                 = NULL,
-        .from_float               = NULL,
-        .from_float_ref           = NULL,
-        .vec_dot                  = NULL,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-        .nrows                    = 1,
-        .ncols                    = 8,
-        .gemv                     = ggml_gemv_q4_0_8x8_q8_0,
-        .gemm                     = ggml_gemm_q4_0_8x8_q8_0,
-    },
     [GGML_TYPE_TQ1_0] = {
         .type_name                = "tq1_0",
         .blck_size                = QK_K,
@@ -3472,7 +3424,7 @@ size_t ggml_row_size(enum ggml_type type, int64_t ne) {
 double ggml_type_sizef(enum ggml_type type) {
     return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
 }
-
+ 
 const char * ggml_type_name(enum ggml_type type) {
     return type < GGML_TYPE_COUNT ? type_traits[type].type_name : "NONE";
 }
@@ -3578,9 +3530,6 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
         case GGML_FTYPE_MOSTLY_IQ4_XS:        wtype = GGML_TYPE_IQ4_XS;   break;
         case GGML_FTYPE_MOSTLY_IQ3_S:         wtype = GGML_TYPE_IQ3_S;    break;
         case GGML_FTYPE_MOSTLY_IQ2_S:         wtype = GGML_TYPE_IQ2_S;    break;
-        case GGML_FTYPE_MOSTLY_Q4_0_4_4:      wtype = GGML_TYPE_Q4_0_4_4; break;
-        case GGML_FTYPE_MOSTLY_Q4_0_4_8:      wtype = GGML_TYPE_Q4_0_4_8; break;
-        case GGML_FTYPE_MOSTLY_Q4_0_8_8:      wtype = GGML_TYPE_Q4_0_8_8; break;
         case GGML_FTYPE_UNKNOWN:              wtype = GGML_TYPE_COUNT; break;
         case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
     }
@@ -4107,7 +4056,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
         /*.data         =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
         /*.name         =*/ { 0 },
         /*.extra        =*/ NULL,
-        ///*.padding      =*/ { 0 },
+        // /*.padding      =*/ { 0 },
     };
 
 #ifdef __clang__
@@ -9517,9 +9466,6 @@ static void ggml_compute_forward_add(
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ3_S:
         case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q4_0_4_4:
-        case GGML_TYPE_Q4_0_4_8:
-        case GGML_TYPE_Q4_0_8_8:
             {
                 ggml_compute_forward_add_q_f32(params, dst);
             } break;
@@ -9897,9 +9843,6 @@ static void ggml_compute_forward_add1(
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ3_S:
         case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q4_0_4_4:
-        case GGML_TYPE_Q4_0_4_8:
-        case GGML_TYPE_Q4_0_8_8:
             {
                 ggml_compute_forward_add1_q_f32(params, dst);
             } break;
@@ -10027,9 +9970,6 @@ static void ggml_compute_forward_acc(
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ3_S:
         case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q4_0_4_4:
-        case GGML_TYPE_Q4_0_4_8:
-        case GGML_TYPE_Q4_0_8_8:
         default:
             {
                 GGML_ABORT("fatal error");
@@ -13093,9 +13033,6 @@ static void ggml_compute_forward_out_prod(
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ3_S:
         case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q4_0_4_4:
-        case GGML_TYPE_Q4_0_4_8:
-        case GGML_TYPE_Q4_0_8_8:
             {
                 ggml_compute_forward_out_prod_q_f32(params, dst);
             } break;
@@ -13283,9 +13220,6 @@ static void ggml_compute_forward_set(
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ3_S:
         case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q4_0_4_4:
-        case GGML_TYPE_Q4_0_4_8:
-        case GGML_TYPE_Q4_0_8_8:
         default:
             {
                 GGML_ABORT("fatal error");
@@ -13547,9 +13481,6 @@ static void ggml_compute_forward_get_rows(
         case GGML_TYPE_IQ4_XS:
         case GGML_TYPE_IQ3_S:
         case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q4_0_4_4:
-        case GGML_TYPE_Q4_0_4_8:
-        case GGML_TYPE_Q4_0_8_8:
             {
                 ggml_compute_forward_get_rows_q(params, dst);
             } break;
@@ -14139,9 +14070,6 @@ static void ggml_compute_forward_clamp(
         case GGML_TYPE_IQ3_S:
         case GGML_TYPE_IQ2_S:
         case GGML_TYPE_Q8_K:
-        case GGML_TYPE_Q4_0_4_4:
-        case GGML_TYPE_Q4_0_4_8:
-        case GGML_TYPE_Q4_0_8_8:
         case GGML_TYPE_I8:
         case GGML_TYPE_I16:
         case GGML_TYPE_I32:
@@ -21941,9 +21869,6 @@ size_t ggml_quantize_chunk(
         case GGML_TYPE_IQ1_M:   result = quantize_iq1_m  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_IQ4_NL:  result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_IQ4_XS:  result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_F16:
             {
                 size_t elemsize = sizeof(ggml_fp16_t);
diff --git a/include/llama.h b/include/llama.h
index 5c14d2a3..7d7392fe 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -165,18 +165,18 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_Q2_K_S        = 21, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ3_XS        = 22, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ3_XXS       = 23, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ1_S         = 24, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ1_S         = 24, // except 1d tensors, 1 bit quantization
         LLAMA_FTYPE_MOSTLY_IQ4_NL        = 25, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ3_S         = 26, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ3_M         = 27, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ2_S         = 28, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ2_M         = 29, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_IQ4_XS        = 30, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ1_M         = 31, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ1_M         = 31, // except 1d tensors, 1 bit quantization
         LLAMA_FTYPE_MOSTLY_BF16          = 32, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_0_4_4      = 33, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_0_4_8      = 34, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_0_8_8      = 35, // except 1d tensors
+        // LLAMA_FTYPE_MOSTLY_Q4_0_4_4      = 33, // removed from gguf files, use Q4_0 and runtime repack
+        // LLAMA_FTYPE_MOSTLY_Q4_0_4_8      = 34, // removed from gguf files, use Q4_0 and runtime repack
+        // LLAMA_FTYPE_MOSTLY_Q4_0_8_8      = 35, // removed from gguf files, use Q4_0 and runtime repack
         LLAMA_FTYPE_MOSTLY_TQ1_0         = 36, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_TQ2_0         = 37, // except 1d tensors
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 88e13e5a..1aedb6a4 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3560,16 +3560,26 @@ static bool is_dtype_exist(struct model_params * n_params, enum ggml_type dtype)
         case GGML_TYPE_F32:
         case GGML_TYPE_F16:    
             return true;
+        case GGML_TYPE_Q2_K:
+            return n_params->layer_q2k    > 0 || n_params->output_q2k    > 0;
         case GGML_TYPE_Q4_K:
-            return n_params->layer_q4k > 0 || n_params->output_q4k > 0;
-        case GGML_TYPE_Q5_0:
-            return n_params->layer_q50 > 0 || n_params->output_q50 > 0;
+            return n_params->layer_q4k    > 0 || n_params->output_q4k    > 0;
         case GGML_TYPE_Q5_K:
-            return n_params->layer_q5k > 0 || n_params->output_q5k > 0;
+            return n_params->layer_q5k    > 0 || n_params->output_q5k    > 0;
         case GGML_TYPE_Q6_K:
-            return n_params->layer_q6k > 0 || n_params->output_q6k > 0;
+            return n_params->layer_q6k    > 0 || n_params->output_q6k    > 0;
+        case GGML_TYPE_IQ2_XXS:
+            return n_params->layer_iq2xxs > 0 || n_params->output_iq2xxs > 0;
+        case GGML_TYPE_Q5_0:
+            return n_params->layer_q50    > 0 || n_params->output_q50    > 0;
         case GGML_TYPE_Q8_0:
-            return n_params->layer_q80 > 0 || n_params->output_q80 > 0;
+            return n_params->layer_q80    > 0 || n_params->output_q80    > 0;
+        case GGML_TYPE_IQ1_S:
+            return n_params->layer_iq1s   > 0 || n_params->output_iq1s   > 0;
+        case GGML_TYPE_IQ4_NL:
+            return n_params->layer_iq4nl  > 0 || n_params->output_iq4nl  > 0;
+        case GGML_TYPE_IQ1_M:
+            return n_params->layer_iq1m   > 0 || n_params->output_iq1m   > 0;
         default:
             throw std::runtime_error("Unrecognized data type\n");
     }
@@ -3650,18 +3660,18 @@ void llama_profile_device(
         dev_info->gpu_props.cuda_flops_f16_f32  = device_cuda_flops (model, GGML_TYPE_F16,  GGML_TYPE_F32);
     }
 
+    if (is_dtype_exist(n_params, GGML_TYPE_Q2_K)) {
+        dev_info->cpu_props.flops_q2k_f32       = device_cpu_flops  (model, GGML_TYPE_Q2_K, GGML_TYPE_F32, n_threads);
+        dev_info->gpu_props.metal_flops_q2k_f32 = device_metal_flops(model, GGML_TYPE_Q2_K, GGML_TYPE_F32);
+        dev_info->gpu_props.cuda_flops_q2k_f32  = device_cuda_flops (model, GGML_TYPE_Q2_K, GGML_TYPE_F32);
+    }
+
     if (is_dtype_exist(n_params, GGML_TYPE_Q4_K)) {
         dev_info->cpu_props.flops_q4k_f32       = device_cpu_flops  (model, GGML_TYPE_Q4_K, GGML_TYPE_F32, n_threads);
         dev_info->gpu_props.metal_flops_q4k_f32 = device_metal_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
         dev_info->gpu_props.cuda_flops_q4k_f32  = device_cuda_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
     }
 
-    if (is_dtype_exist(n_params, GGML_TYPE_Q5_0)) {
-        dev_info->cpu_props.flops_q50_f32       = device_cpu_flops  (model, GGML_TYPE_Q5_0, GGML_TYPE_F32, n_threads);
-        dev_info->gpu_props.metal_flops_q50_f32 = device_metal_flops(model, GGML_TYPE_Q5_0, GGML_TYPE_F32);
-        dev_info->gpu_props.cuda_flops_q50_f32  = device_cuda_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32);
-    }
-
     if (is_dtype_exist(n_params, GGML_TYPE_Q5_K)) {
         dev_info->cpu_props.flops_q5k_f32       = device_cpu_flops  (model, GGML_TYPE_Q5_K, GGML_TYPE_F32, n_threads);
         dev_info->gpu_props.metal_flops_q5k_f32 = device_metal_flops(model, GGML_TYPE_Q5_K, GGML_TYPE_F32);
@@ -3674,11 +3684,42 @@ void llama_profile_device(
         dev_info->gpu_props.cuda_flops_q6k_f32  = device_cuda_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32);
     }
 
+    if (is_dtype_exist(n_params, GGML_TYPE_IQ2_XXS)) {
+        dev_info->cpu_props.flops_iq2xxs_f32    = device_cpu_flops  (model, GGML_TYPE_IQ2_XXS, GGML_TYPE_F32, n_threads);
+        dev_info->gpu_props.metal_flops_iq2xxs_f32= device_metal_flops(model, GGML_TYPE_IQ2_XXS, GGML_TYPE_F32);
+        dev_info->gpu_props.cuda_flops_iq2xxs_f32 = device_cuda_flops (model, GGML_TYPE_IQ2_XXS, GGML_TYPE_F32);
+    }
+
+    if (is_dtype_exist(n_params, GGML_TYPE_Q5_0)) {
+        dev_info->cpu_props.flops_q50_f32       = device_cpu_flops  (model, GGML_TYPE_Q5_0, GGML_TYPE_F32, n_threads);
+        dev_info->gpu_props.metal_flops_q50_f32 = device_metal_flops(model, GGML_TYPE_Q5_0, GGML_TYPE_F32);
+        dev_info->gpu_props.cuda_flops_q50_f32  = device_cuda_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32);
+    }
+
+
     if (is_dtype_exist(n_params, GGML_TYPE_Q8_0)) {
         dev_info->cpu_props.flops_q80_f32       = device_cpu_flops  (model, GGML_TYPE_Q8_0, GGML_TYPE_F32, n_threads);
         dev_info->gpu_props.metal_flops_q80_f32 = device_metal_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32);
         dev_info->gpu_props.cuda_flops_q80_f32  = device_cuda_flops (model, GGML_TYPE_Q8_0, GGML_TYPE_F32);
     }
+
+    if (is_dtype_exist(n_params, GGML_TYPE_IQ1_S)) {
+        dev_info->cpu_props.flops_iq1s_f32      = device_cpu_flops  (model, GGML_TYPE_IQ1_S, GGML_TYPE_F32, n_threads);
+        dev_info->gpu_props.metal_flops_iq1s_f32= device_metal_flops(model, GGML_TYPE_IQ1_S, GGML_TYPE_F32);
+        dev_info->gpu_props.cuda_flops_iq1s_f32 = device_cuda_flops (model, GGML_TYPE_IQ1_S, GGML_TYPE_F32);
+    }
+
+    if (is_dtype_exist(n_params, GGML_TYPE_IQ4_NL)) {
+        dev_info->cpu_props.flops_iq4nl_f32     = device_cpu_flops   (model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32, n_threads);
+        dev_info->gpu_props.metal_flops_iq4nl_f32= device_metal_flops(model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32);
+        dev_info->gpu_props.cuda_flops_iq4nl_f32 = device_cuda_flops (model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32);
+    }
+
+    if (is_dtype_exist(n_params, GGML_TYPE_IQ1_M)) {
+        dev_info->cpu_props.flops_iq1m_f32      = device_cpu_flops  (model, GGML_TYPE_IQ1_M, GGML_TYPE_F32, n_threads);
+        dev_info->gpu_props.metal_flops_iq1m_f32= device_metal_flops(model, GGML_TYPE_IQ1_M, GGML_TYPE_F32);
+        dev_info->gpu_props.cuda_flops_iq1m_f32 = device_cuda_flops (model, GGML_TYPE_IQ1_M, GGML_TYPE_F32);
+    }
 }
 
 ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device) {
@@ -4844,9 +4885,7 @@ struct llama_model_loader {
                 case GGML_TYPE_IQ4_NL:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL;  break;
                 case GGML_TYPE_IQ4_XS:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS;  break;
                 case GGML_TYPE_IQ3_S:   ftype = LLAMA_FTYPE_MOSTLY_IQ3_S;   break;
-                case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break;
-                case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break;
-                case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break;
+
                 default:
                     {
                         LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@@ -5654,9 +5693,6 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
         case LLAMA_FTYPE_MOSTLY_IQ4_XS:   return "IQ4_XS - 4.25 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ3_S:    return "IQ3_S - 3.4375 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ3_M:    return "IQ3_S mix - 3.66 bpw";
-        case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
-        case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
-        case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
 
         default: return "unknown, may not work";
     }
@@ -18997,10 +19033,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
             else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
                 new_type = GGML_TYPE_IQ3_S;
             }
-            else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 ||
-                     new_type == GGML_TYPE_Q4_0_8_8) {
-                new_type = GGML_TYPE_Q4_0;
-            }
             else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
                 new_type = GGML_TYPE_Q4_K;
             }
@@ -19323,10 +19355,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         case LLAMA_FTYPE_MOSTLY_IQ4_XS:  default_type = GGML_TYPE_IQ4_XS;  break;
         case LLAMA_FTYPE_MOSTLY_IQ3_S:   default_type = GGML_TYPE_IQ3_S;   break;
         case LLAMA_FTYPE_MOSTLY_IQ3_M:   default_type = GGML_TYPE_IQ3_S;   break;
-        case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;
-        case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
-        case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;
-
+        
         default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
     }
 
@@ -19646,14 +19675,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                 f32_data = (float *) f32_conv_buf.data();
             }
 
-            int chunk_size_multiplier = 1;
-            if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
-                if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0;
-                else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0;
-                if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
-                else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
-            }
-
             LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
             fflush(stdout);
 
@@ -19666,8 +19687,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             const int64_t nrows = tensor->ne[1];
 
             static const int64_t min_chunk_size = 32 * 512;
-            const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) *
-                                       chunk_size_multiplier;
+            const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
 
             const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
             const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
@@ -21049,25 +21069,40 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en
         case PROFILER_LAYER_OUTPUT:
             switch (dtype) {
                 case GGML_TYPE_F32:
-                    n_flops->output_f32_f32 += n;
+                    n_flops->output_f32_f32    += n;
                     break;
                 case GGML_TYPE_F16:
-                    n_flops->output_f16_f32 += n;
+                    n_flops->output_f16_f32    += n;
+                    break;
+                case GGML_TYPE_Q2_K:
+                    n_flops->output_q2k_f32    += n;
                     break;
                 case GGML_TYPE_Q4_K:
-                    n_flops->output_q4k_f32 += n;
-                    break;
-                case GGML_TYPE_Q5_0:
-                    n_flops->output_q50_f32 += n;
+                    n_flops->output_q4k_f32    += n;
                     break;
                 case GGML_TYPE_Q5_K:
-                    n_flops->output_q5k_f32 += n;
+                    n_flops->output_q5k_f32    += n;
                     break;
                 case GGML_TYPE_Q6_K:
-                    n_flops->output_q6k_f32 += n;
+                    n_flops->output_q6k_f32    += n;
+                    break;
+                case GGML_TYPE_IQ2_XXS:
+                    n_flops->output_iq2xxs_f32 += n;
+                    break;
+                case GGML_TYPE_Q5_0:
+                    n_flops->output_q50_f32    += n;
                     break;
                 case GGML_TYPE_Q8_0:
-                    n_flops->output_q80_f32 += n;
+                    n_flops->output_q80_f32    += n;
+                    break;
+                case GGML_TYPE_IQ1_S:
+                    n_flops->output_iq1s_f32   += n;
+                    break;
+                case GGML_TYPE_IQ4_NL:
+                    n_flops->output_iq4nl_f32  += n;
+                    break;
+                case GGML_TYPE_IQ1_M:
+                    n_flops->output_iq1m_f32   += n;
                     break;
                 default:
                     throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
@@ -21075,27 +21110,42 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en
             break;
 
         case PROFILER_LAYER_BACKEND:
-            switch (dtype) {
+              switch (dtype) {
                 case GGML_TYPE_F32:
-                    n_flops->layer_f32_f32 += n;
+                    n_flops->layer_f32_f32    += n;
                     break;
                 case GGML_TYPE_F16:
-                    n_flops->layer_f16_f32 += n;
+                    n_flops->layer_f16_f32    += n;
+                    break;
+                case GGML_TYPE_Q2_K:
+                    n_flops->layer_q2k_f32    += n;
                     break;
                 case GGML_TYPE_Q4_K:
-                    n_flops->layer_q4k_f32 += n;
-                    break;
-                case GGML_TYPE_Q5_0:
-                    n_flops->layer_q50_f32 += n;
+                    n_flops->layer_q4k_f32    += n;
                     break;
                 case GGML_TYPE_Q5_K:
-                    n_flops->layer_q5k_f32 += n;
+                    n_flops->layer_q5k_f32    += n;
                     break;
                 case GGML_TYPE_Q6_K:
-                    n_flops->layer_q6k_f32 += n;
+                    n_flops->layer_q6k_f32    += n;
+                    break;
+                case GGML_TYPE_IQ2_XXS:
+                    n_flops->layer_iq2xxs_f32 += n;
+                    break;
+                case GGML_TYPE_Q5_0:
+                    n_flops->layer_q50_f32    += n;
                     break;
                 case GGML_TYPE_Q8_0:
-                    n_flops->layer_q80_f32 += n;
+                    n_flops->layer_q80_f32    += n;
+                    break;
+                case GGML_TYPE_IQ1_S:
+                    n_flops->layer_iq1s_f32   += n;
+                    break;
+                case GGML_TYPE_IQ4_NL:
+                    n_flops->layer_iq4nl_f32  += n;
+                    break;
+                case GGML_TYPE_IQ1_M:
+                    n_flops->layer_iq1m_f32   += n;
                     break;
                 default:
                     throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n");
@@ -21113,25 +21163,40 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
         case PROFILER_LAYER_INPUT:
             switch (dtype) {
                 case GGML_TYPE_F32:
-                    n_params->input_f32 += n_i64t;
+                    n_params->input_f32    += n_i64t;
                     break;
                 case GGML_TYPE_F16:
-                    n_params->input_f16 += n_i64t;
+                    n_params->input_f16    += n_i64t;
+                    break;
+                case GGML_TYPE_Q2_K:
+                    n_params->input_q2k    += n_i64t;
                     break;
                 case GGML_TYPE_Q4_K:
-                    n_params->input_q4k += n_i64t;
-                    break;
-                case GGML_TYPE_Q5_0:
-                    n_params->input_q50 += n_i64t;
+                    n_params->input_q4k    += n_i64t;
                     break;
                 case GGML_TYPE_Q5_K:
-                    n_params->input_q5k += n_i64t;
+                    n_params->input_q5k    += n_i64t;
                     break;
                 case GGML_TYPE_Q6_K:
-                    n_params->input_q6k += n_i64t;
+                    n_params->input_q6k    += n_i64t;
+                    break;
+                case GGML_TYPE_IQ2_XXS:
+                    n_params->input_iq2xxs += n_i64t;
+                    break;
+                case GGML_TYPE_Q5_0:
+                    n_params->input_q50    += n_i64t;
                     break;
                 case GGML_TYPE_Q8_0:
-                    n_params->input_q80 += n_i64t;
+                    n_params->input_q80    += n_i64t;
+                    break;
+                case GGML_TYPE_IQ1_S:
+                    n_params->input_iq1s   += n_i64t;
+                    break;
+                case GGML_TYPE_IQ4_NL:
+                    n_params->input_iq4nl  += n_i64t;
+                    break;
+                case GGML_TYPE_IQ1_M:
+                    n_params->input_iq1m   += n_i64t;
                     break;
                 default:
                     throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
@@ -21141,25 +21206,40 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
         case PROFILER_LAYER_OUTPUT:
             switch (dtype) {
                 case GGML_TYPE_F32:
-                    n_params->output_f32 += n_i64t;
+                    n_params->output_f32    += n_i64t;
                     break;
                 case GGML_TYPE_F16:
-                    n_params->output_f16 += n_i64t;
+                    n_params->output_f16    += n_i64t;
+                    break;
+                case GGML_TYPE_Q2_K:
+                    n_params->output_q2k    += n_i64t;
                     break;
                 case GGML_TYPE_Q4_K:
-                    n_params->output_q4k += n_i64t;
-                    break;
-                case GGML_TYPE_Q5_0:
-                    n_params->output_q50 += n_i64t;
+                    n_params->output_q4k    += n_i64t;
                     break;
                 case GGML_TYPE_Q5_K:
-                    n_params->output_q5k += n_i64t;
+                    n_params->output_q5k    += n_i64t;
                     break;
                 case GGML_TYPE_Q6_K:
-                    n_params->output_q6k += n_i64t;
+                    n_params->output_q6k    += n_i64t;
+                    break;
+                case GGML_TYPE_IQ2_XXS:
+                    n_params->output_iq2xxs += n_i64t;
+                    break;
+                case GGML_TYPE_Q5_0:
+                    n_params->output_q50    += n_i64t;
                     break;
                 case GGML_TYPE_Q8_0:
-                    n_params->output_q80 += n_i64t;
+                    n_params->output_q80    += n_i64t;
+                    break;
+                case GGML_TYPE_IQ1_S:
+                    n_params->output_iq1s   += n_i64t;
+                    break;
+                case GGML_TYPE_IQ4_NL:
+                    n_params->output_iq4nl  += n_i64t;
+                    break;
+                case GGML_TYPE_IQ1_M:
+                    n_params->output_iq1m   += n_i64t;
                     break;
                 default:
                     throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
@@ -21169,25 +21249,40 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
         case PROFILER_LAYER_BACKEND:
             switch (dtype) {
                 case GGML_TYPE_F32:
-                    n_params->layer_f32 += n_i64t;
+                    n_params->layer_f32     += n_i64t;
                     break;
                 case GGML_TYPE_F16:
-                    n_params->layer_f16 += n_i64t;
+                    n_params->layer_f16     += n_i64t;
+                    break;
+                case GGML_TYPE_Q2_K:
+                    n_params->layer_q2k     += n_i64t;
                     break;
                 case GGML_TYPE_Q4_K:
-                    n_params->layer_q4k += n_i64t;
-                    break;
-                case GGML_TYPE_Q5_0:
-                    n_params->layer_q50 += n_i64t;
+                    n_params->layer_q4k     += n_i64t;
                     break;
                 case GGML_TYPE_Q5_K:
-                    n_params->layer_q5k += n_i64t;
+                    n_params->layer_q5k     += n_i64t;
                     break;
                 case GGML_TYPE_Q6_K:
-                    n_params->layer_q6k += n_i64t;
+                    n_params->layer_q6k     += n_i64t;
+                    break;
+                case GGML_TYPE_IQ2_XXS:
+                    n_params->layer_iq2xxs  += n_i64t;
+                    break;
+                case GGML_TYPE_Q5_0:
+                    n_params->layer_q50     += n_i64t;
                     break;
                 case GGML_TYPE_Q8_0:
-                    n_params->layer_q80 += n_i64t;
+                    n_params->layer_q80     += n_i64t;
+                    break;
+                case GGML_TYPE_IQ1_S:
+                    n_params->layer_iq1s    += n_i64t;
+                    break;
+                case GGML_TYPE_IQ4_NL:
+                    n_params->layer_iq4nl   += n_i64t;
+                    break;
+                case GGML_TYPE_IQ1_M:
+                    n_params->layer_iq1m    += n_i64t;
                     break;
                 default:
                     throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n");
@@ -21477,23 +21572,33 @@ void llama_model_n_flops(
     }
 
     // use average values instead of total values
-    n_flops->layer_f32_f32 = static_cast<int64_t>((double)n_flops->layer_f32_f32 / (double)n_layer);
-    n_flops->layer_f16_f32 = static_cast<int64_t>((double)n_flops->layer_f16_f32 / (double)n_layer);
-    n_flops->layer_q4k_f32 = static_cast<int64_t>((double)n_flops->layer_q4k_f32 / (double)n_layer);
-    n_flops->layer_q50_f32 = static_cast<int64_t>((double)n_flops->layer_q50_f32 / (double)n_layer);
-    n_flops->layer_q5k_f32 = static_cast<int64_t>((double)n_flops->layer_q5k_f32 / (double)n_layer);
-    n_flops->layer_q6k_f32 = static_cast<int64_t>((double)n_flops->layer_q6k_f32 / (double)n_layer);
-    n_flops->layer_q80_f32 = static_cast<int64_t>((double)n_flops->layer_q80_f32 / (double)n_layer);
-
-    n_params->layer_f32    = static_cast<int64_t>((double)n_params->layer_f32    / (double)n_layer);
-    n_params->layer_f16    = static_cast<int64_t>((double)n_params->layer_f16    / (double)n_layer);
-    n_params->layer_q4k    = static_cast<int64_t>((double)n_params->layer_q4k    / (double)n_layer);
-    n_params->layer_q50    = static_cast<int64_t>((double)n_params->layer_q50    / (double)n_layer);
-    n_params->layer_q5k    = static_cast<int64_t>((double)n_params->layer_q5k    / (double)n_layer);
-    n_params->layer_q6k    = static_cast<int64_t>((double)n_params->layer_q6k    / (double)n_layer);
-    n_params->layer_q80    = static_cast<int64_t>((double)n_params->layer_q80    / (double)n_layer);
-
-    n_bytes->nb_layer      = static_cast<int64_t>((double)n_bytes->nb_layer      / (double)n_layer);
+    n_flops->layer_f32_f32    = static_cast<int64_t>((double)n_flops->layer_f32_f32    / (double)n_layer);
+    n_flops->layer_f16_f32    = static_cast<int64_t>((double)n_flops->layer_f16_f32    / (double)n_layer);
+    n_flops->layer_q2k_f32    = static_cast<int64_t>((double)n_flops->layer_q2k_f32    / (double)n_layer);
+    n_flops->layer_q4k_f32    = static_cast<int64_t>((double)n_flops->layer_q4k_f32    / (double)n_layer);
+    n_flops->layer_q5k_f32    = static_cast<int64_t>((double)n_flops->layer_q5k_f32    / (double)n_layer);
+    n_flops->layer_q6k_f32    = static_cast<int64_t>((double)n_flops->layer_q6k_f32    / (double)n_layer);
+    n_flops->layer_iq2xxs_f32 = static_cast<int64_t>((double)n_flops->layer_iq2xxs_f32 / (double)n_layer);
+    n_flops->layer_q50_f32    = static_cast<int64_t>((double)n_flops->layer_q50_f32    / (double)n_layer);
+    n_flops->layer_q80_f32    = static_cast<int64_t>((double)n_flops->layer_q80_f32    / (double)n_layer);
+    n_flops->layer_iq1s_f32   = static_cast<int64_t>((double)n_flops->layer_iq1s_f32   / (double)n_layer);
+    n_flops->layer_iq4nl_f32  = static_cast<int64_t>((double)n_flops->layer_iq4nl_f32  / (double)n_layer);
+    n_flops->layer_iq1m_f32   = static_cast<int64_t>((double)n_flops->layer_iq1m_f32   / (double)n_layer);
+    
+    n_params->layer_f32      = static_cast<int64_t>((double)n_params->layer_f32     / (double)n_layer);
+    n_params->layer_f16      = static_cast<int64_t>((double)n_params->layer_f16     / (double)n_layer);
+    n_params->layer_q2k      = static_cast<int64_t>((double)n_params->layer_q2k     / (double)n_layer);
+    n_params->layer_q4k      = static_cast<int64_t>((double)n_params->layer_q4k     / (double)n_layer);
+    n_params->layer_q5k      = static_cast<int64_t>((double)n_params->layer_q5k     / (double)n_layer);
+    n_params->layer_q6k      = static_cast<int64_t>((double)n_params->layer_q6k     / (double)n_layer);
+    n_params->layer_iq2xxs   = static_cast<int64_t>((double)n_params->layer_iq2xxs  / (double)n_layer);
+    n_params->layer_q50      = static_cast<int64_t>((double)n_params->layer_q50     / (double)n_layer);
+    n_params->layer_q80      = static_cast<int64_t>((double)n_params->layer_q80     / (double)n_layer);
+    n_params->layer_iq1s     = static_cast<int64_t>((double)n_params->layer_iq1s    / (double)n_layer);
+    n_params->layer_iq4nl    = static_cast<int64_t>((double)n_params->layer_iq4nl   / (double)n_layer);
+    n_params->layer_iq1m     = static_cast<int64_t>((double)n_params->layer_iq1m    / (double)n_layer);
+    
+    n_bytes->nb_layer        = static_cast<int64_t>((double)n_bytes->nb_layer       / (double)n_layer);
 
     // reset ml, model, and clear contexts
     ml->n_created = 0;