Merge branch 'dev'

2025-09-05 22:09:03 +00:00 · 2025-05-14 14:19:53 +04:00 · 2025-05-14 14:19:53 +04:00 · ebd09fc83c
commit ebd09fc83c
parent e2de4511c5 258fb2d06b
12 changed files with 1140 additions and 546 deletions
--- a/.gitignore
+++ b/.gitignore
@ -67,6 +67,7 @@ autogen-*.md

 /main
 /server
+/profile-tool

 # CI

@ -135,4 +136,8 @@ poetry.toml
 /lora-tests

 # Video
-*.mp4
+*.mp4
+
+# fio
+fio_test*
+*.fio
--- a/7
+++ b/7
@ -1,5 +1,5 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = llama-cli
+BUILD_TARGETS = llama-cli profile-tool
 # BUILD_TARGETS = \
 # 	libllava.a \
 # 	llama-baby-llama \
@ -1528,6 +1528,11 @@ llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual

+profile-tool: tools/profile_tool.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 ifeq ($(UNAME_S),Darwin)
 swift: examples/batched.swift
 	(cd examples/batched.swift; make build)
--- a/README.md
+++ b/README.md
@ -98,7 +98,7 @@ Here are the models we have tested so far. You can also try more on Hugging Face
 - **DeepSeek R1-8B (Q4K, Q6K, Q80):** [deepseek-ai.DeepSeek-R1-Distill-Llama-8B](https://huggingface.co/DevQuasar/deepseek-ai.DeepSeek-R1-Distill-Llama-8B-GGUF)
 - **DeepSeek R1-14B (Q4K, Q6K, Q80):** [deepseek-ai.DeepSeek-R1-Distill-Qwen-14B](https://huggingface.co/DevQuasar/deepseek-ai.DeepSeek-R1-Distill-Qwen-14B-GGUF)
 - **DeepSeek R1-32B (Q4K, Q6K, Q80):** [deepseek-ai.DeepSeek-R1-Distill-Qwen-32B](https://huggingface.co/DevQuasar/deepseek-ai.DeepSeek-R1-Distill-Qwen-32B-GGUF)
- **DeepSeek R1-70B (Q4K, Q6K, Q80):** [DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Llama-70B-GGUF)
+- **DeepSeek R1-70B (Q4K, Q6K, Q80, [IQ1](https://huggingface.co/bartowski/DeepSeek-R1-Distill-Llama-70B-GGUF)):** [DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/unsloth/DeepSeek-R1-Distill-Llama-70B-GGUF)

 ## ⚙️ How to Use?

@ -287,7 +287,15 @@ By default, prima.cpp automatically profiles devices and assigns workloads. Howe

 > Example: if `-lw "16,16,16,16"` is passed to the head device, then each of the 4 devices will handle 16 model layers. A worker with `-ngl 8` (if a GPU is available) will run 8/16 layers on the GPU.

-**2. How to run in chat mode like in llama.cpp?**
+**2. How to manually profile my device?**
+
+If `-lw` is set, prima.cpp skips profiling and runs directly with the user-defined `-lw` and `-ngl`. If you wish to profile a device manually, run `profile-tool` on that device.
+
+```shell
+./profile-tool -m download/qwq-32b-q4_k_m.gguf 
+```
+
+**3. How to run in chat mode like in llama.cpp?**

 To enable chat (conversation) mode, simply add the `-cnv` flag on the head device:

@ -298,7 +306,7 @@ To enable chat (conversation) mode, simply add the `-cnv` flag on the head devic

 To quit the chat mode, input `quit` or `exit`.

-**3. How to force prefetching after computing?**
+**4. How to force prefetching after computing?**

 By default, prima.cpp only advises the OS to prefetch upcoming layer weights. The actual prefetching is then scheduled and handled by the OS, which may introduce some uncertainty. To explicitly trigger prefetching right after computing, you can use the `--force` flag on each device:

@ -309,11 +317,11 @@ By default, prima.cpp only advises the OS to prefetch upcoming layer weights. Th

 This enables more aggressive overlap but also introduce extra memory access latency. Use `--force` only after testing, as its effect depends on your hardware and OS behavior.

-**4. Does it support Windows?**
+**5. Does it support Windows?**

 Not yet—but it's on the roadmap. Currently, prima.cpp can run on Linux, macOS, Android and HarmonyOS (via Termux). You can mix heterogeneous devices in the cluster.

-**5. Does it support Vulkan or AMD GPUs?**
+**6. Does it support Vulkan or AMD GPUs?**

 Not yet. Now prima.cpp supports only CUDA-based GPUs. Vulkan is in our roadmap, and AMD GPUs will be supported once we have that device.

--- a/common/common.cpp
+++ b/common/common.cpp
@ -901,13 +901,19 @@ static bool assign_layers_to_device(
        float t_read_ram_cpu = 0.0f;

        float t_calc_cpu = (
-            master.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) +
-            master.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) +
-            master.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) +
-            master.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) +
-            master.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) +
-            master.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) +
-            master.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
+            master.model_flops.layer_f32_f32   / (dev.cpu_props.flops_f32_f32   * 1e9 + EPS) +
+            master.model_flops.layer_f16_f32   / (dev.cpu_props.flops_f16_f32   * 1e9 + EPS) +
+            master.model_flops.layer_q2k_f32   / (dev.cpu_props.flops_q2k_f32   * 1e9 + EPS) +
+            master.model_flops.layer_q4k_f32   / (dev.cpu_props.flops_q4k_f32   * 1e9 + EPS) +
+            master.model_flops.layer_q5k_f32   / (dev.cpu_props.flops_q5k_f32   * 1e9 + EPS) +
+            master.model_flops.layer_q6k_f32   / (dev.cpu_props.flops_q6k_f32   * 1e9 + EPS) +
+            master.model_flops.layer_iq2xxs_f32/ (dev.cpu_props.flops_iq2xxs_f32* 1e9 + EPS) +
+            master.model_flops.layer_q50_f32   / (dev.cpu_props.flops_q50_f32   * 1e9 + EPS) +
+            master.model_flops.layer_q80_f32   / (dev.cpu_props.flops_q80_f32   * 1e9 + EPS) +
+            master.model_flops.layer_iq1s_f32  / (dev.cpu_props.flops_iq1s_f32  * 1e9 + EPS) +
+            master.model_flops.layer_iq4nl_f32 / (dev.cpu_props.flops_iq4nl_f32 * 1e9 + EPS) +
+            master.model_flops.layer_iq1m_f32  / (dev.cpu_props.flops_iq1m_f32  * 1e9 + EPS) ) * 1000; // in ms
+
        float t_kv_cpy_cpu = dev.memory.mem_cpy_delay; // in ms
        // t_read_ram_cpu = b_prime / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms

@ -921,24 +927,36 @@ static bool assign_layers_to_device(

            if (dev.gpu_support.metal) {
                t_calc_gpu = (
-                    master.model_flops.layer_f32_f32 / (dev.gpu_props.metal_flops_f32_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_f16_f32 / (dev.gpu_props.metal_flops_f16_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_q4k_f32 / (dev.gpu_props.metal_flops_q4k_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_q50_f32 / (dev.gpu_props.metal_flops_q50_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_q5k_f32 / (dev.gpu_props.metal_flops_q5k_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_q6k_f32 / (dev.gpu_props.metal_flops_q6k_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_q80_f32 / (dev.gpu_props.metal_flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
+                    master.model_flops.layer_f32_f32    / (dev.gpu_props.metal_flops_f32_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_f16_f32    / (dev.gpu_props.metal_flops_f16_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_q2k_f32    / (dev.gpu_props.metal_flops_q2k_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_q4k_f32    / (dev.gpu_props.metal_flops_q4k_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_q5k_f32    / (dev.gpu_props.metal_flops_q5k_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_q6k_f32    / (dev.gpu_props.metal_flops_q6k_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_iq2xxs_f32 / (dev.gpu_props.metal_flops_iq2xxs_f32 * 1e9 + EPS) +
+                    master.model_flops.layer_q50_f32    / (dev.gpu_props.metal_flops_q50_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_q80_f32    / (dev.gpu_props.metal_flops_q80_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_iq1s_f32   / (dev.gpu_props.metal_flops_iq1s_f32   * 1e9 + EPS) +
+                    master.model_flops.layer_iq4nl_f32  / (dev.gpu_props.metal_flops_iq4nl_f32  * 1e9 + EPS) +
+                    master.model_flops.layer_iq1m_f32   / (dev.gpu_props.metal_flops_iq1m_f32   * 1e9 + EPS) ) * 1000; // in ms
+
                t_kv_cpy_gpu = dev.gpu_props.metal_mem_cpy_delay; // in ms
                // t_read_ram_gpu = b_prime / (dev.gpu_props.metal_read_vram_bw * 1e9) * 1000; // in ms
            } else {
                t_calc_gpu = (
-                    master.model_flops.layer_f32_f32 / (dev.gpu_props.cuda_flops_f32_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_f16_f32 / (dev.gpu_props.cuda_flops_f16_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_q4k_f32 / (dev.gpu_props.cuda_flops_q4k_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_q50_f32 / (dev.gpu_props.cuda_flops_q50_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_q5k_f32 / (dev.gpu_props.cuda_flops_q5k_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_q6k_f32 / (dev.gpu_props.cuda_flops_q6k_f32 * 1e9 + EPS) +
-                    master.model_flops.layer_q80_f32 / (dev.gpu_props.cuda_flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
+                    master.model_flops.layer_f32_f32    / (dev.gpu_props.cuda_flops_f32_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_f16_f32    / (dev.gpu_props.cuda_flops_f16_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_q2k_f32    / (dev.gpu_props.cuda_flops_q2k_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_q4k_f32    / (dev.gpu_props.cuda_flops_q4k_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_q5k_f32    / (dev.gpu_props.cuda_flops_q5k_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_q6k_f32    / (dev.gpu_props.cuda_flops_q6k_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_iq2xxs_f32 / (dev.gpu_props.cuda_flops_iq2xxs_f32 * 1e9 + EPS) +
+                    master.model_flops.layer_q50_f32    / (dev.gpu_props.cuda_flops_q50_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_q80_f32    / (dev.gpu_props.cuda_flops_q80_f32    * 1e9 + EPS) +
+                    master.model_flops.layer_iq1s_f32   / (dev.gpu_props.cuda_flops_iq1s_f32   * 1e9 + EPS) +
+                    master.model_flops.layer_iq4nl_f32  / (dev.gpu_props.cuda_flops_iq4nl_f32  * 1e9 + EPS) +
+                    master.model_flops.layer_iq1m_f32   / (dev.gpu_props.cuda_flops_iq1m_f32   * 1e9 + EPS) ) * 1000; // in ms
+
                t_kv_cpy_gpu = dev.gpu_props.cuda_mem_cpy_delay; // in ms
                // t_read_ram_gpu = b_prime / (dev.gpu_props.cuda_read_vram_bw * 1e9) * 1000; // in ms
            }
@ -1113,14 +1131,18 @@ static bool assign_layers_to_device(

            if (m == 0) {
                kappa = (
-                    dev.model_flops.layer_f32_f32 / (dev.cpu_props.flops_f32_f32 * 1e9 + EPS) +
-                    dev.model_flops.layer_f16_f32 / (dev.cpu_props.flops_f16_f32 * 1e9 + EPS) +
-                    dev.model_flops.layer_q4k_f32 / (dev.cpu_props.flops_q4k_f32 * 1e9 + EPS) +
-                    dev.model_flops.layer_q50_f32 / (dev.cpu_props.flops_q50_f32 * 1e9 + EPS) +
-                    dev.model_flops.layer_q5k_f32 / (dev.cpu_props.flops_q5k_f32 * 1e9 + EPS) +
-                    dev.model_flops.layer_q6k_f32 / (dev.cpu_props.flops_q6k_f32 * 1e9 + EPS) +
-                    dev.model_flops.layer_q80_f32 / (dev.cpu_props.flops_q80_f32 * 1e9 + EPS)) * 1000; // in ms
-
+                    dev.model_flops.layer_f32_f32    / (dev.cpu_props.flops_f32_f32    * 1e9 + EPS) +
+                    dev.model_flops.layer_f16_f32    / (dev.cpu_props.flops_f16_f32    * 1e9 + EPS) +
+                    dev.model_flops.layer_q2k_f32    / (dev.cpu_props.flops_q2k_f32    * 1e9 + EPS) +
+                    dev.model_flops.layer_q4k_f32    / (dev.cpu_props.flops_q4k_f32    * 1e9 + EPS) +
+                    dev.model_flops.layer_q5k_f32    / (dev.cpu_props.flops_q5k_f32    * 1e9 + EPS) +
+                    dev.model_flops.layer_q6k_f32    / (dev.cpu_props.flops_q6k_f32    * 1e9 + EPS) +
+                    dev.model_flops.layer_iq2xxs_f32 / (dev.cpu_props.flops_iq2xxs_f32 * 1e9 + EPS) +
+                    dev.model_flops.layer_q50_f32    / (dev.cpu_props.flops_q50_f32    * 1e9 + EPS) +
+                    dev.model_flops.layer_q80_f32    / (dev.cpu_props.flops_q80_f32    * 1e9 + EPS) +
+                    dev.model_flops.layer_iq1s_f32   / (dev.cpu_props.flops_iq1s_f32   * 1e9 + EPS) +
+                    dev.model_flops.layer_iq4nl_f32  / (dev.cpu_props.flops_iq4nl_f32  * 1e9 + EPS) +
+                    dev.model_flops.layer_iq1m_f32   / (dev.cpu_props.flops_iq1m_f32   * 1e9 + EPS) ) * 1000; // in ms
                // kappa += (bi / n_vocab + bo) / (dev.memory.cpu_read_ram_bw * 1e9) * 1000; // in ms

                kappa += (bi / n_vocab) / (disk_speed[m] * 1e9) * 1000; // in ms
@ -1505,6 +1527,12 @@ static bool assign_layers_to_device(
 //

 struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
+
+#if !(defined(GGML_USE_METAL) || defined(GGML_USE_CUDA))
+    // reset n_gpu_layers to 0 if GPU is not used
+    params.n_gpu_layers  = 0;
+#endif
+
    llama_init_result iparams;
    auto mparams = llama_model_params_from_gpt_params(params);

@ -1554,19 +1582,13 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
    uint32_t my_rank   = params.rank;
    bool auto_schedule = params.n_layer_window[0] == 0;
    
-    // get device profile
-    LOG_INF("\nstart profiling this device, this may take some seconds ...\n");
-    dev_info.rank = params.rank;
-    if (n_world > 1) {
-        llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
-    }
-
    // create llama context
    struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
    llama_context * lctx                = llama_new_context_with_model(model, cparams);

    if (n_world == 1) {
        uint32_t n_layers = llama_model_n_layers(model);
+        // assign all layers to this device
        params.n_layer_window[0]  = n_layers;
        cparams.n_layer_window[0] = n_layers;
        mparams.n_layer_window[0] = n_layers;
@ -1577,16 +1599,34 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
        // initialize sockets
        llama_init_sockets(lctx, n_world, my_rank);

+        // broadcast startup args
+        struct startup_args args;
+        if (my_rank == 0){
+            args.should_profile = auto_schedule;
+        }
+        llama_bcast_startup_args(lctx, my_rank, &args);
+
+        auto_schedule = args.should_profile;
+        // if n_world > 1 and need auto schdule, then prifile
+        if (auto_schedule){
+            // get device profile
+            LOG_INF("\nstart profiling this device, this may take some seconds ...\n");
+            dev_info.rank = params.rank;
+            if (n_world > 1) {
+                llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
+            }
+        }
+
        // sychronize device profile to the master node
-        struct device_info * dev_info_set = nullptr;
        if (my_rank == 0) {
-            dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info));
-            dev_info_set[0] = dev_info;
-
-            llama_gather_device_info(lctx, dev_info_set);
-            device_print_props(dev_info_set, n_world, model, cparams);
-
            if (auto_schedule) {
+                struct device_info * dev_info_set = nullptr;
+                dev_info_set = (struct device_info *)malloc(n_world * sizeof(struct device_info));
+                dev_info_set[0] = dev_info;
+
+                llama_gather_device_info(lctx, dev_info_set);
+                device_print_props(dev_info_set, n_world, model, cparams);
+
                // automatically determine n_layer_window and n_gpu_layers
                if (!assign_layers_to_device(n_world, my_rank, dev_info_set, n_layer_window, n_gpu_layers, model, cparams)) {
                    LOG_ERR("%s: Invalid allocation by HiGHS solver\n", __func__);
@ -1601,7 +1641,9 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
                llama_bcast_layer_setup(lctx, n_layer_window, nullptr);
            }
        } else {
-            llama_send_device_info(lctx, &dev_info);
+            if (auto_schedule){
+                llama_send_device_info(lctx, &dev_info);
+            }
            llama_recv_layer_setup(lctx, n_layer_window, n_gpu_layers);
        }

@ -1766,33 +1808,25 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
    return mparams;
 }

-static ggml_type kv_cache_type_from_str(const std::string & s) {
-    if (s == "f32") {
-        return GGML_TYPE_F32;
-    }
-    if (s == "f16") {
-        return GGML_TYPE_F16;
-    }
-    if (s == "q8_0") {
-        return GGML_TYPE_Q8_0;
-    }
-    if (s == "q4_0") {
-        return GGML_TYPE_Q4_0;
-    }
-    if (s == "q4_1") {
-        return GGML_TYPE_Q4_1;
-    }
-    if (s == "iq4_nl") {
-        return GGML_TYPE_IQ4_NL;
-    }
-    if (s == "q5_0") {
-        return GGML_TYPE_Q5_0;
-    }
-    if (s == "q5_1") {
-        return GGML_TYPE_Q5_1;
-    }
+const std::vector<ggml_type> kv_cache_types = {
+    GGML_TYPE_F32,
+    GGML_TYPE_F16,
+    GGML_TYPE_BF16, // Added BF16 data type support
+    GGML_TYPE_Q8_0,
+    GGML_TYPE_Q4_0,
+    GGML_TYPE_Q4_1,
+    GGML_TYPE_IQ4_NL,
+    GGML_TYPE_Q5_0,
+    GGML_TYPE_Q5_1,
+};

-    throw std::runtime_error("Invalid cache type: " + s);
+static ggml_type kv_cache_type_from_str(const std::string & s) {
+    for (const auto & type : kv_cache_types) {
+        if (ggml_type_name(type) == s) {
+            return type;
+        }
+    }
+    throw std::runtime_error("Unsupported cache type: " + s);
 }

 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
--- a/common/profiler.cpp
+++ b/common/profiler.cpp
--- a/common/profiler.h
+++ b/common/profiler.h
@ -15,25 +15,36 @@ struct cpu_props {
    const char * name;
    const char * description;
    uint32_t     cores;
-    float        flops_f32_f32; // in GFLOPS
-    float        flops_f16_f32; // in GFLOPS
-    float        flops_q4k_f32; // in GFLOPS
-    float        flops_q50_f32; // in GFLOPS
-    float        flops_q5k_f32; // in GFLOPS
-    float        flops_q6k_f32; // in GFLOPS
-    float        flops_q80_f32; // in GFLOPS
+    float        flops_f32_f32;     // in GFLOPS
+    float        flops_f16_f32;     // in GFLOPS
+    float        flops_q2k_f32;     // in GFLOPS
+    float        flops_q4k_f32;     // in GFLOPS
+    float        flops_q5k_f32;     // in GFLOPS
+    float        flops_q6k_f32;     // in GFLOPS
+    float        flops_iq2xxs_f32;  // in GFLOPS
+    float        flops_q50_f32;     // in GFLOPS    
+    float        flops_q80_f32;     // in GFLOPS
+    float        flops_iq1s_f32;    // in GFLOPS
+    float        flops_iq4nl_f32;   // in GFLOPS
+    float        flops_iq1m_f32;    // in GFLOPS

-    cpu_props() : 
-        name(""), 
-        description(""), 
-        cores(0), 
-        flops_f32_f32(0.0f), 
-        flops_f16_f32(0.0f), 
-        flops_q4k_f32(0.0f),
-        flops_q50_f32(0.0f),
-        flops_q5k_f32(0.0f),
-        flops_q6k_f32(0.0f),
-        flops_q80_f32(0.0f) {}
+    cpu_props()
+        : name            (""), 
+          description     (""), 
+          cores           (0), 
+          flops_f32_f32   (0.0f), 
+          flops_f16_f32   (0.0f), 
+          flops_q2k_f32   (0.0f),
+          flops_q4k_f32   (0.0f),
+          flops_q5k_f32   (0.0f),
+          flops_q6k_f32   (0.0f),
+          flops_iq2xxs_f32(0.0f),
+          flops_q50_f32   (0.0f),
+          flops_q80_f32   (0.0f),
+          flops_iq1s_f32  (0.0f), 
+          flops_iq4nl_f32 (0.0f),
+          flops_iq1m_f32  (0.0f)
+    {}
 };

 struct memory_info {
@ -77,132 +88,204 @@ struct gpu_support {
 struct gpu_props {
    const char * name;
    const char * description;
-    float        memory_free;         // in GiB
-    float        memory_total;        // in GiB
-    float        metal_read_vram_bw;  // in GB/s
-    float        metal_flops_f32_f32; // in GFLOPS
-    float        metal_flops_f16_f32; // in GFLOPS
-    float        metal_flops_q4k_f32; // in GFLOPS
-    float        metal_flops_q50_f32; // in GFLOPS
-    float        metal_flops_q5k_f32; // in GFLOPS
-    float        metal_flops_q6k_f32; // in GFLOPS
-    float        metal_flops_q80_f32; // in GFLOPS
-    float        metal_mem_cpy_delay; // in ms
-    float        cuda_read_vram_bw;   // in GB/s
-    float        cuda_flops_f32_f32;  // in GFLOPS
-    float        cuda_flops_f16_f32;  // in GFLOPS
-    float        cuda_flops_q4k_f32;  // in GFLOPS
-    float        cuda_flops_q50_f32;  // in GFLOPS
-    float        cuda_flops_q5k_f32;  // in GFLOPS
-    float        cuda_flops_q6k_f32;  // in GFLOPS
-    float        cuda_flops_q80_f32;  // in GFLOPS
-    float        cuda_mem_cpy_delay;  // in ms
+    float        memory_free;               // in GiB
+    float        memory_total;              // in GiB
+    float        metal_read_vram_bw;        // in GB/s
+    float        metal_flops_f32_f32;       // in GFLOPS
+    float        metal_flops_f16_f32;       // in GFLOPS
+    float        metal_flops_q2k_f32;       // in GFLOPS
+    float        metal_flops_q4k_f32;     // in GFLOPS
+    float        metal_flops_q5k_f32;     // in GFLOPS
+    float        metal_flops_q6k_f32;     // in GFLOPS
+    float        metal_flops_iq2xxs_f32;  // in GFLOPS
+    float        metal_flops_q50_f32;     // in GFLOPS
+    float        metal_flops_q80_f32;     // in GFLOPS
+    float        metal_flops_iq1s_f32;    // in GFLOPS
+    float        metal_flops_iq4nl_f32;   // in GFLOPS
+    float        metal_flops_iq1m_f32;    // in GFLOPS
+    float        metal_mem_cpy_delay;     // in ms
+    float        cuda_read_vram_bw;       // in GB/s
+    float        cuda_flops_f32_f32;      // in GFLOPS
+    float        cuda_flops_f16_f32;      // in GFLOPS
+    float        cuda_flops_q2k_f32;      // in GFLOPS
+    float        cuda_flops_q4k_f32;      // in GFLOPS
+    float        cuda_flops_q5k_f32;      // in GFLOPS
+    float        cuda_flops_q6k_f32;      // in GFLOPS
+    float        cuda_flops_iq2xxs_f32;   // in GFLOPS
+    float        cuda_flops_q50_f32;      // in GFLOPS
+    float        cuda_flops_q80_f32;      // in GFLOPS
+    float        cuda_flops_iq1s_f32;     // in GFLOPS
+    float        cuda_flops_iq4nl_f32;    // in GFLOPS
+    float        cuda_flops_iq1m_f32;     // in GFLOPS
+    float        cuda_mem_cpy_delay;      // in ms

    gpu_props() : 
-        name(""), 
-        description(""), 
-        memory_free        (0.0f), 
-        memory_total       (0.0f), 
-        metal_read_vram_bw (0.0f),
-        metal_flops_f32_f32(0.0f), 
-        metal_flops_f16_f32(0.0f),
-        metal_flops_q4k_f32(0.0f),
-        metal_flops_q50_f32(0.0f),
-        metal_flops_q5k_f32(0.0f),
-        metal_flops_q6k_f32(0.0f),
-        metal_flops_q80_f32(0.0f),
-        metal_mem_cpy_delay(0.0f),
-        cuda_read_vram_bw  (0.0f),
-        cuda_flops_f32_f32 (0.0f), 
-        cuda_flops_f16_f32 (0.0f), 
-        cuda_flops_q4k_f32 (0.0f),
-        cuda_flops_q50_f32 (0.0f),
-        cuda_flops_q5k_f32 (0.0f),
-        cuda_flops_q6k_f32 (0.0f),
-        cuda_flops_q80_f32 (0.0f),
-        cuda_mem_cpy_delay (0.0f) {}
+        name                    (""), 
+        description             (""), 
+        memory_free             (0.0f), 
+        memory_total            (0.0f), 
+        metal_read_vram_bw      (0.0f),
+        metal_flops_f32_f32     (0.0f), 
+        metal_flops_f16_f32     (0.0f),
+        metal_flops_q2k_f32     (0.0f),
+        metal_flops_q4k_f32     (0.0f),
+        metal_flops_q5k_f32     (0.0f),
+        metal_flops_q6k_f32     (0.0f),
+        metal_flops_iq2xxs_f32  (0.0f),
+        metal_flops_q50_f32     (0.0f),
+        metal_flops_q80_f32     (0.0f),
+        metal_flops_iq1s_f32    (0.0f),
+        metal_flops_iq4nl_f32   (0.0f),
+        metal_flops_iq1m_f32    (0.0f),
+        metal_mem_cpy_delay     (0.0f),
+        cuda_read_vram_bw       (0.0f),
+        cuda_flops_f32_f32      (0.0f), 
+        cuda_flops_f16_f32      (0.0f), 
+        cuda_flops_q2k_f32      (0.0f),
+        cuda_flops_q4k_f32      (0.0f),
+        cuda_flops_q5k_f32      (0.0f),
+        cuda_flops_q6k_f32      (0.0f),
+        cuda_flops_iq2xxs_f32   (0.0f),
+        cuda_flops_q50_f32      (0.0f),
+        cuda_flops_q80_f32      (0.0f),
+        cuda_flops_iq1s_f32     (0.0f),
+        cuda_flops_iq4nl_f32    (0.0f),
+        cuda_flops_iq1m_f32     (0.0f),
+        cuda_mem_cpy_delay      (0.0f) {}
 };

 struct model_flops {
    float   inp_embd_ms;
    int64_t output_f32_f32;
    int64_t output_f16_f32;
+    int64_t output_q2k_f32;
    int64_t output_q4k_f32;
-    int64_t output_q50_f32;
    int64_t output_q5k_f32;
    int64_t output_q6k_f32;
+    int64_t output_iq2xxs_f32;
+    int64_t output_q50_f32;
    int64_t output_q80_f32;
+    int64_t output_iq1s_f32;
+    int64_t output_iq4nl_f32;
+    int64_t output_iq1m_f32;
    int64_t layer_f32_f32;
    int64_t layer_f16_f32;
+    int64_t layer_q2k_f32;
    int64_t layer_q4k_f32;
-    int64_t layer_q50_f32;
    int64_t layer_q5k_f32;
    int64_t layer_q6k_f32;
+    int64_t layer_iq2xxs_f32;
+    int64_t layer_q50_f32;
    int64_t layer_q80_f32;
+    int64_t layer_iq1s_f32;
+    int64_t layer_iq4nl_f32;
+    int64_t layer_iq1m_f32;

    model_flops() : 
-        inp_embd_ms(0.0f),
-        output_f32_f32(0), 
-        output_f16_f32(0),
-        output_q4k_f32(0),
-        output_q50_f32(0),
-        output_q5k_f32(0),
-        output_q6k_f32(0), 
-        output_q80_f32(0),
-        layer_f32_f32 (0),
-        layer_f16_f32 (0),
-        layer_q4k_f32 (0),
-        layer_q50_f32 (0),
-        layer_q5k_f32 (0),
-        layer_q6k_f32 (0),
-        layer_q80_f32 (0) {}
+        inp_embd_ms        (0.0f),
+        output_f32_f32     (0), 
+        output_f16_f32     (0),
+        output_q2k_f32     (0),
+        output_q4k_f32     (0),
+        output_q5k_f32     (0),
+        output_q6k_f32     (0), 
+        output_iq2xxs_f32  (0),
+        output_q50_f32     (0),
+        output_q80_f32     (0),
+        output_iq1s_f32    (0),
+        output_iq4nl_f32   (0),
+        output_iq1m_f32    (0),
+        layer_f32_f32      (0),
+        layer_f16_f32      (0),
+        layer_q2k_f32      (0),
+        layer_q4k_f32      (0),
+        layer_q5k_f32      (0),
+        layer_q6k_f32      (0),
+        layer_iq2xxs_f32   (0),
+        layer_q50_f32      (0),
+        layer_q80_f32      (0),
+        layer_iq1s_f32     (0),
+        layer_iq4nl_f32    (0), 
+        layer_iq1m_f32     (0)
+        {}
 };

 struct model_params {
    int64_t input_f32;
    int64_t input_f16;
+    int64_t input_q2k;
    int64_t input_q4k;
-    int64_t input_q50;
    int64_t input_q5k;
    int64_t input_q6k;
+    int64_t input_iq2xxs;
+    int64_t input_q50;
    int64_t input_q80;
+    int64_t input_iq1s;
+    int64_t input_iq4nl;
+    int64_t input_iq1m;
    int64_t output_f32;
    int64_t output_f16;
+    int64_t output_q2k;
    int64_t output_q4k;
-    int64_t output_q50;
    int64_t output_q5k;
    int64_t output_q6k;
+    int64_t output_iq2xxs;
+    int64_t output_q50;
    int64_t output_q80;
+    int64_t output_iq1s;
+    int64_t output_iq4nl;
+    int64_t output_iq1m;
    int64_t layer_f32;
    int64_t layer_f16;
+    int64_t layer_q2k;
    int64_t layer_q4k;
-    int64_t layer_q50;
    int64_t layer_q5k;
    int64_t layer_q6k;
+    int64_t layer_iq2xxs;
+    int64_t layer_q50;
    int64_t layer_q80;
+    int64_t layer_iq1s;
+    int64_t layer_iq4nl;
+    int64_t layer_iq1m;

    model_params() :
-        input_f32 (0),
-        input_f16 (0),
-        input_q4k (0),
-        input_q50 (0),
-        input_q5k (0),
-        input_q6k (0),
-        input_q80 (0),
-        output_f32(0),
-        output_f16(0),
-        output_q4k(0),
-        output_q50(0),
-        output_q5k(0),
-        output_q6k(0),
-        output_q80(0),
-        layer_f32 (0),
-        layer_f16 (0),
-        layer_q4k (0),
-        layer_q50 (0),
-        layer_q5k (0),
-        layer_q6k (0),
-        layer_q80 (0) {}
+        input_f32       (0),
+        input_f16       (0),
+        input_q2k       (0),
+        input_q4k       (0),
+        input_q5k       (0),
+        input_q6k       (0),
+        input_iq2xxs    (0),
+        input_q50       (0),
+        input_q80       (0),
+        input_iq1s      (0),
+        input_iq4nl     (0),
+        input_iq1m      (0),
+        output_f32      (0),
+        output_f16      (0),
+        output_q2k      (0),
+        output_q4k      (0),
+        output_q5k      (0),
+        output_q6k      (0),
+        output_iq2xxs   (0),
+        output_q50      (0),
+        output_q80      (0),
+        output_iq1s     (0),
+        output_iq4nl    (0),
+        output_iq1m     (0),
+        layer_f32       (0),
+        layer_f16       (0),
+        layer_q2k       (0),
+        layer_q4k       (0),
+        layer_q5k       (0),
+        layer_q6k       (0),
+        layer_iq2xxs    (0),
+        layer_q50       (0),
+        layer_q80       (0),
+        layer_iq1s      (0),
+        layer_iq4nl     (0), 
+        layer_iq1m      (0)
+        {}
 };

 struct model_bytes {
@ -229,6 +312,10 @@ struct disk_props {
        write_rnd_bw(0.0f) {}
 };

+struct startup_args{
+    bool should_profile;
+};
+
 struct device_info {
    uint32_t            rank;
    const char *        device_name;
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -385,12 +385,12 @@ extern "C" {
        GGML_TYPE_F64     = 28,
        GGML_TYPE_IQ1_M   = 29,
        GGML_TYPE_BF16    = 30,
-        GGML_TYPE_Q4_0_4_4 = 31,
-        GGML_TYPE_Q4_0_4_8 = 32,
-        GGML_TYPE_Q4_0_8_8 = 33,
+        // GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
+        // GGML_TYPE_Q4_0_4_8 = 32,
+        // GGML_TYPE_Q4_0_8_8 = 33,
        GGML_TYPE_TQ1_0   = 34,
        GGML_TYPE_TQ2_0   = 35,
-        GGML_TYPE_COUNT,
+        GGML_TYPE_COUNT   = 39,
    };

    // precision
@ -431,9 +431,6 @@ extern "C" {
        GGML_FTYPE_MOSTLY_IQ4_XS  = 22, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ1_M   = 23, // except 1d tensors
        GGML_FTYPE_MOSTLY_BF16    = 24, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
    };

    // available tensor operations:
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@ -15725,15 +15725,6 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
            {
                VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
            } break;
-        case GGML_TYPE_Q4_0_4_4:
-        case GGML_TYPE_Q4_0_4_8:
-            {
-                VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x4, data, nbytes / sizeof(block_q4_0x4), 4);
-            } break;
-        case GGML_TYPE_Q4_0_8_8:
-            {
-                VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x8, data, nbytes / sizeof(block_q4_0x8), 8);
-            } break;

        case GGML_TYPE_I8:
        case GGML_TYPE_I16:
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -1076,54 +1076,6 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
        .vec_dot_type             = GGML_TYPE_BF16,
        .nrows                    = 1,
    },
-    [GGML_TYPE_Q4_0_4_4] = {
-        .type_name                = "q4_0_4x4",
-        .blck_size                = QK4_0,
-        .blck_size_interleave     = 4,
-        .type_size                = sizeof(block_q4_0),
-        .is_quantized             = true,
-        .to_float                 = NULL,
-        .from_float               = NULL,
-        .from_float_ref           = NULL,
-        .vec_dot                  = NULL,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-        .nrows                    = 1,
-        .ncols                    = 4,
-        .gemv                     = ggml_gemv_q4_0_4x4_q8_0,
-        .gemm                     = ggml_gemm_q4_0_4x4_q8_0,
-    },
-    [GGML_TYPE_Q4_0_4_8] = {
-        .type_name                = "q4_0_4x8",
-        .blck_size                = QK4_0,
-        .blck_size_interleave     = 8,
-        .type_size                = sizeof(block_q4_0),
-        .is_quantized             = true,
-        .to_float                 = NULL,
-        .from_float               = NULL,
-        .from_float_ref           = NULL,
-        .vec_dot                  = NULL,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-        .nrows                    = 1,
-        .ncols                    = 4,
-        .gemv                     = ggml_gemv_q4_0_4x8_q8_0,
-        .gemm                     = ggml_gemm_q4_0_4x8_q8_0,
-    },
-    [GGML_TYPE_Q4_0_8_8] = {
-        .type_name                = "q4_0_8x8",
-        .blck_size                = QK4_0,
-        .blck_size_interleave     = 8,
-        .type_size                = sizeof(block_q4_0),
-        .is_quantized             = true,
-        .to_float                 = NULL,
-        .from_float               = NULL,
-        .from_float_ref           = NULL,
-        .vec_dot                  = NULL,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-        .nrows                    = 1,
-        .ncols                    = 8,
-        .gemv                     = ggml_gemv_q4_0_8x8_q8_0,
-        .gemm                     = ggml_gemm_q4_0_8x8_q8_0,
-    },
    [GGML_TYPE_TQ1_0] = {
        .type_name                = "tq1_0",
        .blck_size                = QK_K,
@ -3472,7 +3424,7 @@ size_t ggml_row_size(enum ggml_type type, int64_t ne) {
 double ggml_type_sizef(enum ggml_type type) {
    return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
 }
-
+ 
 const char * ggml_type_name(enum ggml_type type) {
    return type < GGML_TYPE_COUNT ? type_traits[type].type_name : "NONE";
 }
@ -3578,9 +3530,6 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
        case GGML_FTYPE_MOSTLY_IQ4_XS:        wtype = GGML_TYPE_IQ4_XS;   break;
        case GGML_FTYPE_MOSTLY_IQ3_S:         wtype = GGML_TYPE_IQ3_S;    break;
        case GGML_FTYPE_MOSTLY_IQ2_S:         wtype = GGML_TYPE_IQ2_S;    break;
-        case GGML_FTYPE_MOSTLY_Q4_0_4_4:      wtype = GGML_TYPE_Q4_0_4_4; break;
-        case GGML_FTYPE_MOSTLY_Q4_0_4_8:      wtype = GGML_TYPE_Q4_0_4_8; break;
-        case GGML_FTYPE_MOSTLY_Q4_0_8_8:      wtype = GGML_TYPE_Q4_0_8_8; break;
        case GGML_FTYPE_UNKNOWN:              wtype = GGML_TYPE_COUNT; break;
        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
    }
@ -4107,7 +4056,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
        /*.data         =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
        /*.name         =*/ { 0 },
        /*.extra        =*/ NULL,
-        ///*.padding      =*/ { 0 },
+        // /*.padding      =*/ { 0 },
    };

 #ifdef __clang__
@ -9517,9 +9466,6 @@ static void ggml_compute_forward_add(
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q4_0_4_4:
-        case GGML_TYPE_Q4_0_4_8:
-        case GGML_TYPE_Q4_0_8_8:
            {
                ggml_compute_forward_add_q_f32(params, dst);
            } break;
@ -9897,9 +9843,6 @@ static void ggml_compute_forward_add1(
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q4_0_4_4:
-        case GGML_TYPE_Q4_0_4_8:
-        case GGML_TYPE_Q4_0_8_8:
            {
                ggml_compute_forward_add1_q_f32(params, dst);
            } break;
@ -10027,9 +9970,6 @@ static void ggml_compute_forward_acc(
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q4_0_4_4:
-        case GGML_TYPE_Q4_0_4_8:
-        case GGML_TYPE_Q4_0_8_8:
        default:
            {
                GGML_ABORT("fatal error");
@ -13093,9 +13033,6 @@ static void ggml_compute_forward_out_prod(
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q4_0_4_4:
-        case GGML_TYPE_Q4_0_4_8:
-        case GGML_TYPE_Q4_0_8_8:
            {
                ggml_compute_forward_out_prod_q_f32(params, dst);
            } break;
@ -13283,9 +13220,6 @@ static void ggml_compute_forward_set(
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q4_0_4_4:
-        case GGML_TYPE_Q4_0_4_8:
-        case GGML_TYPE_Q4_0_8_8:
        default:
            {
                GGML_ABORT("fatal error");
@ -13547,9 +13481,6 @@ static void ggml_compute_forward_get_rows(
        case GGML_TYPE_IQ4_XS:
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_Q4_0_4_4:
-        case GGML_TYPE_Q4_0_4_8:
-        case GGML_TYPE_Q4_0_8_8:
            {
                ggml_compute_forward_get_rows_q(params, dst);
            } break;
@ -14139,9 +14070,6 @@ static void ggml_compute_forward_clamp(
        case GGML_TYPE_IQ3_S:
        case GGML_TYPE_IQ2_S:
        case GGML_TYPE_Q8_K:
-        case GGML_TYPE_Q4_0_4_4:
-        case GGML_TYPE_Q4_0_4_8:
-        case GGML_TYPE_Q4_0_8_8:
        case GGML_TYPE_I8:
        case GGML_TYPE_I16:
        case GGML_TYPE_I32:
@ -21941,9 +21869,6 @@ size_t ggml_quantize_chunk(
        case GGML_TYPE_IQ1_M:   result = quantize_iq1_m  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ4_NL:  result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ4_XS:  result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_F16:
            {
                size_t elemsize = sizeof(ggml_fp16_t);
--- a/include/llama.h
+++ b/include/llama.h
@ -165,18 +165,18 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_Q2_K_S        = 21, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ3_XS        = 22, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ3_XXS       = 23, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ1_S         = 24, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ1_S         = 24, // except 1d tensors, 1 bit quantization
        LLAMA_FTYPE_MOSTLY_IQ4_NL        = 25, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ3_S         = 26, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ3_M         = 27, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ2_S         = 28, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ2_M         = 29, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ4_XS        = 30, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ1_M         = 31, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ1_M         = 31, // except 1d tensors, 1 bit quantization
        LLAMA_FTYPE_MOSTLY_BF16          = 32, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_0_4_4      = 33, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_0_4_8      = 34, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_0_8_8      = 35, // except 1d tensors
+        // LLAMA_FTYPE_MOSTLY_Q4_0_4_4      = 33, // removed from gguf files, use Q4_0 and runtime repack
+        // LLAMA_FTYPE_MOSTLY_Q4_0_4_8      = 34, // removed from gguf files, use Q4_0 and runtime repack
+        // LLAMA_FTYPE_MOSTLY_Q4_0_8_8      = 35, // removed from gguf files, use Q4_0 and runtime repack
        LLAMA_FTYPE_MOSTLY_TQ1_0         = 36, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_TQ2_0         = 37, // except 1d tensors

@ -453,6 +453,7 @@ extern "C" {
    LLAMA_API void llama_free_sockets      (struct llama_context * ctx, char ** msg);
    LLAMA_API int  llama_gather_device_info(struct llama_context * ctx, struct device_info * dev_info_set);
    LLAMA_API int  llama_send_device_info  (struct llama_context * ctx, struct device_info * dev_info);
+    LLAMA_API int  llama_bcast_startup_args(struct llama_context * ctx, uint32_t rank, struct startup_args * args);
    LLAMA_API int  llama_bcast_layer_setup (struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers);
    LLAMA_API int  llama_recv_layer_setup  (struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers);

--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -3560,16 +3560,26 @@ static bool is_dtype_exist(struct model_params * n_params, enum ggml_type dtype)
        case GGML_TYPE_F32:
        case GGML_TYPE_F16:    
            return true;
+        case GGML_TYPE_Q2_K:
+            return n_params->layer_q2k    > 0 || n_params->output_q2k    > 0;
        case GGML_TYPE_Q4_K:
-            return n_params->layer_q4k > 0 || n_params->output_q4k > 0;
-        case GGML_TYPE_Q5_0:
-            return n_params->layer_q50 > 0 || n_params->output_q50 > 0;
+            return n_params->layer_q4k    > 0 || n_params->output_q4k    > 0;
        case GGML_TYPE_Q5_K:
-            return n_params->layer_q5k > 0 || n_params->output_q5k > 0;
+            return n_params->layer_q5k    > 0 || n_params->output_q5k    > 0;
        case GGML_TYPE_Q6_K:
-            return n_params->layer_q6k > 0 || n_params->output_q6k > 0;
+            return n_params->layer_q6k    > 0 || n_params->output_q6k    > 0;
+        case GGML_TYPE_IQ2_XXS:
+            return n_params->layer_iq2xxs > 0 || n_params->output_iq2xxs > 0;
+        case GGML_TYPE_Q5_0:
+            return n_params->layer_q50    > 0 || n_params->output_q50    > 0;
        case GGML_TYPE_Q8_0:
-            return n_params->layer_q80 > 0 || n_params->output_q80 > 0;
+            return n_params->layer_q80    > 0 || n_params->output_q80    > 0;
+        case GGML_TYPE_IQ1_S:
+            return n_params->layer_iq1s   > 0 || n_params->output_iq1s   > 0;
+        case GGML_TYPE_IQ4_NL:
+            return n_params->layer_iq4nl  > 0 || n_params->output_iq4nl  > 0;
+        case GGML_TYPE_IQ1_M:
+            return n_params->layer_iq1m   > 0 || n_params->output_iq1m   > 0;
        default:
            throw std::runtime_error("Unrecognized data type\n");
    }
@ -3650,18 +3660,18 @@ void llama_profile_device(
        dev_info->gpu_props.cuda_flops_f16_f32  = device_cuda_flops (model, GGML_TYPE_F16,  GGML_TYPE_F32);
    }

+    if (is_dtype_exist(n_params, GGML_TYPE_Q2_K)) {
+        dev_info->cpu_props.flops_q2k_f32       = device_cpu_flops  (model, GGML_TYPE_Q2_K, GGML_TYPE_F32, n_threads);
+        dev_info->gpu_props.metal_flops_q2k_f32 = device_metal_flops(model, GGML_TYPE_Q2_K, GGML_TYPE_F32);
+        dev_info->gpu_props.cuda_flops_q2k_f32  = device_cuda_flops (model, GGML_TYPE_Q2_K, GGML_TYPE_F32);
+    }
+
    if (is_dtype_exist(n_params, GGML_TYPE_Q4_K)) {
        dev_info->cpu_props.flops_q4k_f32       = device_cpu_flops  (model, GGML_TYPE_Q4_K, GGML_TYPE_F32, n_threads);
        dev_info->gpu_props.metal_flops_q4k_f32 = device_metal_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
        dev_info->gpu_props.cuda_flops_q4k_f32  = device_cuda_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
    }

-    if (is_dtype_exist(n_params, GGML_TYPE_Q5_0)) {
-        dev_info->cpu_props.flops_q50_f32       = device_cpu_flops  (model, GGML_TYPE_Q5_0, GGML_TYPE_F32, n_threads);
-        dev_info->gpu_props.metal_flops_q50_f32 = device_metal_flops(model, GGML_TYPE_Q5_0, GGML_TYPE_F32);
-        dev_info->gpu_props.cuda_flops_q50_f32  = device_cuda_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32);
-    }
-
    if (is_dtype_exist(n_params, GGML_TYPE_Q5_K)) {
        dev_info->cpu_props.flops_q5k_f32       = device_cpu_flops  (model, GGML_TYPE_Q5_K, GGML_TYPE_F32, n_threads);
        dev_info->gpu_props.metal_flops_q5k_f32 = device_metal_flops(model, GGML_TYPE_Q5_K, GGML_TYPE_F32);
@ -3674,11 +3684,42 @@ void llama_profile_device(
        dev_info->gpu_props.cuda_flops_q6k_f32  = device_cuda_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32);
    }

+    if (is_dtype_exist(n_params, GGML_TYPE_IQ2_XXS)) {
+        dev_info->cpu_props.flops_iq2xxs_f32    = device_cpu_flops  (model, GGML_TYPE_IQ2_XXS, GGML_TYPE_F32, n_threads);
+        dev_info->gpu_props.metal_flops_iq2xxs_f32= device_metal_flops(model, GGML_TYPE_IQ2_XXS, GGML_TYPE_F32);
+        dev_info->gpu_props.cuda_flops_iq2xxs_f32 = device_cuda_flops (model, GGML_TYPE_IQ2_XXS, GGML_TYPE_F32);
+    }
+
+    if (is_dtype_exist(n_params, GGML_TYPE_Q5_0)) {
+        dev_info->cpu_props.flops_q50_f32       = device_cpu_flops  (model, GGML_TYPE_Q5_0, GGML_TYPE_F32, n_threads);
+        dev_info->gpu_props.metal_flops_q50_f32 = device_metal_flops(model, GGML_TYPE_Q5_0, GGML_TYPE_F32);
+        dev_info->gpu_props.cuda_flops_q50_f32  = device_cuda_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32);
+    }
+
+
    if (is_dtype_exist(n_params, GGML_TYPE_Q8_0)) {
        dev_info->cpu_props.flops_q80_f32       = device_cpu_flops  (model, GGML_TYPE_Q8_0, GGML_TYPE_F32, n_threads);
        dev_info->gpu_props.metal_flops_q80_f32 = device_metal_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32);
        dev_info->gpu_props.cuda_flops_q80_f32  = device_cuda_flops (model, GGML_TYPE_Q8_0, GGML_TYPE_F32);
    }
+
+    if (is_dtype_exist(n_params, GGML_TYPE_IQ1_S)) {
+        dev_info->cpu_props.flops_iq1s_f32      = device_cpu_flops  (model, GGML_TYPE_IQ1_S, GGML_TYPE_F32, n_threads);
+        dev_info->gpu_props.metal_flops_iq1s_f32= device_metal_flops(model, GGML_TYPE_IQ1_S, GGML_TYPE_F32);
+        dev_info->gpu_props.cuda_flops_iq1s_f32 = device_cuda_flops (model, GGML_TYPE_IQ1_S, GGML_TYPE_F32);
+    }
+
+    if (is_dtype_exist(n_params, GGML_TYPE_IQ4_NL)) {
+        dev_info->cpu_props.flops_iq4nl_f32     = device_cpu_flops   (model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32, n_threads);
+        dev_info->gpu_props.metal_flops_iq4nl_f32= device_metal_flops(model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32);
+        dev_info->gpu_props.cuda_flops_iq4nl_f32 = device_cuda_flops (model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32);
+    }
+
+    if (is_dtype_exist(n_params, GGML_TYPE_IQ1_M)) {
+        dev_info->cpu_props.flops_iq1m_f32      = device_cpu_flops  (model, GGML_TYPE_IQ1_M, GGML_TYPE_F32, n_threads);
+        dev_info->gpu_props.metal_flops_iq1m_f32= device_metal_flops(model, GGML_TYPE_IQ1_M, GGML_TYPE_F32);
+        dev_info->gpu_props.cuda_flops_iq1m_f32 = device_cuda_flops (model, GGML_TYPE_IQ1_M, GGML_TYPE_F32);
+    }
 }

 ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device) {
@ -4844,9 +4885,7 @@ struct llama_model_loader {
                case GGML_TYPE_IQ4_NL:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL;  break;
                case GGML_TYPE_IQ4_XS:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS;  break;
                case GGML_TYPE_IQ3_S:   ftype = LLAMA_FTYPE_MOSTLY_IQ3_S;   break;
-                case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break;
-                case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break;
-                case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break;
+
                default:
                    {
                        LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@ -5654,9 +5693,6 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
        case LLAMA_FTYPE_MOSTLY_IQ4_XS:   return "IQ4_XS - 4.25 bpw";
        case LLAMA_FTYPE_MOSTLY_IQ3_S:    return "IQ3_S - 3.4375 bpw";
        case LLAMA_FTYPE_MOSTLY_IQ3_M:    return "IQ3_S mix - 3.66 bpw";
-        case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
-        case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
-        case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";

        default: return "unknown, may not work";
    }
@ -18997,10 +19033,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
                new_type = GGML_TYPE_IQ3_S;
            }
-            else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 ||
-                     new_type == GGML_TYPE_Q4_0_8_8) {
-                new_type = GGML_TYPE_Q4_0;
-            }
            else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
                new_type = GGML_TYPE_Q4_K;
            }
@ -19323,10 +19355,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        case LLAMA_FTYPE_MOSTLY_IQ4_XS:  default_type = GGML_TYPE_IQ4_XS;  break;
        case LLAMA_FTYPE_MOSTLY_IQ3_S:   default_type = GGML_TYPE_IQ3_S;   break;
        case LLAMA_FTYPE_MOSTLY_IQ3_M:   default_type = GGML_TYPE_IQ3_S;   break;
-        case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;
-        case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
-        case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;
-
+        
        default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
    }

@ -19646,14 +19675,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                f32_data = (float *) f32_conv_buf.data();
            }

-            int chunk_size_multiplier = 1;
-            if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
-                if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0;
-                else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0;
-                if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
-                else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
-            }
-
            LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
            fflush(stdout);

@ -19666,8 +19687,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            const int64_t nrows = tensor->ne[1];

            static const int64_t min_chunk_size = 32 * 512;
-            const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row)) *
-                                       chunk_size_multiplier;
+            const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));

            const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
            const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
@ -20242,6 +20262,46 @@ int llama_send_device_info(struct llama_context * ctx, struct device_info * dev_
    return 0;
 }

+int llama_bcast_startup_args(llama_context * ctx, uint32_t rank, startup_args * args) {
+    int32_t n_world = ctx->cparams.n_world;
+    if (n_world == 1) {
+        return 0;
+    }
+    GGML_ASSERT(ctx != nullptr && ctx->send_socket != nullptr);
+    if (rank == 0){
+        // send
+        try {
+            std::vector<zmq::message_t> send_msgs;
+            send_msgs.emplace_back("should_profile", strlen("should_profile"));
+            send_msgs.emplace_back(&args->should_profile, sizeof(args->should_profile));
+            zmq::send_multipart(*ctx->send_socket, send_msgs);
+        } catch (const zmq::error_t& e) {
+            LLAMA_LOG_INFO("Failed to send data: %s\n", e.what());
+            return -1;
+        }
+    }else {
+        // receive
+        std::vector<zmq::message_t> recv_msgs;
+        if (!zmq::recv_multipart(*ctx->recv_socket, std::back_inserter(recv_msgs))) {
+            return -1;
+        }
+        GGML_ASSERT(recv_msgs[0].to_string() == "should_profile");
+        GGML_ASSERT(recv_msgs[1].size() == sizeof(bool));
+        bool should_profile = *static_cast<bool*>(recv_msgs[1].data());
+        args->should_profile = should_profile;
+        if ((int)rank != (int)n_world - 1){
+            // send
+            try {
+                zmq::send_multipart(*ctx->send_socket, recv_msgs);
+            } catch (const zmq::error_t& e) {
+                LLAMA_LOG_INFO("Failed to send data: %s\n", e.what());
+                return -1;
+            }
+        }
+    }
+    return 0;
+}
+
 int llama_bcast_layer_setup(struct llama_context * ctx, uint32_t * n_layer_window, uint32_t * n_gpu_layers) {
    uint32_t n_world = ctx->cparams.n_world;
    if (n_world == 1) {
@ -21049,25 +21109,40 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en
        case PROFILER_LAYER_OUTPUT:
            switch (dtype) {
                case GGML_TYPE_F32:
-                    n_flops->output_f32_f32 += n;
+                    n_flops->output_f32_f32    += n;
                    break;
                case GGML_TYPE_F16:
-                    n_flops->output_f16_f32 += n;
+                    n_flops->output_f16_f32    += n;
+                    break;
+                case GGML_TYPE_Q2_K:
+                    n_flops->output_q2k_f32    += n;
                    break;
                case GGML_TYPE_Q4_K:
-                    n_flops->output_q4k_f32 += n;
-                    break;
-                case GGML_TYPE_Q5_0:
-                    n_flops->output_q50_f32 += n;
+                    n_flops->output_q4k_f32    += n;
                    break;
                case GGML_TYPE_Q5_K:
-                    n_flops->output_q5k_f32 += n;
+                    n_flops->output_q5k_f32    += n;
                    break;
                case GGML_TYPE_Q6_K:
-                    n_flops->output_q6k_f32 += n;
+                    n_flops->output_q6k_f32    += n;
+                    break;
+                case GGML_TYPE_IQ2_XXS:
+                    n_flops->output_iq2xxs_f32 += n;
+                    break;
+                case GGML_TYPE_Q5_0:
+                    n_flops->output_q50_f32    += n;
                    break;
                case GGML_TYPE_Q8_0:
-                    n_flops->output_q80_f32 += n;
+                    n_flops->output_q80_f32    += n;
+                    break;
+                case GGML_TYPE_IQ1_S:
+                    n_flops->output_iq1s_f32   += n;
+                    break;
+                case GGML_TYPE_IQ4_NL:
+                    n_flops->output_iq4nl_f32  += n;
+                    break;
+                case GGML_TYPE_IQ1_M:
+                    n_flops->output_iq1m_f32   += n;
                    break;
                default:
                    throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
@ -21075,27 +21150,42 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en
            break;

        case PROFILER_LAYER_BACKEND:
-            switch (dtype) {
+              switch (dtype) {
                case GGML_TYPE_F32:
-                    n_flops->layer_f32_f32 += n;
+                    n_flops->layer_f32_f32    += n;
                    break;
                case GGML_TYPE_F16:
-                    n_flops->layer_f16_f32 += n;
+                    n_flops->layer_f16_f32    += n;
+                    break;
+                case GGML_TYPE_Q2_K:
+                    n_flops->layer_q2k_f32    += n;
                    break;
                case GGML_TYPE_Q4_K:
-                    n_flops->layer_q4k_f32 += n;
-                    break;
-                case GGML_TYPE_Q5_0:
-                    n_flops->layer_q50_f32 += n;
+                    n_flops->layer_q4k_f32    += n;
                    break;
                case GGML_TYPE_Q5_K:
-                    n_flops->layer_q5k_f32 += n;
+                    n_flops->layer_q5k_f32    += n;
                    break;
                case GGML_TYPE_Q6_K:
-                    n_flops->layer_q6k_f32 += n;
+                    n_flops->layer_q6k_f32    += n;
+                    break;
+                case GGML_TYPE_IQ2_XXS:
+                    n_flops->layer_iq2xxs_f32 += n;
+                    break;
+                case GGML_TYPE_Q5_0:
+                    n_flops->layer_q50_f32    += n;
                    break;
                case GGML_TYPE_Q8_0:
-                    n_flops->layer_q80_f32 += n;
+                    n_flops->layer_q80_f32    += n;
+                    break;
+                case GGML_TYPE_IQ1_S:
+                    n_flops->layer_iq1s_f32   += n;
+                    break;
+                case GGML_TYPE_IQ4_NL:
+                    n_flops->layer_iq4nl_f32  += n;
+                    break;
+                case GGML_TYPE_IQ1_M:
+                    n_flops->layer_iq1m_f32   += n;
                    break;
                default:
                    throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n");
@ -21113,25 +21203,40 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
        case PROFILER_LAYER_INPUT:
            switch (dtype) {
                case GGML_TYPE_F32:
-                    n_params->input_f32 += n_i64t;
+                    n_params->input_f32    += n_i64t;
                    break;
                case GGML_TYPE_F16:
-                    n_params->input_f16 += n_i64t;
+                    n_params->input_f16    += n_i64t;
+                    break;
+                case GGML_TYPE_Q2_K:
+                    n_params->input_q2k    += n_i64t;
                    break;
                case GGML_TYPE_Q4_K:
-                    n_params->input_q4k += n_i64t;
-                    break;
-                case GGML_TYPE_Q5_0:
-                    n_params->input_q50 += n_i64t;
+                    n_params->input_q4k    += n_i64t;
                    break;
                case GGML_TYPE_Q5_K:
-                    n_params->input_q5k += n_i64t;
+                    n_params->input_q5k    += n_i64t;
                    break;
                case GGML_TYPE_Q6_K:
-                    n_params->input_q6k += n_i64t;
+                    n_params->input_q6k    += n_i64t;
+                    break;
+                case GGML_TYPE_IQ2_XXS:
+                    n_params->input_iq2xxs += n_i64t;
+                    break;
+                case GGML_TYPE_Q5_0:
+                    n_params->input_q50    += n_i64t;
                    break;
                case GGML_TYPE_Q8_0:
-                    n_params->input_q80 += n_i64t;
+                    n_params->input_q80    += n_i64t;
+                    break;
+                case GGML_TYPE_IQ1_S:
+                    n_params->input_iq1s   += n_i64t;
+                    break;
+                case GGML_TYPE_IQ4_NL:
+                    n_params->input_iq4nl  += n_i64t;
+                    break;
+                case GGML_TYPE_IQ1_M:
+                    n_params->input_iq1m   += n_i64t;
                    break;
                default:
                    throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
@ -21141,25 +21246,40 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
        case PROFILER_LAYER_OUTPUT:
            switch (dtype) {
                case GGML_TYPE_F32:
-                    n_params->output_f32 += n_i64t;
+                    n_params->output_f32    += n_i64t;
                    break;
                case GGML_TYPE_F16:
-                    n_params->output_f16 += n_i64t;
+                    n_params->output_f16    += n_i64t;
+                    break;
+                case GGML_TYPE_Q2_K:
+                    n_params->output_q2k    += n_i64t;
                    break;
                case GGML_TYPE_Q4_K:
-                    n_params->output_q4k += n_i64t;
-                    break;
-                case GGML_TYPE_Q5_0:
-                    n_params->output_q50 += n_i64t;
+                    n_params->output_q4k    += n_i64t;
                    break;
                case GGML_TYPE_Q5_K:
-                    n_params->output_q5k += n_i64t;
+                    n_params->output_q5k    += n_i64t;
                    break;
                case GGML_TYPE_Q6_K:
-                    n_params->output_q6k += n_i64t;
+                    n_params->output_q6k    += n_i64t;
+                    break;
+                case GGML_TYPE_IQ2_XXS:
+                    n_params->output_iq2xxs += n_i64t;
+                    break;
+                case GGML_TYPE_Q5_0:
+                    n_params->output_q50    += n_i64t;
                    break;
                case GGML_TYPE_Q8_0:
-                    n_params->output_q80 += n_i64t;
+                    n_params->output_q80    += n_i64t;
+                    break;
+                case GGML_TYPE_IQ1_S:
+                    n_params->output_iq1s   += n_i64t;
+                    break;
+                case GGML_TYPE_IQ4_NL:
+                    n_params->output_iq4nl  += n_i64t;
+                    break;
+                case GGML_TYPE_IQ1_M:
+                    n_params->output_iq1m   += n_i64t;
                    break;
                default:
                    throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
@ -21169,25 +21289,40 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
        case PROFILER_LAYER_BACKEND:
            switch (dtype) {
                case GGML_TYPE_F32:
-                    n_params->layer_f32 += n_i64t;
+                    n_params->layer_f32     += n_i64t;
                    break;
                case GGML_TYPE_F16:
-                    n_params->layer_f16 += n_i64t;
+                    n_params->layer_f16     += n_i64t;
+                    break;
+                case GGML_TYPE_Q2_K:
+                    n_params->layer_q2k     += n_i64t;
                    break;
                case GGML_TYPE_Q4_K:
-                    n_params->layer_q4k += n_i64t;
-                    break;
-                case GGML_TYPE_Q5_0:
-                    n_params->layer_q50 += n_i64t;
+                    n_params->layer_q4k     += n_i64t;
                    break;
                case GGML_TYPE_Q5_K:
-                    n_params->layer_q5k += n_i64t;
+                    n_params->layer_q5k     += n_i64t;
                    break;
                case GGML_TYPE_Q6_K:
-                    n_params->layer_q6k += n_i64t;
+                    n_params->layer_q6k     += n_i64t;
+                    break;
+                case GGML_TYPE_IQ2_XXS:
+                    n_params->layer_iq2xxs  += n_i64t;
+                    break;
+                case GGML_TYPE_Q5_0:
+                    n_params->layer_q50     += n_i64t;
                    break;
                case GGML_TYPE_Q8_0:
-                    n_params->layer_q80 += n_i64t;
+                    n_params->layer_q80     += n_i64t;
+                    break;
+                case GGML_TYPE_IQ1_S:
+                    n_params->layer_iq1s    += n_i64t;
+                    break;
+                case GGML_TYPE_IQ4_NL:
+                    n_params->layer_iq4nl   += n_i64t;
+                    break;
+                case GGML_TYPE_IQ1_M:
+                    n_params->layer_iq1m    += n_i64t;
                    break;
                default:
                    throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n");
@ -21477,23 +21612,33 @@ void llama_model_n_flops(
    }

    // use average values instead of total values
-    n_flops->layer_f32_f32 = static_cast<int64_t>((double)n_flops->layer_f32_f32 / (double)n_layer);
-    n_flops->layer_f16_f32 = static_cast<int64_t>((double)n_flops->layer_f16_f32 / (double)n_layer);
-    n_flops->layer_q4k_f32 = static_cast<int64_t>((double)n_flops->layer_q4k_f32 / (double)n_layer);
-    n_flops->layer_q50_f32 = static_cast<int64_t>((double)n_flops->layer_q50_f32 / (double)n_layer);
-    n_flops->layer_q5k_f32 = static_cast<int64_t>((double)n_flops->layer_q5k_f32 / (double)n_layer);
-    n_flops->layer_q6k_f32 = static_cast<int64_t>((double)n_flops->layer_q6k_f32 / (double)n_layer);
-    n_flops->layer_q80_f32 = static_cast<int64_t>((double)n_flops->layer_q80_f32 / (double)n_layer);
-
-    n_params->layer_f32    = static_cast<int64_t>((double)n_params->layer_f32    / (double)n_layer);
-    n_params->layer_f16    = static_cast<int64_t>((double)n_params->layer_f16    / (double)n_layer);
-    n_params->layer_q4k    = static_cast<int64_t>((double)n_params->layer_q4k    / (double)n_layer);
-    n_params->layer_q50    = static_cast<int64_t>((double)n_params->layer_q50    / (double)n_layer);
-    n_params->layer_q5k    = static_cast<int64_t>((double)n_params->layer_q5k    / (double)n_layer);
-    n_params->layer_q6k    = static_cast<int64_t>((double)n_params->layer_q6k    / (double)n_layer);
-    n_params->layer_q80    = static_cast<int64_t>((double)n_params->layer_q80    / (double)n_layer);
-
-    n_bytes->nb_layer      = static_cast<int64_t>((double)n_bytes->nb_layer      / (double)n_layer);
+    n_flops->layer_f32_f32    = static_cast<int64_t>((double)n_flops->layer_f32_f32    / (double)n_layer);
+    n_flops->layer_f16_f32    = static_cast<int64_t>((double)n_flops->layer_f16_f32    / (double)n_layer);
+    n_flops->layer_q2k_f32    = static_cast<int64_t>((double)n_flops->layer_q2k_f32    / (double)n_layer);
+    n_flops->layer_q4k_f32    = static_cast<int64_t>((double)n_flops->layer_q4k_f32    / (double)n_layer);
+    n_flops->layer_q5k_f32    = static_cast<int64_t>((double)n_flops->layer_q5k_f32    / (double)n_layer);
+    n_flops->layer_q6k_f32    = static_cast<int64_t>((double)n_flops->layer_q6k_f32    / (double)n_layer);
+    n_flops->layer_iq2xxs_f32 = static_cast<int64_t>((double)n_flops->layer_iq2xxs_f32 / (double)n_layer);
+    n_flops->layer_q50_f32    = static_cast<int64_t>((double)n_flops->layer_q50_f32    / (double)n_layer);
+    n_flops->layer_q80_f32    = static_cast<int64_t>((double)n_flops->layer_q80_f32    / (double)n_layer);
+    n_flops->layer_iq1s_f32   = static_cast<int64_t>((double)n_flops->layer_iq1s_f32   / (double)n_layer);
+    n_flops->layer_iq4nl_f32  = static_cast<int64_t>((double)n_flops->layer_iq4nl_f32  / (double)n_layer);
+    n_flops->layer_iq1m_f32   = static_cast<int64_t>((double)n_flops->layer_iq1m_f32   / (double)n_layer);
+    
+    n_params->layer_f32      = static_cast<int64_t>((double)n_params->layer_f32     / (double)n_layer);
+    n_params->layer_f16      = static_cast<int64_t>((double)n_params->layer_f16     / (double)n_layer);
+    n_params->layer_q2k      = static_cast<int64_t>((double)n_params->layer_q2k     / (double)n_layer);
+    n_params->layer_q4k      = static_cast<int64_t>((double)n_params->layer_q4k     / (double)n_layer);
+    n_params->layer_q5k      = static_cast<int64_t>((double)n_params->layer_q5k     / (double)n_layer);
+    n_params->layer_q6k      = static_cast<int64_t>((double)n_params->layer_q6k     / (double)n_layer);
+    n_params->layer_iq2xxs   = static_cast<int64_t>((double)n_params->layer_iq2xxs  / (double)n_layer);
+    n_params->layer_q50      = static_cast<int64_t>((double)n_params->layer_q50     / (double)n_layer);
+    n_params->layer_q80      = static_cast<int64_t>((double)n_params->layer_q80     / (double)n_layer);
+    n_params->layer_iq1s     = static_cast<int64_t>((double)n_params->layer_iq1s    / (double)n_layer);
+    n_params->layer_iq4nl    = static_cast<int64_t>((double)n_params->layer_iq4nl   / (double)n_layer);
+    n_params->layer_iq1m     = static_cast<int64_t>((double)n_params->layer_iq1m    / (double)n_layer);
+    
+    n_bytes->nb_layer        = static_cast<int64_t>((double)n_bytes->nb_layer       / (double)n_layer);

    // reset ml, model, and clear contexts
    ml->n_created = 0;
--- a/tools/profile_tool.cpp
+++ b/tools/profile_tool.cpp
@ -0,0 +1,62 @@
+#include "arg.h"
+#include "common.h"
+#include "console.h"
+#include "log.h"
+#include "llama.h"
+
+static void print_usage(int argc, char ** argv) {
+    (void) argc;
+
+    LOG("\nexample usage:\n");
+    LOG("\n  text generation:     %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
+    LOG("\n  chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
+    LOG("\n");
+}
+
+int main(int argc, char ** argv) {
+    gpt_params params;
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
+        return 1;
+    }
+
+    if (params.n_ctx != 0 && params.n_ctx < 8) {
+        LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+        params.n_ctx = 8;
+    }
+
+    if (params.rope_freq_base != 0.0) {
+        LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
+    }
+
+    if (params.rope_freq_scale != 0.0) {
+        LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
+    }
+
+    // load the model and apply lora adapter, if any
+    auto mparams = llama_model_params_from_gpt_params(params);
+    struct llama_context_params cparams = llama_context_params_from_gpt_params(params);
+
+    struct llama_model * model = nullptr;
+
+    if (!params.hf_repo.empty() && !params.hf_file.empty()) {
+        model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
+    } else if (!params.model_url.empty()) {
+        model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
+    } else {
+        model = llama_load_model_from_file(params.model.c_str(), mparams);
+    }
+
+    if (model == NULL) {
+        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
+        return -1;
+    }
+
+    llama_model_loader * ml = llama_model_load(params.model.c_str(), model, &mparams);
+
+    device_info dev_info;
+    llama_profile_device(&dev_info, model, ml, params.gpu_mem, params.n_predict, params.n_ctx, params.cpuparams.n_threads, params.flash_attn);
+    device_print_props(&dev_info, 1, model, cparams);
+
+    llama_free_model(model);
+    return 0;
+}