Added support for Q2K, IQ1s, IQ4NL quantization types

2025-09-16 07:09:40 +00:00 · 2025-03-04 15:22:55 +00:00 · 2025-03-04 15:22:55 +00:00 · 2f049b8428
commit 2f049b8428
parent e2cda4cfa0
4 changed files with 551 additions and 218 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -3559,16 +3559,22 @@ static bool is_dtype_exist(struct model_params * n_params, enum ggml_type dtype)
        case GGML_TYPE_F32:
        case GGML_TYPE_F16:    
            return true;
+        case GGML_TYPE_Q2_K:
+            return n_params->layer_q2k > 0   || n_params->output_q2k   > 0;
        case GGML_TYPE_Q4_K:
-            return n_params->layer_q4k > 0 || n_params->output_q4k > 0;
-        case GGML_TYPE_Q5_0:
-            return n_params->layer_q50 > 0 || n_params->output_q50 > 0;
+            return n_params->layer_q4k > 0   || n_params->output_q4k   > 0;
        case GGML_TYPE_Q5_K:
-            return n_params->layer_q5k > 0 || n_params->output_q5k > 0;
+            return n_params->layer_q5k > 0   || n_params->output_q5k   > 0;
        case GGML_TYPE_Q6_K:
-            return n_params->layer_q6k > 0 || n_params->output_q6k > 0;
+            return n_params->layer_q6k > 0   || n_params->output_q6k   > 0;
+        case GGML_TYPE_Q5_0:
+            return n_params->layer_q50 > 0   || n_params->output_q50   > 0;
        case GGML_TYPE_Q8_0:
-            return n_params->layer_q80 > 0 || n_params->output_q80 > 0;
+            return n_params->layer_q80 > 0   || n_params->output_q80   > 0;
+        case GGML_TYPE_IQ1_S:
+            return n_params->layer_iq1s > 0  || n_params->output_iq1s  > 0;
+        case GGML_TYPE_IQ4_NL:
+            return n_params->layer_iq4nl > 0 || n_params->output_iq4nl > 0;
        default:
            throw std::runtime_error("Unrecognized data type\n");
    }
@ -3649,18 +3655,18 @@ void llama_profile_device(
        dev_info->gpu_props.cuda_flops_f16_f32  = device_cuda_flops (model, GGML_TYPE_F16,  GGML_TYPE_F32);
    }

+    if (is_dtype_exist(n_params, GGML_TYPE_Q2_K)) {
+        dev_info->cpu_props.flops_q2k_f32       = device_cpu_flops  (model, GGML_TYPE_Q2_K, GGML_TYPE_F32, n_threads);
+        dev_info->gpu_props.metal_flops_q2k_f32 = device_metal_flops(model, GGML_TYPE_Q2_K, GGML_TYPE_F32);
+        dev_info->gpu_props.cuda_flops_q2k_f32  = device_cuda_flops (model, GGML_TYPE_Q2_K, GGML_TYPE_F32);
+    }
+
    if (is_dtype_exist(n_params, GGML_TYPE_Q4_K)) {
        dev_info->cpu_props.flops_q4k_f32       = device_cpu_flops  (model, GGML_TYPE_Q4_K, GGML_TYPE_F32, n_threads);
        dev_info->gpu_props.metal_flops_q4k_f32 = device_metal_flops(model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
        dev_info->gpu_props.cuda_flops_q4k_f32  = device_cuda_flops (model, GGML_TYPE_Q4_K, GGML_TYPE_F32);
    }

-    if (is_dtype_exist(n_params, GGML_TYPE_Q5_0)) {
-        dev_info->cpu_props.flops_q50_f32       = device_cpu_flops  (model, GGML_TYPE_Q5_0, GGML_TYPE_F32, n_threads);
-        dev_info->gpu_props.metal_flops_q50_f32 = device_metal_flops(model, GGML_TYPE_Q5_0, GGML_TYPE_F32);
-        dev_info->gpu_props.cuda_flops_q50_f32  = device_cuda_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32);
-    }
-
    if (is_dtype_exist(n_params, GGML_TYPE_Q5_K)) {
        dev_info->cpu_props.flops_q5k_f32       = device_cpu_flops  (model, GGML_TYPE_Q5_K, GGML_TYPE_F32, n_threads);
        dev_info->gpu_props.metal_flops_q5k_f32 = device_metal_flops(model, GGML_TYPE_Q5_K, GGML_TYPE_F32);
@ -3673,11 +3679,30 @@ void llama_profile_device(
        dev_info->gpu_props.cuda_flops_q6k_f32  = device_cuda_flops (model, GGML_TYPE_Q6_K, GGML_TYPE_F32);
    }

+    if (is_dtype_exist(n_params, GGML_TYPE_Q5_0)) {
+        dev_info->cpu_props.flops_q50_f32       = device_cpu_flops  (model, GGML_TYPE_Q5_0, GGML_TYPE_F32, n_threads);
+        dev_info->gpu_props.metal_flops_q50_f32 = device_metal_flops(model, GGML_TYPE_Q5_0, GGML_TYPE_F32);
+        dev_info->gpu_props.cuda_flops_q50_f32  = device_cuda_flops (model, GGML_TYPE_Q5_0, GGML_TYPE_F32);
+    }
+
+
    if (is_dtype_exist(n_params, GGML_TYPE_Q8_0)) {
        dev_info->cpu_props.flops_q80_f32       = device_cpu_flops  (model, GGML_TYPE_Q8_0, GGML_TYPE_F32, n_threads);
        dev_info->gpu_props.metal_flops_q80_f32 = device_metal_flops(model, GGML_TYPE_Q8_0, GGML_TYPE_F32);
        dev_info->gpu_props.cuda_flops_q80_f32  = device_cuda_flops (model, GGML_TYPE_Q8_0, GGML_TYPE_F32);
    }
+
+    if (is_dtype_exist(n_params, GGML_TYPE_IQ1_S)) {
+        dev_info->cpu_props.flops_iq1s_f32      = device_cpu_flops  (model, GGML_TYPE_IQ1_S, GGML_TYPE_F32, n_threads);
+        dev_info->gpu_props.metal_flops_iq1s_f32= device_metal_flops(model, GGML_TYPE_IQ1_S, GGML_TYPE_F32);
+        dev_info->gpu_props.cuda_flops_iq1s_f32 = device_cuda_flops (model, GGML_TYPE_IQ1_S, GGML_TYPE_F32);
+    }
+
+    if (is_dtype_exist(n_params, GGML_TYPE_IQ4_NL)) {
+        dev_info->cpu_props.flops_iq4nl_f32     = device_cpu_flops   (model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32, n_threads);
+        dev_info->gpu_props.metal_flops_iq4nl_f32= device_metal_flops(model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32);
+        dev_info->gpu_props.cuda_flops_iq4nl_f32 = device_cuda_flops (model, GGML_TYPE_IQ4_NL, GGML_TYPE_F32);
+    }
 }

 ggml_backend_buffer_type_t llama_dev_buffer_type(struct llama_model * model, int device) {
@ -21029,49 +21054,67 @@ static void count_n_flops(struct model_flops * n_flops, enum ggml_type dtype, en
                case GGML_TYPE_F16:
                    n_flops->output_f16_f32 += n;
                    break;
+                case GGML_TYPE_Q2_K:
+                    n_flops->output_q2k_f32 += n;
+                    break;
                case GGML_TYPE_Q4_K:
                    n_flops->output_q4k_f32 += n;
                    break;
-                case GGML_TYPE_Q5_0:
-                    n_flops->output_q50_f32 += n;
-                    break;
                case GGML_TYPE_Q5_K:
                    n_flops->output_q5k_f32 += n;
                    break;
                case GGML_TYPE_Q6_K:
                    n_flops->output_q6k_f32 += n;
                    break;
+                case GGML_TYPE_Q5_0:
+                    n_flops->output_q50_f32 += n;
+                    break;
                case GGML_TYPE_Q8_0:
                    n_flops->output_q80_f32 += n;
                    break;
+                case GGML_TYPE_IQ1_S:
+                    n_flops->output_iq1s_f32 += n;
+                    break;
+                case GGML_TYPE_IQ4_NL:
+                    n_flops->output_iq4nl_f32 += n;
+                    break;
                default:
                    throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
            }
            break;

        case PROFILER_LAYER_BACKEND:
-            switch (dtype) {
+              switch (dtype) {
                case GGML_TYPE_F32:
                    n_flops->layer_f32_f32 += n;
                    break;
                case GGML_TYPE_F16:
                    n_flops->layer_f16_f32 += n;
                    break;
+                case GGML_TYPE_Q2_K:
+                    n_flops->layer_q2k_f32 += n;
+                    break;
                case GGML_TYPE_Q4_K:
                    n_flops->layer_q4k_f32 += n;
                    break;
-                case GGML_TYPE_Q5_0:
-                    n_flops->layer_q50_f32 += n;
-                    break;
                case GGML_TYPE_Q5_K:
                    n_flops->layer_q5k_f32 += n;
                    break;
                case GGML_TYPE_Q6_K:
                    n_flops->layer_q6k_f32 += n;
                    break;
+                case GGML_TYPE_Q5_0:
+                    n_flops->layer_q50_f32 += n;
+                    break;
                case GGML_TYPE_Q8_0:
                    n_flops->layer_q80_f32 += n;
                    break;
+                case GGML_TYPE_IQ1_S:
+                    n_flops->layer_iq1s_f32 += n;
+                    break;
+                case GGML_TYPE_IQ4_NL:
+                    n_flops->layer_iq4nl_f32 += n;
+                    break;
                default:
                    throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n");
            }
@ -21093,21 +21136,30 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
                case GGML_TYPE_F16:
                    n_params->input_f16 += n_i64t;
                    break;
+                case GGML_TYPE_Q2_K:
+                    n_params->input_q2k += n_i64t;
+                    break;
                case GGML_TYPE_Q4_K:
                    n_params->input_q4k += n_i64t;
                    break;
-                case GGML_TYPE_Q5_0:
-                    n_params->input_q50 += n_i64t;
-                    break;
                case GGML_TYPE_Q5_K:
                    n_params->input_q5k += n_i64t;
                    break;
                case GGML_TYPE_Q6_K:
                    n_params->input_q6k += n_i64t;
                    break;
+                case GGML_TYPE_Q5_0:
+                    n_params->input_q50 += n_i64t;
+                    break;
                case GGML_TYPE_Q8_0:
                    n_params->input_q80 += n_i64t;
                    break;
+                case GGML_TYPE_IQ1_S:
+                    n_params->input_iq1s += n_i64t;
+                    break;
+                case GGML_TYPE_IQ4_NL:
+                    n_params->input_iq4nl += n_i64t;
+                    break;
                default:
                    throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
            }
@ -21116,25 +21168,34 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
        case PROFILER_LAYER_OUTPUT:
            switch (dtype) {
                case GGML_TYPE_F32:
-                    n_params->output_f32 += n_i64t;
+                    n_params->output_f32    += n_i64t;
                    break;
                case GGML_TYPE_F16:
-                    n_params->output_f16 += n_i64t;
+                    n_params->output_f16    += n_i64t;
+                    break;
+                case GGML_TYPE_Q2_K:
+                    n_params->output_q2k    += n_i64t;
                    break;
                case GGML_TYPE_Q4_K:
-                    n_params->output_q4k += n_i64t;
-                    break;
-                case GGML_TYPE_Q5_0:
-                    n_params->output_q50 += n_i64t;
+                    n_params->output_q4k    += n_i64t;
                    break;
                case GGML_TYPE_Q5_K:
-                    n_params->output_q5k += n_i64t;
+                    n_params->output_q5k    += n_i64t;
                    break;
                case GGML_TYPE_Q6_K:
-                    n_params->output_q6k += n_i64t;
+                    n_params->output_q6k    += n_i64t;
+                    break;
+                case GGML_TYPE_Q5_0:
+                    n_params->output_q50    += n_i64t;
                    break;
                case GGML_TYPE_Q8_0:
-                    n_params->output_q80 += n_i64t;
+                    n_params->output_q80    += n_i64t;
+                    break;
+                case GGML_TYPE_IQ1_S:
+                    n_params->output_iq1s   += n_i64t;
+                    break;
+                case GGML_TYPE_IQ4_NL:
+                    n_params->output_iq4nl  += n_i64t;
                    break;
                default:
                    throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_OUTPUT\n");
@ -21144,25 +21205,34 @@ static void count_n_params(struct model_params * n_params, enum ggml_type dtype,
        case PROFILER_LAYER_BACKEND:
            switch (dtype) {
                case GGML_TYPE_F32:
-                    n_params->layer_f32 += n_i64t;
+                    n_params->layer_f32     += n_i64t;
                    break;
                case GGML_TYPE_F16:
-                    n_params->layer_f16 += n_i64t;
+                    n_params->layer_f16     += n_i64t;
+                    break;
+                case GGML_TYPE_Q2_K:
+                    n_params->layer_q2k     += n_i64t;
                    break;
                case GGML_TYPE_Q4_K:
-                    n_params->layer_q4k += n_i64t;
-                    break;
-                case GGML_TYPE_Q5_0:
-                    n_params->layer_q50 += n_i64t;
+                    n_params->layer_q4k     += n_i64t;
                    break;
                case GGML_TYPE_Q5_K:
-                    n_params->layer_q5k += n_i64t;
+                    n_params->layer_q5k     += n_i64t;
                    break;
                case GGML_TYPE_Q6_K:
-                    n_params->layer_q6k += n_i64t;
+                    n_params->layer_q6k     += n_i64t;
+                    break;
+                case GGML_TYPE_Q5_0:
+                    n_params->layer_q50     += n_i64t;
                    break;
                case GGML_TYPE_Q8_0:
-                    n_params->layer_q80 += n_i64t;
+                    n_params->layer_q80     += n_i64t;
+                    break;
+                case GGML_TYPE_IQ1_S:
+                    n_params->layer_iq1s    += n_i64t;
+                    break;
+                case GGML_TYPE_IQ4_NL:
+                    n_params->layer_iq4nl   += n_i64t;
                    break;
                default:
                    throw std::runtime_error("Unrecognized weight type in PROFILER_LAYER_BACKEND\n");
@ -21452,23 +21522,29 @@ void llama_model_n_flops(
    }

    // use average values instead of total values
-    n_flops->layer_f32_f32 = static_cast<int64_t>((double)n_flops->layer_f32_f32 / (double)n_layer);
-    n_flops->layer_f16_f32 = static_cast<int64_t>((double)n_flops->layer_f16_f32 / (double)n_layer);
-    n_flops->layer_q4k_f32 = static_cast<int64_t>((double)n_flops->layer_q4k_f32 / (double)n_layer);
-    n_flops->layer_q50_f32 = static_cast<int64_t>((double)n_flops->layer_q50_f32 / (double)n_layer);
-    n_flops->layer_q5k_f32 = static_cast<int64_t>((double)n_flops->layer_q5k_f32 / (double)n_layer);
-    n_flops->layer_q6k_f32 = static_cast<int64_t>((double)n_flops->layer_q6k_f32 / (double)n_layer);
-    n_flops->layer_q80_f32 = static_cast<int64_t>((double)n_flops->layer_q80_f32 / (double)n_layer);
-
-    n_params->layer_f32    = static_cast<int64_t>((double)n_params->layer_f32    / (double)n_layer);
-    n_params->layer_f16    = static_cast<int64_t>((double)n_params->layer_f16    / (double)n_layer);
-    n_params->layer_q4k    = static_cast<int64_t>((double)n_params->layer_q4k    / (double)n_layer);
-    n_params->layer_q50    = static_cast<int64_t>((double)n_params->layer_q50    / (double)n_layer);
-    n_params->layer_q5k    = static_cast<int64_t>((double)n_params->layer_q5k    / (double)n_layer);
-    n_params->layer_q6k    = static_cast<int64_t>((double)n_params->layer_q6k    / (double)n_layer);
-    n_params->layer_q80    = static_cast<int64_t>((double)n_params->layer_q80    / (double)n_layer);
-
-    n_bytes->nb_layer      = static_cast<int64_t>((double)n_bytes->nb_layer      / (double)n_layer);
+    n_flops->layer_f32_f32   = static_cast<int64_t>((double)n_flops->layer_f32_f32  / (double)n_layer);
+    n_flops->layer_f16_f32   = static_cast<int64_t>((double)n_flops->layer_f16_f32  / (double)n_layer);
+    n_flops->layer_q2k_f32   = static_cast<int64_t>((double)n_flops->layer_q2k_f32  / (double)n_layer);
+    n_flops->layer_q4k_f32   = static_cast<int64_t>((double)n_flops->layer_q4k_f32  / (double)n_layer);
+    n_flops->layer_q5k_f32   = static_cast<int64_t>((double)n_flops->layer_q5k_f32  / (double)n_layer);
+    n_flops->layer_q6k_f32   = static_cast<int64_t>((double)n_flops->layer_q6k_f32  / (double)n_layer);
+    n_flops->layer_q50_f32   = static_cast<int64_t>((double)n_flops->layer_q50_f32  / (double)n_layer);
+    n_flops->layer_q80_f32   = static_cast<int64_t>((double)n_flops->layer_q80_f32  / (double)n_layer);
+    n_flops->layer_iq1s_f32  = static_cast<int64_t>((double)n_flops->layer_iq1s_f32 / (double)n_layer);
+    n_flops->layer_iq4nl_f32 = static_cast<int64_t>((double)n_flops->layer_iq4nl_f32 / (double)n_layer);
+    
+    n_params->layer_f32      = static_cast<int64_t>((double)n_params->layer_f32     / (double)n_layer);
+    n_params->layer_f16      = static_cast<int64_t>((double)n_params->layer_f16     / (double)n_layer);
+    n_params->layer_q2k      = static_cast<int64_t>((double)n_params->layer_q2k     / (double)n_layer);
+    n_params->layer_q4k      = static_cast<int64_t>((double)n_params->layer_q4k     / (double)n_layer);
+    n_params->layer_q50      = static_cast<int64_t>((double)n_params->layer_q50     / (double)n_layer);
+    n_params->layer_q5k      = static_cast<int64_t>((double)n_params->layer_q5k     / (double)n_layer);
+    n_params->layer_q6k      = static_cast<int64_t>((double)n_params->layer_q6k     / (double)n_layer);
+    n_params->layer_q80      = static_cast<int64_t>((double)n_params->layer_q80     / (double)n_layer);
+    n_params->layer_iq1s     = static_cast<int64_t>((double)n_params->layer_iq1s    / (double)n_layer);
+    n_params->layer_iq4nl    = static_cast<int64_t>((double)n_params->layer_iq4nl   / (double)n_layer);
+    
+    n_bytes->nb_layer        = static_cast<int64_t>((double)n_bytes->nb_layer       / (double)n_layer);

    // reset ml, model, and clear contexts
    ml->n_created = 0;