Merge branch 'upstream' into concedo_experimental

# Conflicts: # .github/workflows/build.yml # Makefile # docs/build.md # examples/rpc/rpc-server.cpp # examples/sycl/build.sh # ggml/CMakeLists.txt # ggml/src/ggml-cann/aclnn_ops.cpp # ggml/src/ggml-cann/ggml-cann.cpp # ggml/src/ggml-hip/CMakeLists.txt # scripts/sync-ggml.last
2025-09-11 09:34:37 +00:00 · 2025-04-17 00:52:37 +08:00 · 2025-04-17 00:52:37 +08:00 · 06159939d9
commit 06159939d9
parent a5c39143a1 b43d89e311
72 changed files with 6549 additions and 5397 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -837,7 +837,7 @@ std::string fs_get_cache_directory() {
    if (getenv("LLAMA_CACHE")) {
        cache_directory = std::getenv("LLAMA_CACHE");
    } else {
-#if defined(__linux__) || defined(__FreeBSD__)
+#if defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)
        if (std::getenv("XDG_CACHE_HOME")) {
            cache_directory = std::getenv("XDG_CACHE_HOME");
        } else {
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -4422,6 +4422,10 @@ class DeepseekV2Model(Model):
        self._set_vocab_gpt2()
    def set_gguf_parameters(self):
        # note: deepseek2 using MLA converts into MQA (ie: GQA with 1 group)
        self.hparams["num_key_value_heads"] = 1
        super().set_gguf_parameters()
        hparams = self.hparams
@ -4430,8 +4434,13 @@ class DeepseekV2Model(Model):
        if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
            self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
        self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
-        self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
+
-        self.gguf_writer.add_value_length(hparams["v_head_dim"])
+        # note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
        self.gguf_writer.add_key_length(hparams["kv_lora_rank"] + hparams["qk_rope_head_dim"])
        self.gguf_writer.add_value_length(hparams["kv_lora_rank"])
        self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
        self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])
        self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
        self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
        self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
@ -4500,6 +4509,26 @@ class DeepseekV2Model(Model):
            else:
                return []
        # note: MLA with the absorption optimization, needs these two split and k_b_proj transposed
        if name.endswith("kv_b_proj.weight"):
            name_kb = name.replace("kv_b_proj", "k_b_proj")
            name_vb = name.replace("kv_b_proj", "v_b_proj")
            n_head_kv = self.hparams["num_key_value_heads"]
            v_head_dim = self.hparams["v_head_dim"]
            qk_nope_head_dim = self.hparams["qk_nope_head_dim"]
            assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim)
            kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1])
            k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1)
            k_b = k_b.transpose(1, 2)
            return [
                (self.map_tensor_name(name_kb), k_b),
                (self.map_tensor_name(name_vb), v_b)
            ]
        return [(self.map_tensor_name(name), data_torch)]
    def prepare_tensors(self):
--- a/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/gemma3-cli.cpp
@ -317,6 +317,6 @@ int main(int argc, char ** argv) {
            is_first_msg = false;
        }
    }
-
+    llama_perf_context_print(ctx.lctx);
    return 0;
 }
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -10,6 +10,7 @@
 #include <fstream>
 #include <cmath>
 #include <cctype>
 #include <algorithm>
 struct quant_option {
    std::string name;
@ -17,7 +18,7 @@ struct quant_option {
    std::string desc;
 };
-static const std::vector<struct quant_option> QUANT_OPTIONS = {
+static const std::vector<quant_option> QUANT_OPTIONS = {
    { "Q4_0",     LLAMA_FTYPE_MOSTLY_Q4_0,     " 4.34G, +0.4685 ppl @ Llama-3-8B",  },
    { "Q4_1",     LLAMA_FTYPE_MOSTLY_Q4_1,     " 4.78G, +0.4511 ppl @ Llama-3-8B",  },
    { "Q5_0",     LLAMA_FTYPE_MOSTLY_Q5_0,     " 5.21G, +0.1316 ppl @ Llama-3-8B",  },
@ -106,7 +107,8 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 //
 [[noreturn]]
 static void usage(const char * executable) {
-    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
+    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type]\n", executable);
    printf("       [--token-embedding-type] [--tensor-type] [--keep-split] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
    printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
    printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
    printf("  --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
@ -115,6 +117,8 @@ static void usage(const char * executable) {
    printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
    printf("  --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
    printf("  --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
    printf("  --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. example: --tensor-type attn_q=q8_0\n");
    printf("      Advanced option to selectively quantize tensors. May be specified multiple times.\n");
    printf("  --keep-split: will generate quantized model in the same shards as input\n");
    printf("  --override-kv KEY=TYPE:VALUE\n");
    printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
@ -245,6 +249,107 @@ static ggml_type parse_ggml_type(const char * arg) {
    return GGML_TYPE_COUNT;
 }
 // Allowed tensors for arbitrary quantization with --tensor-type option
 static const std::vector<std::string> ALLOWED_TENSOR_TYPE = {
    "attn_k",
    "attn_kv_a_mqa",
    "attn_kv_b",
    "attn_o",
    "attn_output",
    "attn_q",
    "attn_q_a",
    "attn_q_b",
    "attn_qkv",
    "attn_v",
    "channel_mix_key",
    "channel_mix_receptance",
    "channel_mix_value",
    "cls",
    "cls.output",
    "cross_attn_k",
    "cross_attn_o",
    "cross_attn_q",
    "cross_attn_v",
    "ffn_act",
    "ffn_down",
    "ffn_down_exps",
    "ffn_down_shexp",
    "ffn_gate",
    "ffn_gate_exps",
    "ffn_gate_shexp",
    "ffn_up",
    "ffn_up_exps",
    "ffn_up_shexp",
    "ssm_in",
    "ssm_out",
    "time_mix_gate",
    "time_mix_key",
    "time_mix_output",
    "time_mix_receptance",
    "time_mix_value",
 };
 // changes to this struct must be replicated in llama-quant.cpp
 struct tensor_quantization {
    std::string name;
    ggml_type quant = GGML_TYPE_COUNT;
 };
 static bool parse_tensor_type(const char * data, std::vector<tensor_quantization> & tensor_type) {
    const char * sep = strchr(data, '=');
    if (sep == nullptr) {
        printf("\n%s: malformed tensor type '%s'\n\n", __func__, data);
        return false;
    }
    const size_t tn_len = sep - data;
    if (tn_len == 0) {
        printf("\n%s: missing tensor name\n\n", __func__);
        return false;
    }
    if (const size_t qt_len = strlen(sep); qt_len == 1) {
        printf("\n%s: missing quantization type\n\n", __func__);
        return false;
    }
    std::string tn(data, tn_len);
    std::transform(tn.begin(), tn.end(), tn.begin(), tolower);
    sep++;
    const std::string qt(sep);
    bool found = false;
    for (const auto & allowed : ALLOWED_TENSOR_TYPE) {
        std::string tensor;
        tensor = tn.rfind('.') != std::string::npos ? tn.substr(tn.rfind('.') + 1) : tn;
        // handle special case of cls.output
        std::string cls_output = "cls.output";
        if (tn.find(cls_output) != std::string::npos) {
            tensor = "cls.output";
        }
        // check if an allowed tensor exists and it's at the end of the kv string
        if (tensor == allowed) {
            found = true;
            break;
        }
    }
    if (!found) {
        printf("\n%s: invalid tensor name '%s'\n\n", __func__, tn.c_str());
        return false;
    }
    if (parse_ggml_type(qt.c_str()) == GGML_TYPE_COUNT) {
        printf("\n%s: invalid quantization type '%s'\n\n", __func__, qt.c_str());
        return false;
    }
    tensor_quantization tqz;
    tqz.name = tn;
    tqz.quant = parse_ggml_type(qt.c_str());
    tensor_type.emplace_back(std::move(tqz));
    return true;
 }
 int main(int argc, char ** argv) {
    if (argc < 3) {
        usage(argv[0]);
@ -256,6 +361,7 @@ int main(int argc, char ** argv) {
    std::string imatrix_file;
    std::vector<std::string> included_weights, excluded_weights;
    std::vector<llama_model_kv_override> kv_overrides;
    std::vector<tensor_quantization> tensor_types;
    for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
        if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
@ -278,6 +384,10 @@ int main(int argc, char ** argv) {
            } else {
                usage(argv[0]);
            }
        } else if (strcmp(argv[arg_idx], "--tensor-type") == 0) {
            if (arg_idx == argc-1 || !parse_tensor_type(argv[++arg_idx], tensor_types)) {
                usage(argv[0]);
            }
        } else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
            if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
                usage(argv[0]);
@ -362,6 +472,9 @@ int main(int argc, char ** argv) {
        kv_overrides.back().key[0] = 0;
        params.kv_overrides = &kv_overrides;
    }
    if (!tensor_types.empty()) {
        params.tensor_types = &tensor_types;
    }
    llama_backend_init();
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@ -425,6 +425,8 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
        }
        case GGML_OP_IM2COL_BACK:
            return src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32;
        case GGML_OP_GET_ROWS_BACK:
            return src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16;
        case GGML_OP_OUT_PROD:
            return (src0->type == GGML_TYPE_F32 || (ggml_is_quantized(src0->type) && src0->ne[2] == src1->ne[2] && src0->ne[3] == src1->ne[3])) &&
                src1->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@ -98,31 +98,32 @@ int ggml_cuda_get_device() {
 static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
    ggml_cuda_set_device(device);
 #if defined(GGML_USE_HIP) && defined(GGML_HIP_UMA)
    auto res = hipMallocManaged(ptr, size);
    if (res == hipSuccess) {
        // if error we "need" to know why...
        CUDA_CHECK(hipMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
    }
    return res;
 #else
 #if !defined(GGML_USE_HIP)
    cudaError_t err;
    if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
    {
        err = cudaMallocManaged(ptr, size);
 #if defined(GGML_USE_HIP)
        if (err == hipSuccess) {
            CUDA_CHECK(cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
        }
        // fall back to cudaMalloc if not supported (e.g. on Windows)
        if (err == hipErrorNotSupported) {
            static bool warned_unsupported = false;
            if (!warned_unsupported) {
                GGML_LOG_WARN("hipMallocManaged unsupported, falling back to hipMalloc.\n");
                warned_unsupported = true;
            }
            err = cudaMalloc(ptr, size);
        }
 #endif // defined(GGML_USE_HIP)
    }
    else
    {
        err = cudaMalloc(ptr, size);
    }
    return err;
 #else
    return cudaMalloc(ptr, size);
 #endif // !defined(GGML_USE_HIP)
 #endif
 }
 #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
@ -2493,10 +2494,10 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
 #endif
        }
-        if (node->op == GGML_OP_MUL_MAT_ID) {
+        if (node->op == GGML_OP_MUL_MAT_ID || node->op == GGML_OP_CONT || node->op == GGML_OP_DUP) {
            use_cuda_graph = false; // This node type is not supported by CUDA graph capture
 #ifndef NDEBUG
-            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
+            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__);
 #endif
        }
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@ -71,6 +71,8 @@
 #define cudaLaunchHostFunc hipLaunchHostFunc
 #define cudaMalloc hipMalloc
 #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
 #define cudaMallocManaged hipMallocManaged
 #define cudaMemAdvise hipMemAdvise
 #define cudaMemcpy hipMemcpy
 #define cudaMemcpyAsync hipMemcpyAsync
 #define cudaMemcpyPeerAsync hipMemcpyPeerAsync
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@ -402,6 +402,13 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H192,
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK192_HV128,
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256,
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H96,
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H96,
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H96,
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H96,
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H96,
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H96,
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H96,
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128,
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H128,
    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128,
@ -1059,6 +1066,13 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H192,        flash_attn_ext_q8_0_h192,        has_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_HK192_HV128, flash_attn_ext_q8_0_hk192_hv128, has_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_Q8_0_H256,        flash_attn_ext_q8_0_h256,        has_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H96,      flash_attn_ext_vec_f16_h96,      has_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H96,     flash_attn_ext_vec_bf16_h96,     has_simdgroup_reduction && use_bfloat);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H96,     flash_attn_ext_vec_q4_0_h96,     has_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H96,     flash_attn_ext_vec_q4_1_h96,     has_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H96,     flash_attn_ext_vec_q5_0_h96,     has_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H96,     flash_attn_ext_vec_q5_1_h96,     has_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H96,     flash_attn_ext_vec_q8_0_h96,     has_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128,     flash_attn_ext_vec_f16_h128,     has_simdgroup_reduction);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H128,    flash_attn_ext_vec_bf16_h128,    has_simdgroup_reduction && use_bfloat);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H128,    flash_attn_ext_vec_q4_0_h128,    has_simdgroup_reduction);
@ -3843,7 +3857,7 @@ static void ggml_metal_encode_node(
                // TODO: add vec kernels for (ne00%64 == 0) and maybe also for (ne00%32 == 0)
                //       for now avoiding mainly to keep the number of templates/kernels a bit lower
                //       these are now trivial to add after: https://github.com/ggml-org/llama.cpp/pull/12612
-                if (ne01 >= 4 || (ne00%128 != 0 && ne00 != 192)) {
+                if (ne01 >= 4 || (ne00%128 != 0 && ne00 != 96 && ne00 != 192)) {
                    switch (src1->type) {
                        case GGML_TYPE_F16:
                            {
@ -4010,6 +4024,24 @@ static void ggml_metal_encode_node(
                    use_vec_kernel = true;
                    switch (ne00) {
                        case 96:
                            {
                                switch (src1->type) {
                                    case GGML_TYPE_F16:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H96].pipeline; break;
                                    case GGML_TYPE_BF16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_BF16_H96].pipeline; break;
                                    case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_0_H96].pipeline; break;
                                    case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q4_1_H96].pipeline; break;
                                    case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_0_H96].pipeline; break;
                                    case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q5_1_H96].pipeline; break;
                                    case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_Q8_0_H96].pipeline; break;
                                    default:
                                        {
                                            GGML_LOG_ERROR("unsupported type: %d\n", src1->type);
                                            GGML_LOG_ERROR("add template specialization for this type\n");
                                            GGML_ABORT("add template specialization for this type");
                                        }
                                }
                            } break;
                        case 128:
                            {
                                switch (src1->type) {
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@ -3959,6 +3959,16 @@ kernel void kernel_flash_attn_ext_vec(
 typedef decltype(kernel_flash_attn_ext_vec<FA_TYPES, half4, 1, dequantize_f16_t4, half4, 1, dequantize_f16_t4, 128, 128, 4>) flash_attn_ext_vec_t;
 template [[host_name("kernel_flash_attn_ext_vec_f16_h96")]]  kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4,             1, dequantize_f16_t4,  half4,       1, dequantize_f16_t4,  96, 96, 4>;
 #if defined(GGML_METAL_USE_BF16)
 template [[host_name("kernel_flash_attn_ext_vec_bf16_h96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4,           1, dequantize_bf16_t4, bfloat4,     1, dequantize_bf16_t4, 96, 96, 4>;
 #endif
 template [[host_name("kernel_flash_attn_ext_vec_q4_0_h96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_0,        8, dequantize_q4_0_t4, block_q4_0,  8, dequantize_q4_0_t4, 96, 96, 4>;
 template [[host_name("kernel_flash_attn_ext_vec_q4_1_h96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q4_1,        8, dequantize_q4_1_t4, block_q4_1,  8, dequantize_q4_1_t4, 96, 96, 4>;
 template [[host_name("kernel_flash_attn_ext_vec_q5_0_h96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_0,        8, dequantize_q5_0_t4, block_q5_0,  8, dequantize_q5_0_t4, 96, 96, 4>;
 template [[host_name("kernel_flash_attn_ext_vec_q5_1_h96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q5_1,        8, dequantize_q5_1_t4, block_q5_1,  8, dequantize_q5_1_t4, 96, 96, 4>;
 template [[host_name("kernel_flash_attn_ext_vec_q8_0_h96")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, block_q8_0,        8, dequantize_q8_0_t4, block_q8_0,  8, dequantize_q8_0_t4, 96, 96, 4>;
 template [[host_name("kernel_flash_attn_ext_vec_f16_h128")]]  kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, half4,             1, dequantize_f16_t4,  half4,       1, dequantize_f16_t4,  128, 128, 4>;
 #if defined(GGML_METAL_USE_BF16)
 template [[host_name("kernel_flash_attn_ext_vec_bf16_h128")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES, bfloat4,           1, dequantize_bf16_t4, bfloat4,     1, dequantize_bf16_t4, 128, 128, 4>;
--- a/ggml/src/ggml-opencl/CMakeLists.txt
+++ b/ggml/src/ggml-opencl/CMakeLists.txt
@ -54,16 +54,41 @@ function(ggml_opencl_add_kernel KNAME)
 endfunction()
 set(GGML_OPENCL_KERNELS
-    ggml-opencl
+    add
-    ggml-opencl_mm
+    clamp
-    ggml-opencl_cvt
+    cpy
-    ggml-opencl_gemv_noshuffle
+    cvt
-    ggml-opencl_gemv_noshuffle_general
+    diag_mask_inf
-    ggml-opencl_mul_mat_Ab_Bi_8x4
+    gelu
-    ggml-opencl_transpose_16
+    gemv_noshuffle_general
-    ggml-opencl_transpose_32
+    gemv_noshuffle
-    ggml-opencl_transpose_32_16
+    get_rows
-    ggml-opencl_im2col
+    im2col_f32
    im2col_f16
    mul_mat_Ab_Bi_8x4
    mul_mv_f16_f16
    mul_mv_f16_f32_1row
    mul_mv_f16_f32_l4
    mul_mv_f16_f32
    mul_mv_f32_f32
    mul_mv_q4_0_f32
    mul_mv_q4_0_f32_v
    mul_mv_q4_0_f32_8x_flat
    mul_mv_q4_0_f32_1d_8x_flat
    mul_mv_q4_0_f32_1d_16x_flat
    mul_mv_q6_k
    mul
    norm
    relu
    rms_norm
    rope
    scale
    silu
    softmax_4_f32
    softmax_4_f16
    softmax_f32
    softmax_f16
    transpose
 )
 foreach (K ${GGML_OPENCL_KERNELS})
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
--- a/ggml/src/ggml-opencl/kernels/add.cl
+++ b/ggml/src/ggml-opencl/kernels/add.cl
@ -0,0 +1,83 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 //------------------------------------------------------------------------------
 // add
 //------------------------------------------------------------------------------
 // general-purpose kernel for addition of two tensors
 // pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
 // cons: not very efficient
 kernel void kernel_add(
        global char * src0,
        ulong  offset0,
        global char * src1,
        ulong  offset1,
        global char * dst,
        ulong  offsetd,
        int   ne00,
        int   ne01,
        int   ne02,
        int   ne03,
        ulong nb00,
        ulong nb01,
        ulong nb02,
        ulong nb03,
        int   ne10,
        int   ne11,
        int   ne12,
        int   ne13,
        ulong nb10,
        ulong nb11,
        ulong nb12,
        ulong nb13,
        int   ne0,
        int   ne1,
        int   ne2,
        int   ne3,
        ulong nb0,
        ulong nb1,
        ulong nb2,
        ulong nb3
 ) {
    src0 = src0 + offset0;
    src1 = src1 + offset1;
    dst = dst + offsetd;
    int i03 = get_group_id(2);
    int i02 = get_group_id(1);
    int i01 = get_group_id(0);
    int i13 = i03 % ne13;
    int i12 = i02 % ne12;
    int i11 = i01 % ne11;
    global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
    global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
    global char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
        const int i10 = i0 % ne10;
        *((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) + *((global float *)(src1_ptr + i10*nb10));
    }
 }
 // assumption: src1 is a row
 // broadcast src1 into src0
 kernel void kernel_add_row(
        global float4 * src0,
        ulong  offset0,
        global float4 * src1,
        ulong  offset1,
        global float4 * dst,
        ulong  offsetd,
        int ne
 ) {
    src0 = (global float4*)((global char*)src0 + offset0);
    src1 = (global float4*)((global char*)src1 + offset1);
    dst = (global float4*)((global char*)dst + offsetd);
    // This performs better than using %.
    uint gid = get_global_id(0);
    uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
    dst[gid] = src0[gid] + src1[idx1];
 }
--- a/ggml/src/ggml-opencl/kernels/clamp.cl
+++ b/ggml/src/ggml-opencl/kernels/clamp.cl
@ -0,0 +1,20 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 //------------------------------------------------------------------------------
 // clamp
 //------------------------------------------------------------------------------
 kernel void kernel_clamp(
        global float * src0,
        ulong offset0,
        global float * dst,
        ulong offsetd,
        float min,
        float max
 ) {
    src0 = (global float*)((global char*)src0 + offset0);
    dst = (global float*)((global char*)dst + offsetd);
    dst[get_global_id(0)] = src0[get_global_id(0)] < min ?
        min :
        (src0[get_global_id(0)] > max ? max : src0[get_global_id(0)]);
 }
--- a/ggml/src/ggml-opencl/kernels/cpy.cl
+++ b/ggml/src/ggml-opencl/kernels/cpy.cl
@ -0,0 +1,184 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 //------------------------------------------------------------------------------
 // cpy
 //------------------------------------------------------------------------------
 kernel void kernel_cpy_f16_f16(
        global half * src0,
        ulong offset0,
        global half * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int ne02,
        int ne03,
        ulong nb00,
        ulong nb01,
        ulong nb02,
        ulong nb03,
        int ne0,
        int ne1,
        int ne2,
        int ne3,
        ulong nb0,
        ulong nb1,
        ulong nb2,
        ulong nb3
 ) {
    src0 = (global half*)((global char*)src0 + offset0);
    dst = (global half*)((global char*)dst + offsetd);
    int i03 = get_group_id(2);
    int i02 = get_group_id(1);
    int i01 = get_group_id(0);
    int n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
    int i3 = n / (ne2*ne1*ne0);
    int i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
    int i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
    int i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
    global half * dst_data = (global half *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
        global const half * src = (global half *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
        dst_data[i00] = src[0];
    }
 }
 kernel void kernel_cpy_f16_f32(
        global half * src0,
        ulong offset0,
        global float * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int ne02,
        int ne03,
        ulong nb00,
        ulong nb01,
        ulong nb02,
        ulong nb03,
        int ne0,
        int ne1,
        int ne2,
        int ne3,
        ulong nb0,
        ulong nb1,
        ulong nb2,
        ulong nb3
 ) {
    src0 = (global half*)((global char*)src0 + offset0);
    dst = (global float*)((global char*)dst + offsetd);
    int i03 = get_group_id(2);
    int i02 = get_group_id(1);
    int i01 = get_group_id(0);
    int n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
    int i3 = n / (ne2*ne1*ne0);
    int i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
    int i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
    int i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
    global float * dst_data = (global float *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
        global half * src = (global half *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
        dst_data[i00] = src[0];
    }
 }
 kernel void kernel_cpy_f32_f16(
        global float * src0,
        ulong offset0,
        global half * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int ne02,
        int ne03,
        ulong nb00,
        ulong nb01,
        ulong nb02,
        ulong nb03,
        int ne0,
        int ne1,
        int ne2,
        int ne3,
        ulong nb0,
        ulong nb1,
        ulong nb2,
        ulong nb3
 ) {
    src0 = (global float*)((global char*)src0 + offset0);
    dst = (global half*)((global char*)dst + offsetd);
    int i03 = get_group_id(2);
    int i02 = get_group_id(1);
    int i01 = get_group_id(0);
    int n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
    int i3 = n / (ne2*ne1*ne0);
    int i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
    int i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
    int i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
    global half * dst_data = (global half *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
        global const float * src = (global float *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
        dst_data[i00] = src[0];
    }
 }
 kernel void kernel_cpy_f32_f32(
        global float * src0,
        ulong offset0,
        global float * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int ne02,
        int ne03,
        ulong nb00,
        ulong nb01,
        ulong nb02,
        ulong nb03,
        int ne0,
        int ne1,
        int ne2,
        int ne3,
        ulong nb0,
        ulong nb1,
        ulong nb2,
        ulong nb3
 ) {
    src0 = (global float*)((global char*)src0 + offset0);
    dst = (global float*)((global char*)dst + offsetd);
    int i03 = get_group_id(2);
    int i02 = get_group_id(1);
    int i01 = get_group_id(0);
    int n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
    int i3 = n / (ne2*ne1*ne0);
    int i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
    int i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
    int i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
    global float * dst_data = (global float *) ((global char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
        global const float * src = (global float *)((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
        dst_data[i00] = src[0];
    }
 }
--- a/ggml/src/ggml-opencl/kernels/ggml-opencl_cvt.cl
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_cvt.cl
@ -1,39 +1,20 @@
 //------------------------------------------------------------------------------
-// This file is contains additional kernels for data conversion.
+// This file is contains kernels for data conversion.
 // These kernels are used when loading the model, so its performance is less
 // important.
 //------------------------------------------------------------------------------
 #ifdef cl_khr_fp16
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #elif defined(cl_amd_fp16)
 #pragma OPENCL EXTENSION cl_amd_fp16 : enable
 #else
 #error "Half precision floating point not supportedby OpenCL implementation on your device."
 #endif
 #ifdef cl_khr_subgroups
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 #elif defined(cl_intel_subgroups)
 #pragma OPENCL EXTENSION cl_intel_subgroups : enable
 #else
 #error "Subgroup not supported on your device."
 #endif
 #ifdef cl_intel_required_subgroup_size
 // Always use subgroup size of 32 on Intel.
 #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
 #define INTEL_GPU 1
 #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
 #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
 #elif defined(cl_qcom_reqd_sub_group_size)
 // Always use subgroups size of 64 on Adreno.
 #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 #define ADRENO_GPU 1
 #define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
 #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
 #else
 // TODO: do not know how to choose subgroup size on other GPUs.
 #error "Selecting subgroup size is not supported on your device."
 #endif
 #define QK4_0                   32
@ -66,13 +47,44 @@ struct block_q4_0
 };
 //------------------------------------------------------------------------------
-// mul_vec_q_n_f32_flat_noshuffle
+// kernel_convert_block_q4_0
-//
+// Convert the block_q4_0 format to 2 separate arrays (AOS -> SOA).
-// This variation uses flat arrays (struct of arrays, SOA) representation for
+// This kernel does not deshuffle the bits.
-// quant tensors. It also uses non shuffled bit order for weights.
+//------------------------------------------------------------------------------
-//
+kernel void kernel_convert_block_q4_0(
-// The shuffled version is kept in the original file because moving it here
+    global struct block_q4_0 * src0,
-// seems to result in worse performance for adreno.
+    global uchar * dst_q,
    global half  * dst_d
 ) {
    global struct block_q4_0 * b = (global struct block_q4_0 *) src0 + get_global_id(0);
    global uchar * q = (global uchar *) dst_q + QK4_0/2*get_global_id(0);
    global half  * d = (global half *) dst_d + get_global_id(0);
    *d = b->d;
    for (int i = 0; i < QK4_0/2; ++i) {
        q[i] = b->qs[i];
    }
 }
 kernel void kernel_restore_block_q4_0(
    global uchar * src_q,
    global half  * src_d,
    global struct block_q4_0 * dst
 ) {
    global struct block_q4_0 * b = (global struct block_q4_0 *) dst + get_global_id(0);
    global uchar * q = (global uchar *) src_q + QK4_0/2*get_global_id(0);
    global half  * d = (global half *) src_d + get_global_id(0);
    b->d = *d;
    for (int i = 0; i < QK4_0/2; ++i) {
        b->qs[i] = q[i];
    }
 }
 //------------------------------------------------------------------------------
 // kernel_convert_block_q4_0_noshuffle
 // Flatten q4_0 weights and unshuffle the bits
 //------------------------------------------------------------------------------
 kernel void kernel_convert_block_q4_0_noshuffle(
--- a/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl
+++ b/ggml/src/ggml-opencl/kernels/diag_mask_inf.cl
@ -0,0 +1,58 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 //------------------------------------------------------------------------------
 // diag_mask_inf kernels
 //------------------------------------------------------------------------------
 kernel void kernel_diag_mask_inf(
        global float * src0,
        ulong offset0,
        global float * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int n_past
 ) {
    src0 = (global float*)((global char*)src0 + offset0);
    dst = (global float*)((global char*)dst + offsetd);
    int i02 = get_global_id(2);
    int i01 = get_global_id(1);
    int i00 = get_global_id(0);
    if (i00 > n_past + i01) {
        dst[i02*ne01*ne00 + i01*ne00 + i00] = -INFINITY;
    } else {
        dst[i02*ne01*ne00 + i01*ne00 + i00] = src0[i02*ne01*ne00 + i01*ne00 + i00];
    }
 }
 kernel void kernel_diag_mask_inf_8(
        global float4 * src0,
        ulong offset0,
        global float4 * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int n_past
 ) {
    src0 = (global float4*)((global char*)src0 + offset0);
    dst = (global float4*)((global char*)dst + offsetd);
    int i = 2*get_global_id(0);
    dst[i+0] = src0[i+0];
    dst[i+1] = src0[i+1];
    int i4 = 4*i;
    int i02 = i4/(ne00*ne01); i4 -= i02*ne00*ne01;
    int i01 = i4/(ne00);      i4 -= i01*ne00;
    int i00 = i4;
    for (int k = 3; k >= 0; --k) {
        if (i00 + 4 + k <= n_past + i01) {
            break;
        }
        (&dst[i+1])[k] = -INFINITY;
        if (i00 + k > n_past + i01) {
            (&dst[i])[k] = -INFINITY;
        }
    }
 }
--- a/ggml/src/ggml-opencl/kernels/gelu.cl
+++ b/ggml/src/ggml-opencl/kernels/gelu.cl
@ -0,0 +1,62 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 //------------------------------------------------------------------------------
 // gelu
 //------------------------------------------------------------------------------
 #define GELU_COEF_A     0.044715f
 #define GELU_QUICK_COEF -1.702f
 #define SQRT_2_OVER_PI  0.79788456080286535587989211986876f
 kernel void kernel_gelu(
    global float * src0,
    ulong offset0,
    global float * dst,
    ulong offsetd
 ) {
    src0 = (global float*)((global char*)src0 + offset0);
    dst = (global float*)((global char*)dst + offsetd);
    float x = src0[get_global_id(0)];
    dst[get_global_id(0)] = 0.5f*x*(1.0f + tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
 }
 kernel void kernel_gelu_4(
    global float4 * src0,
    ulong offset0,
    global float4 * dst,
    ulong offsetd
 ) {
    src0 = (global float4*)((global char*)src0 + offset0);
    dst = (global float4*)((global char*)dst + offsetd);
    float4 x = src0[get_global_id(0)];
    dst[get_global_id(0)] = 0.5f*x*(1.0f + tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
 }
 kernel void kernel_gelu_quick(
    global float * src0,
    ulong offset0,
    global float * dst,
    ulong offsetd
 ) {
    src0 = (global float*)((global char*)src0 + offset0);
    dst = (global float*)((global char*)dst + offsetd);
    float x = src0[get_global_id(0)];
    dst[get_global_id(0)] = x*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x)));
 }
 kernel void kernel_gelu_quick_4(
    global float4 * src0,
    ulong offset0,
    global float4 * dst,
    ulong offsetd
 ) {
    src0 = (global float4*)((global char*)src0 + offset0);
    dst = (global float4*)((global char*)dst + offsetd);
    float4 x = src0[get_global_id(0)];
    dst[get_global_id(0)] = x*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x)));
 }
--- a/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl
--- a/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle_general.cl
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle_general.cl
--- a/ggml/src/ggml-opencl/kernels/get_rows.cl
+++ b/ggml/src/ggml-opencl/kernels/get_rows.cl
@ -0,0 +1,163 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 typedef char int8_t;
 typedef uchar uint8_t;
 typedef short int16_t;
 typedef ushort uint16_t;
 typedef int int32_t;
 typedef uint uint32_t;
 #define QK4_0                   32
 //------------------------------------------------------------------------------
 // block_q4_0
 //------------------------------------------------------------------------------
 struct block_q4_0
 {
    half d;
    uint8_t qs[QK4_0 / 2];
 };
 //------------------------------------------------------------------------------
 // dequantize_q4_0_f32, dequantize_q4_0_f16
 //------------------------------------------------------------------------------
 void dequantize_q4_0_f32(global struct block_q4_0 * xb, short il, float16 * reg) {
    global ushort * qs = ((global ushort *)xb + 1);
    float d1 = il ? (xb->d / 16.h) : xb->d;
    float d2 = d1 / 256.f;
    float md = -8.h * xb->d;
    ushort mask0 = il ? 0x00F0 : 0x000F;
    ushort mask1 = mask0 << 8;
    reg->s0 = d1 * (qs[0] & mask0) + md;
    reg->s1 = d2 * (qs[0] & mask1) + md;
    reg->s2 = d1 * (qs[1] & mask0) + md;
    reg->s3 = d2 * (qs[1] & mask1) + md;
    reg->s4 = d1 * (qs[2] & mask0) + md;
    reg->s5 = d2 * (qs[2] & mask1) + md;
    reg->s6 = d1 * (qs[3] & mask0) + md;
    reg->s7 = d2 * (qs[3] & mask1) + md;
    reg->s8 = d1 * (qs[4] & mask0) + md;
    reg->s9 = d2 * (qs[4] & mask1) + md;
    reg->sa = d1 * (qs[5] & mask0) + md;
    reg->sb = d2 * (qs[5] & mask1) + md;
    reg->sc = d1 * (qs[6] & mask0) + md;
    reg->sd = d2 * (qs[6] & mask1) + md;
    reg->se = d1 * (qs[7] & mask0) + md;
    reg->sf = d2 * (qs[7] & mask1) + md;
 }
 //------------------------------------------------------------------------------
 // get_rows
 //------------------------------------------------------------------------------
 kernel void kernel_get_rows_f32(
        global void * src0,
        ulong offset0,
        global int * src1,
        ulong offset1,
        global float * dst,
        ulong offsetd,
        int ne00,
        ulong nb01,
        ulong nb02,
        int ne10,
        ulong nb10,
        ulong nb11,
        ulong nb1,
        ulong nb2
 ) {
    src0 = (global void*)((global char*)src0 + offset0);
    src1 = (global int*)((global char*)src1 + offset1);
    dst = (global float*)((global char*)dst + offsetd);
    int i10 = get_group_id(0);
    int i11 = get_group_id(1);
    int r = ((global int *) ((global char *) src1 + i11*nb11 + i10*nb10))[0];
    int i02 = i11;
    for (int ind = get_local_id(0); ind < ne00; ind += get_local_size(0)) {
        ((global float *) ((global char *) dst + i11*nb2 + i10*nb1))[ind] =
            ((global float *) ((global char *) src0 + r*nb01 + i02*nb02))[ind];
    }
 }
 kernel void kernel_get_rows_f16(
        global void * src0,
        ulong offset0,
        global int * src1,
        ulong offset1,
        global float * dst,
        ulong offsetd,
        int ne00,
        ulong nb01,
        ulong nb02,
        int ne10,
        ulong nb10,
        ulong nb11,
        ulong nb1,
        ulong nb2
 ) {
    src0 = (global void*)((global char*)src0 + offset0);
    src1 = (global int*)((global char*)src1 + offset1);
    dst = (global float*)((global char*)dst + offsetd);
    int i10 = get_group_id(0);
    int i11 = get_group_id(1);
    int r = ((global int32_t *) ((global char *) src1 + i11*nb11 + i10*nb10))[0];
    int i02 = i11;
    for (int ind = get_local_id(0); ind < ne00; ind += get_local_size(0)) {
        ((global float *) ((global char *) dst + i11*nb2 + i10*nb1))[ind] =
            ((global half *) ((global char *) src0 + r*nb01 + i02*nb02))[ind];
    }
 }
 kernel void kernel_get_rows_q4_0(
        global void * src0,
        ulong offset0,
        global int * src1,
        ulong offset1,
        global float * dst,
        ulong offsetd,
        int ne00,
        ulong nb01,
        ulong nb02,
        int ne10,
        ulong nb10,
        ulong nb11,
        ulong nb1,
        ulong nb2
 ) {
    src0 = (global void*)((global char*)src0 + offset0);
    src1 = (global int*)((global char*)src1 + offset1);
    dst = (global float*)((global char*)dst + offsetd);
    const int NL = 2;
    int i10 = get_group_id(0);
    int i11 = get_group_id(1);
    int r = ((global int32_t *) ((global char *) src1 + i11*nb11 + i10*nb10))[0];
    int i02 = i11;
    for (int ind = get_local_id(0); ind < ne00/16; ind += get_local_size(0)) {
        float16 temp;
        dequantize_q4_0_f32(
            ((global struct block_q4_0 *) ((global char *) src0 + r*nb01 + i02*nb02)) + ind/NL, ind%NL, &temp);
        *(((global float16 *) ((global char *) dst + i11*nb2 + i10*nb1)) + ind) = temp;
    }
 }
--- a/ggml/src/ggml-opencl/kernels/ggml-opencl.cl
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl.cl
--- a/ggml/src/ggml-opencl/kernels/ggml-opencl_im2col.cl
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_im2col.cl
@ -1,146 +0,0 @@
 #ifdef cl_khr_fp16
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #elif defined(cl_amd_fp16)
 #pragma OPENCL EXTENSION cl_amd_fp16 : enable
 #else
 #error "Half precision floating point not supportedby OpenCL implementation on your device."
 #endif
 #ifdef cl_khr_subgroups
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 #elif defined(cl_intel_subgroups)
 #pragma OPENCL EXTENSION cl_intel_subgroups : enable
 #else
 #error "Subgroup not supported on your device."
 #endif
 #ifdef cl_intel_required_subgroup_size
 // Always use subgroup size of 32 on Intel.
 #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
 #define INTEL_GPU 1
 #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
 #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
 #elif defined(cl_qcom_reqd_sub_group_size)
 // Always use subgroups size of 64 on Adreno.
 #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 #define ADRENO_GPU 1
 #define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
 #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
 #else
 // TODO: do not know how to choose subgroup size on other GPUs.
 #error "Selecting subgroup size is not supported on your device."
 #endif
 kernel void kernel_im2col_f32(
        global float * src1,
        ulong offset1,
        global float * dst,
        ulong offsetd,
        ulong batch_offset,
        ulong delta_offset,
        long IW,
        long IH,
        long IC,
        long OW,
        long OH,
        long KW,
        long KH,
        long pelements,
        long CHW,
        int  s0,
        int  s1,
        int  p0,
        int  p1,
        int  d0,
        int  d1
 ) {
    // threadIdx.x + blockIdx.x * blockDim.x
    long i = get_global_id(0);
    if (i >= pelements) {
        return;
    }
    src1 = (global float*)((global char*)src1 + offset1);
    dst = (global float*)((global char*)dst + offsetd);
    long  ksize = OW * (KH > 1 ? KW : 1);
    long  kx = i / ksize;
    long  kd = kx * ksize;
    long  ky = (i - kd) / OW;
    long  ix = i % OW;
    long  oh = get_group_id(1);
    long  batch = get_group_id(2) / IC;
    long  ic = get_group_id(2) % IC;
    long iiw = ix * s0 + kx * d0 - p0;
    long iih = oh * s1 + ky * d1 - p1;
    long offset_dst =
        ((batch * OH + oh) * OW + ix) * CHW +
        (ic * (KW * KH) + ky * KW + kx);
    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
        dst[offset_dst] = 0.0f;
    } else {
        long offset_src = ic * delta_offset + batch * batch_offset;
        dst[offset_dst] = src1[offset_src + iih * IW + iiw];
    }
 }
 kernel void kernel_im2col_f16(
        global float * src1,
        ulong offset1,
        global half  * dst,
        ulong offsetd,
        ulong batch_offset,
        ulong delta_offset,
        long IW,
        long IH,
        long IC,
        long OW,
        long OH,
        long KW,
        long KH,
        long pelements,
        long CHW,
        int  s0,
        int  s1,
        int  p0,
        int  p1,
        int  d0,
        int  d1
 ) {
    long i = get_global_id(0);
    if (i >= pelements) {
        return;
    }
    src1 = (global float*)((global char*)src1 + offset1);
    dst = (global half*)((global char*)dst + offsetd);
    long  ksize = OW * (KH > 1 ? KW : 1);
    long  kx = i / ksize;
    long  kd = kx * ksize;
    long  ky = (i - kd) / OW;
    long  ix = i % OW;
    long  oh = get_group_id(1);
    long  batch = get_group_id(2) / IC;
    long  ic = get_group_id(2) % IC;
    long iiw = ix * s0 + kx * d0 - p0;
    long iih = oh * s1 + ky * d1 - p1;
    long offset_dst =
        ((batch * OH + oh) * OW + ix) * CHW +
        (ic * (KW * KH) + ky * KW + kx);
    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
        dst[offset_dst] = 0.0f;
    } else {
        long offset_src = ic * delta_offset + batch * batch_offset;
        dst[offset_dst] = src1[offset_src + iih * IW + iiw];
    }
 }
--- a/ggml/src/ggml-opencl/kernels/ggml-opencl_mm.cl
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_mm.cl
--- a/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_16.cl
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_16.cl
@ -1,26 +0,0 @@
 // 16-bit transpose, loading/storing a 4x4 tile of elements
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 kernel void kernel_transpose_16(
    __read_only image1d_buffer_t input,
    __write_only image1d_buffer_t output,
    const uint rows,
    const uint cols
 ) {
    const int i = get_global_id(0);
    const int j = get_global_id(1);
    const int i_2 = i<<2;
    const int j_2 = j<<2;
    half4 temp0 = read_imageh(input, (j_2+0)*cols+i);
    half4 temp1 = read_imageh(input, (j_2+1)*cols+i);
    half4 temp2 = read_imageh(input, (j_2+2)*cols+i);
    half4 temp3 = read_imageh(input, (j_2+3)*cols+i);
    write_imageh(output, (i_2+0)*rows+j, (half4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0));
    write_imageh(output, (i_2+1)*rows+j, (half4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
    write_imageh(output, (i_2+2)*rows+j, (half4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
    write_imageh(output, (i_2+3)*rows+j, (half4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
 }
--- a/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32.cl
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32.cl
@ -1,25 +0,0 @@
 // 32-bit transpose, loading/storing a 4x4 tile of elements
 kernel void kernel_transpose_32(
    __read_only image1d_buffer_t input,
    __write_only image1d_buffer_t output,
    const uint rows,
    const uint cols
 ) {
    const int i = get_global_id(0);
    const int j = get_global_id(1);
    const int i_2 = i<<2;
    const int j_2 = j<<2;
    float4 temp0 = read_imagef(input, (j_2+0)*cols+i);
    float4 temp1 = read_imagef(input, (j_2+1)*cols+i);
    float4 temp2 = read_imagef(input, (j_2+2)*cols+i);
    float4 temp3 = read_imagef(input, (j_2+3)*cols+i);
    write_imagef(output, (i_2+0)*rows+j, (float4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0));
    write_imagef(output, (i_2+1)*rows+j, (float4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
    write_imagef(output, (i_2+2)*rows+j, (float4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
    write_imagef(output, (i_2+3)*rows+j, (float4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
 }
--- a/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32_16.cl
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32_16.cl
@ -1,35 +0,0 @@
 // 32-bit transpose, loading/storing a 4x4 tile of elements
 // Only used for activations
 // converts to FP16
 // also adds zero padding for non multiple of 8 prompt lengths
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 kernel void kernel_transpose_32_16(__read_only image1d_buffer_t input, __write_only image1d_buffer_t output, const uint rows, const uint cols, const uint padded_rows) {
    const int i = get_global_id(0);
    const int j = get_global_id(1);
    const int i_2 = i<<2;
    const int j_2 = j<<2;
    half4 temp0 = {0,0,0,0}; // initialize outputs to 0
    half4 temp1 = {0,0,0,0};
    half4 temp2 = {0,0,0,0};
    half4 temp3 = {0,0,0,0};
    if((j_2+0)*cols+i*4+3 < rows*cols*16){ // only load from a valid location. Otherwise keep register data as 0
        temp0 = read_imageh(input, (j_2+0)*cols+i);
    }
    if((j_2+1)*cols+i*4+3 < rows*cols*16){
        temp1 = read_imageh(input, (j_2+1)*cols+i);
    }
    if((j_2+2)*cols+i*4+3 < rows*cols*16){
        temp2 = read_imageh(input, (j_2+2)*cols+i);
    }
    if((j_2+3)*cols+i*4+3 < rows*cols*16){
        temp3 = read_imageh(input, (j_2+3)*cols+i);
    }
    write_imageh(output, (i_2+0)*padded_rows+j, (half4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0)); // no conditionals for output, includes zero padding
    write_imageh(output, (i_2+1)*padded_rows+j, (half4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
    write_imageh(output, (i_2+2)*padded_rows+j, (half4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
    write_imageh(output, (i_2+3)*padded_rows+j, (half4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
 }
--- a/ggml/src/ggml-opencl/kernels/im2col_f16.cl
+++ b/ggml/src/ggml-opencl/kernels/im2col_f16.cl
@ -0,0 +1,57 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 kernel void kernel_im2col_f16(
        global float * src1,
        ulong offset1,
        global half  * dst,
        ulong offsetd,
        ulong batch_offset,
        ulong delta_offset,
        long IW,
        long IH,
        long IC,
        long OW,
        long OH,
        long KW,
        long KH,
        long pelements,
        long CHW,
        int  s0,
        int  s1,
        int  p0,
        int  p1,
        int  d0,
        int  d1
 ) {
    long i = get_global_id(0);
    if (i >= pelements) {
        return;
    }
    src1 = (global float*)((global char*)src1 + offset1);
    dst = (global half*)((global char*)dst + offsetd);
    long  ksize = OW * (KH > 1 ? KW : 1);
    long  kx = i / ksize;
    long  kd = kx * ksize;
    long  ky = (i - kd) / OW;
    long  ix = i % OW;
    long  oh = get_group_id(1);
    long  batch = get_group_id(2) / IC;
    long  ic = get_group_id(2) % IC;
    long iiw = ix * s0 + kx * d0 - p0;
    long iih = oh * s1 + ky * d1 - p1;
    long offset_dst =
        ((batch * OH + oh) * OW + ix) * CHW +
        (ic * (KW * KH) + ky * KW + kx);
    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
        dst[offset_dst] = 0.0f;
    } else {
        long offset_src = ic * delta_offset + batch * batch_offset;
        dst[offset_dst] = src1[offset_src + iih * IW + iiw];
    }
 }
--- a/ggml/src/ggml-opencl/kernels/im2col_f32.cl
+++ b/ggml/src/ggml-opencl/kernels/im2col_f32.cl
@ -0,0 +1,57 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 kernel void kernel_im2col_f32(
        global float * src1,
        ulong offset1,
        global float * dst,
        ulong offsetd,
        ulong batch_offset,
        ulong delta_offset,
        long IW,
        long IH,
        long IC,
        long OW,
        long OH,
        long KW,
        long KH,
        long pelements,
        long CHW,
        int  s0,
        int  s1,
        int  p0,
        int  p1,
        int  d0,
        int  d1
 ) {
    long i = get_global_id(0);
    if (i >= pelements) {
        return;
    }
    src1 = (global float*)((global char*)src1 + offset1);
    dst = (global float*)((global char*)dst + offsetd);
    long  ksize = OW * (KH > 1 ? KW : 1);
    long  kx = i / ksize;
    long  kd = kx * ksize;
    long  ky = (i - kd) / OW;
    long  ix = i % OW;
    long  oh = get_group_id(1);
    long  batch = get_group_id(2) / IC;
    long  ic = get_group_id(2) % IC;
    long iiw = ix * s0 + kx * d0 - p0;
    long iih = oh * s1 + ky * d1 - p1;
    long offset_dst =
        ((batch * OH + oh) * OW + ix) * CHW +
        (ic * (KW * KH) + ky * KW + kx);
    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
        dst[offset_dst] = 0.0f;
    } else {
        long offset_src = ic * delta_offset + batch * batch_offset;
        dst[offset_dst] = src1[offset_src + iih * IW + iiw];
    }
 }
--- a/ggml/src/ggml-opencl/kernels/mul.cl
+++ b/ggml/src/ggml-opencl/kernels/mul.cl
@ -0,0 +1,79 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 //------------------------------------------------------------------------------
 // mul
 //------------------------------------------------------------------------------
 kernel void kernel_mul(
        global char * src0,
        ulong offset0,
        global char * src1,
        ulong offset1,
        global char * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int ne02,
        int ne03,
        ulong nb00,
        ulong nb01,
        ulong nb02,
        ulong nb03,
        int ne10,
        int ne11,
        int ne12,
        int ne13,
        ulong nb10,
        ulong nb11,
        ulong nb12,
        ulong nb13,
        int ne0,
        int ne1,
        int ne2,
        int ne3,
        ulong nb0,
        ulong nb1,
        ulong nb2,
        ulong nb3
 ) {
    src0 = src0 + offset0;
    src1 = src1 + offset1;
    dst  = dst + offsetd;
    int i03 = get_group_id(2);
    int i02 = get_group_id(1);
    int i01 = get_group_id(0);
    int i13 = i03 % ne13;
    int i12 = i02 % ne12;
    int i11 = i01 % ne11;
    global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
    global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
    global char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1;
    for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
        const int i10 = i0 % ne10;
        *((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) * *((global float *)(src1_ptr + i10*nb10));
    }
 }
 // assumption: src1 is a row
 // broadcast src1 into src0
 kernel void kernel_mul_row(
        global float4 * src0,
        ulong offset0,
        global float4 * src1,
        ulong offset1,
        global float4 * dst,
        ulong offsetd,
        int ne
 ) {
    src0 = (global float4*)((global char*)src0 + offset0);
    src1 = (global float4*)((global char*)src1 + offset1);
    dst = (global float4*)((global char*)dst + offsetd);
    // This performs better than using %.
    uint gid = get_global_id(0);
    uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
    dst[gid] = src0[gid] * src1[idx1];
 }
--- a/ggml/src/ggml-opencl/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
+++ b/ggml/src/ggml-opencl/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
--- a/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_f16_f16.cl
@ -0,0 +1,118 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #ifdef cl_intel_subgroups
 #pragma OPENCL EXTENSION cl_intel_subgroups : enable
 #else
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 #endif
 #ifdef cl_intel_required_subgroup_size
 #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
 #define INTEL_GPU 1
 #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
 #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
 #elif defined(cl_qcom_reqd_sub_group_size)
 #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 #define ADRENO_GPU 1
 #define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
 #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
 #endif
 #define N_F16_F16 4
 #ifdef ADRENO_GPU
 REQD_SUBGROUP_SIZE_64
 #endif
 kernel void kernel_mul_mat_f16_f16(
        global char * src0,
        ulong offset0,
        global char * src1,
        ulong offset1,
        global float * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int ne02,
        ulong nb00,
        ulong nb01,
        ulong nb02,
        ulong nb03,
        int ne10,
        int ne11,
        int ne12,
        ulong nb10,
        ulong nb11,
        ulong nb12,
        ulong nb13,
        int ne0,
        int ne1,
        int r2,
        int r3)
 {
    src0 = (global char*)((global char*)src0 + offset0);
    src1 = (global char*)((global char*)src1 + offset1);
    dst = (global float*)((global char*)dst + offsetd);
    int r0 = get_group_id(0);
    int rb = get_group_id(1)*N_F16_F16;
    int im = get_group_id(2);
    int i12 = im%ne12;
    int i13 = im/ne12;
    ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
    global half * x = (global half *) (src0 + offset_src0);
    if (ne00 < 128) {
        for (int row = 0; row < N_F16_F16; ++row) {
            int r1 = rb + row;
            if (r1 >= ne11) {
                break;
            }
            ulong offset_src1 = r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
            global half * y = (global half *) (src1 + offset_src1);
            float sumf = 0;
            for (int i = get_sub_group_local_id(); i < ne00; i += get_max_sub_group_size()) {
                sumf += (half) x[i] * (half) y[i];
            }
            float all_sum = sub_group_reduce_add(sumf);
            if (get_sub_group_local_id() == 0) {
                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
            }
        }
    } else {
        global half4 * x4 = (global half4 *)x;
        for (int row = 0; row < N_F16_F16; ++row) {
            int r1 = rb + row;
            if (r1 >= ne11) {
                break;
            }
            ulong offset_src1 = r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
            global half  * y  = (global half  *) (src1 + offset_src1);
            global half4 * y4 = (global half4 *) y;
            float sumf = 0;
            for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
                sumf += (half) x4[i].s0 * y4[i].s0;
                sumf += (half) x4[i].s1 * y4[i].s1;
                sumf += (half) x4[i].s2 * y4[i].s2;
                sumf += (half) x4[i].s3 * y4[i].s3;
            }
            float all_sum = sub_group_reduce_add(sumf);
            if (get_sub_group_local_id() == 0) {
                for (int i = 4*(ne00/4); i < ne00; ++i) {
                    all_sum += (half) x[i] * y[i];
                }
                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
            }
        }
    }
 }
--- a/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32.cl
@ -0,0 +1,118 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #ifdef cl_intel_subgroups
 #pragma OPENCL EXTENSION cl_intel_subgroups : enable
 #else
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 #endif
 #ifdef cl_intel_required_subgroup_size
 #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
 #define INTEL_GPU 1
 #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
 #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
 #elif defined(cl_qcom_reqd_sub_group_size)
 #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 #define ADRENO_GPU 1
 #define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
 #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
 #endif
 #define N_F16_F32 4
 #ifdef ADRENO_GPU
 REQD_SUBGROUP_SIZE_64
 #endif
 kernel void kernel_mul_mat_f16_f32(
        global char * src0,
        ulong offset0,
        global char * src1,
        ulong offset1,
        global float * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int ne02,
        ulong nb00,
        ulong nb01,
        ulong nb02,
        ulong nb03,
        int ne10,
        int ne11,
        int ne12,
        ulong nb10,
        ulong nb11,
        ulong nb12,
        ulong nb13,
        int ne0,
        int ne1,
        int r2,
        int r3
 ) {
    src0 = (global char*)((global char*)src0 + offset0);
    src1 = (global char*)((global char*)src1 + offset1);
    dst = (global float*)((global char*)dst + offsetd);
    int r0 = get_group_id(0);
    int rb = get_group_id(1)*N_F16_F32;
    int im = get_group_id(2);
    int i12 = im%ne12;
    int i13 = im/ne12;
    ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
    global half * x = (global half *) (src0 + offset_src0);
    if (ne00 < 128) {
        for (int row = 0; row < N_F16_F32; ++row) {
            int r1 = rb + row;
            if (r1 >= ne11) {
                break;
            }
            ulong offset_src1 = r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
            global float * y = (global float *) (src1 + offset_src1);
            float sumf = 0;
            for (int i = get_sub_group_local_id(); i < ne00; i += get_max_sub_group_size()) {
                sumf += convert_float(x[i]) * y[i];
            }
            float all_sum = sub_group_reduce_add(sumf);
            if (get_sub_group_local_id() == 0) {
                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
            }
        }
    } else {
        global half4 * x4 = (global half4 *)x;
        for (int row = 0; row < N_F16_F32; ++row) {
            int r1 = rb + row;
            if (r1 >= ne11) {
                break;
            }
            ulong offset_src1 = r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
            global float  * y  = (global float  *) (src1 + offset_src1);
            global float4 * y4 = (global float4 *) y;
            float sumf = 0;
            for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
                sumf += convert_float(x4[i].s0) * y4[i].s0;
                sumf += convert_float(x4[i].s1) * y4[i].s1;
                sumf += convert_float(x4[i].s2) * y4[i].s2;
                sumf += convert_float(x4[i].s3) * y4[i].s3;
            }
            float all_sum = sub_group_reduce_add(sumf);
            if (get_sub_group_local_id() == 0) {
                for (int i = 4*(ne00/4); i < ne00; ++i) {
                    all_sum += (float) x[i] * y[i];
                }
                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
            }
        }
    }
 }
--- a/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_1row.cl
@ -0,0 +1,94 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #ifdef cl_intel_subgroups
 #pragma OPENCL EXTENSION cl_intel_subgroups : enable
 #else
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 #endif
 #ifdef cl_intel_required_subgroup_size
 #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
 #define INTEL_GPU 1
 #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
 #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
 #elif defined(cl_qcom_reqd_sub_group_size)
 #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 #define ADRENO_GPU 1
 #define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
 #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
 #endif
 #ifdef ADRENO_GPU
 REQD_SUBGROUP_SIZE_64
 #endif
 kernel void kernel_mul_mat_f16_f32_1row(
        global char * src0,
        ulong offset0,
        global char * src1,
        ulong offset1,
        global float * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int ne02,
        ulong nb00,
        ulong nb01,
        ulong nb02,
        ulong nb03,
        int ne10,
        int ne11,
        int ne12,
        ulong nb10,
        ulong nb11,
        ulong nb12,
        ulong nb13,
        int ne0,
        int ne1,
        int r2,
        int r3
 ) {
    src0 = (global char*)((global char*)src0 + offset0);
    src1 = (global char*)((global char*)src1 + offset1);
    dst = (global float*)((global char*)dst + offsetd);
    int r0 = get_group_id(0);
    int r1 = get_group_id(1);
    int im = get_group_id(2);
    int i12 = im%ne12;
    int i13 = im/ne12;
    ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
    ulong offset_src1 = r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
    global half  * x = (global half  *) (src0 + offset_src0);
    global float * y = (global float *) (src1 + offset_src1);
    float sumf = 0;
    if (ne00 < 128) {
        for (int i = get_sub_group_local_id(); i < ne00; i += get_max_sub_group_size()) {
            sumf += (float) x[i] * (float) y[i];
        }
        float all_sum = sub_group_reduce_add(sumf);
        if (get_sub_group_local_id() == 0) {
            dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
        }
    } else {
        global half4  * x4 = (global half4  *) x;
        global float4 * y4 = (global float4 *) y;
        for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
            sumf += (float) x4[i].s0 * y4[i].s0;
            sumf += (float) x4[i].s1 * y4[i].s1;
            sumf += (float) x4[i].s2 * y4[i].s2;
            sumf += (float) x4[i].s3 * y4[i].s3;
        }
        float all_sum = sub_group_reduce_add(sumf);
        if (get_sub_group_local_id() == 0) {
            for (int i = 4*(ne00/4); i < ne00; ++i) {
                all_sum += (float) x[i] * y[i];
            }
            dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
        }
    }
 }
--- a/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl
@ -0,0 +1,84 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #ifdef cl_intel_subgroups
 #pragma OPENCL EXTENSION cl_intel_subgroups : enable
 #else
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 #endif
 #ifdef cl_intel_required_subgroup_size
 #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
 #define INTEL_GPU 1
 #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
 #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
 #elif defined(cl_qcom_reqd_sub_group_size)
 #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 #define ADRENO_GPU 1
 #define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
 #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
 #endif
 // Assumes row size (ne00) is a multiple of 4
 #ifdef ADRENO_GPU
 REQD_SUBGROUP_SIZE_64
 #endif
 kernel void kernel_mul_mat_f16_f32_l4(
        global char * src0,
        ulong offset0,
        global char * src1,
        ulong offset1,
        global float * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int ne02,
        ulong nb00,
        ulong nb01,
        ulong nb02,
        ulong nb03,
        int ne10,
        int ne11,
        int ne12,
        ulong nb10,
        ulong nb11,
        ulong nb12,
        ulong nb13,
        int ne0,
        int ne1,
        int r2,
        int r3
 ) {
    src0 = (global char*)((global char*)src0 + offset0);
    src1 = (global char*)((global char*)src1 + offset1);
    dst = (global float*)((global char*)dst + offsetd);
    int nrows = ne11;
    int r0 = get_group_id(0);
    int im = get_group_id(2);
    int i12 = im%ne12;
    int i13 = im/ne12;
    ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
    global half4 * x4 = (global half4 *) (src0 + offset_src0);
    for (int r1 = 0; r1 < nrows; ++r1) {
        ulong offset_src1 = r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
        global float4 * y4 = (global float4 *) (src1 + offset_src1);
        float sumf = 0;
        for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
            sumf += convert_float(x4[i].s0) * y4[i].s0;
            sumf += convert_float(x4[i].s1) * y4[i].s1;
            sumf += convert_float(x4[i].s2) * y4[i].s2;
            sumf += convert_float(x4[i].s3) * y4[i].s3;
        }
        float all_sum = sub_group_reduce_add(sumf);
        if (get_sub_group_local_id() == 0) {
            dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
        }
    }
 }
--- a/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_f32_f32.cl
@ -0,0 +1,118 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #ifdef cl_intel_subgroups
 #pragma OPENCL EXTENSION cl_intel_subgroups : enable
 #else
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 #endif
 #ifdef cl_intel_required_subgroup_size
 #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
 #define INTEL_GPU 1
 #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
 #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
 #elif defined(cl_qcom_reqd_sub_group_size)
 #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 #define ADRENO_GPU 1
 #define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
 #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
 #endif
 #define N_F32_F32 4
 #ifdef ADRENO_GPU
 REQD_SUBGROUP_SIZE_64
 #endif
 kernel void kernel_mul_mat_f32_f32(
        global char * src0,
        ulong offset0,
        global char * src1,
        ulong offset1,
        global float * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int ne02,
        ulong nb00,
        ulong nb01,
        ulong nb02,
        ulong nb03,
        int ne10,
        int ne11,
        int ne12,
        ulong nb10,
        ulong nb11,
        ulong nb12,
        ulong nb13,
        int ne0,
        int ne1,
        int r2,
        int r3
 ) {
    src0 = (global char*)((global char*)src0 + offset0);
    src1 = (global char*)((global char*)src1 + offset1);
    dst = (global float*)((global char*)dst + offsetd);
    int r0 = get_group_id(0);
    int rb = get_group_id(1)*N_F32_F32;
    int im = get_group_id(2);
    int i12 = im%ne12;
    int i13 = im/ne12;
    ulong offset_src0 = r0*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
    global float * x = (global float *) (src0 + offset_src0);
    if (ne00 < 128) {
        for (int row = 0; row < N_F32_F32; ++row) {
            int r1 = rb + row;
            if (r1 >= ne11) {
                break;
            }
            ulong offset_src1 = r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
            global float * y = (global float *) (src1 + offset_src1);
            float sumf = 0;
            for (int i = get_sub_group_local_id(); i < ne00; i += get_max_sub_group_size()) {
                sumf += (float) x[i] * (float) y[i];
            }
            float all_sum = sub_group_reduce_add(sumf);
            if (get_sub_group_local_id() == 0) {
                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
            }
        }
    } else {
        global float4 * x4 = (global float4 *)x;
        for (int row = 0; row < N_F32_F32; ++row) {
            int r1 = rb + row;
            if (r1 >= ne11) {
                break;
            }
            ulong offset_src1 = r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
            global float  * y  = (global float  *) (src1 + offset_src1);
            global float4 * y4 = (global float4 *) y;
            float sumf = 0;
            for (int i = get_sub_group_local_id(); i < ne00/4; i += get_max_sub_group_size()) {
                sumf += (float) x4[i].s0 * y4[i].s0;
                sumf += (float) x4[i].s1 * y4[i].s1;
                sumf += (float) x4[i].s2 * y4[i].s2;
                sumf += (float) x4[i].s3 * y4[i].s3;
            }
            float all_sum = sub_group_reduce_add(sumf);
            if (get_sub_group_local_id() == 0) {
                for (int i = 4*(ne00/4); i < ne00; ++i) {
                    all_sum += (float) x[i] * y[i];
                }
                dst[im*ne1*ne0 + r1*ne0 + r0] = all_sum;
            }
        }
    }
 }
--- a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32.cl
@ -0,0 +1,192 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #ifdef cl_intel_subgroups
 #pragma OPENCL EXTENSION cl_intel_subgroups : enable
 #else
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 #endif
 #ifdef cl_intel_required_subgroup_size
 #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
 #define INTEL_GPU 1
 #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
 #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
 #elif defined(cl_qcom_reqd_sub_group_size)
 #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 #define ADRENO_GPU 1
 #define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
 #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
 #endif
 #define QK4_0                   32
 #define QR4_0                   2
 #define QK4_1                   32
 #define QR4_1                   2
 #define QK5_0                   32
 #define QR5_0                   2
 #define QK5_1                   32
 #define QR5_1                   2
 #define QK8_0                   32
 #define QR8_0                   1
 #define QK_K                    256
 #define K_QUANTS_PER_ITERATION  2
 typedef char int8_t;
 typedef uchar uint8_t;
 typedef short int16_t;
 typedef ushort uint16_t;
 typedef int int32_t;
 typedef uint uint32_t;
 //------------------------------------------------------------------------------
 // block_q4_0
 //------------------------------------------------------------------------------
 struct block_q4_0
 {
    half d;
    uint8_t qs[QK4_0 / 2];
 };
 //------------------------------------------------------------------------------
 // mul_vec_q_n_f32
 //------------------------------------------------------------------------------
 // function for calculate inner product between half a q4_0 block and 16 floats (yl), sumy is SUM(yl[i])
 // il indicates where the q4 quants begin (0 or QK4_0/4)
 // we assume that the yl's have been multiplied with the appropriate scale factor
 // that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
 inline float block_q_4_0_dot_y(
        global struct block_q4_0 * qb_curr,
        float sumy,
        private float * yl,
        int il
 ) {
    float d = qb_curr->d;
    float2 acc = 0.f;
    global ushort * qs = ((global ushort *)qb_curr + 1 + il/2);
    for (int i = 0; i < 8; i+=2) {
        acc.s0 += yl[i + 0] * (qs[i / 2] & 0x000F)
                + yl[i + 1] * (qs[i / 2] & 0x0F00);
        acc.s1 += yl[i + 8] * (qs[i / 2] & 0x00F0)
                + yl[i + 9] * (qs[i / 2] & 0xF000);
    }
    return d * (sumy * -8.f + acc.s0 + acc.s1);
 }
 #ifdef INTEL_GPU
 #define N_DST 4 // each SIMD group works on 4 rows
 #define N_SIMDGROUP 1 // number of SIMD groups in a thread group
 #define N_SIMDWIDTH 16 // assuming SIMD group size is 16
 #elif defined (ADRENO_GPU)
 #define N_DST 4
 #define N_SIMDGROUP 1
 #define N_SIMDWIDTH 64
 #endif
 inline void mul_vec_q_n_f32(
        global void * src0,
        global float * src1,
        global float * dst,
        int ne00,
        int ne01,
        int ne02,
        int ne10,
        int ne12,
        int ne0,
        int ne1,
        int r2,
        int r3
 ) {
    const ulong nb = ne00/QK4_0;
    int r0 = get_group_id(0);
    int r1 = get_group_id(1);
    int im = get_group_id(2);
    // (r0 * N_SIMDGROUP + get_sub_group_id()) is essenatially the linear global
    // id of a SIMD group in the grid.
    int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
    int i12 = im%ne12;
    int i13 = im/ne12;
    ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
    global struct block_q4_0 * x = (global struct block_q4_0 *) src0 + offset0;
    global float             * y = (global float             *) src1 + r1*ne10 + im*ne00*ne1;
    float yl[16];       // src1 vector cache
    float sumf[N_DST]={0.f};
    int ix = get_sub_group_local_id()/2;
    int il = 8*(get_sub_group_local_id()%2);
    global float * yb = y + ix * QK4_0 + il;
    // each thread in a SIMD group deals with half a block.
    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
        float sumy = 0;
        for (int i = 0; i < 8; i += 2) {
            sumy += yb[i] + yb[i+1];
            yl[i+0] = yb[i+ 0];
            yl[i+1] = yb[i+ 1]/256.f;
            sumy += yb[i+16] + yb[i+17];
            yl[i+8] = yb[i+16]/16.f;
            yl[i+9] = yb[i+17]/4096.f;
        }
        for (int row = 0; row < N_DST; row++) {
            sumf[row] += block_q_4_0_dot_y(x+ib+row*nb, sumy, yl, il);
        }
        // One thread in a SIMD group (i.e., subgroup) handles a half block,
        // hence then entire SIMD group handles SIMDWIDTH/2 blocks.
        // y points to the activation matrix (of type float). Therefore for
        // one thread, the # of blocks y should advance is SIMDWIDTH/2 (because
        // SIMDWIDTH/2 blocks are processed by a SIMD group) - in terms of
        // floats, it is QK4_0 * (SIMDWIDTH/2), where QK4_0 is the block size.
        yb += QK4_0 * (N_SIMDWIDTH/2);
    }
    // The above does not work for Adreno - it produces incorrect results for
    // row = 1, 2, 3 and only row = 0 gives the correct result.
    // If N_DST is changed, the below array must be initialized accordingly.
    // This also seems to perform better on Intel.
    float tot[N_DST] = {
        sub_group_reduce_add(sumf[0]), sub_group_reduce_add(sumf[1]),
        sub_group_reduce_add(sumf[2]), sub_group_reduce_add(sumf[3])};
    for (int row = 0; row < N_DST; ++row) {
        if (get_sub_group_local_id() == 0 && first_row + row < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot[row];
        }
    }
 }
 #ifdef INTEL_GPU
 REQD_SUBGROUP_SIZE_16
 #elif defined (ADRENO_GPU)
 REQD_SUBGROUP_SIZE_64
 #endif
 kernel void kernel_mul_mat_q4_0_f32(
        global void * src0,
        ulong offset0,
        global float * src1,
        ulong offset1,
        global float * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int ne02,
        int ne10,
        int ne12,
        int ne0,
        int ne1,
        int r2,
        int r3
 ) {
    src0 = (global void*)((global char*)src0 + offset0);
    src1 = (global float*)((global char*)src1 + offset1);
    dst = (global float*)((global char*)dst + offsetd);
    mul_vec_q_n_f32(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
 }
--- a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_16x_flat.cl
@ -0,0 +1,307 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #ifdef cl_intel_subgroups
 #pragma OPENCL EXTENSION cl_intel_subgroups : enable
 #else
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 #endif
 #ifdef cl_intel_required_subgroup_size
 #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
 #define INTEL_GPU 1
 #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
 #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
 #elif defined(cl_qcom_reqd_sub_group_size)
 #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 #define ADRENO_GPU 1
 #define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
 #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
 #endif
 #define QK4_0                   32
 #define QR4_0                   2
 #define QK4_1                   32
 #define QR4_1                   2
 #define QK5_0                   32
 #define QR5_0                   2
 #define QK5_1                   32
 #define QR5_1                   2
 #define QK8_0                   32
 #define QR8_0                   1
 #define QK_K                    256
 #define K_QUANTS_PER_ITERATION  2
 typedef char int8_t;
 typedef uchar uint8_t;
 typedef short int16_t;
 typedef ushort uint16_t;
 typedef int int32_t;
 typedef uint uint32_t;
 //------------------------------------------------------------------------------
 // block_q4_0
 //------------------------------------------------------------------------------
 struct block_q4_0
 {
    half d;
    uint8_t qs[QK4_0 / 2];
 };
 inline float mm_block_q_4_0_dot_y_flat(
        global uchar * x,
        global half  * dh,
        float sumy,
        float16 yl,
        int il
 ) {
    float           d   = *dh;
    global ushort * qs  = ((global ushort *)x + il/2);
    float           acc = 0.f;
    acc += yl.s0 * (qs[0] & 0x000F);
    acc += yl.s1 * (qs[0] & 0x0F00);
    acc += yl.s8 * (qs[0] & 0x00F0);
    acc += yl.s9 * (qs[0] & 0xF000);
    acc += yl.s2 * (qs[1] & 0x000F);
    acc += yl.s3 * (qs[1] & 0x0F00);
    acc += yl.sa * (qs[1] & 0x00F0);
    acc += yl.sb * (qs[1] & 0xF000);
    acc += yl.s4 * (qs[2] & 0x000F);
    acc += yl.s5 * (qs[2] & 0x0F00);
    acc += yl.sc * (qs[2] & 0x00F0);
    acc += yl.sd * (qs[2] & 0xF000);
    acc += yl.s6 * (qs[3] & 0x000F);
    acc += yl.s7 * (qs[3] & 0x0F00);
    acc += yl.se * (qs[3] & 0x00F0);
    acc += yl.sf * (qs[3] & 0xF000);
    return d * (sumy * -8.f + acc);
 }
 #ifdef INTEL_GPU
 #define N_DST 16 // each SIMD group works on 8 rows (in weights matrix)
 #define N_SIMDGROUP 1 // number of SIMD groups in a thread group
 #define N_SIMDWIDTH 16 // assuming SIMD group size is 16
 #elif defined (ADRENO_GPU)
 #define N_DST 16
 #define N_SIMDGROUP 1
 #define N_SIMDWIDTH 64
 #endif
 //
 // This variant performs 1d blocking with 16x output.
 // Eeach simdgroup outputs 16 values on `n0` dim (row in the output matrix).
 //
 inline void mul_mat_q_n_f32_1d_16x_flat(
        global uchar * src0_q,
        global half  * src0_d,
        global float * src1,
        global float * dst,
        int ne00,
        int ne01,
        int ne02,
        int ne10,
        int ne12,
        int ne0,
        int ne1,
        int r2,
        int r3
 ) {
    const int nb = ne00/QK4_0;
    int r0 = get_group_id(0);
    int r1 = get_group_id(1);
    int im = get_group_id(2);
    // (r0 * N_SIMDGROUP + get_sub_group_id()) is the linear global id of
    // a SIMD group in the grid. Each SIMD group produces N_DST values in the
    // result, hence uses nb blocks, i.e., the offset becomes first_row*nb.
    // Currently with llama2 7B, im is always 0.
    // TODO: how to handle im/gqa*(nb*ne0)?
    int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
    int i12 = im%ne12;
    int i13 = im/ne12;
    // The number of scales is the same as the number of blocks.
    ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
    // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
    ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
    global uchar * x = (global uchar *) src0_q + offset0_q;
    global half  * d = (global half  *) src0_d + offset0_d;
    global float * y = (global float *) src1   + r1*ne10 + im*ne00*ne1;
    float16 yl;
    float16 sumf = (float16)(0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
                             0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f);
    int ix = get_sub_group_local_id()/2;
    int il = 8*(get_sub_group_local_id()%2);
    global float * yb = y + ix*QK4_0 + il;
    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
        float sumy = 0.f;
        sumy += yb[0];
        sumy += yb[1];
        sumy += yb[2];
        sumy += yb[3];
        sumy += yb[4];
        sumy += yb[5];
        sumy += yb[6];
        sumy += yb[7];
        sumy += yb[16];
        sumy += yb[17];
        sumy += yb[18];
        sumy += yb[19];
        sumy += yb[20];
        sumy += yb[21];
        sumy += yb[22];
        sumy += yb[23];
        yl.s0 = yb[0];
        yl.s1 = yb[1]/256.f;
        yl.s2 = yb[2];
        yl.s3 = yb[3]/256.f;
        yl.s4 = yb[4];
        yl.s5 = yb[5]/256.f;
        yl.s6 = yb[6];
        yl.s7 = yb[7]/256.f;
        yl.s8 = yb[16]/16.f;
        yl.s9 = yb[17]/4096.f;
        yl.sa = yb[18]/16.f;
        yl.sb = yb[19]/4096.f;
        yl.sc = yb[20]/16.f;
        yl.sd = yb[21]/4096.f;
        yl.se = yb[22]/16.f;
        yl.sf = yb[23]/4096.f;
        sumf.s0 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 +  0*nb*QK4_0/2, d + ib +  0*nb, sumy, yl, il);
        sumf.s1 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 +  1*nb*QK4_0/2, d + ib +  1*nb, sumy, yl, il);
        sumf.s2 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 +  2*nb*QK4_0/2, d + ib +  2*nb, sumy, yl, il);
        sumf.s3 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 +  3*nb*QK4_0/2, d + ib +  3*nb, sumy, yl, il);
        sumf.s4 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 +  4*nb*QK4_0/2, d + ib +  4*nb, sumy, yl, il);
        sumf.s5 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 +  5*nb*QK4_0/2, d + ib +  5*nb, sumy, yl, il);
        sumf.s6 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 +  6*nb*QK4_0/2, d + ib +  6*nb, sumy, yl, il);
        sumf.s7 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 +  7*nb*QK4_0/2, d + ib +  7*nb, sumy, yl, il);
        sumf.s8 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 +  8*nb*QK4_0/2, d + ib +  8*nb, sumy, yl, il);
        sumf.s9 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 +  9*nb*QK4_0/2, d + ib +  9*nb, sumy, yl, il);
        sumf.sa += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 10*nb*QK4_0/2, d + ib + 10*nb, sumy, yl, il);
        sumf.sb += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 11*nb*QK4_0/2, d + ib + 11*nb, sumy, yl, il);
        sumf.sc += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 12*nb*QK4_0/2, d + ib + 12*nb, sumy, yl, il);
        sumf.sd += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 13*nb*QK4_0/2, d + ib + 13*nb, sumy, yl, il);
        sumf.se += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 14*nb*QK4_0/2, d + ib + 14*nb, sumy, yl, il);
        sumf.sf += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 15*nb*QK4_0/2, d + ib + 15*nb, sumy, yl, il);
        yb += QK4_0 * (N_SIMDWIDTH/2);
    }
    float16 tot = (float16)(
        sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
        sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
        sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
        sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7),
        sub_group_reduce_add(sumf.s8), sub_group_reduce_add(sumf.s9),
        sub_group_reduce_add(sumf.sa), sub_group_reduce_add(sumf.sb),
        sub_group_reduce_add(sumf.sc), sub_group_reduce_add(sumf.sd),
        sub_group_reduce_add(sumf.se), sub_group_reduce_add(sumf.sf)
    );
    if (get_sub_group_local_id() == 0) {
        if (first_row + 0 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
        }
        if (first_row + 1 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
        }
        if (first_row + 2 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
        }
        if (first_row + 3 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
        }
        if (first_row + 4 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
        }
        if (first_row + 5 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
        }
        if (first_row + 6 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
        }
        if (first_row + 7 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
        }
        if (first_row + 8 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 8] = tot.s8;
        }
        if (first_row + 9 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 9] = tot.s9;
        }
        if (first_row + 10 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 10] = tot.sa;
        }
        if (first_row + 11 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 11] = tot.sb;
        }
        if (first_row + 12 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 12] = tot.sc;
        }
        if (first_row + 13 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 13] = tot.sd;
        }
        if (first_row + 14 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 14] = tot.se;
        }
        if (first_row + 15 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 15] = tot.sf;
        }
    }
 }
 #ifdef INTEL_GPU
 REQD_SUBGROUP_SIZE_16
 #elif defined (ADRENO_GPU)
 REQD_SUBGROUP_SIZE_64
 #endif
 kernel void kernel_mul_mat_q4_0_f32_1d_16x_flat(
        global uchar * src0_q,
        global half  * src0_d,
        global float * src1,
        ulong offset1,
        global float * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int ne02,
        int ne10,
        int ne12,
        int ne0,
        int ne1,
        int r2,
        int r3
 ) {
    src1 = (global float*)((global char*)src1 + offset1);
    dst = (global float*)((global char*)dst + offsetd);
    mul_mat_q_n_f32_1d_16x_flat(src0_q, src0_d, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
 }
--- a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_1d_8x_flat.cl
@ -0,0 +1,265 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #ifdef cl_intel_subgroups
 #pragma OPENCL EXTENSION cl_intel_subgroups : enable
 #else
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 #endif
 #ifdef cl_intel_required_subgroup_size
 #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
 #define INTEL_GPU 1
 #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
 #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
 #elif defined(cl_qcom_reqd_sub_group_size)
 #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 #define ADRENO_GPU 1
 #define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
 #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
 #endif
 #define QK4_0                   32
 #define QR4_0                   2
 #define QK4_1                   32
 #define QR4_1                   2
 #define QK5_0                   32
 #define QR5_0                   2
 #define QK5_1                   32
 #define QR5_1                   2
 #define QK8_0                   32
 #define QR8_0                   1
 #define QK_K                    256
 #define K_QUANTS_PER_ITERATION  2
 typedef char int8_t;
 typedef uchar uint8_t;
 typedef short int16_t;
 typedef ushort uint16_t;
 typedef int int32_t;
 typedef uint uint32_t;
 //------------------------------------------------------------------------------
 // block_q4_0
 //------------------------------------------------------------------------------
 struct block_q4_0
 {
    half d;
    uint8_t qs[QK4_0 / 2];
 };
 inline float mm_block_q_4_0_dot_y_flat(
        global uchar * x,
        global half  * dh,
        float sumy,
        float16 yl,
        int il
 ) {
    float           d   = *dh;
    global ushort * qs  = ((global ushort *)x + il/2);
    float           acc = 0.f;
    acc += yl.s0 * (qs[0] & 0x000F);
    acc += yl.s1 * (qs[0] & 0x0F00);
    acc += yl.s8 * (qs[0] & 0x00F0);
    acc += yl.s9 * (qs[0] & 0xF000);
    acc += yl.s2 * (qs[1] & 0x000F);
    acc += yl.s3 * (qs[1] & 0x0F00);
    acc += yl.sa * (qs[1] & 0x00F0);
    acc += yl.sb * (qs[1] & 0xF000);
    acc += yl.s4 * (qs[2] & 0x000F);
    acc += yl.s5 * (qs[2] & 0x0F00);
    acc += yl.sc * (qs[2] & 0x00F0);
    acc += yl.sd * (qs[2] & 0xF000);
    acc += yl.s6 * (qs[3] & 0x000F);
    acc += yl.s7 * (qs[3] & 0x0F00);
    acc += yl.se * (qs[3] & 0x00F0);
    acc += yl.sf * (qs[3] & 0xF000);
    return d * (sumy * -8.f + acc);
 }
 #ifdef INTEL_GPU
 #define N_DST 8 // each SIMD group works on 8 rows (in weights matrix)
 #define N_SIMDGROUP 1 // number of SIMD groups in a thread group
 #define N_SIMDWIDTH 16 // assuming SIMD group size is 16
 #elif defined (ADRENO_GPU)
 #define N_DST 8
 #define N_SIMDGROUP 1
 #define N_SIMDWIDTH 64
 #endif
 //
 // This variant performs 1d blocking with 8x output.
 // Eeach simdgroup outputs 8 values on `n0` dim (row in the output matrix).
 //
 inline void mul_mat_q_n_f32_1d_8x_flat(
        global uchar * src0_q,
        global half  * src0_d,
        global float * src1,
        global float * dst,
        int ne00,
        int ne01,
        int ne02,
        int ne10,
        int ne12,
        int ne0,
        int ne1,
        int r2,
        int r3
 ) {
    const int nb = ne00/QK4_0;
    int r0 = get_group_id(0);
    int r1 = get_group_id(1);
    int im = get_group_id(2);
    // (r0 * N_SIMDGROUP + get_sub_group_id()) is the linear global id of
    // a SIMD group in the grid. Each SIMD group produces N_DST values in the
    // result, hence uses nb blocks, i.e., the offset becomes first_row*nb.
    // Currently with llama2 7B, im is always 0.
    // TODO: how to handle im/gqa*(nb*ne0)?
    int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
    int i12 = im%ne12;
    int i13 = im/ne12;
    // The number of scales is the same as the number of blocks.
    ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
    // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
    ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
    global uchar * x = (global uchar *) src0_q + offset0_q;
    global half  * d = (global half  *) src0_d + offset0_d;
    global float * y = (global float *) src1   + r1*ne10 + im*ne00*ne1;
    float16 yl;
    float8 sumf = (float8)(0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f);
    int ix = get_sub_group_local_id()/2;
    int il = 8*(get_sub_group_local_id()%2);
    global float * yb = y + ix*QK4_0 + il;
    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
        float sumy = 0.f;
        sumy += yb[0];
        sumy += yb[1];
        sumy += yb[2];
        sumy += yb[3];
        sumy += yb[4];
        sumy += yb[5];
        sumy += yb[6];
        sumy += yb[7];
        sumy += yb[16];
        sumy += yb[17];
        sumy += yb[18];
        sumy += yb[19];
        sumy += yb[20];
        sumy += yb[21];
        sumy += yb[22];
        sumy += yb[23];
        yl.s0 = yb[0];
        yl.s1 = yb[1]/256.f;
        yl.s2 = yb[2];
        yl.s3 = yb[3]/256.f;
        yl.s4 = yb[4];
        yl.s5 = yb[5]/256.f;
        yl.s6 = yb[6];
        yl.s7 = yb[7]/256.f;
        yl.s8 = yb[16]/16.f;
        yl.s9 = yb[17]/4096.f;
        yl.sa = yb[18]/16.f;
        yl.sb = yb[19]/4096.f;
        yl.sc = yb[20]/16.f;
        yl.sd = yb[21]/4096.f;
        yl.se = yb[22]/16.f;
        yl.sf = yb[23]/4096.f;
        sumf.s0 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
        sumf.s1 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
        sumf.s2 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
        sumf.s3 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
        sumf.s4 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
        sumf.s5 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
        sumf.s6 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
        sumf.s7 += mm_block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
        yb += QK4_0 * (N_SIMDWIDTH/2);
    }
    float8 tot = (float8)(
        sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
        sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
        sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
        sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
    );
    if (get_sub_group_local_id() == 0) {
        if (first_row + 0 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
        }
        if (first_row + 1 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
        }
        if (first_row + 2 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
        }
        if (first_row + 3 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
        }
        if (first_row + 4 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
        }
        if (first_row + 5 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
        }
        if (first_row + 6 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
        }
        if (first_row + 7 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
        }
    }
 }
 #ifdef INTEL_GPU
 REQD_SUBGROUP_SIZE_16
 #elif defined (ADRENO_GPU)
 REQD_SUBGROUP_SIZE_64
 #endif
 kernel void kernel_mul_mat_q4_0_f32_1d_8x_flat(
        global uchar * src0_q,
        global half  * src0_d,
        global float * src1,
        ulong offset1,
        global float * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int ne02,
        int ne10,
        int ne12,
        int ne0,
        int ne1,
        int r2,
        int r3
 ) {
    src1 = (global float*)((global char*)src1 + offset1);
    dst = (global float*)((global char*)dst + offsetd);
    mul_mat_q_n_f32_1d_8x_flat(src0_q, src0_d, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
 }
--- a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_8x_flat.cl
@ -0,0 +1,272 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #ifdef cl_intel_subgroups
 #pragma OPENCL EXTENSION cl_intel_subgroups : enable
 #else
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 #endif
 #ifdef cl_intel_required_subgroup_size
 #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
 #define INTEL_GPU 1
 #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
 #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
 #elif defined(cl_qcom_reqd_sub_group_size)
 #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 #define ADRENO_GPU 1
 #define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
 #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
 #endif
 #define QK4_0                   32
 #define QR4_0                   2
 #define QK4_1                   32
 #define QR4_1                   2
 #define QK5_0                   32
 #define QR5_0                   2
 #define QK5_1                   32
 #define QR5_1                   2
 #define QK8_0                   32
 #define QR8_0                   1
 #define QK_K                    256
 #define K_QUANTS_PER_ITERATION  2
 typedef char int8_t;
 typedef uchar uint8_t;
 typedef short int16_t;
 typedef ushort uint16_t;
 typedef int int32_t;
 typedef uint uint32_t;
 //------------------------------------------------------------------------------
 // block_q4_0
 //------------------------------------------------------------------------------
 struct block_q4_0
 {
    half d;
    uint8_t qs[QK4_0 / 2];
 };
 // This function requires the original shuffled weights.
 // As a reminder, the original weights are shuffled so that (q[0], q[16]) are
 // packed together in a byte, so are (q[1], q[17]) and so on.
 inline float block_q_4_0_dot_y_flat(
        global uchar * x,
        global half  * dh,
        float sumy,
        float16 yl,
        int il
 ) {
    float           d   = *dh;
    global ushort * qs  = ((global ushort *)x + il/2);
    float           acc = 0.f;
    acc += yl.s0 * (qs[0] & 0x000F);
    acc += yl.s1 * (qs[0] & 0x0F00);
    acc += yl.s8 * (qs[0] & 0x00F0);
    acc += yl.s9 * (qs[0] & 0xF000);
    acc += yl.s2 * (qs[1] & 0x000F);
    acc += yl.s3 * (qs[1] & 0x0F00);
    acc += yl.sa * (qs[1] & 0x00F0);
    acc += yl.sb * (qs[1] & 0xF000);
    acc += yl.s4 * (qs[2] & 0x000F);
    acc += yl.s5 * (qs[2] & 0x0F00);
    acc += yl.sc * (qs[2] & 0x00F0);
    acc += yl.sd * (qs[2] & 0xF000);
    acc += yl.s6 * (qs[3] & 0x000F);
    acc += yl.s7 * (qs[3] & 0x0F00);
    acc += yl.se * (qs[3] & 0x00F0);
    acc += yl.sf * (qs[3] & 0xF000);
    return d * (sumy * -8.f + acc);
 }
 //
 // This variant outputs 8 values.
 //
 #undef N_DST
 #undef N_SIMDGROUP
 #undef N_SIMDWIDTH
 #ifdef INTEL_GPU
 #define N_DST 8 // each SIMD group works on 8 rows
 #define N_SIMDGROUP 1 // number of SIMD groups in a thread group
 #define N_SIMDWIDTH 16 // assuming SIMD group size is 32
 #elif defined (ADRENO_GPU)
 #define N_DST 8
 #define N_SIMDGROUP 1
 #define N_SIMDWIDTH 64
 #endif
 inline void mul_vec_q_n_f32_8x_flat(
        global uchar * src0_q,
        global half  * src0_d,
        global float * src1,
        global float * dst,
        int ne00,
        int ne01,
        int ne02,
        int ne10,
        int ne12,
        int ne0,
        int ne1,
        int r2,
        int r3
 ) {
    const ulong nb = ne00/QK4_0;
    int r0 = get_group_id(0);
    int r1 = get_group_id(1);
    int im = get_group_id(2);
    // (r0 * N_SIMDGROUP + get_sub_group_id()) is the linear global id of
    // a SIMD group in the grid. Each SIMD group produces N_DST values in the
    // result, hence uses nb blocks, i.e., the offset becomes first_row*nb.
    // Currently with llama2 7B, im is always 0.
    // TODO: how to handle im/gqa*(nb*ne0)?
    int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
    int i12 = im%ne12;
    int i13 = im/ne12;
    // The number of scales is the same as the number of blocks.
    ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
    // Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
    ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
    global uchar * x = (global uchar *) src0_q + offset0_q;
    global half  * d = (global half  *) src0_d + offset0_d;
    global float * y = (global float *) src1   + r1*ne10 + im*ne00*ne1;
    float16 yl;
    float8 sumf = 0.f;
    int ix = get_sub_group_local_id()/2;
    int il = 8*(get_sub_group_local_id()%2);
    global float * yb = y + ix*QK4_0 + il;
    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
        float sumy = 0.f;
        sumy += yb[0];
        sumy += yb[1];
        sumy += yb[2];
        sumy += yb[3];
        sumy += yb[4];
        sumy += yb[5];
        sumy += yb[6];
        sumy += yb[7];
        sumy += yb[16];
        sumy += yb[17];
        sumy += yb[18];
        sumy += yb[19];
        sumy += yb[20];
        sumy += yb[21];
        sumy += yb[22];
        sumy += yb[23];
        yl.s0 = yb[0];
        yl.s1 = yb[1]/256.f;
        yl.s2 = yb[2];
        yl.s3 = yb[3]/256.f;
        yl.s4 = yb[4];
        yl.s5 = yb[5]/256.f;
        yl.s6 = yb[6];
        yl.s7 = yb[7]/256.f;
        yl.s8 = yb[16]/16.f;
        yl.s9 = yb[17]/4096.f;
        yl.sa = yb[18]/16.f;
        yl.sb = yb[19]/4096.f;
        yl.sc = yb[20]/16.f;
        yl.sd = yb[21]/4096.f;
        yl.se = yb[22]/16.f;
        yl.sf = yb[23]/4096.f;
        sumf.s0 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
        sumf.s1 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
        sumf.s2 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
        sumf.s3 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
        sumf.s4 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
        sumf.s5 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
        sumf.s6 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
        sumf.s7 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
        yb += QK4_0 * (N_SIMDWIDTH/2);
    }
    float8 tot = (float8)(
        sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
        sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
        sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
        sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
    );
    if (get_sub_group_local_id() == 0) {
        if (first_row + 0 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
        }
        if (first_row + 1 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
        }
        if (first_row + 2 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
        }
        if (first_row + 3 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
        }
        if (first_row + 4 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
        }
        if (first_row + 5 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
        }
        if (first_row + 6 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
        }
        if (first_row + 7 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
        }
    }
 }
 #ifdef INTEL_GPU
 REQD_SUBGROUP_SIZE_16
 #elif defined (ADRENO_GPU)
 REQD_SUBGROUP_SIZE_64
 #endif
 kernel void kernel_mul_mat_q4_0_f32_8x_flat(
        global uchar * src0_q,
        global half  * src0_d,
        global float * src1,
        ulong offset1,
        global float * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int ne02,
        int ne10,
        int ne12,
        int ne0,
        int ne1,
        int r2,
        int r3
 ) {
    src1 = (global float*)((global char*)src1 + offset1);
    dst = (global float*)((global char*)dst + offsetd);
    mul_vec_q_n_f32_8x_flat(src0_q, src0_d, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
 }
--- a/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_q4_0_f32_v.cl
@ -0,0 +1,254 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #ifdef cl_intel_subgroups
 #pragma OPENCL EXTENSION cl_intel_subgroups : enable
 #else
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 #endif
 #ifdef cl_intel_required_subgroup_size
 #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
 #define INTEL_GPU 1
 #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
 #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
 #elif defined(cl_qcom_reqd_sub_group_size)
 #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 #define ADRENO_GPU 1
 #define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
 #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
 #endif
 #define QK4_0                   32
 #define QR4_0                   2
 #define QK4_1                   32
 #define QR4_1                   2
 #define QK5_0                   32
 #define QR5_0                   2
 #define QK5_1                   32
 #define QR5_1                   2
 #define QK8_0                   32
 #define QR8_0                   1
 #define QK_K                    256
 #define K_QUANTS_PER_ITERATION  2
 typedef char int8_t;
 typedef uchar uint8_t;
 typedef short int16_t;
 typedef ushort uint16_t;
 typedef int int32_t;
 typedef uint uint32_t;
 //------------------------------------------------------------------------------
 // block_q4_0
 //------------------------------------------------------------------------------
 struct block_q4_0
 {
    half d;
    uint8_t qs[QK4_0 / 2];
 };
 //
 // This variant unrolls the loops and uses vector types instead of pointers.
 // It improves performance on Adreno but not so much on Intel.
 //
 inline float block_q_4_0_dot_y_v(
        global struct block_q4_0 * qb_curr,
        float sumy,
        float16 yl,
        int il
 ) {
    float d = qb_curr->d;
    float acc = 0.f;
    global ushort * qs = ((global ushort *)qb_curr + 1 + il/2);
    acc += yl.s0 * (qs[0] & 0x000F);
    acc += yl.s1 * (qs[0] & 0x0F00);
    acc += yl.s8 * (qs[0] & 0x00F0);
    acc += yl.s9 * (qs[0] & 0xF000);
    acc += yl.s2 * (qs[1] & 0x000F);
    acc += yl.s3 * (qs[1] & 0x0F00);
    acc += yl.sa * (qs[1] & 0x00F0);
    acc += yl.sb * (qs[1] & 0xF000);
    acc += yl.s4 * (qs[2] & 0x000F);
    acc += yl.s5 * (qs[2] & 0x0F00);
    acc += yl.sc * (qs[2] & 0x00F0);
    acc += yl.sd * (qs[2] & 0xF000);
    acc += yl.s6 * (qs[3] & 0x000F);
    acc += yl.s7 * (qs[3] & 0x0F00);
    acc += yl.se * (qs[3] & 0x00F0);
    acc += yl.sf * (qs[3] & 0xF000);
    return d * (sumy * -8.f + acc);
 }
 #undef N_DST
 #undef N_SIMDGROUP
 #undef N_SIMDWIDTH
 #ifdef INTEL_GPU
 #define N_DST 4 // each SIMD group works on 4 rows
 #define N_SIMDGROUP 1 // number of SIMD groups in a thread group
 #define N_SIMDWIDTH 16 // assuming SIMD group size is 16
 #elif defined (ADRENO_GPU)
 #define N_DST 4
 #define N_SIMDGROUP 1
 #define N_SIMDWIDTH 64
 #endif
 inline void mul_vec_q_n_f32_v(
        global void * src0,
        global float * src1,
        global float * dst,
        int ne00,
        int ne01,
        int ne02,
        int ne10,
        int ne12,
        int ne0,
        int ne1,
        int r2,
        int r3
 ) {
    const ulong nb = ne00/QK4_0;
    int r0 = get_group_id(0);
    int r1 = get_group_id(1);
    int im = get_group_id(2);
    // (r0 * N_SIMDGROUP + get_sub_group_id()) is essenatially the linear global
    // id of a SIMD group in the grid.
    int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
    int i12 = im%ne12;
    int i13 = im/ne12;
    ulong offset0 = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
    global struct block_q4_0 * x = (global struct block_q4_0 *) src0 + offset0;
    global float             * y = (global float             *) src1 + r1*ne10 + im*ne00*ne1;
    float16 yl;       // src1 vector cache
    float4 sumf = (float4)(0.f, 0.f, 0.f, 0.f);
    int ix = get_sub_group_local_id()/2;
    int il = 8*(get_sub_group_local_id()%2);
    global float * yb = y + ix * QK4_0 + il;
    // each thread in a SIMD group deals with half a block.
    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
        float sumy = 0;
        sumy += yb[0];
        sumy += yb[1];
        sumy += yb[2];
        sumy += yb[3];
        sumy += yb[4];
        sumy += yb[5];
        sumy += yb[6];
        sumy += yb[7];
        sumy += yb[16];
        sumy += yb[17];
        sumy += yb[18];
        sumy += yb[19];
        sumy += yb[20];
        sumy += yb[21];
        sumy += yb[22];
        sumy += yb[23];
        yl.s0 = yb[0];
        yl.s1 = yb[1]/256.f;
        yl.s2 = yb[2];
        yl.s3 = yb[3]/256.f;
        yl.s4 = yb[4];
        yl.s5 = yb[5]/256.f;
        yl.s6 = yb[6];
        yl.s7 = yb[7]/256.f;
        yl.s8 = yb[16]/16.f;
        yl.s9 = yb[17]/4096.f;
        yl.sa = yb[18]/16.f;
        yl.sb = yb[19]/4096.f;
        yl.sc = yb[20]/16.f;
        yl.sd = yb[21]/4096.f;
        yl.se = yb[22]/16.f;
        yl.sf = yb[23]/4096.f;
        sumf.s0 += block_q_4_0_dot_y_v(x+ib+0*nb, sumy, yl, il);
        sumf.s1 += block_q_4_0_dot_y_v(x+ib+1*nb, sumy, yl, il);
        sumf.s2 += block_q_4_0_dot_y_v(x+ib+2*nb, sumy, yl, il);
        sumf.s3 += block_q_4_0_dot_y_v(x+ib+3*nb, sumy, yl, il);
        // One thread in a SIMD group (i.e., subgroup) handles a half block,
        // hence then entire SIMD group handles SIMDWIDTH/2 blocks.
        // y points to the activation matrix (of type float). Therefore for
        // one thread, the # of blocks y should advance is SIMDWIDTH/2 (because
        // SIMDWIDTH/2 blocks are processed by a SIMD group) - in terms of
        // floats, it is QK4_0 * (SIMDWIDTH/2), where QK4_0 is the block size.
        yb += QK4_0 * (N_SIMDWIDTH/2);
    }
    // The above does not work for Adreno - it produces incorrect results for
    // row = 1, 2, 3 and only row = 0 gives the correct result.
    // If N_DST is changed, the below array must be initialized accordingly.
    // This also seems to perform better on Intel.
    float4 tot = (float4)(
        sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
        sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3)
    );
    if (get_sub_group_local_id() == 0) {
        if (first_row + 0 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
        }
        if (first_row + 1 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
        }
        if (first_row + 2 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
        }
        if (first_row + 3 < ne01) {
            dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
        }
    }
 }
 #ifdef INTEL_GPU
 REQD_SUBGROUP_SIZE_16
 #elif defined (ADRENO_GPU)
 REQD_SUBGROUP_SIZE_64
 #endif
 kernel void kernel_mul_mat_q4_0_f32_v(
        global void * src0,
        ulong offset0,
        global float * src1,
        ulong offset1,
        global float * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int ne02,
        int ne10,
        int ne12,
        int ne0,
        int ne1,
        int r2,
        int r3
 ) {
    src0 = (global void*)((global char*)src0 + offset0);
    src1 = (global float*)((global char*)src1 + offset1);
    dst = (global float*)((global char*)dst + offsetd);
    mul_vec_q_n_f32_v(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
 }
--- a/ggml/src/ggml-opencl/kernels/mul_mv_q6_k.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_q6_k.cl
@ -0,0 +1,190 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #ifdef cl_intel_subgroups
 #pragma OPENCL EXTENSION cl_intel_subgroups : enable
 #else
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 #endif
 #ifdef cl_intel_required_subgroup_size
 #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
 #define INTEL_GPU 1
 #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
 #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
 #elif defined(cl_qcom_reqd_sub_group_size)
 #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 #define ADRENO_GPU 1
 #define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
 #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
 #endif
 #define QK4_0                   32
 #define QR4_0                   2
 #define QK4_1                   32
 #define QR4_1                   2
 #define QK5_0                   32
 #define QR5_0                   2
 #define QK5_1                   32
 #define QR5_1                   2
 #define QK8_0                   32
 #define QR8_0                   1
 #define QK_K                    256
 #define K_QUANTS_PER_ITERATION  2
 typedef char int8_t;
 typedef uchar uint8_t;
 typedef short int16_t;
 typedef ushort uint16_t;
 typedef int int32_t;
 typedef uint uint32_t;
 //------------------------------------------------------------------------------
 // block_q6_K
 //------------------------------------------------------------------------------
 // 6-bit quantization
 // weight is represented as x = a * q
 // 16 blocks of 16 elements each
 // Effectively 6.5625 bits per weight
 typedef struct {
    uint8_t ql[QK_K/2];      // quants, lower 4 bits
    uint8_t qh[QK_K/4];      // quants, upper 2 bits
    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
    half d;             // super-block scale
 } block_q6_K;
 //------------------------------------------------------------------------------
 // kernel_mul_mv_q6_K_f32
 //------------------------------------------------------------------------------
 #undef N_DST
 #undef N_SIMDGROUP
 #undef N_SIMDWIDTH
 #ifdef INTEL_GPU
 #define N_DST 1 // number of rows each SIMD group works on
 #define N_SIMDGROUP 2 // number of SIMD groups in a thread group
 #define N_SIMDWIDTH 16 // SIMD group size
 #elif defined (ADRENO_GPU)
 #define N_DST 1
 #define N_SIMDGROUP 2
 #define N_SIMDWIDTH 64
 #endif
 #define BLOCK_STRIDE (N_SIMDWIDTH/16) // number of blocks each subgroup processes
 #ifdef INTEL_GPU
 REQD_SUBGROUP_SIZE_16
 #elif defined (ADRENO_GPU)
 REQD_SUBGROUP_SIZE_64
 #endif
 kernel void kernel_mul_mv_q6_K_f32(
        global void * src0,
        ulong offset0,
        global float * src1,
        ulong offset1,
        global float * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int ne02,
        int ne10,
        int ne12,
        int ne0,
        int ne1,
        int r2,
        int r3
 ) {
    src0 = (global void*)((global char*)src0 + offset0);
    src1 = (global float*)((global char*)src1 + offset1);
    dst = (global float*)((global char*)dst + offsetd);
    uchar kmask1 = 0x03;
    uchar kmask2 = 0x0C;
    uchar kmask3 = 0x30;
    uchar kmask4 = 0xC0;
    int nb = ne00/QK_K;
    int r0 = get_group_id(0);
    int r1 = get_group_id(1);
    int im = get_group_id(2);
    int row = N_SIMDGROUP * r0 + get_sub_group_id();
    int i12 = im%ne12;
    int i13 = im/ne12;
    ulong offset_src0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
    global block_q6_K * x = (global block_q6_K *) src0 + row*nb + offset_src0;
    global float      * yy = (global float     *) src1 + r1*ne10 + im*ne00*ne1;
    float sumf = 0;
    // For Q6_K quantization, 16 values forms a subblock, 16 subblock forms a
    // block. Values in a subblock shares a scale that is quantized with 8 bits;
    // the entire block shares a single floating point scale.
    // For work distribution, each thread processes a subblock (16 weights), hence
    // 16 threads process a (super) block -- a subgroup thus handles SIMDWIDTH/16
    // (super) blocks -- this is the block stride.
    // The 16 threads that process a (super) block are split into 2 portions, each has
    // 8 threads; each portion works on 8 subblocks.
    // For subgroup of 16 threads, the entire subgroup works on a single (super) block
    // before moving to the next (super) block. Thread0 - thread7 work on the
    // first 8 subblocks; thread8 - thread15 works on the last 8 subblocks.
    // Thread0 - thread3 work on subblocks 0, 2, 4, 6; thread4 - thread7 work on
    // subblocks 1, 3, 5, 7. Each thread does not work on an entire subblock, but
    // works on a total of 16 weight values.
    int tid  = get_sub_group_local_id()/BLOCK_STRIDE; // first block_stride groups have tid=0
    int ix   = get_sub_group_local_id()%BLOCK_STRIDE; // first block is 0..block_stride-1
    int ip   = tid/8;   // first or second half of (super) block (0 or 1)
    int il   = tid%8;   // each half has 8 parts, one per scale
    int n    = 4;       // 4 scales at a time (and 4 sums)
    int l0   = n*il;    // offset into half-block, 0..28
    int is   = 8*ip + l0/16; // 0, 1, 8, 9
    int y_offset = 128*ip + l0;
    int q_offset_l = 64*ip + l0;
    int q_offset_h = 32*ip + l0;
    for (int i = ix; i < nb; i += BLOCK_STRIDE) {
        global uint8_t * q1 = x[i].ql + q_offset_l;
        global uint8_t * q2 = q1 + QK_K/8;
        global uint8_t * qh = x[i].qh + q_offset_h;
        global int8_t  * sc = x[i].scales + is;
        global float * y = yy + i * QK_K + y_offset;
        float dall = x[i].d;
        float4 sums = {0.f, 0.f, 0.f, 0.f};
        sums.s0 += y[0+ 0] * ((float)((q1[0] & 0xF) | ((qh[0] & kmask1) << 4)) - 32.f);
        sums.s1 += y[0+32] * ((float)((q2[0] & 0xF) | ((qh[0] & kmask2) << 2)) - 32.f);
        sums.s2 += y[0+64] * ((float)((q1[0]  >> 4) | ((qh[0] & kmask3) << 0)) - 32.f);
        sums.s3 += y[0+96] * ((float)((q2[0]  >> 4) | ((qh[0] & kmask4) >> 2)) - 32.f);
        sums.s0 += y[1+ 0] * ((float)((q1[1] & 0xF) | ((qh[1] & kmask1) << 4)) - 32.f);
        sums.s1 += y[1+32] * ((float)((q2[1] & 0xF) | ((qh[1] & kmask2) << 2)) - 32.f);
        sums.s2 += y[1+64] * ((float)((q1[1]  >> 4) | ((qh[1] & kmask3) << 0)) - 32.f);
        sums.s3 += y[1+96] * ((float)((q2[1]  >> 4) | ((qh[1] & kmask4) >> 2)) - 32.f);
        sums.s0 += y[2+ 0] * ((float)((q1[2] & 0xF) | ((qh[2] & kmask1) << 4)) - 32.f);
        sums.s1 += y[2+32] * ((float)((q2[2] & 0xF) | ((qh[2] & kmask2) << 2)) - 32.f);
        sums.s2 += y[2+64] * ((float)((q1[2]  >> 4) | ((qh[2] & kmask3) << 0)) - 32.f);
        sums.s3 += y[2+96] * ((float)((q2[2]  >> 4) | ((qh[2] & kmask4) >> 2)) - 32.f);
        sums.s0 += y[3+ 0] * ((float)((q1[3] & 0xF) | ((qh[3] & kmask1) << 4)) - 32.f);
        sums.s1 += y[3+32] * ((float)((q2[3] & 0xF) | ((qh[3] & kmask2) << 2)) - 32.f);
        sums.s2 += y[3+64] * ((float)((q1[3]  >> 4) | ((qh[3] & kmask3) << 0)) - 32.f);
        sums.s3 += y[3+96] * ((float)((q2[3]  >> 4) | ((qh[3] & kmask4) >> 2)) - 32.f);
        sumf += dall * (sums.s0 * sc[0] + sums.s1 * sc[2] + sums.s2 * sc[4] + sums.s3 * sc[6]);
    }
    float tot = sub_group_reduce_add(sumf);
    if (get_sub_group_local_id() == 0) {
        dst[r1*ne0 + im*ne0*ne1 + row] = tot;
    }
 }
--- a/ggml/src/ggml-opencl/kernels/norm.cl
+++ b/ggml/src/ggml-opencl/kernels/norm.cl
@ -0,0 +1,81 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #ifdef cl_intel_required_subgroup_size
 #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
 #define INTEL_GPU 1
 #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
 #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
 #elif defined(cl_qcom_reqd_sub_group_size)
 #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 #define ADRENO_GPU 1
 #define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
 #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
 #endif
 //------------------------------------------------------------------------------
 // norm
 //------------------------------------------------------------------------------
 kernel void kernel_norm(
        global void * src0,
        ulong offset0,
        global float * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int ne02,
        int ne03,
        ulong nb01,
        ulong nb02,
        ulong nb03,
        float eps,
        local float * sum
 ) {
    src0 = (global void*)((global char*)src0 + offset0);
    dst = (global void*)((global char*)dst + offsetd);
    int i03 = get_group_id(2);
    int i02 = get_group_id(1);
    int i01 = get_group_id(0);
    global float * x = (global float *) ((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01);
    // MEAN
    // parallel sum
    sum[get_local_id(0)] = 0.0f;
    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
        sum[get_local_id(0)] += x[i00];
    }
    // reduce
    barrier(CLK_LOCAL_MEM_FENCE);
    for (uint i = get_local_size(0)/2; i > 0; i /= 2) {
        if (get_local_id(0) < i) {
            sum[get_local_id(0)] += sum[get_local_id(0) + i];
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    float mean  = sum[0] / ne00;
    // recenter and VARIANCE
    barrier(CLK_LOCAL_MEM_FENCE);
    global float * y = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
    sum[get_local_id(0)] = 0.0f;
    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
        y[i00] = x[i00] - mean;
        sum[get_local_id(0)] += y[i00] * y[i00];
    }
    // reduce
    barrier(CLK_LOCAL_MEM_FENCE);
    for (uint i = get_local_size(0)/2; i > 0; i /= 2) {
        if (get_local_id(0) < i) {
            sum[get_local_id(0)] += sum[get_local_id(0) + i];
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    float variance = sum[0] / ne00;
    float scale = 1.0f/sqrt(variance + eps);
    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
        y[i00] = y[i00] * scale;
    }
 }
--- a/ggml/src/ggml-opencl/kernels/relu.cl
+++ b/ggml/src/ggml-opencl/kernels/relu.cl
@ -0,0 +1,16 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 //------------------------------------------------------------------------------
 // relu
 //------------------------------------------------------------------------------
 kernel void kernel_relu(
        global float * src0,
        ulong offset0,
        global float * dst,
        ulong offsetd
 ) {
    src0 = (global float*)((global char*)src0 + offset0);
    dst = (global float*)((global char*)dst + offsetd);
    dst[get_global_id(0)] = fmax(0.0f, src0[get_global_id(0)]);
 }
--- a/ggml/src/ggml-opencl/kernels/rms_norm.cl
+++ b/ggml/src/ggml-opencl/kernels/rms_norm.cl
@ -0,0 +1,96 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #ifdef cl_intel_subgroups
 #pragma OPENCL EXTENSION cl_intel_subgroups : enable
 #else
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 #endif
 #ifdef cl_intel_required_subgroup_size
 #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
 #define INTEL_GPU 1
 #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
 #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
 #elif defined(cl_qcom_reqd_sub_group_size)
 #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 #define ADRENO_GPU 1
 #define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
 #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
 #endif
 //------------------------------------------------------------------------------
 // rms_norm
 //------------------------------------------------------------------------------
 // This kernel depends on subgroup size.
 #ifdef INTEL_GPU
 REQD_SUBGROUP_SIZE_32
 #elif defined (ADRENO_GPU)
 REQD_SUBGROUP_SIZE_64
 #endif
 kernel void kernel_rms_norm(
        global void * src0,
        ulong offset0,
        global float * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int ne02,
        int ne03,
        ulong nb01,
        ulong nb02,
        ulong nb03,
        float eps,
        local float * sum // Note, the size depends on number of subgroups
 ) {
    src0 = (global void*)((global char*)src0 + offset0);
    dst = (global float*)((global char*)dst + offsetd);
    int i03 = get_group_id(2);
    int i02 = get_group_id(1);
    int i01 = get_group_id(0);
    global float4 * x = (global float4 *) ((global char *) src0 + i03*nb03 + i02*nb02 + i01*nb01);
    global float * x_scalar = (global float *) x;
    float4 sumf = 0;
    float all_sum = 0;
    // parallel sum
    for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
        sumf += x[i00] * x[i00];
    }
    all_sum = sumf.s0 + sumf.s1 + sumf.s2 + sumf.s3;
    all_sum = sub_group_reduce_add(all_sum);
    if (get_sub_group_local_id() == 0) {
        sum[get_sub_group_id()] = all_sum;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    // broadcast
    for (uint i = get_local_size(0) / get_max_sub_group_size() / 2; i > 0; i /= 2) {
       if (get_local_id(0) < i) {
           sum[get_local_id(0)] += sum[get_local_id(0) + i];
       }
    }
    if (get_local_id(0) == 0) {
        for (int i = 4 * (ne00 / 4); i < ne00; i++) {
            sum[0] += x_scalar[i];
        }
        sum[0] /= ne00;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    const float mean  = sum[0];
    const float scale = 1.0f/sqrt(mean + eps);
    global float4 * y = (global float4 *) (dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
    global float * y_scalar = (global float *) y;
    for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
        y[i00] = x[i00] * scale;
    }
    if (get_local_id(0) == 0) {
        for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
            y_scalar[i00] = x_scalar[i00] * scale;
        }
    }
 }
--- a/ggml/src/ggml-opencl/kernels/rope.cl
+++ b/ggml/src/ggml-opencl/kernels/rope.cl
@ -0,0 +1,721 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 //------------------------------------------------------------------------------
 // kernel_rope
 //------------------------------------------------------------------------------
 float rope_yarn_ramp(float low, float high, int i0) {
    const float y = (i0 / 2 - low) / max(0.001f, high - low);
    return 1.0f - min(1.0f, max(0.0f, y));
 }
 // YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
 // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
 float2 rope_yarn(
    float theta_extrap, float freq_scale, float2 corr_dims, int i0, float ext_factor, float mscale
 ) {
    // Get n-d rotational scaling corrected for extrapolation
    float theta_interp = freq_scale * theta_extrap;
    float theta = theta_interp;
    if (ext_factor != 0.0f) {
        float ramp_mix = rope_yarn_ramp(corr_dims.s0, corr_dims.s1, i0) * ext_factor;
        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
        // Get n-d magnitude scaling corrected for interpolation
        mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
    }
    return (float2)(cos(theta) * mscale, sin(theta) * mscale);
 }
 // Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
 // `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
 float rope_yarn_corr_factor(int n_dims, int n_ctx_orig, float n_rot, float base) {
    return n_dims * log(n_ctx_orig / (n_rot * 2 * M_PI_F)) / (2 * log(base));
 }
 float2 rope_yarn_corr_dims(
    int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow
 ) {
    // start and end correction dims
    return (float2)(
        max(0.0f,         floor(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_fast, freq_base))),
        min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_slow, freq_base)))
    );
 }
 kernel void kernel_rope_norm_f32(
        global void * src0,
        ulong offset0,
        global int * src1,
        ulong offset1,
        global float * src2,
        ulong offset2,
        global float * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int ne02,
        int ne03,
        ulong nb00,
        ulong nb01,
        ulong nb02,
        ulong nb03,
        int ne0,
        int ne1,
        int ne2,
        int ne3,
        ulong nb0,
        ulong nb1,
        ulong nb2,
        ulong nb3,
        int n_past,
        int n_dims,
        int n_ctx_orig,
        float freq_base,
        float freq_scale,
        float ext_factor,
        float attn_factor,
        float beta_fast,
        float beta_slow
 ) {
    src0 = (global void*)((global char*)src0 + offset0);
    src1 = (global int*)((global char*)src1 + offset1);
    src2 = (global float*)((global char*)src2 + offset2);
    dst = (global float*)((global char*)dst + offsetd);
    int i3 = get_group_id(2);
    int i2 = get_group_id(1);
    int i1 = get_group_id(0);
    float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
    global int * pos = src1;
    float theta_base = (float) pos[i2];
    float inv_ndims = -1.f/n_dims;
    for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
        if (i0 < n_dims) {
            int ic = i0/2;
            float theta = theta_base * pow(freq_base, inv_ndims*i0);
            float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
            float2 cos_sin_theta = rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
            global float * src       = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
            global float * dst_data  = (global float *)((global char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
            float x0 = src[0];
            float x1 = src[1];
            dst_data[0] = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
            dst_data[1] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
        } else {
            global float * src      = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
            global float * dst_data = (global float *)((global char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
            dst_data[0] = src[0];
            dst_data[1] = src[1];
        }
    }
 }
 kernel void kernel_rope_norm_f16(
        global void * src0,
        ulong offset0,
        global int * src1,
        ulong offset1,
        global float * src2,
        ulong offset2,
        global float * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int ne02,
        int ne03,
        ulong nb00,
        ulong nb01,
        ulong nb02,
        ulong nb03,
        int ne0,
        int ne1,
        int ne2,
        int ne3,
        ulong nb0,
        ulong nb1,
        ulong nb2,
        ulong nb3,
        int n_past,
        int n_dims,
        int n_ctx_orig,
        float freq_base,
        float freq_scale,
        float ext_factor,
        float attn_factor,
        float beta_fast,
        float beta_slow
 ) {
    src0 = (global void*)((global char*)src0 + offset0);
    src1 = (global int*)((global char*)src1 + offset1);
    src2 = (global float*)((global char*)src2 + offset2);
    dst = (global float*)((global char*)dst + offsetd);
    int i3 = get_group_id(2);
    int i2 = get_group_id(1);
    int i1 = get_group_id(0);
    float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
    global int * pos = src1;
    float theta_base = (float) pos[i2];
    float inv_ndims = -1.f/n_dims;
    for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
        if (i0 < n_dims) {
            int ic = i0/2;
            float theta = theta_base * pow(freq_base, inv_ndims*i0);
            float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
            float2 cos_sin_theta = rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
            global half * src       = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
            global half * dst_data  = (global half *)((global char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
            float x0 = src[0];
            float x1 = src[1];
            dst_data[0] = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
            dst_data[1] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
        } else {
            global half * src      = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
            global half * dst_data = (global half *)((global char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
            dst_data[0] = src[0];
            dst_data[1] = src[1];
        }
    }
 }
 kernel void kernel_rope_neox_f32(
        global void * src0,
        ulong offset0,
        global int * src1,
        ulong offset1,
        global float * src2,
        ulong offset2,
        global float * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int ne02,
        int ne03,
        ulong nb00,
        ulong nb01,
        ulong nb02,
        ulong nb03,
        int ne0,
        int ne1,
        int ne2,
        int ne3,
        ulong nb0,
        ulong nb1,
        ulong nb2,
        ulong nb3,
        int n_past,
        int n_dims,
        int n_ctx_orig,
        float freq_base,
        float freq_scale,
        float ext_factor,
        float attn_factor,
        float beta_fast,
        float beta_slow
 ) {
    src0 = (global void*)((global char*)src0 + offset0);
    src1 = (global int*)((global char*)src1 + offset1);
    src2 = (global float*)((global char*)src2 + offset2);
    dst = (global float*)((global char*)dst + offsetd);
    int i3 = get_group_id(2);
    int i2 = get_group_id(1);
    int i1 = get_group_id(0);
    float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
    global int * pos = src1;
    float theta_base = (float) pos[i2];
    float inv_ndims = -1.f/n_dims;
    for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
        if (i0 < n_dims) {
            int ic = i0/2;
            const float theta = theta_base * pow(freq_base, inv_ndims*i0);
            const float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
            float2 cos_sin_theta = rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
            global float * src      = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
            global float * dst_data = (global float *)((global char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
            const float x0 = src[0];
            const float x1 = src[n_dims/2];
            dst_data[0]        = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
            dst_data[n_dims/2] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
        } else {
            global float * const src = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
            global float * dst_data  = (global float *)((global char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
            dst_data[0] = src[0];
            dst_data[1] = src[1];
        }
    }
 }
 kernel void kernel_rope_neox_f16(
        global void * src0,
        ulong offset0,
        global int * src1,
        ulong offset1,
        global float * src2,
        ulong offset2,
        global float * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int ne02,
        int ne03,
        ulong nb00,
        ulong nb01,
        ulong nb02,
        ulong nb03,
        int ne0,
        int ne1,
        int ne2,
        int ne3,
        ulong nb0,
        ulong nb1,
        ulong nb2,
        ulong nb3,
        int n_past,
        int n_dims,
        int n_ctx_orig,
        float freq_base,
        float freq_scale,
        float ext_factor,
        float attn_factor,
        float beta_fast,
        float beta_slow
 ) {
    src0 = (global void*)((global char*)src0 + offset0);
    src1 = (global int*)((global char*)src1 + offset1);
    src2 = (global float*)((global char*)src2 + offset2);
    dst = (global float*)((global char*)dst + offsetd);
    int i3 = get_group_id(2);
    int i2 = get_group_id(1);
    int i1 = get_group_id(0);
    float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
    global int * pos = src1;
    float theta_base = (float) pos[i2];
    float inv_ndims = -1.f/n_dims;
    for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
        if (i0 < n_dims) {
            int ic = i0/2;
            const float theta = theta_base * pow(freq_base, inv_ndims*i0);
            const float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
            float2 cos_sin_theta = rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
            global half * src       = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
            global half * dst_data  = (global half *)((global char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
            const float x0 = src[0];
            const float x1 = src[n_dims/2];
            dst_data[0]        = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
            dst_data[n_dims/2] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
        } else {
            global half * const src = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
            global half * dst_data  = (global half *)((global char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
            dst_data[0] = src[0];
            dst_data[1] = src[1];
        }
    }
 }
 kernel void kernel_rope_multi_f32(
        global void * src0,
        ulong offset0,
        global int * src1,
        ulong offset1,
        global float * src2,
        ulong offset2,
        global float * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int ne02,
        int ne03,
        ulong nb00,
        ulong nb01,
        ulong nb02,
        ulong nb03,
        int ne0,
        int ne1,
        int ne2,
        int ne3,
        ulong nb0,
        ulong nb1,
        ulong nb2,
        ulong nb3,
        int n_past,
        int n_dims,
        int n_ctx_orig,
        float freq_base,
        float freq_scale,
        float ext_factor,
        float attn_factor,
        float beta_fast,
        float beta_slow,
        int4 sections
 ) {
    src0 = (global void*)((global char*)src0 + offset0);
    src1 = (global int*)((global char*)src1 + offset1);
    src2 = (global float*)((global char*)src2 + offset2);
    dst = (global float*)((global char*)dst + offsetd);
    int i3 = get_group_id(2);
    int i2 = get_group_id(1);
    int i1 = get_group_id(0);
    float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
    global int * pos = src1;
    const int sect_dims = sections.s0 + sections.s1 + sections.s2 + sections.s3;
    const int sec_w = sections.s1 + sections.s0;
    float inv_ndims = -1.f/n_dims;
    for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
        if (i0 < n_dims) {
            int ic = i0/2;
            const int sector = (i0 / 2) % sect_dims;
            float theta_base = 0.0f;
            if (sector < sections.s0) {
                theta_base = pos[i2];
            }
            else if (sector >= sections.s0 && sector < sec_w) {
                theta_base = pos[i2 + ne2 * 1];
            }
            else if (sector >= sec_w && sector < sec_w + sections.s2) {
                theta_base = pos[i2 + ne2 * 2];
            }
            else if (sector >= sec_w + sections.s2) {
                theta_base = pos[i2 + ne2 * 3];
            }
            const float theta = theta_base * pow(freq_base, inv_ndims*i0);
            const float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
            float2 cos_sin_theta = rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
            global float * src      = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
            global float * dst_data = (global float *)((global char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
            const float x0 = src[0];
            const float x1 = src[n_dims/2];
            dst_data[0]        = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
            dst_data[n_dims/2] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
        } else {
            global float * const src = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
            global float * dst_data  = (global float *)((global char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
            dst_data[0] = src[0];
            dst_data[1] = src[1];
        }
    }
 }
 kernel void kernel_rope_multi_f16(
        global void * src0,
        ulong offset0,
        global int * src1,
        ulong offset1,
        global float * src2,
        ulong offset2,
        global half * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int ne02,
        int ne03,
        ulong nb00,
        ulong nb01,
        ulong nb02,
        ulong nb03,
        int ne0,
        int ne1,
        int ne2,
        int ne3,
        ulong nb0,
        ulong nb1,
        ulong nb2,
        ulong nb3,
        int n_past,
        int n_dims,
        int n_ctx_orig,
        float freq_base,
        float freq_scale,
        float ext_factor,
        float attn_factor,
        float beta_fast,
        float beta_slow,
        int4 sections
 ) {
    src0 = (global void*)((global char*)src0 + offset0);
    src1 = (global int*)((global char*)src1 + offset1);
    src2 = (global float*)((global char*)src2 + offset2);
    dst = (global float*)((global char*)dst + offsetd);
    int i3 = get_group_id(2);
    int i2 = get_group_id(1);
    int i1 = get_group_id(0);
    float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
    global int * pos = src1;
    const int sect_dims = sections.s0 + sections.s1 + sections.s2 + sections.s3;
    const int sec_w = sections.s1 + sections.s0;
    float inv_ndims = -1.f/n_dims;
    for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
        if (i0 < n_dims) {
            int ic = i0/2;
            const int sector = (i0 / 2) % sect_dims;
            float theta_base = 0.0f;
            if (sector < sections.s0) {
                theta_base = pos[i2];
            }
            else if (sector >= sections.s0 && sector < sec_w) {
                theta_base = pos[i2 + ne2 * 1];
            }
            else if (sector >= sec_w && sector < sec_w + sections.s2) {
                theta_base = pos[i2 + ne2 * 2];
            }
            else if (sector >= sec_w + sections.s2) {
                theta_base = pos[i2 + ne2 * 3];
            }
            const float theta = theta_base * pow(freq_base, inv_ndims*i0);
            const float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
            float2 cos_sin_theta = rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
            global half * src      = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
            global half * dst_data = (global half *)((global char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
            const float x0 = src[0];
            const float x1 = src[n_dims/2];
            dst_data[0]        = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
            dst_data[n_dims/2] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
        } else {
            global half * const src = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
            global half * dst_data  = (global half *)((global char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
            dst_data[0] = src[0];
            dst_data[1] = src[1];
        }
    }
 }
 kernel void kernel_rope_vision_f32(
        global void * src0,
        ulong offset0,
        global int * src1,
        ulong offset1,
        global float * src2,
        ulong offset2,
        global float * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int ne02,
        int ne03,
        ulong nb00,
        ulong nb01,
        ulong nb02,
        ulong nb03,
        int ne0,
        int ne1,
        int ne2,
        int ne3,
        ulong nb0,
        ulong nb1,
        ulong nb2,
        ulong nb3,
        int n_past,
        int n_dims,
        int n_ctx_orig,
        float freq_base,
        float freq_scale,
        float ext_factor,
        float attn_factor,
        float beta_fast,
        float beta_slow,
        int4 sections
 ) {
    src0 = (global void*)((global char*)src0 + offset0);
    src1 = (global int*)((global char*)src1 + offset1);
    src2 = (global float*)((global char*)src2 + offset2);
    dst = (global float*)((global char*)dst + offsetd);
    int i3 = get_group_id(2);
    int i2 = get_group_id(1);
    int i1 = get_group_id(0);
    float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
    global int * pos = src1;
    const int sect_dims = sections.s0 + sections.s1;
    const int sec_w = sections.s1 + sections.s0;
    float inv_ndims = -1.f/n_dims;
    for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
        int ic = i0/2;
        const int sector = (i0/2) % sect_dims;
        float theta_base = 0.0f;
        if (sector < sections.s0) {
            const int p = sector;
            theta_base = pos[i2] * pow(freq_base, inv_ndims*2.0f*p);
        } else if (sector >= sections.s0 && sector < sec_w) {
            const int p = sector - sections.s0;
            theta_base = pos[i2 + ne2] * pow(freq_base, inv_ndims*2.0f*p);
        }
        const float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
        float2 cos_sin_theta = rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
        global float * src      = (global float *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
        global float * dst_data = (global float *)((global char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
        const float x0 = src[0];
        const float x1 = src[n_dims];
        dst_data[0]      = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
        dst_data[n_dims] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
    }
 }
 kernel void kernel_rope_vision_f16(
        global void * src0,
        ulong offset0,
        global int * src1,
        ulong offset1,
        global float * src2,
        ulong offset2,
        global half * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int ne02,
        int ne03,
        ulong nb00,
        ulong nb01,
        ulong nb02,
        ulong nb03,
        int ne0,
        int ne1,
        int ne2,
        int ne3,
        ulong nb0,
        ulong nb1,
        ulong nb2,
        ulong nb3,
        int n_past,
        int n_dims,
        int n_ctx_orig,
        float freq_base,
        float freq_scale,
        float ext_factor,
        float attn_factor,
        float beta_fast,
        float beta_slow,
        int4 sections
 ) {
    src0 = (global void*)((global char*)src0 + offset0);
    src1 = (global int*)((global char*)src1 + offset1);
    src2 = (global float*)((global char*)src2 + offset2);
    dst = (global float*)((global char*)dst + offsetd);
    int i3 = get_group_id(2);
    int i2 = get_group_id(1);
    int i1 = get_group_id(0);
    float2 corr_dims = rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow);
    global int * pos = src1;
    const int sect_dims = sections.s0 + sections.s1;
    const int sec_w = sections.s1 + sections.s0;
    float inv_ndims = -1.f/n_dims;
    for (int i0 = 2*get_local_id(0); i0 < ne0; i0 += 2*get_local_size(0)) {
        int ic = i0/2;
        const int sector = (i0/2) % sect_dims;
        float theta_base = 0.0f;
        if (sector < sections.s0) {
            const int p = sector;
            theta_base = pos[i2] * pow(freq_base, inv_ndims*2.0f*p);
        } else if (sector >= sections.s0 && sector < sec_w) {
            const int p = sector - sections.s0;
            theta_base = pos[i2 + ne2] * pow(freq_base, inv_ndims*2.0f*p);
        }
        const float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
        float2 cos_sin_theta = rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor);
        global half * src      = (global half *)((global char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
        global half * dst_data = (global half *)((global char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
        const float x0 = src[0];
        const float x1 = src[n_dims];
        dst_data[0]      = x0*cos_sin_theta.s0 - x1*cos_sin_theta.s1;
        dst_data[n_dims] = x0*cos_sin_theta.s1 + x1*cos_sin_theta.s0;
    }
 }
--- a/ggml/src/ggml-opencl/kernels/scale.cl
+++ b/ggml/src/ggml-opencl/kernels/scale.cl
@ -0,0 +1,16 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 //------------------------------------------------------------------------------
 // scale
 //------------------------------------------------------------------------------
 kernel void kernel_scale(
        global float4 * src0,
        ulong offset0,
        global float4 * dst,
        ulong offsetd,
        float scale
 ) {
    src0 = (global float4*)((global char*)src0 + offset0);
    dst = (global float4*)((global char*)dst + offsetd);
    dst[get_global_id(0)] = src0[get_global_id(0)] * scale;
 }
--- a/ggml/src/ggml-opencl/kernels/silu.cl
+++ b/ggml/src/ggml-opencl/kernels/silu.cl
@ -0,0 +1,30 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 //------------------------------------------------------------------------------
 // silu
 //------------------------------------------------------------------------------
 kernel void kernel_silu(
        global float * src0,
        ulong offset0,
        global float * dst,
        ulong offsetd
 ) {
    src0 = (global float*)((global char*)src0 + offset0);
    dst = (global float*)((global char*)dst + offsetd);
    float x = src0[get_global_id(0)];
    dst[get_global_id(0)] = x / (1.0f + exp(-x));
 }
 kernel void kernel_silu_4(
        global float4 * src0,
        ulong offset0,
        global float4 * dst,
        ulong offsetd
 ) {
    src0 = (global float4*)((global char*)src0 + offset0);
    dst = (global float4*)((global char*)dst + offsetd);
    float4 x = src0[get_global_id(0)];
    dst[get_global_id(0)] = x / (1.0f + exp(-x));
 }
--- a/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl
+++ b/ggml/src/ggml-opencl/kernels/softmax_4_f16.cl
@ -0,0 +1,87 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #ifdef cl_intel_subgroups
 #pragma OPENCL EXTENSION cl_intel_subgroups : enable
 #else
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 #endif
 #ifdef cl_intel_required_subgroup_size
 #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
 #define INTEL_GPU 1
 #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
 #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
 #elif defined(cl_qcom_reqd_sub_group_size)
 #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 #define ADRENO_GPU 1
 #define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
 #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
 #endif
 #ifdef ADRENO_GPU
 REQD_SUBGROUP_SIZE_64
 #endif
 kernel void kernel_soft_max_4_f16(
        global float * src0,
        ulong offset0,
        global half * src1,
        ulong offset1,
        global float * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int ne02,
        float scale,
        float max_bias,
        float m0,
        float m1,
        int n_head_log2
 ) {
    src0 = (global float *)((global char *)src0 + offset0);
    src1 = (global half *)((global char *)src1 + offset1);
    dst = (global float *)((global char *)dst + offsetd);
    int i03 = get_group_id(2);
    int i02 = get_group_id(1);
    int i01 = get_group_id(0);
    global float4 * psrc4 = (global float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
    global half4  * pmask = (global char *)src1 != (global char *)src0 ? (global half4 *)(src1 + i01*ne00) : 0;
    global float4 * pdst4 = (global float4 *)(dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
    float slope = 1.0f;
    // ALiBi
    if (max_bias > 0.0f) {
        int h = i02;
        float base = h < n_head_log2 ? m0 : m1;
        int   exp  = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
        slope = pow(base, exp);
    }
    // parallel max
    float4 lmax4 = -INFINITY;
    for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
        lmax4 = fmax(lmax4, psrc4[i00]*scale + slope*(pmask ? convert_float4(pmask[i00]) : 0.0f));
    }
    float lmax = fmax(fmax(lmax4.s0, lmax4.s1), fmax(lmax4.s2, lmax4.s3));
    const float max = sub_group_reduce_max(lmax);
    // parallel sum
    float4 lsum4 = 0.0f;
    for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
        const float4 exp_psrc4 = exp((psrc4[i00]*scale + slope*(pmask ? convert_float4(pmask[i00]) : 0.0f)) - max);
        lsum4 += exp_psrc4;
        pdst4[i00] = exp_psrc4;
    }
    float lsum = lsum4.s0 + lsum4.s1 + lsum4.s2 + lsum4.s3;
    const float sum = sub_group_reduce_add(lsum);
    for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
        pdst4[i00] /= sum;
    }
 }
--- a/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl
+++ b/ggml/src/ggml-opencl/kernels/softmax_4_f32.cl
@ -0,0 +1,87 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #ifdef cl_intel_subgroups
 #pragma OPENCL EXTENSION cl_intel_subgroups : enable
 #else
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 #endif
 #ifdef cl_intel_required_subgroup_size
 #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
 #define INTEL_GPU 1
 #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
 #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
 #elif defined(cl_qcom_reqd_sub_group_size)
 #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 #define ADRENO_GPU 1
 #define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
 #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
 #endif
 #ifdef ADRENO_GPU
 REQD_SUBGROUP_SIZE_64
 #endif
 kernel void kernel_soft_max_4(
        global float * src0,
        ulong offset0,
        global float * src1,
        ulong offset1,
        global float * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int ne02,
        float scale,
        float max_bias,
        float m0,
        float m1,
        int n_head_log2
 ) {
    src0 = (global float*)((global char*)src0 + offset0);
    src1 = (global float*)((global char*)src1 + offset1);
    dst = (global float*)((global char*)dst + offsetd);
    int i03 = get_group_id(2);
    int i02 = get_group_id(1);
    int i01 = get_group_id(0);
    global float4 * psrc4 = (global float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
    global float4 * pmask = src1 != src0 ? (global float4 *)(src1 + i01*ne00) : 0;
    global float4 * pdst4 = (global float4 *)(dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
    float slope = 1.0f;
    // ALiBi
    if (max_bias > 0.0f) {
        int h = i02;
        float base = h < n_head_log2 ? m0 : m1;
        int   exp  = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
        slope = pow(base, exp);
    }
    // parallel max
    float4 lmax4 = -INFINITY;
    for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
        lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f));
    }
    float lmax = fmax(fmax(lmax4.s0, lmax4.s1), fmax(lmax4.s2, lmax4.s3));
    const float max = sub_group_reduce_max(lmax);
    // parallel sum
    float4 lsum4 = 0.0f;
    for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
        const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f)) - max);
        lsum4 += exp_psrc4;
        pdst4[i00] = exp_psrc4;
    }
    float lsum = lsum4.s0 + lsum4.s1 + lsum4.s2 + lsum4.s3;
    const float sum = sub_group_reduce_add(lsum);
    for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
        pdst4[i00] /= sum;
    }
 }
--- a/ggml/src/ggml-opencl/kernels/softmax_f16.cl
+++ b/ggml/src/ggml-opencl/kernels/softmax_f16.cl
@ -0,0 +1,86 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #ifdef cl_intel_subgroups
 #pragma OPENCL EXTENSION cl_intel_subgroups : enable
 #else
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 #endif
 #ifdef cl_intel_required_subgroup_size
 #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
 #define INTEL_GPU 1
 #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
 #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
 #elif defined(cl_qcom_reqd_sub_group_size)
 #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 #define ADRENO_GPU 1
 #define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
 #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
 #endif
 #ifdef ADRENO_GPU
 REQD_SUBGROUP_SIZE_64
 #endif
 kernel void kernel_soft_max_f16(
        global float * src0,
        ulong offset0,
        global half * src1,
        ulong offset1,
        global float * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int ne02,
        float scale,
        float max_bias,
        float m0,
        float m1,
        int n_head_log2
 ) {
    src0 = (global float *)((global char *)src0 + offset0);
    src1 = (global half *)((global char *)src1 + offset1);
    dst = (global float *)((global char *)dst + offsetd);
    int i03 = get_group_id(2);
    int i02 = get_group_id(1);
    int i01 = get_group_id(0);
    global float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
    global half  * pmask = (global char *)src1 != (global char *)src0 ? src1 + i01*ne00 : 0;
    global float * pdst  = dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
    float slope = 1.0f;
    // ALiBi
    if (max_bias > 0.0f) {
        int h = i02;
        float base = h < n_head_log2 ? m0 : m1;
        int   exp  = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
        slope = pow(base, exp);
    }
    // parallel max
    float lmax = -INFINITY;
    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
        lmax = fmax(lmax, psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f));
    }
    float max = sub_group_reduce_max(lmax);
    // parallel sum
    float lsum = 0.0f;
    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
        float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f)) - max);
        lsum += exp_psrc0;
        // Remember the result of exp here. exp is expensive, so we really do not
        // wish to compute it twice.
        pdst[i00] = exp_psrc0;
    }
    const float sum = sub_group_reduce_add(lsum);
    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
        pdst[i00] /= sum;
    }
 }
--- a/ggml/src/ggml-opencl/kernels/softmax_f32.cl
+++ b/ggml/src/ggml-opencl/kernels/softmax_f32.cl
@ -0,0 +1,86 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #ifdef cl_intel_subgroups
 #pragma OPENCL EXTENSION cl_intel_subgroups : enable
 #else
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 #endif
 #ifdef cl_intel_required_subgroup_size
 #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
 #define INTEL_GPU 1
 #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
 #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
 #elif defined(cl_qcom_reqd_sub_group_size)
 #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 #define ADRENO_GPU 1
 #define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
 #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
 #endif
 #ifdef ADRENO_GPU
 REQD_SUBGROUP_SIZE_64
 #endif
 kernel void kernel_soft_max(
        global float * src0,
        ulong offset0,
        global float * src1,
        ulong offset1,
        global float * dst,
        ulong offsetd,
        int ne00,
        int ne01,
        int ne02,
        float scale,
        float max_bias,
        float m0,
        float m1,
        int n_head_log2
 ) {
    src0 = (global float*)((global char*)src0 + offset0);
    src1 = (global float*)((global char*)src1 + offset1);
    dst = (global float*)((global char*)dst + offsetd);
    int i03 = get_group_id(2);
    int i02 = get_group_id(1);
    int i01 = get_group_id(0);
    global float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
    global float * pmask = src1 != src0 ? src1 + i01*ne00 : 0;
    global float * pdst  = dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
    float slope = 1.0f;
    // ALiBi
    if (max_bias > 0.0f) {
        int h = i02;
        float base = h < n_head_log2 ? m0 : m1;
        int   exp  = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
        slope = pow(base, exp);
    }
    // parallel max
    float lmax = -INFINITY;
    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
        lmax = fmax(lmax, psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f));
    }
    float max = sub_group_reduce_max(lmax);
    // parallel sum
    float lsum = 0.0f;
    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
        float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f)) - max);
        lsum += exp_psrc0;
        // Remember the result of exp here. exp is expensive, so we really do not
        // wish to compute it twice.
        pdst[i00] = exp_psrc0;
    }
    const float sum = sub_group_reduce_add(lsum);
    for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
        pdst[i00] /= sum;
    }
 }
--- a/ggml/src/ggml-opencl/kernels/transpose.cl
+++ b/ggml/src/ggml-opencl/kernels/transpose.cl
@ -0,0 +1,84 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 // 16-bit transpose, loading/storing a 4x4 tile of elements
 kernel void kernel_transpose_16(
    __read_only image1d_buffer_t input,
    __write_only image1d_buffer_t output,
    const uint rows,
    const uint cols
 ) {
    const int i = get_global_id(0);
    const int j = get_global_id(1);
    const int i_2 = i<<2;
    const int j_2 = j<<2;
    half4 temp0 = read_imageh(input, (j_2+0)*cols+i);
    half4 temp1 = read_imageh(input, (j_2+1)*cols+i);
    half4 temp2 = read_imageh(input, (j_2+2)*cols+i);
    half4 temp3 = read_imageh(input, (j_2+3)*cols+i);
    write_imageh(output, (i_2+0)*rows+j, (half4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0));
    write_imageh(output, (i_2+1)*rows+j, (half4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
    write_imageh(output, (i_2+2)*rows+j, (half4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
    write_imageh(output, (i_2+3)*rows+j, (half4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
 }
 // 32-bit transpose, loading/storing a 4x4 tile of elements
 kernel void kernel_transpose_32(
    __read_only image1d_buffer_t input,
    __write_only image1d_buffer_t output,
    const uint rows,
    const uint cols
 ) {
    const int i = get_global_id(0);
    const int j = get_global_id(1);
    const int i_2 = i<<2;
    const int j_2 = j<<2;
    float4 temp0 = read_imagef(input, (j_2+0)*cols+i);
    float4 temp1 = read_imagef(input, (j_2+1)*cols+i);
    float4 temp2 = read_imagef(input, (j_2+2)*cols+i);
    float4 temp3 = read_imagef(input, (j_2+3)*cols+i);
    write_imagef(output, (i_2+0)*rows+j, (float4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0));
    write_imagef(output, (i_2+1)*rows+j, (float4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
    write_imagef(output, (i_2+2)*rows+j, (float4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
    write_imagef(output, (i_2+3)*rows+j, (float4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
 }
 // 32-bit transpose, loading/storing a 4x4 tile of elements
 // Only used for activations
 // converts to FP16
 // also adds zero padding for non multiple of 8 prompt lengths
 kernel void kernel_transpose_32_16(__read_only image1d_buffer_t input, __write_only image1d_buffer_t output, const uint rows, const uint cols, const uint padded_rows) {
    const int i = get_global_id(0);
    const int j = get_global_id(1);
    const int i_2 = i<<2;
    const int j_2 = j<<2;
    half4 temp0 = {0,0,0,0}; // initialize outputs to 0
    half4 temp1 = {0,0,0,0};
    half4 temp2 = {0,0,0,0};
    half4 temp3 = {0,0,0,0};
    if((j_2+0)*cols+i*4+3 < rows*cols*16){ // only load from a valid location. Otherwise keep register data as 0
        temp0 = read_imageh(input, (j_2+0)*cols+i);
    }
    if((j_2+1)*cols+i*4+3 < rows*cols*16){
        temp1 = read_imageh(input, (j_2+1)*cols+i);
    }
    if((j_2+2)*cols+i*4+3 < rows*cols*16){
        temp2 = read_imageh(input, (j_2+2)*cols+i);
    }
    if((j_2+3)*cols+i*4+3 < rows*cols*16){
        temp3 = read_imageh(input, (j_2+3)*cols+i);
    }
    write_imageh(output, (i_2+0)*padded_rows+j, (half4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0)); // no conditionals for output, includes zero padding
    write_imageh(output, (i_2+1)*padded_rows+j, (half4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1));
    write_imageh(output, (i_2+2)*padded_rows+j, (half4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2));
    write_imageh(output, (i_2+3)*padded_rows+j, (half4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3));
 }
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@ -1,6 +1,7 @@
 #include "ggml-rpc.h"
 #include "ggml-impl.h"
 #include "ggml-backend-impl.h"
 #include "ggml-cpp.h"
 #include <cinttypes>
 #include <string>
@ -853,12 +854,13 @@ bool rpc_server::get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_
        /*.no_alloc   =*/ true,
    };
-    struct ggml_context * ctx = ggml_init(params);
+    ggml_context_ptr ctx_ptr { ggml_init(params) };
    GGML_ASSERT(ctx_ptr != nullptr);
    ggml_context * ctx = ctx_ptr.get();
    ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
    if (tensor == nullptr) {
        GGML_LOG_ERROR("Null tensor pointer passed to server get_alloc_size function.\n");
        ggml_free(ctx);
        return false;
    }
@ -871,7 +873,6 @@ bool rpc_server::get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_
    response.alloc_size = ggml_backend_buft_get_alloc_size(buft,tensor);
    ggml_free(ctx);
    return true;
 }
@ -985,11 +986,12 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
        /*.mem_buffer =*/ NULL,
        /*.no_alloc   =*/ true,
    };
-    struct ggml_context * ctx = ggml_init(params);
+    ggml_context_ptr ctx_ptr { ggml_init(params) };
    GGML_ASSERT(ctx_ptr != nullptr);
    ggml_context * ctx = ctx_ptr.get();
    ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor);
    if (tensor == nullptr) {
        GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
        ggml_free(ctx);
        return false;
    }
    GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
@ -1016,7 +1018,6 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
        printf("[%s] saved to '%s'\n", __func__, cache_file.c_str());
    }
    ggml_backend_tensor_set(tensor, data, offset, size);
    ggml_free(ctx);
    return true;
 }
@ -1060,11 +1061,12 @@ bool rpc_server::set_tensor_hash(const std::vector<uint8_t> & input, rpc_msg_set
        /*.mem_buffer =*/ NULL,
        /*.no_alloc   =*/ true,
    };
-    struct ggml_context * ctx = ggml_init(params);
+    ggml_context_ptr ctx_ptr { ggml_init(params) };
    GGML_ASSERT(ctx_ptr != nullptr);
    ggml_context * ctx = ctx_ptr.get();
    ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor);
    if (tensor == nullptr) {
        GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
        ggml_free(ctx);
        return false;
    }
    GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu, hash: %" PRIx64 "\n", __func__, (void*)tensor->buffer, tensor->data, offset, size, *hash);
@ -1080,7 +1082,6 @@ bool rpc_server::set_tensor_hash(const std::vector<uint8_t> & input, rpc_msg_set
    }
    ggml_backend_tensor_set(tensor, cached_file.data(), offset, size);
    response.result = 1;
    ggml_free(ctx);
    return true;
 }
@ -1090,11 +1091,12 @@ bool rpc_server::init_tensor(const rpc_msg_init_tensor_req & request) {
        /*.mem_buffer =*/ NULL,
        /*.no_alloc   =*/ true,
    };
-    struct ggml_context * ctx = ggml_init(params);
+    ggml_context_ptr ctx_ptr { ggml_init(params) };
    GGML_ASSERT(ctx_ptr != nullptr);
    ggml_context * ctx = ctx_ptr.get();
    ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
    if (tensor == nullptr) {
        GGML_LOG_ERROR("Null tensor pointer passed to server init_tensor function.\n");
        ggml_free(ctx);
        return false;
    }
@ -1110,11 +1112,9 @@ bool rpc_server::init_tensor(const rpc_msg_init_tensor_req & request) {
        // This pointer can either be passed around client/server, or probably better stored server-side and kept track of.
        // Currently unimplemented.
        GGML_LOG_ERROR("tensor->extra populated by the backend, this is currently unsupported.\n");
        ggml_free(ctx);
        return false;
    }
    ggml_free(ctx);
    return true;
 }
@ -1124,11 +1124,12 @@ bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector<
        /*.mem_buffer =*/ NULL,
        /*.no_alloc   =*/ true,
    };
-    struct ggml_context * ctx = ggml_init(params);
+    ggml_context_ptr ctx_ptr { ggml_init(params) };
    GGML_ASSERT(ctx_ptr != nullptr);
    ggml_context * ctx = ctx_ptr.get();
    ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
    if (tensor == nullptr) {
        GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
        ggml_free(ctx);
        return false;
    }
    GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, request.offset, request.size);
@ -1147,7 +1148,6 @@ bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector<
    response.resize(request.size, 0);
    ggml_backend_tensor_get(tensor, response.data(), request.offset, request.size);
    ggml_free(ctx);
    return true;
 }
@ -1157,12 +1157,14 @@ bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_co
        /*.mem_buffer =*/ NULL,
        /*.no_alloc   =*/ true,
    };
-    struct ggml_context * ctx = ggml_init(params);
+    ggml_context_ptr ctx_ptr { ggml_init(params) };
    GGML_ASSERT(ctx_ptr != nullptr);
    ggml_context * ctx = ctx_ptr.get();
    ggml_tensor * src = deserialize_tensor(ctx, &request.src);
    ggml_tensor * dst = deserialize_tensor(ctx, &request.dst);
    if (src == nullptr || dst == nullptr) {
        GGML_LOG_ERROR("[%s] error deserializing tensors\n", __func__);
        ggml_free(ctx);
        return false;
    }
@ -1180,7 +1182,6 @@ bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_co
                         dst_data + src_size,
                         dst_base,
                         dst_base + dst_buf_sz);
        ggml_free(ctx);
        return false;
    }
@ -1188,7 +1189,6 @@ bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_co
                     __func__, (void*) src->buffer, (void*) dst->buffer);
    response.result = ggml_backend_buffer_copy_tensor(src, dst);
    ggml_free(ctx);
    return true;
 }
@ -1242,7 +1242,9 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph
        /*.mem_buffer =*/ NULL,
        /*.no_alloc   =*/ true,
    };
-    struct ggml_context * ctx = ggml_init(params);
+    ggml_context_ptr ctx_ptr { ggml_init(params) };
    GGML_ASSERT(ctx_ptr != nullptr);
    ggml_context * ctx = ctx_ptr.get();
    struct ggml_cgraph * graph = ggml_new_graph_custom(ctx, n_nodes, false);
    graph->n_nodes = n_nodes;
    std::unordered_map<uint64_t, const rpc_tensor*> tensor_ptrs;
@ -1257,7 +1259,6 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph
    }
    ggml_status status = ggml_backend_graph_compute(backend, graph);
    response.result = status;
    ggml_free(ctx);
    return true;
 }
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@ -4009,17 +4009,14 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_ROPE:
            {
                const int mode = ((const int32_t *) op->op_params)[2];
-                if (mode & GGML_ROPE_TYPE_MROPE) {
+                // mode is not used as a bitmask in practice, the various rope type modes are independent implementations
-                    return false;
+                if (mode == GGML_ROPE_TYPE_MROPE) {
                }
                if (mode & GGML_ROPE_TYPE_VISION) {
                    return false;
                }
                return ggml_is_contiguous(op->src[0]);
            }
        case GGML_OP_IM2COL:
-            // TODO: add support for the new F32 operations
+            return true;
            return op->src[0]->type == GGML_TYPE_F16;
        case GGML_OP_UPSCALE:
            return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
        case GGML_OP_POOL_2D:
--- a/ggml/src/ggml-sycl/im2col.cpp
+++ b/ggml/src/ggml-sycl/im2col.cpp
@ -12,110 +12,125 @@
 #include "im2col.hpp"
 #include <sycl/sycl.hpp>
 #include <type_traits>  // For std::is_same_v
 #include "ggml.h"
 template <typename T>
-static void im2col_kernel(
+static void im2col_kernel(const float * x, T * dst, int64_t batch_offset, int64_t offset_delta, int64_t IC, int64_t IW,
-        const float *x, T *dst, int64_t batch_offset, int64_t offset_delta,
+                          int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW,
-        int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH,
+                          int s0, int s1, int p0, int p1, int d0, int d1, const sycl::nd_item<3> & item_ct1) {
        int64_t pelements, int64_t CHW, int s0, int s1, int p0, int p1, int d0, int d1,
        const sycl::nd_item<3> &item_ct1) {
    const int64_t work_group_size = item_ct1.get_local_range(2);
-    const int64_t global_id = item_ct1.get_local_id(2) + work_group_size * item_ct1.get_group(2);
+    const int64_t global_id       = item_ct1.get_local_id(2) + (work_group_size * item_ct1.get_group(2));
    // make each work-item deal with more elements since sycl global range can not exceed max int
-    for (int64_t i = global_id; i < pelements; i += work_group_size * item_ct1.get_group_range(2)) {
+    for (int64_t i = global_id; i < pelements; i += (work_group_size * item_ct1.get_group_range(2))) {
        const int64_t ksize = OW * (KH > 1 ? KW : 1);
-        const int64_t kx = i / ksize;
+        const int64_t kx    = i / ksize;
-        const int64_t kd = kx * ksize;
+        const int64_t kd    = kx * ksize;
-        const int64_t ky = (i - kd) / OW;
+        const int64_t ky    = (i - kd) / OW;
-        const int64_t ix = i % OW;
+        const int64_t ix    = i % OW;
-        const int64_t  oh = item_ct1.get_group(1);
+        const int64_t oh    = item_ct1.get_group(1);
-        const int64_t  batch = item_ct1.get_group(0) / IC;
+        const int64_t batch = item_ct1.get_group(0) / IC;
-        const int64_t  ic = item_ct1.get_group(0) % IC;
+        const int64_t ic    = item_ct1.get_group(0) % IC;
-        const int64_t iiw = ix * s0 + kx * d0 - p0;
+        const int64_t iiw = (ix * s0) + (kx * d0) - p0;
-        const int64_t iih = oh * s1 + ky * d1 - p1;
+        const int64_t iih = (oh * s1) + (ky * d1) - p1;
-        const int64_t offset_dst =
+        const int64_t offset_dst = (((batch * OH + oh) * OW + ix) * CHW) + (ic * (KW * KH) + ky * KW + kx);
            ((batch * OH + oh) * OW + ix) * CHW +
            (ic * (KW * KH) + ky * KW + kx);
-        if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
+        const int64_t offset_src_base = (ic * offset_delta) + (batch * batch_offset);
-            dst[offset_dst] =
+        const int64_t offset_src      = offset_src_base + (iih * IW) + iiw;
-                sycl::vec<float, 1>(0.0f)
+
-                    .convert<sycl::half, sycl::rounding_mode::automatic>()[0];
+        const bool  out_of_bounds = (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW);
-        } else {
+        const float src_val       = out_of_bounds ? 0.0f : x[offset_src];
-            const int64_t offset_src = ic * offset_delta + batch * batch_offset;
+
-            dst[offset_dst] =
+        if constexpr (std::is_same_v<T, sycl::half>) {
-                sycl::vec<float, 1>(x[offset_src + iih * IW + iiw])
+            dst[offset_dst] = sycl::half(src_val);
-                    .convert<sycl::half, sycl::rounding_mode::automatic>()[0];
+        } else if constexpr (std::is_same_v<T, float>) {
            dst[offset_dst] = src_val;
        }
    }
 }
 template <typename T>
-static void im2col_sycl(
+static void im2col_sycl_internal(const float * x, T * dst, int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW,
-        const float *x, T *dst, int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW,
+                                 int64_t KH, int64_t IC, int64_t batch, int64_t batch_offset, int64_t offset_delta,
-        int64_t KH, int64_t IC, int64_t batch, int64_t batch_offset, int64_t offset_delta,
+                                 int s0, int s1, int p0, int p1, int d0, int d1, queue_ptr stream) {
        int s0, int s1, int p0, int p1, int d0, int d1,
        queue_ptr stream) {
    const int64_t parallel_elements = OW * KW * KH;
-    const int64_t num_blocks = (parallel_elements + SYCL_IM2COL_BLOCK_SIZE - 1) / SYCL_IM2COL_BLOCK_SIZE;
+    const int64_t num_blocks        = (parallel_elements + SYCL_IM2COL_BLOCK_SIZE - 1) / SYCL_IM2COL_BLOCK_SIZE;
    // decrease global range when it exceeds the max int
    int64_t local_size = downsample_sycl_global_range(batch * IC * OH * num_blocks, SYCL_IM2COL_BLOCK_SIZE);
    sycl::range<3> block_nums(batch * IC, OH, num_blocks);
    sycl::range<3> local_range(1, 1, local_size);
-    {
+    const int64_t CHW = IC * KH * KW;
        dpct::has_capability_or_fail(stream->get_device(),
                                     {sycl::aspect::fp16});
-        stream->parallel_for(
+    stream->parallel_for(sycl::nd_range<3>(block_nums * local_range, local_range), [=](sycl::nd_item<3> item_ct1) {
-            sycl::nd_range<3>(block_nums * local_range, local_range),
+        im2col_kernel<T>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, CHW, s0, s1,
-            [=](sycl::nd_item<3> item_ct1) {
+                         p0, p1, d0, d1, item_ct1);
-                im2col_kernel(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH,
+    });
                               parallel_elements, (IC * KH * KW), s0, s1, p0,
                               p1, d0, d1, item_ct1);
            });
    }
 }
-void ggml_sycl_op_im2col(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
+static void im2col_sycl_f16(const float * x, sycl::half * dst, int64_t IW, int64_t IH, int64_t OW, int64_t OH,
                            int64_t KW, int64_t KH, int64_t IC, int64_t batch, int64_t batch_offset,
                            int64_t offset_delta, int s0, int s1, int p0, int p1, int d0, int d1, queue_ptr stream) {
    if (!stream->get_device().has(sycl::aspect::fp16)) {
        throw sycl::exception(sycl::make_error_code(sycl::errc::kernel_not_supported),
                              "Device does not support half precision (fp16) operations!");
    }
    im2col_sycl_internal<sycl::half>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0,
                                     p1, d0, d1, stream);
 }
 static void im2col_sycl_f32(const float * x, float * dst, int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW,
                            int64_t KH, int64_t IC, int64_t batch, int64_t batch_offset, int64_t offset_delta, int s0,
                            int s1, int p0, int p1, int d0, int d1, queue_ptr stream) {
    im2col_sycl_internal<float>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1,
                                d0, d1, stream);
 }
 void ggml_sycl_op_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const ggml_tensor * src1 = dst->src[1];
    GGML_ASSERT(src0->type == GGML_TYPE_F16);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+    const int32_t s0 = ((const int32_t *) (dst->op_params))[0];
-    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
+    const int32_t s1 = ((const int32_t *) (dst->op_params))[1];
-    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
+    const int32_t p0 = ((const int32_t *) (dst->op_params))[2];
-    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
+    const int32_t p1 = ((const int32_t *) (dst->op_params))[3];
-    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
+    const int32_t d0 = ((const int32_t *) (dst->op_params))[4];
-    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
+    const int32_t d1 = ((const int32_t *) (dst->op_params))[5];
-    const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
+    const bool is_2D = ((const int32_t *) (dst->op_params))[6] == 1;
    const int64_t IC = src1->ne[is_2D ? 2 : 1];
    const int64_t IH = is_2D ? src1->ne[1] : 1;
-    const int64_t IW =         src1->ne[0];
+    const int64_t IW = src1->ne[0];
    const int64_t KH = is_2D ? src0->ne[1] : 1;
-    const int64_t KW =         src0->ne[0];
+    const int64_t KW = src0->ne[0];
    const int64_t OH = is_2D ? dst->ne[2] : 1;
-    const int64_t OW =         dst->ne[1];
+    const int64_t OW = dst->ne[1];
-    const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
+    const size_t  delta_offset = src1->nb[is_2D ? 2 : 1] / sizeof(float);
-    const int64_t batch = src1->ne[3];
+    const int64_t batch        = src1->ne[is_2D ? 3 : 2];
-    const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32
+    const size_t  batch_offset = src1->nb[is_2D ? 3 : 2] / sizeof(float);
    queue_ptr stream = ctx.stream();
    if (dst->type == GGML_TYPE_F16) {
-        im2col_sycl((const float *) src1->data, (sycl::half *)dst->data, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, ctx.stream());
+        im2col_sycl_f16((const float *) src1->data, (sycl::half *) dst->data, IW, IH, OW, OH, KW, KH, IC, batch,
                        batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
    } else {
-        im2col_sycl((const float *) src1->data, (float *)dst->data, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, ctx.stream());
+        im2col_sycl_f32((const float *) src1->data, (float *) dst->data, IW, IH, OW, OH, KW, KH, IC, batch,
                        batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
    }
 }
--- a/ggml/src/ggml-sycl/rope.cpp
+++ b/ggml/src/ggml-sycl/rope.cpp
@ -1,9 +1,15 @@
 #include "rope.hpp"
 #include "ggml-sycl/common.hpp"
 #include "ggml.h"
 struct rope_corr_dims {
    float v[2];
 };
 struct mrope_sections {
    int v[4];
 };
 static float rope_yarn_ramp(const float low, const float high, const int i0) {
    const float y = (i0 / 2 - low) / sycl::max(0.001f, high - low);
    return 1.0f - sycl::min(1.0f, sycl::max(0.0f, y));
@ -114,6 +120,48 @@ static void rope_neox(
    dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
 }
 template <typename T, bool has_ff>
 static void rope_vision(const T * x, T * dst, const int ne0, const int ne1, const int ne2, const size_t s1,
                        const size_t s2, const int n_dims, const int32_t * pos, const float freq_scale,
                        const float ext_factor, const float attn_factor, const rope_corr_dims corr_dims,
                        const float theta_scale, const float * freq_factors, const mrope_sections sections,
                        const sycl::nd_item<3> & item_ct1) {
    // get index pos
    const int i0 = 2 * (item_ct1.get_group(1) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1));
    if (i0 >= ne0) {
        return;
    }
    const int    row_dst   = (item_ct1.get_group(2) * item_ct1.get_local_range(2)) + item_ct1.get_local_id(2);
    const int    row_x     = row_dst % ne1;
    const int    channel_x = row_dst / ne1;
    const int    idst      = (row_dst * ne0) + (i0 / 2);
    const size_t ix        = ((size_t) channel_x * s2) + ((size_t) row_x * s1) + (i0 / 2);
    const int sect_dims = sections.v[0] + sections.v[1];
    const int sector    = (i0 / 2) % sect_dims;
    float theta_base = 0.0f;
    if (sector < sections.v[0]) {
        const int p = sector;
        theta_base  = pos[channel_x] * sycl::pow(theta_scale, (float) p);
    } else {
        // Simplified from CUDA backend code: if (sector >= sections.v[0] && sector < sec_w) which is just sector >= sections.v[0]
        const int p = sector - sections.v[0];
        theta_base  = pos[channel_x + ne2] * sycl::pow(theta_scale, (float) p);
    }
    const float freq_factor = has_ff ? freq_factors[i0 / 2] : 1.0f;
    float       cos_theta;
    float       sin_theta;
    rope_yarn(theta_base / freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
    const float x0 = x[ix + 0];
    const float x1 = x[ix + n_dims];
    // store results in dst
    dst[idst + 0]      = x0 * cos_theta - x1 * sin_theta;
    dst[idst + n_dims] = x0 * sin_theta + x1 * cos_theta;
 }
 template <typename T>
 static void rope_norm_sycl(
    const T *x, T *dst, int ne0, int n_dims, int nr, const int32_t *pos, float freq_scale, int p_delta_rows,
@ -192,21 +240,58 @@ static void rope_neox_sycl(
    }
 }
 // rope vision
 template <typename T>
 static void rope_vision_sycl(const T * x, T * dst, const int ne0, const int ne1, const int ne2, const size_t s1,
                             const size_t s2, const int n_dims, const int nr, const int32_t * pos,
                             const float freq_scale, const float freq_base, const float ext_factor,
                             const float attn_factor, const rope_corr_dims corr_dims, const float * freq_factors,
                             const mrope_sections sections, queue_ptr stream) {
    GGML_ASSERT(ne0 % 2 == 0);
    const sycl::range<3>    block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
    const int               n_blocks_y = (ne0 + 2 * SYCL_ROPE_BLOCK_SIZE - 1) / (2 * SYCL_ROPE_BLOCK_SIZE);
    const sycl::range<3>    grid_dims(1, n_blocks_y, nr);
    const sycl::nd_range<3> nd_range(grid_dims * block_dims, block_dims);
    const float theta_scale = std::pow(freq_base, -2.0f / n_dims);
    // Add FP16 capability check if T could be sycl::half
    if constexpr (std::is_same_v<T, sycl::half>) {
        dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
    }
    // launch kernel
    if (freq_factors == nullptr) {
        stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
            rope_vision<T, false>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
                                  corr_dims, theta_scale, freq_factors, sections, item_ct1);
        });
    } else {
        stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
            rope_vision<T, true>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
                                 corr_dims, theta_scale, freq_factors, sections, item_ct1);
        });
    }
 }
 void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
    GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
    GGML_ASSERT(dst->src[0]->type == dst->type);
-
+    const int64_t ne00 = dst->src[0]->ne[0]; // head dims
-    const int64_t ne00 = dst->src[0]->ne[0];
+    const int64_t ne01 = dst->src[0]->ne[1]; // num heads
-    const int64_t ne01 = dst->src[0]->ne[1];
+    const int64_t ne02 = dst->src[0]->ne[2]; // num heads
    const int64_t nr = ggml_nrows(dst->src[0]);
    const size_t s01 = dst->src[0]->nb[1] / ggml_type_size(dst->src[0]->type);
    const size_t s02 = dst->src[0]->nb[2] / ggml_type_size(dst->src[0]->type);
    //const int n_past      = ((int32_t *) dst->op_params)[0];
    const int n_dims      = ((int32_t *) dst->op_params)[1];
    const int mode        = ((int32_t *) dst->op_params)[2];
    //const int n_ctx       = ((int32_t *) dst->op_params)[3];
    const int n_ctx_orig  = ((int32_t *) dst->op_params)[4];
    mrope_sections sections;
    // RoPE alteration for extended context
    float freq_base;
@ -222,8 +307,10 @@ void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
    memcpy(&sections.v,  (int32_t *) dst->op_params + 11, sizeof(int)*4);
    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
    const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
    const int32_t * pos = (const int32_t *) dst->src[1]->data;
@ -240,6 +327,7 @@ void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
    // compute
    if (is_neox) {
        GGML_SYCL_DEBUG("%s: neox path\n", __func__);
        if (dst->src[0]->type == GGML_TYPE_F32) {
            rope_neox_sycl(
                (const float *)dst->src[0]->data, (float *)dst->data, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
@ -253,7 +341,19 @@ void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
        } else {
            GGML_ABORT("fatal error");
        }
    } else if (is_vision) {
        GGML_SYCL_DEBUG("%s: vision path\n", __func__);
        if (dst->src[0]->type == GGML_TYPE_F16) {
            rope_vision_sycl((const sycl::half *)dst->src[0]->data, (sycl::half *)dst->data, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, main_stream);
        } else if (dst->src[0]->type == GGML_TYPE_F32) {
            rope_vision_sycl((const float *) dst->src[0]->data, (float *)dst->data, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, main_stream);
        } else {
            GGML_ABORT("Fatal error: Tensor type unsupported!");
        }
    } else {
        GGML_SYCL_DEBUG("%s: norm path\n", __func__);
        if (dst->src[0]->type == GGML_TYPE_F32) {
            rope_norm_sycl(
                (const float *)dst->src[0]->data, (float *)dst->data, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -139,6 +139,8 @@ class Keys:
        REL_BUCKETS_COUNT            = "{arch}.attention.relative_buckets_count"
        SLIDING_WINDOW               = "{arch}.attention.sliding_window"
        SCALE                        = "{arch}.attention.scale"
        KEY_LENGTH_MLA               = "{arch}.attention.key_length_mla"
        VALUE_LENGTH_MLA             = "{arch}.attention.value_length_mla"
    class Rope:
        DIMENSION_COUNT         = "{arch}.rope.dimension_count"
@ -382,6 +384,8 @@ class MODEL_TENSOR(IntEnum):
    ATTN_Q_B             = auto()
    ATTN_KV_A_MQA        = auto()
    ATTN_KV_B            = auto()
    ATTN_K_B             = auto()
    ATTN_V_B             = auto()
    ATTN_Q_A_NORM        = auto()
    ATTN_KV_A_NORM       = auto()
    FFN_SUB_NORM         = auto()
@ -590,6 +594,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.ATTN_Q_B:                  "blk.{bid}.attn_q_b",
    MODEL_TENSOR.ATTN_KV_A_MQA:             "blk.{bid}.attn_kv_a_mqa",
    MODEL_TENSOR.ATTN_KV_B:                 "blk.{bid}.attn_kv_b",
    MODEL_TENSOR.ATTN_K_B:                  "blk.{bid}.attn_k_b",
    MODEL_TENSOR.ATTN_V_B:                  "blk.{bid}.attn_v_b",
    MODEL_TENSOR.ATTN_Q_A_NORM:             "blk.{bid}.attn_q_a_norm",
    MODEL_TENSOR.ATTN_KV_A_NORM:            "blk.{bid}.attn_kv_a_norm",
    MODEL_TENSOR.ATTN_SUB_NORM:             "blk.{bid}.attn_sub_norm",
@ -1517,6 +1523,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.ATTN_Q_B,
        MODEL_TENSOR.ATTN_KV_A_MQA,
        MODEL_TENSOR.ATTN_KV_B,
        MODEL_TENSOR.ATTN_K_B,
        MODEL_TENSOR.ATTN_V_B,
        MODEL_TENSOR.ATTN_Q_A_NORM,
        MODEL_TENSOR.ATTN_KV_A_NORM,
        MODEL_TENSOR.ATTN_OUT,
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -689,6 +689,12 @@ class GGUFWriter:
    def add_value_length(self, length: int) -> None:
        self.add_uint32(Keys.Attention.VALUE_LENGTH.format(arch=self.arch), length)
    def add_key_length_mla(self, length: int) -> None:
        self.add_uint32(Keys.Attention.KEY_LENGTH_MLA.format(arch=self.arch), length)
    def add_value_length_mla(self, length: int) -> None:
        self.add_uint32(Keys.Attention.VALUE_LENGTH_MLA.format(arch=self.arch), length)
    def add_max_alibi_bias(self, bias: float) -> None:
        self.add_float32(Keys.Attention.MAX_ALIBI_BIAS.format(arch=self.arch), bias)
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -677,6 +677,14 @@ class TensorNameMap:
            "model.layers.{bid}.self_attn.kv_b_proj", # deepseek2
        ),
        MODEL_TENSOR.ATTN_K_B: (
            "model.layers.{bid}.self_attn.k_b_proj",  # deepseek2
        ),
        MODEL_TENSOR.ATTN_V_B: (
            "model.layers.{bid}.self_attn.v_b_proj",  # deepseek2
        ),
        MODEL_TENSOR.ATTN_Q_A_NORM: (
            "model.layers.{bid}.self_attn.q_a_layernorm", # deepseek2
        ),
--- a/include/llama.h
+++ b/include/llama.h
@ -369,17 +369,18 @@ extern "C" {
    // model quantization parameters
    typedef struct llama_model_quantize_params {
-        int32_t nthread;                     // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
+        int32_t nthread;                      // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
-        enum llama_ftype ftype;              // quantize to this llama_ftype
+        enum llama_ftype ftype;               // quantize to this llama_ftype
-        enum ggml_type output_tensor_type;   // output tensor type
+        enum ggml_type output_tensor_type;    // output tensor type
-        enum ggml_type token_embedding_type; // token embeddings tensor type
+        enum ggml_type token_embedding_type;  // token embeddings tensor type
-        bool allow_requantize;               // allow quantizing non-f32/f16 tensors
+        bool allow_requantize;                // allow quantizing non-f32/f16 tensors
-        bool quantize_output_tensor;         // quantize output.weight
+        bool quantize_output_tensor;          // quantize output.weight
-        bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
+        bool only_copy;                       // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
-        bool pure;                           // quantize all tensors to the default type
+        bool pure;                            // quantize all tensors to the default type
-        bool keep_split;                     // quantize to the same number of shards
+        bool keep_split;                      // quantize to the same number of shards
-        void * imatrix;                      // pointer to importance matrix data
+        void * imatrix;                       // pointer to importance matrix data
-        void * kv_overrides;                 // pointer to vector containing overrides
+        void * kv_overrides;                  // pointer to vector containing overrides
        void * tensor_types;                  // pointer to vector containing tensor types
    } llama_model_quantize_params;
    typedef struct llama_logit_bias {
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -50,7 +50,7 @@ logit_bias_max = 512
 dry_seq_break_max = 128
 # global vars
-KcppVersion = "1.88"
+KcppVersion = "1.89"
 showdebug = True
 kcpp_instance = None #global running instance
 global_memory = {"tunnel_url": "", "restart_target":"", "input_to_exit":False, "load_complete":False}
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@ -140,6 +140,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,       "%s.attention.relative_buckets_count"       },
    { LLM_KV_ATTENTION_SLIDING_WINDOW,               "%s.attention.sliding_window"               },
    { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
    { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
    { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },
    { LLM_KV_ROPE_DIMENSION_COUNT,      "%s.rope.dimension_count"                 },
    { LLM_KV_ROPE_DIMENSION_SECTIONS,   "%s.rope.dimension_sections"              },
@ -1103,6 +1105,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_ATTN_Q_B,           "blk.%d.attn_q_b" },
            { LLM_TENSOR_ATTN_KV_A_MQA,      "blk.%d.attn_kv_a_mqa" },
            { LLM_TENSOR_ATTN_KV_B,          "blk.%d.attn_kv_b" },
            { LLM_TENSOR_ATTN_K_B,           "blk.%d.attn_k_b" },
            { LLM_TENSOR_ATTN_V_B,           "blk.%d.attn_v_b" },
            { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },
            { LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },
            { LLM_TENSOR_FFN_GATE,           "blk.%d.ffn_gate" },
@ -1563,23 +1567,8 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
    {LLM_TENSOR_ATTN_Q_B,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_ATTN_KV_A_MQA,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_ATTN_KV_B,                  {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_DEC_ATTN_Q,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_K_B,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_DEC_ATTN_K,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_V_B,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_ATTN_Q,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_ATTN_K,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_ATTN_V,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_ATTN_QKV,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_ATTN_OUT,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_FFN_GATE,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_FFN_DOWN,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_FFN_UP,                     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_FFN_DOWN_SHEXP,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_FFN_GATE_SHEXP,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_FFN_UP_SHEXP,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_ATTN_Q_A,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_ATTN_Q_B,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_ATTN_KV_A_MQA,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_ATTN_KV_B,                  {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_DEC_ATTN_Q,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_DEC_ATTN_K,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
    {LLM_TENSOR_DEC_ATTN_V,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@ -144,6 +144,8 @@ enum llm_kv {
    LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
    LLM_KV_ATTENTION_SLIDING_WINDOW,
    LLM_KV_ATTENTION_SCALE,
    LLM_KV_ATTENTION_KEY_LENGTH_MLA,
    LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
    LLM_KV_ROPE_DIMENSION_COUNT,
    LLM_KV_ROPE_DIMENSION_SECTIONS,
@ -306,6 +308,8 @@ enum llm_tensor {
    LLM_TENSOR_ATTN_Q_B,
    LLM_TENSOR_ATTN_KV_A_MQA,
    LLM_TENSOR_ATTN_KV_B,
    LLM_TENSOR_ATTN_K_B,
    LLM_TENSOR_ATTN_V_B,
    LLM_TENSOR_ATTN_Q_A_NORM,
    LLM_TENSOR_ATTN_KV_A_NORM,
    LLM_TENSOR_ATTN_SUB_NORM,
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@ -10,6 +10,7 @@
 #include <cstring>
 #include <stdexcept>
 #include <cinttypes>
 #include <cmath>
 //
 // llama_context
@ -473,7 +474,6 @@ ggml_tensor * llama_context::build_rope_shift(
    const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
    const auto & yarn_ext_factor  = cparams.yarn_ext_factor;
    const auto & yarn_attn_factor = cparams.yarn_attn_factor;
    const auto & yarn_beta_fast   = cparams.yarn_beta_fast;
    const auto & yarn_beta_slow   = cparams.yarn_beta_slow;
@ -482,6 +482,10 @@ ggml_tensor * llama_context::build_rope_shift(
    const auto & n_rot     = hparams.n_rot;
    const auto & rope_type = hparams.rope_type;
    // See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
    // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
    const float yarn_attn_factor_scaled = model.arch == LLM_ARCH_DEEPSEEK2 ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)) : cparams.yarn_attn_factor;
    ggml_tensor * tmp;
    if (ggml_is_quantized(cur->type)) {
@ -500,14 +504,14 @@ ggml_tensor * llama_context::build_rope_shift(
        tmp = ggml_rope_ext_inplace(ctx0, tmp,
                shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
+                yarn_ext_factor, yarn_attn_factor_scaled, yarn_beta_fast, yarn_beta_slow);
        tmp = ggml_cpy(ctx0, tmp, cur);
    } else {
        // we rotate only the first n_rot dimensions
        tmp = ggml_rope_ext_inplace(ctx0, cur,
                shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
+                yarn_ext_factor, yarn_attn_factor_scaled, yarn_beta_fast, yarn_beta_slow);
    }
    return tmp;
@ -2275,6 +2279,11 @@ llama_context * llama_init_from_model(
        params.flash_attn = false;
    }
    if (params.flash_attn && model->arch == LLM_ARCH_DEEPSEEK2) {
        LLAMA_LOG_WARN("%s: flash_attn is not compatible with Deepseek2 - forcing off\n", __func__);
        params.flash_attn = false;
    }
    if (ggml_is_quantized(params.type_v) && !params.flash_attn) {
        LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
        return nullptr;
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@ -1188,6 +1188,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
         ggml_tensor * v,
         ggml_tensor * kq_b,
         ggml_tensor * kq_mask,
         ggml_tensor * v_mla,
             bool      v_trans,
             float     kq_scale) const {
  //const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
@ -1199,7 +1200,8 @@ ggml_tensor * llm_graph_context::build_attn_mha(
  //const auto & n_embd_head_k = hparams.n_embd_head_k;
  //const auto & n_embd_head_v = hparams.n_embd_head_v;
-    const auto n_embd_head_v = v_trans ? v->ne[1] : v->ne[0];
+    // note: for MLA with the absorption optimization, the final embedding size will be changed via v_mla
    const auto n_embd_head_v = v_mla == nullptr ? v_trans ? v->ne[1] : v->ne[0] : v_mla->ne[1];
    const auto n_tokens = q->ne[1];
    const auto n_head   = q->ne[2];
@ -1267,6 +1269,11 @@ ggml_tensor * llm_graph_context::build_attn_mha(
        ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
        // for MLA with the absorption optimization, we need to "decompress" from MQA back to MHA
        if (v_mla) {
            kqv = ggml_mul_mat(ctx0, v_mla, kqv);
        }
        ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
        cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens);
@ -1304,6 +1311,7 @@ ggml_tensor * llm_graph_context::build_attn(
        ggml_tensor * k_cur,
        ggml_tensor * v_cur,
        ggml_tensor * kq_b,
        ggml_tensor * v_mla,
            float     kq_scale,
            int       il) const {
    GGML_UNUSED(n_tokens);
@ -1325,7 +1333,7 @@ ggml_tensor * llm_graph_context::build_attn(
    ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
    //cb(k, "v", il);
-    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, false, kq_scale);
+    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, false, kq_scale);
    cb(cur, "kqv_out", il);
@ -1379,6 +1387,7 @@ ggml_tensor * llm_graph_context::build_attn(
        ggml_tensor * k_cur,
        ggml_tensor * v_cur,
        ggml_tensor * kq_b,
        ggml_tensor * v_mla,
            float     kq_scale,
            int       il) const {
    // these nodes are added to the graph together so that they are not reordered
@ -1464,7 +1473,7 @@ ggml_tensor * llm_graph_context::build_attn(
                ggml_element_size(kv_self->v_l[il])*n_ctx*n_embd_head_v,
                0);
-    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_trans, kq_scale);
+    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, v_trans, kq_scale);
    cb(cur, "kqv_out", il);
    if (wo) {
@ -1504,6 +1513,7 @@ ggml_tensor * llm_graph_context::build_attn(
        ggml_tensor * k_cur,
        ggml_tensor * v_cur,
        ggml_tensor * kq_b,
        ggml_tensor * v_mla,
            float     kq_scale,
            int       il) const {
    // these nodes are added to the graph together so that they are not reordered
@ -1523,7 +1533,7 @@ ggml_tensor * llm_graph_context::build_attn(
    ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3);
    //cb(k, "v", il);
-    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, false, kq_scale);
+    ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, false, kq_scale);
    cb(cur, "kqv_out", il);
@ -1692,4 +1702,3 @@ void llm_graph_context::build_pooling(
    ggml_build_forward_expand(gf, cur);
 }
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@ -505,11 +505,12 @@ struct llm_graph_context {
    ggml_tensor * build_attn_mha(
             ggml_cgraph * gf,
-             ggml_tensor * q, // [n_embd_head_q, n_tokens, n_head_q]
+             ggml_tensor * q,     // [n_embd_head_q, n_tokens, n_head_q]
-             ggml_tensor * k, // [n_embd_head_k, n_tokens, n_head_k]
+             ggml_tensor * k,     // [n_embd_head_k, n_tokens, n_head_k]
-             ggml_tensor * v, // [n_embd_head_v, n_tokens, n_head_v] (v_trans == false)
+             ggml_tensor * v,     // [n_embd_head_v, n_tokens, n_head_v] (v_trans == false)
             ggml_tensor * kq_b,
             ggml_tensor * kq_mask,
             ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
                    bool   v_trans,
                   float   kq_scale) const;
@ -524,6 +525,7 @@ struct llm_graph_context {
            ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
            ggml_tensor * kq_b,
            ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
                  float   kq_scale,
                    int   il) const;
@ -538,6 +540,7 @@ struct llm_graph_context {
            ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
            ggml_tensor * kq_b,
            ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
                  float   kq_scale,
                    int   il) const;
@ -552,6 +555,7 @@ struct llm_graph_context {
            ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
            ggml_tensor * kq_b,
            ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
                  float   kq_scale,
                    int   il) const;
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@ -43,6 +43,10 @@ struct llama_hparams {
    uint32_t n_expert_used = 0;
    uint32_t n_rel_attn_bkts = 0;
    // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
    uint32_t n_embd_head_k_mla = 0;
    uint32_t n_embd_head_v_mla = 0;
    // for WavTokenizer
    struct llama_hparams_posnet   posnet;
    struct llama_hparams_convnext convnext;
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@ -27,7 +27,7 @@ bool llama_kv_cache_unified::init(
    recurrent = llama_model_is_recurrent(&model);
    v_trans   = !recurrent && !cparams.flash_attn;
-    can_shift = !recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA
+    can_shift = !recurrent;
    LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n",
            __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, can_shift);
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -1161,6 +1161,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
                }
                ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,     hparams.n_lora_kv);
                ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA,   hparams.n_embd_head_k_mla, false);
                ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,        hparams.n_expert_shared);
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,       hparams.expert_weights_scale);
@ -3300,8 +3302,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                {
                    const bool is_lite = (hparams.n_layer == 27);
                    const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
                    // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
                    const int64_t n_embd_head_k_mla = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
                    const int64_t n_embd_head_v_mla = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
                    const int64_t n_embd_head_qk_rope = hparams.n_rot;
-                    const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+                    const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
                    const int64_t q_lora_rank  = hparams.n_lora_q;
                    const int64_t kv_lora_rank = hparams.n_lora_kv;
@ -3327,14 +3335,22 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                        if (!is_lite) {
                            layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
-                            layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
+                            layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
                        } else {
-                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
+                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
                        }
-                        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
+                        layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, 0);
-                        layer.wkv_b     = create_tensor(tn(LLM_TENSOR_ATTN_KV_B,     "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
+
-                        layer.wo        = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {              n_head * (                      n_embd_head_v), n_embd}, 0);
+                        // note: only old legacy GGUF files will have the unsplit wkv_b tensor in
                        if (is_mla) {
                            layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0);
                            layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
                        } else {
                            layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0);
                        }
                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
@ -4386,6 +4402,8 @@ void llama_model::print_info() const {
        LLAMA_LOG_INFO("%s: n_layer_dense_lead   = %d\n",     __func__, hparams.n_layer_dense_lead);
        LLAMA_LOG_INFO("%s: n_lora_q             = %d\n",     __func__, hparams.n_lora_q);
        LLAMA_LOG_INFO("%s: n_lora_kv            = %d\n",     __func__, hparams.n_lora_kv);
        LLAMA_LOG_INFO("%s: n_embd_head_k_mla    = %d\n",     __func__, hparams.n_embd_head_k_mla);
        LLAMA_LOG_INFO("%s: n_embd_head_v_mla    = %d\n",     __func__, hparams.n_embd_head_v_mla);
        LLAMA_LOG_INFO("%s: n_ff_exp             = %d\n",     __func__, hparams.n_ff_exp);
        LLAMA_LOG_INFO("%s: n_expert_shared      = %d\n",     __func__, hparams.n_expert_shared);
        LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n",   __func__, hparams.expert_weights_scale);
@ -4599,7 +4617,7 @@ struct llm_build_llama : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
                cb(cur, "attn_out", il);
            }
@ -4812,7 +4830,7 @@ struct llm_build_deci : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
            }
            if (il == n_layer - 1) {
@ -4954,7 +4972,7 @@ struct llm_build_baichuan : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, NULL,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            }
            if (il == n_layer - 1) {
@ -5069,7 +5087,7 @@ struct llm_build_xverse : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, NULL,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            }
            if (il == n_layer - 1) {
@ -5194,7 +5212,7 @@ struct llm_build_falcon : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, NULL,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            }
            if (il == n_layer - 1) {
@ -5324,7 +5342,7 @@ struct llm_build_grok : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f, il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
            }
            if (il == n_layer - 1) {
@ -5475,7 +5493,7 @@ struct llm_build_dbrx : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, NULL,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            }
            if (il == n_layer - 1) {
@ -5589,7 +5607,7 @@ struct llm_build_starcoder : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            }
            if (il == n_layer - 1) {
@ -5688,7 +5706,7 @@ struct llm_build_refact : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, NULL,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            }
            if (il == n_layer - 1) {
@ -5842,7 +5860,7 @@ struct llm_build_bert : public llm_graph_context {
            cur = build_attn(inp_attn, gf,
                    model.layers[il].wo, model.layers[il].bo,
-                    Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                    Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            cb(cur, "kqv_out", il);
            if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
@ -5959,7 +5977,7 @@ struct llm_build_bloom : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            }
            if (il == n_layer - 1) {
@ -6100,7 +6118,7 @@ struct llm_build_mpt : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            }
            if (il == n_layer - 1) {
@ -6246,7 +6264,7 @@ struct llm_build_stablelm : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, NULL,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            }
            if (il == n_layer - 1) {
@ -6369,7 +6387,7 @@ struct llm_build_qwen : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, NULL,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            }
            if (il == n_layer - 1) {
@ -6489,7 +6507,7 @@ struct llm_build_qwen2 : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            }
            if (il == n_layer - 1) {
@ -6610,7 +6628,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            }
            if (il == n_layer - 1) {
@ -6737,7 +6755,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            }
            if (il == n_layer - 1) {
@ -6890,7 +6908,7 @@ struct llm_build_qwen3 : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            }
            if (il == n_layer - 1) {
@ -7011,7 +7029,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            }
            if (il == n_layer - 1) {
@ -7151,7 +7169,7 @@ struct llm_build_phi2 : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f, il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
            }
            if (il == n_layer - 1) {
@ -7280,7 +7298,7 @@ struct llm_build_phi3 : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f, il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
            }
            if (il == n_layer - 1) {
@ -7415,7 +7433,7 @@ struct llm_build_plamo : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, NULL,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            }
            ggml_tensor * sa_out = cur;
@ -7522,7 +7540,7 @@ struct llm_build_gpt2 : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            }
            if (il == n_layer - 1) {
@ -7638,7 +7656,7 @@ struct llm_build_codeshell : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            }
            if (il == n_layer - 1) {
@ -7767,7 +7785,7 @@ struct llm_build_orion : public llm_graph_context {
            cur = build_attn(inp_attn, gf,
                    model.layers[il].wo, NULL,
-                    Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                    Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
        }
        if (il == n_layer - 1) {
@ -7894,7 +7912,7 @@ struct llm_build_internlm2 : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            }
            if (il == n_layer - 1) {
@ -8091,7 +8109,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, NULL,
-                        q_states, k_states, v_states, nullptr, kq_scale, il);
+                        q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
            }
            if (il == n_layer - 1) {
@ -8221,7 +8239,7 @@ struct llm_build_gemma : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, NULL,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f, il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
            }
            if (il == n_layer - 1) {
@ -8343,7 +8361,7 @@ struct llm_build_gemma2 : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, NULL,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f, il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
            }
            cur = build_norm(cur,
@ -8484,7 +8502,7 @@ struct llm_build_gemma3 : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, NULL,
-                        Qcur, Kcur, Vcur, nullptr, hparams.f_attention_scale, il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
            }
            cur = build_norm(cur,
@ -8624,7 +8642,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            }
            if (il == n_layer - 1) {
@ -8959,7 +8977,7 @@ struct llm_build_command_r : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            }
            if (il == n_layer - 1) {
@ -9094,7 +9112,7 @@ struct llm_build_cohere2 : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            }
            if (il == n_layer - 1) {
@ -9225,7 +9243,7 @@ struct llm_build_olmo : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, nullptr,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            }
            if (il == n_layer - 1) {
@ -9345,7 +9363,7 @@ struct llm_build_olmo2 : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, NULL,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            }
            cur = build_norm(cur,
@ -9478,7 +9496,7 @@ struct llm_build_olmoe : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, NULL,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            }
            if (il == n_layer - 1) {
@ -9611,7 +9629,7 @@ struct llm_build_openelm : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, NULL,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            }
            if (il == n_layer - 1) {
@ -9725,7 +9743,7 @@ struct llm_build_gptneox : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            }
            if (il == n_layer - 1) {
@ -9875,7 +9893,7 @@ struct llm_build_arctic : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, NULL,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            }
            if (il == n_layer - 1) {
@ -10030,7 +10048,7 @@ struct llm_build_deepseek : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
            }
            if (il == n_layer - 1) {
@ -10120,16 +10138,23 @@ struct llm_build_deepseek2 : public llm_graph_context {
    llm_build_deepseek2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
        bool is_lite = (hparams.n_layer == 27);
        const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
        // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
        const int64_t n_embd_head_k = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
        const int64_t n_embd_head_v = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
        const int64_t n_embd_head_qk_rope = hparams.n_rot;
        const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
        const uint32_t kv_lora_rank = hparams.n_lora_kv;
        // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
        // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
        const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
-        const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k));
+        const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(n_embd_head_k));
        const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
        const uint32_t n_embd_head_qk_rope = hparams.n_rot;
        const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
        const uint32_t kv_lora_rank = hparams.n_lora_kv;
        ggml_tensor * cur;
        ggml_tensor * inpL;
@ -10154,16 +10179,14 @@ struct llm_build_deepseek2 : public llm_graph_context {
            {
                ggml_tensor * q = NULL;
                if (!is_lite) {
                    // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
                    q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
                    cb(q, "q", il);
                    q = build_norm(q,
-                            model.layers[il].attn_q_a_norm, NULL,
+                            model.layers[il].attn_q_a_norm, nullptr,
                            LLM_NORM_RMS, il);
                    cb(q, "q", il);
                    // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
                    q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
                    cb(q, "q", il);
                } else {
@ -10171,96 +10194,125 @@ struct llm_build_deepseek2 : public llm_graph_context {
                    cb(q, "q", il);
                }
-                // split into {n_head * n_embd_head_qk_nope, n_tokens}
+                // split into {n_embd_head_qk_nope, n_head, n_tokens}
-                ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
+                ggml_tensor * q_nope = ggml_view_3d(ctx0, q,
-                        ggml_row_size(q->type, hparams.n_embd_head_k),
+                        n_embd_head_qk_nope, n_head, n_tokens,
-                        ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+                        ggml_row_size(q->type, n_embd_head_k),
                        ggml_row_size(q->type, n_embd_head_k) * n_head,
                        0);
                cb(q_nope, "q_nope", il);
-                // and {n_head * n_embd_head_qk_rope, n_tokens}
+                // and {n_embd_head_qk_rope, n_head, n_tokens}
-                ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
+                ggml_tensor * q_pe = ggml_view_3d(ctx0, q,
-                        ggml_row_size(q->type, hparams.n_embd_head_k),
+                        n_embd_head_qk_rope, n_head, n_tokens,
-                        ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+                        ggml_row_size(q->type, n_embd_head_k),
                        ggml_row_size(q->type, n_embd_head_k) * n_head,
                        ggml_row_size(q->type, n_embd_head_qk_nope));
                cb(q_pe, "q_pe", il);
-                // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
+                ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
-                ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
+                cb(kv_cmpr_pe, "kv_cmpr_pe", il);
                cb(kv_pe_compresseed, "kv_pe_compresseed", il);
                // split into {kv_lora_rank, n_tokens}
-                ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
+                ggml_tensor * kv_cmpr = ggml_view_2d(ctx0, kv_cmpr_pe,
-                        kv_pe_compresseed->nb[1],
+                        kv_lora_rank, n_tokens,
                        ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
                        0);
-                cb(kv_compressed, "kv_compressed", il);
+                cb(kv_cmpr, "kv_cmpr", il);
-                // and {n_embd_head_qk_rope, n_tokens}
+                // and {n_embd_head_qk_rope, 1, n_tokens}
-                ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
+                ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_cmpr_pe,
-                        kv_pe_compresseed->nb[1],
+                        n_embd_head_qk_rope, 1, n_tokens,
-                        kv_pe_compresseed->nb[1],
+                        ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
-                        ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
+                        ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
                        ggml_row_size(kv_cmpr_pe->type, kv_lora_rank));
                cb(k_pe, "k_pe", il);
-                // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
+                q_pe = ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr,
                kv_compressed = ggml_cont(ctx0, kv_compressed);
                kv_compressed = build_norm(kv_compressed,
                        model.layers[il].attn_kv_a_norm, NULL,
                        LLM_NORM_RMS, il);
                cb(kv_compressed, "kv_compressed", il);
                // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
                ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
                cb(kv, "kv", il);
                // split into {n_head * n_embd_head_qk_nope, n_tokens}
                ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
                        ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
                        ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
                        0);
                cb(k_nope, "k_nope", il);
                // and {n_head * n_embd_head_v, n_tokens}
                ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
                        ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
                        ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
                        ggml_row_size(kv->type, (n_embd_head_qk_nope)));
                cb(v_states, "v_states", il);
                v_states = ggml_cont(ctx0, v_states);
                cb(v_states, "v_states", il);
                v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
                        ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
                        0);
                cb(v_states, "v_states", il);
                q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
                q_pe = ggml_rope_ext(
                        ctx0, q_pe, inp_pos, nullptr,
                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor_scaled, beta_fast, beta_slow
-                        );
+                );
                cb(q_pe, "q_pe", il);
-                // shared RoPE key
+                k_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr,
                k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
                k_pe = ggml_rope_ext(
                        ctx0, k_pe, inp_pos, nullptr,
                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                        ext_factor, attn_factor_scaled, beta_fast, beta_slow
-                        );
+                );
                cb(k_pe, "k_pe", il);
-                ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
+                kv_cmpr = build_norm(kv_cmpr,
-                cb(q_states, "q_states", il);
+                        model.layers[il].attn_kv_a_norm, nullptr,
                        LLM_NORM_RMS, il);
                cb(kv_cmpr, "kv_cmpr", il);
-                ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
+                if (is_mla) {
-                cb(k_states, "k_states", il);
+                    // {n_embd_head_qk_nope, n_tokens, n_head}
                    q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
                    cb(q_nope, "q_nope_perm", il);
-                cur = build_attn(inp_attn, gf,
+                    // {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
-                        model.layers[il].wo, NULL,
+                    ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
-                        q_states, k_states, v_states, nullptr, kq_scale, il);
+                    cb(q_nope_absorbed, "q_nope_absorbed", il);
                    // {kv_lora_rank, n_head, n_tokens}
                    q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3);
                    cb(q_nope_absorbed, "q_nope_absorbed_perm", il);
                    // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
                    // note: rope must go first for in-place context shifting in build_rope_shift()
                    ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0);
                    cb(Qcur, "Qcur", il);
                    kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
                    cb(kv_cmpr, "kv_cmpr_reshape", il);
                    // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
                    ggml_tensor * Kcur = ggml_concat(ctx0, k_pe, kv_cmpr, 0);
                    cb(Kcur, "Kcur", il);
                    // {kv_lora_rank, 1, n_tokens}
                    ggml_tensor * Vcur = kv_cmpr;
                    cb(Vcur, "Vcur", il);
                    // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
                    cur = build_attn(inp_attn, gf,
                            model.layers[il].wo, NULL,
                            Qcur, Kcur, Vcur, nullptr, model.layers[il].wv_b, kq_scale, il);
                } else {
                    ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
                    cb(kv, "kv", il);
                    // split into {n_embd_head_qk_nope, n_head, n_tokens}
                    ggml_tensor * k_nope = ggml_view_3d(ctx0, kv,
                            n_embd_head_qk_nope, n_head, n_tokens,
                            ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
                            ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
                            0);
                    cb(k_nope, "k_nope_view", il);
                    // and {n_embd_head_v, n_head, n_tokens}
                    ggml_tensor * Vcur = ggml_view_3d(ctx0, kv,
                            n_embd_head_v, n_head, n_tokens,
                            ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
                            ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
                            ggml_row_size(kv->type, n_embd_head_qk_nope));
                    cb(Vcur, "Vcur_view", il);
                    Vcur = ggml_cont(ctx0, Vcur);
                    cb(Vcur, "Vcur_cont", il);
                    // note: rope must go first for in-place context shifting in build_rope_shift()
                    ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope, 0);
                    cb(Qcur, "Qcur", il);
                    ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
                    cb(Kcur, "Kcur", il);
                    // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
                    cur = build_attn(inp_attn, gf,
                            model.layers[il].wo, NULL,
                            Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
                }
            }
            if (il == n_layer - 1) {
@ -10426,7 +10478,7 @@ struct llm_build_bitnet : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        NULL, NULL,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
                cur = build_norm(cur,
                        model.layers[il].attn_sub_norm, NULL,
@ -10549,7 +10601,7 @@ struct llm_build_t5_enc : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo_enc, nullptr,
-                        Qcur, Kcur, Vcur, kq_b, 1.0f, il);
+                        Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
                cb(cur, "kqv_out", il);
            }
@ -10655,7 +10707,7 @@ struct llm_build_t5_dec : public llm_graph_context {
                cur = build_attn(inp_attn_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, Kcur, Vcur, kq_b, 1.0f, il);
+                        Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
                cb(cur, "kqv_out", il);
            }
@ -10687,7 +10739,7 @@ struct llm_build_t5_dec : public llm_graph_context {
                cur = build_attn(inp_attn_cross, gf,
                        model.layers[il].wo_cross, nullptr,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f, il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
                cb(cur, "kqv_out", il);
                //ggml_tensor * q =                 ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
@ -10820,7 +10872,7 @@ struct llm_build_jais : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/float(n_embd_head), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
            }
            if (il == n_layer - 1) {
@ -10952,7 +11004,7 @@ struct llm_build_chatglm : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, NULL,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            }
            if (il == n_layer - 1) {
@ -11085,7 +11137,7 @@ struct llm_build_glm4 : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, NULL,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            }
            if (il == n_layer - 1) {
@ -11229,7 +11281,7 @@ struct llm_build_nemotron : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            }
            if (il == n_layer - 1) {
@ -11360,7 +11412,7 @@ struct llm_build_exaone : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
            }
            if (il == n_layer - 1) {
@ -12262,7 +12314,7 @@ struct llm_build_chameleon : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, nullptr,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
                if (hparams.swin_norm) {
                    cur = build_norm(cur,
@ -12618,7 +12670,7 @@ struct llm_build_plm : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, NULL,
-                        q_states, k_states, v_states, nullptr, kq_scale, il);
+                        q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
            }
            if (il == n_layer - 1) {
@ -12741,7 +12793,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
                cur = build_attn(inp_attn, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_rot)), il);
+                        Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
            }
            if (il == n_layer - 1) {
--- a/src/llama-model.h
+++ b/src/llama-model.h
@ -171,6 +171,8 @@ struct llama_layer {
    struct ggml_tensor * wq_b      = nullptr;
    struct ggml_tensor * wkv_a_mqa = nullptr;
    struct ggml_tensor * wkv_b     = nullptr;
    struct ggml_tensor * wk_b      = nullptr;
    struct ggml_tensor * wv_b      = nullptr;
    struct ggml_tensor * wq_cross  = nullptr;
    struct ggml_tensor * wk_cross  = nullptr;
    struct ggml_tensor * wv_cross  = nullptr;
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@ -10,6 +10,7 @@
 #include <cinttypes>
 #include <fstream>
 #include <mutex>
 #include <regex>
 #include <thread>
 #include <unordered_map>
@ -47,8 +48,14 @@ struct quantize_state_impl {
        {}
 };
 // changes to this struct must be replicated in quantize.cpp
 struct tensor_quantization {
    std::string name;
    ggml_type quant = GGML_TYPE_COUNT;
 };
 static void llama_tensor_dequantize_impl(
-    struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
+    ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
    const size_t nelements, const int nthread
 ) {
    if (output.size() < nelements) {
@ -539,7 +546,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
    model.load_hparams(ml);
    model.load_stats  (ml);
-    struct quantize_state_impl qs(model, params);
+    quantize_state_impl qs(model, params);
    if (params->only_copy) {
        ftype = ml.ftype;
@ -664,7 +671,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
    // populate the original tensors so we get an initial meta data
    for (const auto * it : tensors) {
        uint16_t i_split = params->keep_split ? it->idx : 0;
-        struct ggml_tensor * tensor = it->tensor;
+        ggml_tensor * tensor = it->tensor;
        if (!ctx_outs[i_split]) {
            ctx_outs[i_split].reset(gguf_init_empty());
        }
@ -713,7 +720,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
    new_ofstream(0);
    for (const auto * it : tensors) {
        const auto & weight = *it;
-        struct ggml_tensor * tensor = weight.tensor;
+        ggml_tensor * tensor = weight.tensor;
        if (weight.idx != cur_split && params->keep_split) {
            close_ofstream();
            new_ofstream(weight.idx);
@ -779,7 +786,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        // do not quantize relative position bias (T5)
        quantize &= name.find("attn_rel_b.weight") == std::string::npos;
-        enum ggml_type new_type;
+        ggml_type new_type;
        void * new_data;
        size_t new_size;
@ -789,6 +796,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
            // get more optimal quantization type based on the tensor shape, layer, etc.
            if (!params->pure && ggml_is_quantized(default_type)) {
                new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
                // unless the user specifies a type
                if (params->tensor_types) {
                    const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
                    for (const auto & [tname, qtype] : tensor_types) {
                        if (std::regex pattern(tname); std::regex_search(tensor->name, pattern)) {
                            if (qtype != new_type) {
                                LLAMA_LOG_DEBUG("(overriding %s -> %s), ", ggml_type_name(new_type), ggml_type_name(qtype));
                            }
                            new_type = qtype;
                            break;
                        }
                    }
                }
            }
            if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
                new_type = params->token_embedding_type;
@ -913,8 +933,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 // interface implementation
 //
-struct llama_model_quantize_params llama_model_quantize_default_params() {
+llama_model_quantize_params llama_model_quantize_default_params() {
-    struct llama_model_quantize_params result = {
+    llama_model_quantize_params result = {
        /*.nthread                     =*/ 0,
        /*.ftype                       =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
        /*.output_tensor_type          =*/ GGML_TYPE_COUNT,
@ -926,6 +946,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
        /*.keep_split                  =*/ false,
        /*.imatrix                     =*/ nullptr,
        /*.kv_overrides                =*/ nullptr,
        /*.tensor_type                 =*/ nullptr,
    };
    return result;