From 13b339bcd91de64d59512f308f6f69eaca688103 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 5 Nov 2025 14:32:55 +0200
Subject: [PATCH 01/32] server : do not default to multiple slots with
 speculative decoding (#17017)

* server : do not default to multiple slots with speculative decoding

* cont : fix
---
 common/common.h         |  4 ++++
 tools/server/server.cpp | 14 ++++++++++----
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/common/common.h b/common/common.h
index 78c568a7b..54b7849b1 100644
--- a/common/common.h
+++ b/common/common.h
@@ -507,6 +507,10 @@ struct common_params {
     // return false from callback to abort model loading or true to continue
     llama_progress_callback load_progress_callback = NULL;
     void *                  load_progress_callback_user_data = NULL;
+
+    bool has_speculative() const {
+        return !speculative.model.path.empty() || !speculative.model.hf_repo.empty();
+    }
 };
 
 // call once at the start of a program if it uses libcommon
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 678aad93b..f5089bef2 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -2400,7 +2400,7 @@ struct server_context {
 
         add_bos_token = llama_vocab_get_add_bos(vocab);
 
-        if (!params_base.speculative.model.path.empty() || !params_base.speculative.model.hf_repo.empty()) {
+        if (params_base.has_speculative()) {
             SRV_INF("loading draft model '%s'\n", params_base.speculative.model.path.c_str());
 
             auto params_dft = params_base;
@@ -2476,7 +2476,7 @@ struct server_context {
                 SRV_WRN("%s\n", "cache_reuse is not supported by multimodal, it will be disabled");
             }
 
-            if (!params_base.speculative.model.path.empty()) {
+            if (params_base.has_speculative()) {
                 SRV_ERR("%s\n", "err: speculative decode is not supported by multimodal");
                 return false;
             }
@@ -2520,6 +2520,7 @@ struct server_context {
             if (model_dft) {
                 slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1);
 
+                // TODO: rework speculative decoding [TAG_SERVER_SPEC_REWORK]
                 slot.ctx_dft = llama_init_from_model(model_dft, cparams_dft);
                 if (slot.ctx_dft == nullptr) {
                     SRV_ERR("%s", "failed to create draft context\n");
@@ -2825,6 +2826,7 @@ struct server_context {
         }
 
         // initialize draft batch
+        // TODO: rework speculative decoding [TAG_SERVER_SPEC_REWORK]
         if (slot.ctx_dft) {
             llama_batch_free(slot.batch_spec);
 
@@ -4291,6 +4293,8 @@ struct server_context {
             }
 
             // do speculative decoding
+            // TODO: rework to have a single draft llama_context shared across all slots [TAG_SERVER_SPEC_REWORK]
+            //       perform the speculative drafting for all sequences at the same time in a single batch
             for (auto & slot : slots) {
                 if (!slot.is_processing() || !slot.can_speculate()) {
                     continue;
@@ -4445,8 +4449,10 @@ int main(int argc, char ** argv) {
 
     // TODO: should we have a separate n_parallel parameter for the server?
     //       https://github.com/ggml-org/llama.cpp/pull/16736#discussion_r2483763177
-    if (params.n_parallel == 1 && params.kv_unified == false) {
-        LOG_WRN("%s: setting n_parallel = 4 and kv_unified = true\n", __func__);
+    // TODO: this is a common configuration that is suitable for most local use cases
+    //       however, overriding the parameters is a bit confusing - figure out something more intuitive
+    if (params.n_parallel == 1 && params.kv_unified == false && !params.has_speculative()) {
+        LOG_WRN("%s: setting n_parallel = 4 and kv_unified = true (add -kvu to disable this)\n", __func__);
 
         params.n_parallel = 4;
         params.kv_unified = true;

From 92bb84f775215cf36e3224708a9a93e2531a0a96 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Wed, 5 Nov 2025 14:26:49 +0100
Subject: [PATCH 02/32] mtmd: allow QwenVL to process larger image by default
 (#17020)

---
 tools/mtmd/clip.cpp | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 67a8a470c..16781fb19 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -2791,14 +2791,8 @@ struct clip_model_loader {
                         get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
                         get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern, model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it
                         // ref: https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
-                        // the actual max limit is 12845056/14/14/2/2/4 = 4096 tokens
-                        // but we set a lower value to avoid OOM
-                        // TODO: make it configurable by user
-                        // TODO (2): bbox coordinates become inaccurate with small number of tokens,
-                        //           therefore we need to increase the min_tokens
-                        //           see: https://github.com/ggml-org/llama.cpp/issues/16842#issuecomment-3475144858
-                        hparams.set_limit_image_tokens(8, 2048);
-                        hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
+                        hparams.set_limit_image_tokens(8, 4096);
+                        hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup
                         const int warn_min_pixels = 1024 * hparams.n_merge * hparams.n_merge * hparams.patch_size * hparams.patch_size;
                         if (hparams.image_min_pixels < warn_min_pixels) {
                             LOG_WRN("%s: Qwen-VL models require at minimum 1024 image tokens to function correctly on grounding tasks\n", __func__);
@@ -4814,7 +4808,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         case PROJECTOR_TYPE_QWEN2VL:
         case PROJECTOR_TYPE_QWEN3VL:
             {
-                const int merge_ratio = 2;
+                const int merge_ratio = hparams.n_merge;
                 const int pw = image_size_width  / patch_size;
                 const int ph = image_size_height / patch_size;
                 std::vector<int> positions(n_pos * 4);

From 5886f4f545591dafbbd9d6117a46d7399ce13bfa Mon Sep 17 00:00:00 2001
From: Gabe Goodhart <ghart@us.ibm.com>
Date: Wed, 5 Nov 2025 10:58:16 -0700
Subject: [PATCH 03/32] examples(gguf): GGUF example outputs (#17025)

* feat(llama-gguf): Print out the tensor type in llama-gguf r

Branch: Mamba2Perf

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat(off-topic): print the number of elements in tensors with llama-gguf

Branch: Mamba2SSD

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* style: valign

Branch: GGUFToolOutputs

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* Update examples/gguf/gguf.cpp

---------

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 examples/gguf/gguf.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp
index f31989c8c..499cfacc9 100644
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@@ -184,8 +184,13 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
             const char * name   = gguf_get_tensor_name  (ctx, i);
             const size_t size   = gguf_get_tensor_size  (ctx, i);
             const size_t offset = gguf_get_tensor_offset(ctx, i);
+            const auto   type   = gguf_get_tensor_type  (ctx, i);
 
-            printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset);
+            const char * type_name  = ggml_type_name(type);
+            const size_t type_size  = ggml_type_size(type);
+            const size_t n_elements = size / type_size;
+
+            printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu, type = %s, n_elts = %zu\n", __func__, i, name, size, offset, type_name, n_elements);
         }
     }
 

From a44d77126c911d105f7f800c17da21b2a5b112d1 Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Wed, 5 Nov 2025 12:51:03 -0600
Subject: [PATCH 04/32] vulkan: Fix GGML_VULKAN_CHECK_RESULTS to better handle
 fusion (#16919)

---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 644 +++++++++++++--------------
 1 file changed, 318 insertions(+), 326 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 7fc46bc46..ab94bc3d7 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -14104,20 +14104,11 @@ size_t comp_size;
 size_t comp_nb[GGML_MAX_DIMS];
 size_t check_counter = 0;
 static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx) {
-    ggml_tensor * tensor = cgraph->nodes[tensor_idx];
+    ggml_tensor * tensor = cgraph->nodes[tensor_idx + ctx->num_additional_fused_ops];
     if (tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_SET_ROWS) {
         return;
     }
 
-    bool fused_rms_norm_mul = false;
-    int rms_norm_idx = -1;
-    if (ctx->num_additional_fused_ops == 1 &&
-        tensor->op == GGML_OP_RMS_NORM &&
-        cgraph->nodes[tensor_idx + 1]->op == GGML_OP_MUL) {
-        fused_rms_norm_mul = true;
-        tensor = cgraph->nodes[tensor_idx + 1];
-    }
-
     check_counter++;
     if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) {
         return;
@@ -14125,9 +14116,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
 
     VK_LOG_DEBUG("ggml_vk_check_results_0(" << tensor->name << ")");
 
-    ggml_tensor * src0 = tensor->src[0];
-    ggml_tensor * src1 = tensor->src[1];
-
     struct ggml_init_params iparams = {
         /*.mem_size   =*/ 2ul*1024ul*1024ul*1024ul,
         /*.mem_buffer =*/ NULL,
@@ -14137,328 +14125,339 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
     struct ggml_context * ggml_ctx = ggml_init(iparams);
 
     std::array<struct ggml_tensor *, GGML_MAX_SRC> src_clone = {nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr};
-    std::array<size_t, GGML_MAX_SRC> src_size = {};
-    std::array<void *, GGML_MAX_SRC> src_buffer = {};
     const char * srci_name[GGML_MAX_SRC] = {"src0", "src1", "src2", "src3", "src4", "src5", "src6", "src7", "src8", "src9"};
 
+    std::map<ggml_tensor *, ggml_tensor *> cloned_tensors;
+    std::vector<void *> cloned_mallocs;
+
     struct ggml_tensor * tensor_clone = nullptr;
 
-    for (int i = 0; i < GGML_MAX_SRC; i++) {
-        ggml_tensor * srci = tensor->src[i];
-        if (fused_rms_norm_mul) {
-            rms_norm_idx = tensor->src[0]->op == GGML_OP_RMS_NORM ? 0 : 1;
-            ggml_tensor *rms_norm = tensor->src[rms_norm_idx];
-            switch (i) {
-            case 0: srci = rms_norm->src[0]; break;
-            case 1: srci = tensor->src[1 - rms_norm_idx]; break;
-            default: continue;
+    for (int f = 0; f < ctx->num_additional_fused_ops + 1; ++f) {
+        tensor = cgraph->nodes[tensor_idx + f];
+        for (int i = 0; i < GGML_MAX_SRC; i++) {
+            ggml_tensor * srci = tensor->src[i];
+            if (srci == nullptr) {
+                continue;
             }
-        }
-        if (srci == nullptr) {
-            continue;
-        }
-        ggml_tensor * srci_clone = ggml_dup_tensor(ggml_ctx, srci);
-        size_t srci_size = ggml_nbytes(srci);
+            // If a src tensor has been cloned, use that one
+            auto it = cloned_tensors.find(srci);
+            if (it != cloned_tensors.end()) {
+                src_clone[i] = it->second;
+                continue;
+            }
+            ggml_tensor * srci_clone = ggml_dup_tensor(ggml_ctx, srci);
+            size_t srci_size = ggml_nbytes(srci);
 
-        src_clone[i] = srci_clone;
-        src_size[i] = ggml_nbytes(srci);
-        src_buffer[i] = malloc(srci_size);
+            src_clone[i] = srci_clone;
+            void *src_buffer = malloc(srci_size);
+            cloned_mallocs.push_back(src_buffer);
 
-        srci_clone->data = src_buffer[i];
-        if (ggml_backend_buffer_is_host(srci->buffer)) {
-            memcpy(srci_clone->data, srci->data, srci_size);
-            memcpy(srci_clone->nb, srci->nb, sizeof(size_t) * GGML_MAX_DIMS);
-        } else if (ggml_backend_buffer_is_vk(srci->buffer)) {
-            ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)srci->buffer->context;
-            vk_buffer& buffer_gpu = buf_ctx->dev_buffer;
-            uint64_t offset = vk_tensor_offset(srci) + srci->view_offs;
-            if (!ggml_is_contiguous(srci) && ggml_vk_dim01_contiguous(srci)) {
-                for (int i3 = 0; i3 < srci->ne[3]; i3++) {
-                    for (int i2 = 0; i2 < srci->ne[2]; i2++) {
-                        const int idx = i3*srci->ne[2] + i2;
-                        ggml_vk_buffer_read(buffer_gpu, offset + idx * srci->nb[2], ((char *)srci_clone->data + idx * srci_clone->nb[2]), srci->ne[1] * srci->nb[1]);
-                    }
-                }
-
-                srci_clone->nb[0] = srci->nb[0];
-                srci_clone->nb[1] = srci->nb[1];
-                for (int i = 2; i < GGML_MAX_DIMS; i++) {
-                    srci_clone->nb[i] = srci_clone->nb[i - 1]*srci_clone->ne[i - 1];
-                }
-            } else {
-                if (offset + srci_size >= buffer_gpu->size) {
-                    srci_size = buffer_gpu->size - offset;
-                }
-                ggml_vk_buffer_read(buffer_gpu, offset, srci_clone->data, srci_size);
+            srci_clone->data = src_buffer;
+            if (ggml_backend_buffer_is_host(srci->buffer)) {
+                memcpy(srci_clone->data, srci->data, srci_size);
                 memcpy(srci_clone->nb, srci->nb, sizeof(size_t) * GGML_MAX_DIMS);
+            } else if (ggml_backend_buffer_is_vk(srci->buffer)) {
+                ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)srci->buffer->context;
+                vk_buffer& buffer_gpu = buf_ctx->dev_buffer;
+                uint64_t offset = vk_tensor_offset(srci) + srci->view_offs;
+                if (!ggml_is_contiguous(srci) && ggml_vk_dim01_contiguous(srci)) {
+                    for (int i3 = 0; i3 < srci->ne[3]; i3++) {
+                        for (int i2 = 0; i2 < srci->ne[2]; i2++) {
+                            const int idx = i3*srci->ne[2] + i2;
+                            ggml_vk_buffer_read(buffer_gpu, offset + idx * srci->nb[2], ((char *)srci_clone->data + idx * srci_clone->nb[2]), srci->ne[1] * srci->nb[1]);
+                        }
+                    }
+
+                    srci_clone->nb[0] = srci->nb[0];
+                    srci_clone->nb[1] = srci->nb[1];
+                    for (int i = 2; i < GGML_MAX_DIMS; i++) {
+                        srci_clone->nb[i] = srci_clone->nb[i - 1]*srci_clone->ne[i - 1];
+                    }
+                } else {
+                    if (offset + srci_size >= buffer_gpu->size) {
+                        srci_size = buffer_gpu->size - offset;
+                    }
+                    ggml_vk_buffer_read(buffer_gpu, offset, srci_clone->data, srci_size);
+                    memcpy(srci_clone->nb, srci->nb, sizeof(size_t) * GGML_MAX_DIMS);
+                }
+            } else {
+                GGML_ABORT("fatal error");
+            }
+
+            if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
+                ggml_vk_print_tensor(srci, srci_name[i]);
             }
-        } else {
-            GGML_ABORT("fatal error");
         }
 
-        if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
-            ggml_vk_print_tensor(srci, srci_name[i]);
-        }
-    }
-
-    if (tensor->op == GGML_OP_FLASH_ATTN_EXT) {
-        const float * params = (const float *)tensor->op_params;
-        tensor_clone = ggml_flash_attn_ext(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], src_clone[3], params[0], params[1], params[2]);
-        if (src_clone[4]) {
-            ggml_flash_attn_ext_add_sinks(tensor_clone, src_clone[4]);
-        }
-    } else if (tensor->op == GGML_OP_MUL_MAT) {
-        tensor_clone = ggml_mul_mat(ggml_ctx, src_clone[0], src_clone[1]);
-    } else if (tensor->op == GGML_OP_MUL_MAT_ID) {
-        tensor_clone = ggml_mul_mat_id(ggml_ctx, src_clone[0], src_clone[1], src_clone[2]);
-    } else if (tensor->op == GGML_OP_SUB) {
-        tensor_clone = ggml_sub(ggml_ctx, src_clone[0], src_clone[1]);
-    } else if (tensor->op == GGML_OP_MUL) {
-        if (fused_rms_norm_mul) {
-            tensor_clone = ggml_rms_norm(ggml_ctx, src_clone[0], *(float *)tensor->src[rms_norm_idx]->op_params);
-            tensor_clone = ggml_mul(ggml_ctx, tensor_clone, src_clone[1 - rms_norm_idx]);
-        } else {
-            tensor_clone = ggml_mul(ggml_ctx, src_clone[0], src_clone[1]);
-        }
-    } else if (tensor->op == GGML_OP_DIV) {
-        tensor_clone = ggml_div(ggml_ctx, src_clone[0], src_clone[1]);
-    } else if (tensor->op == GGML_OP_CONCAT) {
-        tensor_clone = ggml_concat(ggml_ctx, src_clone[0], src_clone[1], *(int *)tensor->op_params);
-    } else if (tensor->op == GGML_OP_UPSCALE) {
-        tensor_clone = ggml_interpolate(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], (ggml_scale_mode) tensor->op_params[0]);
-    } else if (tensor->op == GGML_OP_SCALE) {
-        const float * params = (const float *)tensor->op_params;
-        tensor_clone = ggml_scale_bias(ggml_ctx, src_clone[0], params[0], params[1]);
-    } else if (tensor->op == GGML_OP_SQR) {
-        tensor_clone = ggml_sqr(ggml_ctx, src_clone[0]);
-    } else if (tensor->op == GGML_OP_SQRT) {
-        tensor_clone = ggml_sqrt(ggml_ctx, src_clone[0]);
-    } else if (tensor->op == GGML_OP_SIN) {
-        tensor_clone = ggml_sin(ggml_ctx, src_clone[0]);
-    } else if (tensor->op == GGML_OP_COS) {
-        tensor_clone = ggml_cos(ggml_ctx, src_clone[0]);
-    } else if (tensor->op == GGML_OP_CLAMP) {
-        const float * params = (const float *)tensor->op_params;
-        tensor_clone = ggml_clamp(ggml_ctx, src_clone[0], params[0], params[1]);
-    } else if (tensor->op == GGML_OP_PAD) {
-        tensor_clone = ggml_pad_ext(ggml_ctx, src_clone[0], tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3],
-                                                            tensor->op_params[4], tensor->op_params[5], tensor->op_params[6], tensor->op_params[7]);
-    } else if (tensor->op == GGML_OP_REPEAT) {
-        tensor_clone = ggml_repeat(ggml_ctx, src_clone[0], tensor);
-    } else if (tensor->op == GGML_OP_REPEAT_BACK) {
-        tensor_clone = ggml_repeat_back(ggml_ctx, src_clone[0], tensor);
-    } else if (tensor->op == GGML_OP_ADD) {
-        tensor_clone = ggml_add(ggml_ctx, src_clone[0], src_clone[1]);
-    } else if (tensor->op == GGML_OP_ACC) {
-        tensor_clone = ggml_acc(ggml_ctx, src_clone[0], src_clone[1], tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3]);
-    } else if (tensor->op == GGML_OP_NORM) {
-        tensor_clone = ggml_norm(ggml_ctx, src_clone[0], *(float *)tensor->op_params);
-    } else if (tensor->op == GGML_OP_GROUP_NORM) {
-        const float * float_params = (const float *)tensor->op_params;
-        tensor_clone = ggml_group_norm(ggml_ctx, src_clone[0], tensor->op_params[0], float_params[1]);
-    } else if (tensor->op == GGML_OP_RMS_NORM) {
-        tensor_clone = ggml_rms_norm(ggml_ctx, src_clone[0], *(float *)tensor->op_params);
-    } else if (tensor->op == GGML_OP_RMS_NORM_BACK) {
-        const float eps = ((float *) tensor->op_params)[0];
-        tensor_clone = ggml_rms_norm_back(ggml_ctx, src_clone[0], src_clone[1], eps);
-    } else if (tensor->op == GGML_OP_SILU_BACK) {
-        tensor_clone = ggml_silu_back(ggml_ctx, src_clone[0], src_clone[1]);
-    } else if (tensor->op == GGML_OP_L2_NORM) {
-        const float eps = ((float *) tensor->op_params)[0];
-        tensor_clone = ggml_l2_norm(ggml_ctx, src_clone[0], eps);
-    } else if (tensor->op == GGML_OP_SOFT_MAX) {
-        if (src1 != nullptr) {
+        if (tensor->op == GGML_OP_FLASH_ATTN_EXT) {
             const float * params = (const float *)tensor->op_params;
-            tensor_clone = ggml_soft_max_ext(ggml_ctx, src_clone[0], src_clone[1], params[0], params[1]);
-        } else {
-            tensor_clone = ggml_soft_max(ggml_ctx, src_clone[0]);
-        }
-    } else if (tensor->op == GGML_OP_SOFT_MAX_BACK) {
-        tensor_clone = ggml_soft_max_ext_back(ggml_ctx, src_clone[0], src_clone[1], ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
-    } else if (tensor->op == GGML_OP_DIAG_MASK_INF) {
-        tensor_clone = ggml_diag_mask_inf(ggml_ctx, src_clone[0], tensor->op_params[0]);
-    } else if (tensor->op == GGML_OP_ROPE || tensor->op == GGML_OP_ROPE_BACK) {
-        const int n_dims      = ((int32_t *) tensor->op_params)[1];
-        const int mode        = ((int32_t *) tensor->op_params)[2];
-        //const int n_ctx_ggml       = ((int32_t *) tensor->op_params)[3];
-        const int n_ctx_orig_ggml  = ((int32_t *) tensor->op_params)[4];
-        const float freq_base       = ((float *) tensor->op_params)[5];
-        const float freq_scale      = ((float *) tensor->op_params)[6];
-        const float ext_factor      = ((float *) tensor->op_params)[7];
-        const float attn_factor     = ((float *) tensor->op_params)[8];
-        const float beta_fast       = ((float *) tensor->op_params)[9];
-        const float beta_slow       = ((float *) tensor->op_params)[10];
-        if (mode & GGML_ROPE_TYPE_MROPE) {
-            int32_t *sections = ((int32_t *) tensor->op_params) + 11;
-            if (tensor->op == GGML_OP_ROPE) {
-                tensor_clone = ggml_rope_multi(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, sections, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
-            } else {
-                tensor_clone = ggml_rope_multi_back(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, sections, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+            tensor_clone = ggml_flash_attn_ext(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], src_clone[3], params[0], params[1], params[2]);
+            if (src_clone[4]) {
+                ggml_flash_attn_ext_add_sinks(tensor_clone, src_clone[4]);
             }
-        } else {
-            if (tensor->op == GGML_OP_ROPE) {
-                tensor_clone = ggml_rope_ext(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+        } else if (tensor->op == GGML_OP_MUL_MAT) {
+            tensor_clone = ggml_mul_mat(ggml_ctx, src_clone[0], src_clone[1]);
+        } else if (tensor->op == GGML_OP_MUL_MAT_ID) {
+            tensor_clone = ggml_mul_mat_id(ggml_ctx, src_clone[0], src_clone[1], src_clone[2]);
+        } else if (tensor->op == GGML_OP_SUB) {
+            tensor_clone = ggml_sub(ggml_ctx, src_clone[0], src_clone[1]);
+        } else if (tensor->op == GGML_OP_MUL) {
+            tensor_clone = ggml_mul(ggml_ctx, src_clone[0], src_clone[1]);
+        } else if (tensor->op == GGML_OP_DIV) {
+            tensor_clone = ggml_div(ggml_ctx, src_clone[0], src_clone[1]);
+        } else if (tensor->op == GGML_OP_CONCAT) {
+            tensor_clone = ggml_concat(ggml_ctx, src_clone[0], src_clone[1], *(int *)tensor->op_params);
+        } else if (tensor->op == GGML_OP_UPSCALE) {
+            tensor_clone = ggml_interpolate(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], (ggml_scale_mode) tensor->op_params[0]);
+        } else if (tensor->op == GGML_OP_SCALE) {
+            const float * params = (const float *)tensor->op_params;
+            tensor_clone = ggml_scale_bias(ggml_ctx, src_clone[0], params[0], params[1]);
+        } else if (tensor->op == GGML_OP_SQR) {
+            tensor_clone = ggml_sqr(ggml_ctx, src_clone[0]);
+        } else if (tensor->op == GGML_OP_SQRT) {
+            tensor_clone = ggml_sqrt(ggml_ctx, src_clone[0]);
+        } else if (tensor->op == GGML_OP_SIN) {
+            tensor_clone = ggml_sin(ggml_ctx, src_clone[0]);
+        } else if (tensor->op == GGML_OP_COS) {
+            tensor_clone = ggml_cos(ggml_ctx, src_clone[0]);
+        } else if (tensor->op == GGML_OP_CLAMP) {
+            const float * params = (const float *)tensor->op_params;
+            tensor_clone = ggml_clamp(ggml_ctx, src_clone[0], params[0], params[1]);
+        } else if (tensor->op == GGML_OP_PAD) {
+            tensor_clone = ggml_pad_ext(ggml_ctx, src_clone[0], tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3],
+                                                                tensor->op_params[4], tensor->op_params[5], tensor->op_params[6], tensor->op_params[7]);
+        } else if (tensor->op == GGML_OP_REPEAT) {
+            tensor_clone = ggml_repeat(ggml_ctx, src_clone[0], tensor);
+        } else if (tensor->op == GGML_OP_REPEAT_BACK) {
+            tensor_clone = ggml_repeat_back(ggml_ctx, src_clone[0], tensor);
+        } else if (tensor->op == GGML_OP_ADD) {
+            tensor_clone = ggml_add(ggml_ctx, src_clone[0], src_clone[1]);
+        } else if (tensor->op == GGML_OP_ACC) {
+            tensor_clone = ggml_acc(ggml_ctx, src_clone[0], src_clone[1], tensor->op_params[0], tensor->op_params[1], tensor->op_params[2], tensor->op_params[3]);
+        } else if (tensor->op == GGML_OP_NORM) {
+            tensor_clone = ggml_norm(ggml_ctx, src_clone[0], *(float *)tensor->op_params);
+        } else if (tensor->op == GGML_OP_GROUP_NORM) {
+            const float * float_params = (const float *)tensor->op_params;
+            tensor_clone = ggml_group_norm(ggml_ctx, src_clone[0], tensor->op_params[0], float_params[1]);
+        } else if (tensor->op == GGML_OP_RMS_NORM) {
+            tensor_clone = ggml_rms_norm(ggml_ctx, src_clone[0], *(float *)tensor->op_params);
+        } else if (tensor->op == GGML_OP_RMS_NORM_BACK) {
+            const float eps = ((float *) tensor->op_params)[0];
+            tensor_clone = ggml_rms_norm_back(ggml_ctx, src_clone[0], src_clone[1], eps);
+        } else if (tensor->op == GGML_OP_SILU_BACK) {
+            tensor_clone = ggml_silu_back(ggml_ctx, src_clone[0], src_clone[1]);
+        } else if (tensor->op == GGML_OP_L2_NORM) {
+            const float eps = ((float *) tensor->op_params)[0];
+            tensor_clone = ggml_l2_norm(ggml_ctx, src_clone[0], eps);
+        } else if (tensor->op == GGML_OP_SOFT_MAX) {
+            if (tensor->src[1] != nullptr) {
+                const float * params = (const float *)tensor->op_params;
+                tensor_clone = ggml_soft_max_ext(ggml_ctx, src_clone[0], src_clone[1], params[0], params[1]);
             } else {
-                tensor_clone = ggml_rope_ext_back(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+                tensor_clone = ggml_soft_max(ggml_ctx, src_clone[0]);
             }
+        } else if (tensor->op == GGML_OP_SOFT_MAX_BACK) {
+            tensor_clone = ggml_soft_max_ext_back(ggml_ctx, src_clone[0], src_clone[1], ((float *)tensor->op_params)[0], ((float *)tensor->op_params)[1]);
+        } else if (tensor->op == GGML_OP_DIAG_MASK_INF) {
+            tensor_clone = ggml_diag_mask_inf(ggml_ctx, src_clone[0], tensor->op_params[0]);
+        } else if (tensor->op == GGML_OP_ROPE || tensor->op == GGML_OP_ROPE_BACK) {
+            const int n_dims      = ((int32_t *) tensor->op_params)[1];
+            const int mode        = ((int32_t *) tensor->op_params)[2];
+            //const int n_ctx_ggml       = ((int32_t *) tensor->op_params)[3];
+            const int n_ctx_orig_ggml  = ((int32_t *) tensor->op_params)[4];
+            const float freq_base       = ((float *) tensor->op_params)[5];
+            const float freq_scale      = ((float *) tensor->op_params)[6];
+            const float ext_factor      = ((float *) tensor->op_params)[7];
+            const float attn_factor     = ((float *) tensor->op_params)[8];
+            const float beta_fast       = ((float *) tensor->op_params)[9];
+            const float beta_slow       = ((float *) tensor->op_params)[10];
+            if (mode & GGML_ROPE_TYPE_MROPE) {
+                int32_t *sections = ((int32_t *) tensor->op_params) + 11;
+                if (tensor->op == GGML_OP_ROPE) {
+                    tensor_clone = ggml_rope_multi(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, sections, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+                } else {
+                    tensor_clone = ggml_rope_multi_back(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, sections, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+                }
+            } else {
+                if (tensor->op == GGML_OP_ROPE) {
+                    tensor_clone = ggml_rope_ext(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+                } else {
+                    tensor_clone = ggml_rope_ext_back(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+                }
+            }
+        } else if (tensor->op == GGML_OP_UNARY) {
+            switch (ggml_get_unary_op(tensor)) {
+            case GGML_UNARY_OP_EXP:
+                tensor_clone = ggml_exp(ggml_ctx, src_clone[0]);
+                break;
+            case GGML_UNARY_OP_SILU:
+                tensor_clone = ggml_silu(ggml_ctx, src_clone[0]);
+                break;
+            case GGML_UNARY_OP_GELU:
+                tensor_clone = ggml_gelu(ggml_ctx, src_clone[0]);
+                break;
+            case GGML_UNARY_OP_GELU_ERF:
+                tensor_clone = ggml_gelu_erf(ggml_ctx, src_clone[0]);
+                break;
+            case GGML_UNARY_OP_GELU_QUICK:
+                tensor_clone = ggml_gelu_quick(ggml_ctx, src_clone[0]);
+                break;
+            case GGML_UNARY_OP_RELU:
+                tensor_clone = ggml_relu(ggml_ctx, src_clone[0]);
+                break;
+            case GGML_UNARY_OP_TANH:
+                tensor_clone = ggml_tanh(ggml_ctx, src_clone[0]);
+                break;
+            case GGML_UNARY_OP_SIGMOID:
+                tensor_clone = ggml_sigmoid(ggml_ctx, src_clone[0]);
+                break;
+            case GGML_UNARY_OP_HARDSIGMOID:
+                tensor_clone = ggml_hardsigmoid(ggml_ctx, src_clone[0]);
+                break;
+            case GGML_UNARY_OP_HARDSWISH:
+                tensor_clone = ggml_hardswish(ggml_ctx, src_clone[0]);
+                break;
+            default:
+                std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
+                GGML_ABORT("fatal error");
+            }
+        } else if (tensor->op == GGML_OP_GLU) {
+            if (src_clone[1] == nullptr) {
+                tensor_clone = ggml_glu(ggml_ctx, src_clone[0], (ggml_glu_op) tensor->op_params[0], tensor->op_params[1]);
+            } else {
+                tensor_clone = ggml_glu_split(ggml_ctx, src_clone[0], src_clone[1], (ggml_glu_op) tensor->op_params[0]);
+            }
+            ggml_set_op_params_i32(tensor_clone, 2, ggml_get_op_params_i32(tensor, 2));
+            ggml_set_op_params_i32(tensor_clone, 3, ggml_get_op_params_i32(tensor, 3));
+        } else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) {
+            if (tensor->src[1] == nullptr) {
+                tensor_clone = ggml_dup(ggml_ctx, src_clone[0]);
+                tensor_clone->type = tensor->type;
+            } else {
+                tensor_clone = ggml_cpy(ggml_ctx, src_clone[0], src_clone[1]);
+            }
+        } else if (tensor->op == GGML_OP_CONT) {
+            tensor_clone = ggml_cont_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
+        } else if (tensor->op == GGML_OP_RESHAPE) {
+            tensor_clone = ggml_reshape_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
+        } else if (tensor->op == GGML_OP_VIEW) {
+            tensor_clone = ggml_view_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], tensor->nb[1], tensor->nb[2], tensor->nb[3], ((int32_t *) tensor->op_params)[0]);
+        } else if (tensor->op == GGML_OP_PERMUTE) {
+            int32_t * params = (int32_t *)tensor->op_params;
+            tensor_clone = ggml_permute(ggml_ctx, src_clone[0], params[0], params[1], params[2], params[3]);
+        } else if (tensor->op == GGML_OP_TRANSPOSE) {
+            tensor_clone = ggml_transpose(ggml_ctx, src_clone[0]);
+        } else if (tensor->op == GGML_OP_GET_ROWS) {
+            tensor_clone = ggml_get_rows(ggml_ctx, src_clone[0], src_clone[1]);
+        } else if (tensor->op == GGML_OP_ARGSORT) {
+            tensor_clone = ggml_argsort(ggml_ctx, src_clone[0], (ggml_sort_order) *(int *)tensor->op_params);
+        } else if (tensor->op == GGML_OP_SUM) {
+            tensor_clone = ggml_sum(ggml_ctx, src_clone[0]);
+        } else if (tensor->op == GGML_OP_SUM_ROWS) {
+            tensor_clone = ggml_sum_rows(ggml_ctx, src_clone[0]);
+        } else if (tensor->op == GGML_OP_MEAN) {
+            tensor_clone = ggml_mean(ggml_ctx, src_clone[0]);
+        } else if (tensor->op == GGML_OP_ARGMAX) {
+            tensor_clone = ggml_argmax(ggml_ctx, src_clone[0]);
+        } else if (tensor->op == GGML_OP_COUNT_EQUAL) {
+            tensor_clone = ggml_count_equal(ggml_ctx, src_clone[0], src_clone[1]);
+        } else if (tensor->op == GGML_OP_IM2COL) {
+            const int32_t s0 = tensor->op_params[0];
+            const int32_t s1 = tensor->op_params[1];
+            const int32_t p0 = tensor->op_params[2];
+            const int32_t p1 = tensor->op_params[3];
+            const int32_t d0 = tensor->op_params[4];
+            const int32_t d1 = tensor->op_params[5];
+
+            const bool is_2D = tensor->op_params[6] == 1;
+            tensor_clone = ggml_im2col(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1, is_2D, tensor->type);
+        } else if (tensor->op == GGML_OP_IM2COL_3D) {
+            const int32_t s0 = tensor->op_params[0];
+            const int32_t s1 = tensor->op_params[1];
+            const int32_t s2 = tensor->op_params[2];
+            const int32_t p0 = tensor->op_params[3];
+            const int32_t p1 = tensor->op_params[4];
+            const int32_t p2 = tensor->op_params[5];
+            const int32_t d0 = tensor->op_params[6];
+            const int32_t d1 = tensor->op_params[7];
+            const int32_t d2 = tensor->op_params[8];
+            const int32_t IC = tensor->op_params[9];
+
+            tensor_clone = ggml_im2col_3d(ggml_ctx, src_clone[0], src_clone[1], IC, s0, s1, s2, p0, p1, p2, d0, d1, d2, tensor->type);
+        } else if (tensor->op == GGML_OP_TIMESTEP_EMBEDDING) {
+            const int32_t dim = tensor->op_params[0];
+            const int32_t max_period = tensor->op_params[1];
+            tensor_clone = ggml_timestep_embedding(ggml_ctx, src_clone[0], dim, max_period);
+        } else if (tensor->op == GGML_OP_CONV_TRANSPOSE_1D){
+            const int32_t s0 = tensor->op_params[0];
+            const int32_t p0 = tensor->op_params[1];
+            const int32_t d0 = tensor->op_params[2];
+            tensor_clone = ggml_conv_transpose_1d(ggml_ctx, src_clone[0], src_clone[1], s0, p0, d0);
+        } else if (tensor->op == GGML_OP_POOL_2D) {
+            enum ggml_op_pool op = static_cast<ggml_op_pool>(tensor->op_params[0]);
+            const int32_t k0 = tensor->op_params[1];
+            const int32_t k1 = tensor->op_params[2];
+            const int32_t s0 = tensor->op_params[3];
+            const int32_t s1 = tensor->op_params[4];
+            const int32_t p0 = tensor->op_params[5];
+            const int32_t p1 = tensor->op_params[6];
+
+            tensor_clone = ggml_pool_2d(ggml_ctx, src_clone[0], op, k0, k1, s0, s1, p0, p1);
+        } else if (tensor->op == GGML_OP_CONV_2D) {
+            const int32_t s0 = tensor->op_params[0];
+            const int32_t s1 = tensor->op_params[1];
+            const int32_t p0 = tensor->op_params[2];
+            const int32_t p1 = tensor->op_params[3];
+            const int32_t d0 = tensor->op_params[4];
+            const int32_t d1 = tensor->op_params[5];
+            tensor_clone = ggml_conv_2d(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1);
+        } else if (tensor->op == GGML_OP_CONV_2D_DW) {
+            const int32_t s0 = tensor->op_params[0];
+            const int32_t s1 = tensor->op_params[1];
+            const int32_t p0 = tensor->op_params[2];
+            const int32_t p1 = tensor->op_params[3];
+            const int32_t d0 = tensor->op_params[4];
+            const int32_t d1 = tensor->op_params[5];
+            tensor_clone = ggml_conv_2d_dw_direct(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1);
+        } else if (tensor->op == GGML_OP_CONV_TRANSPOSE_2D) {
+            const int32_t s = tensor->op_params[0];
+            tensor_clone = ggml_conv_transpose_2d_p0(ggml_ctx, src_clone[0], src_clone[1], s);
+        } else if (tensor->op == GGML_OP_LEAKY_RELU) {
+            const float * op_params = (const float *)tensor->op_params;
+            tensor_clone = ggml_leaky_relu(ggml_ctx, src_clone[0], op_params[0], false);
+        } else if (tensor->op == GGML_OP_RWKV_WKV6) {
+            tensor_clone = ggml_rwkv_wkv6(ggml_ctx, src_clone[0], src_clone[1],
+            src_clone[2], src_clone[3], src_clone[4], src_clone[5]);
+        } else if (tensor->op == GGML_OP_RWKV_WKV7) {
+            tensor_clone = ggml_rwkv_wkv7(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], src_clone[3],
+            src_clone[4], src_clone[5], src_clone[6]);
+        } else if (tensor->op == GGML_OP_OPT_STEP_ADAMW) {
+            src_clone[0]->flags = tensor->src[0]->flags;
+            tensor_clone = ggml_opt_step_adamw(ggml_ctx, src_clone[0], src_clone[1],
+            src_clone[2], src_clone[3], src_clone[4]);
+        } else if (tensor->op == GGML_OP_OPT_STEP_SGD) {
+            src_clone[0]->flags = tensor->src[0]->flags;
+            tensor_clone = ggml_opt_step_sgd(ggml_ctx, src_clone[0], src_clone[1],
+            src_clone[2]);
+        } else if (tensor->op == GGML_OP_ADD_ID) {
+            tensor_clone = ggml_add_id(ggml_ctx, src_clone[0], src_clone[1], src_clone[2]);
+        } else if (tensor->op == GGML_OP_SSM_SCAN) {
+            tensor_clone = ggml_ssm_scan(ggml_ctx, src_clone[0], src_clone[1], src_clone[2],
+                                         src_clone[3], src_clone[4], src_clone[5], src_clone[6]);
+        } else if (tensor->op == GGML_OP_SSM_CONV) {
+            tensor_clone = ggml_ssm_conv(ggml_ctx, src_clone[0], src_clone[1]);
+        } else if (tensor->op == GGML_OP_ROLL) {
+            const int32_t s0 = tensor->op_params[0];
+            const int32_t s1 = tensor->op_params[1];
+            const int32_t s2 = tensor->op_params[2];
+            const int32_t s3 = tensor->op_params[3];
+            tensor_clone = ggml_roll(ggml_ctx, src_clone[0], s0, s1, s2, s3);
         }
-    } else if (tensor->op == GGML_OP_UNARY) {
-        switch (ggml_get_unary_op(tensor)) {
-        case GGML_UNARY_OP_EXP:
-            tensor_clone = ggml_exp(ggml_ctx, src_clone[0]);
-            break;
-        case GGML_UNARY_OP_SILU:
-            tensor_clone = ggml_silu(ggml_ctx, src_clone[0]);
-            break;
-        case GGML_UNARY_OP_GELU:
-            tensor_clone = ggml_gelu(ggml_ctx, src_clone[0]);
-            break;
-        case GGML_UNARY_OP_GELU_ERF:
-            tensor_clone = ggml_gelu_erf(ggml_ctx, src_clone[0]);
-            break;
-        case GGML_UNARY_OP_GELU_QUICK:
-            tensor_clone = ggml_gelu_quick(ggml_ctx, src_clone[0]);
-            break;
-        case GGML_UNARY_OP_RELU:
-            tensor_clone = ggml_relu(ggml_ctx, src_clone[0]);
-            break;
-        case GGML_UNARY_OP_TANH:
-            tensor_clone = ggml_tanh(ggml_ctx, src_clone[0]);
-            break;
-        case GGML_UNARY_OP_SIGMOID:
-            tensor_clone = ggml_sigmoid(ggml_ctx, src_clone[0]);
-            break;
-        case GGML_UNARY_OP_HARDSIGMOID:
-            tensor_clone = ggml_hardsigmoid(ggml_ctx, src_clone[0]);
-            break;
-        case GGML_UNARY_OP_HARDSWISH:
-            tensor_clone = ggml_hardswish(ggml_ctx, src_clone[0]);
-            break;
-        default:
+        else {
             std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
             GGML_ABORT("fatal error");
         }
-    } else if (tensor->op == GGML_OP_GLU) {
-        if (src_clone[1] == nullptr) {
-            tensor_clone = ggml_glu(ggml_ctx, src_clone[0], (ggml_glu_op) tensor->op_params[0], tensor->op_params[1]);
-        } else {
-            tensor_clone = ggml_glu_split(ggml_ctx, src_clone[0], src_clone[1], (ggml_glu_op) tensor->op_params[0]);
-        }
-        ggml_set_op_params_i32(tensor_clone, 2, ggml_get_op_params_i32(tensor, 2));
-        ggml_set_op_params_i32(tensor_clone, 3, ggml_get_op_params_i32(tensor, 3));
-    } else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) {
-        if (src1 == nullptr) {
-            tensor_clone = ggml_dup(ggml_ctx, src_clone[0]);
-            tensor_clone->type = tensor->type;
-        } else {
-            tensor_clone = ggml_cpy(ggml_ctx, src_clone[0], src_clone[1]);
-        }
-    } else if (tensor->op == GGML_OP_CONT) {
-        tensor_clone = ggml_cont_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
-    } else if (tensor->op == GGML_OP_RESHAPE) {
-        tensor_clone = ggml_reshape_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
-    } else if (tensor->op == GGML_OP_VIEW) {
-        tensor_clone = ggml_view_4d(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], tensor->nb[1], tensor->nb[2], tensor->nb[3], ((int32_t *) tensor->op_params)[0]);
-    } else if (tensor->op == GGML_OP_PERMUTE) {
-        int32_t * params = (int32_t *)tensor->op_params;
-        tensor_clone = ggml_permute(ggml_ctx, src_clone[0], params[0], params[1], params[2], params[3]);
-    } else if (tensor->op == GGML_OP_TRANSPOSE) {
-        tensor_clone = ggml_transpose(ggml_ctx, src_clone[0]);
-    } else if (tensor->op == GGML_OP_GET_ROWS) {
-        tensor_clone = ggml_get_rows(ggml_ctx, src_clone[0], src_clone[1]);
-    } else if (tensor->op == GGML_OP_ARGSORT) {
-        tensor_clone = ggml_argsort(ggml_ctx, src_clone[0], (ggml_sort_order) *(int *)tensor->op_params);
-    } else if (tensor->op == GGML_OP_SUM) {
-        tensor_clone = ggml_sum(ggml_ctx, src_clone[0]);
-    } else if (tensor->op == GGML_OP_SUM_ROWS) {
-        tensor_clone = ggml_sum_rows(ggml_ctx, src_clone[0]);
-    } else if (tensor->op == GGML_OP_MEAN) {
-        tensor_clone = ggml_mean(ggml_ctx, src_clone[0]);
-    } else if (tensor->op == GGML_OP_ARGMAX) {
-        tensor_clone = ggml_argmax(ggml_ctx, src_clone[0]);
-    } else if (tensor->op == GGML_OP_COUNT_EQUAL) {
-        tensor_clone = ggml_count_equal(ggml_ctx, src_clone[0], src_clone[1]);
-    } else if (tensor->op == GGML_OP_IM2COL) {
-        const int32_t s0 = tensor->op_params[0];
-        const int32_t s1 = tensor->op_params[1];
-        const int32_t p0 = tensor->op_params[2];
-        const int32_t p1 = tensor->op_params[3];
-        const int32_t d0 = tensor->op_params[4];
-        const int32_t d1 = tensor->op_params[5];
-
-        const bool is_2D = tensor->op_params[6] == 1;
-        tensor_clone = ggml_im2col(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1, is_2D, tensor->type);
-    } else if (tensor->op == GGML_OP_IM2COL_3D) {
-        const int32_t s0 = tensor->op_params[0];
-        const int32_t s1 = tensor->op_params[1];
-        const int32_t s2 = tensor->op_params[2];
-        const int32_t p0 = tensor->op_params[3];
-        const int32_t p1 = tensor->op_params[4];
-        const int32_t p2 = tensor->op_params[5];
-        const int32_t d0 = tensor->op_params[6];
-        const int32_t d1 = tensor->op_params[7];
-        const int32_t d2 = tensor->op_params[8];
-        const int32_t IC = tensor->op_params[9];
-
-        tensor_clone = ggml_im2col_3d(ggml_ctx, src_clone[0], src_clone[1], IC, s0, s1, s2, p0, p1, p2, d0, d1, d2, tensor->type);
-    } else if (tensor->op == GGML_OP_TIMESTEP_EMBEDDING) {
-        const int32_t dim = tensor->op_params[0];
-        const int32_t max_period = tensor->op_params[1];
-        tensor_clone = ggml_timestep_embedding(ggml_ctx, src_clone[0], dim, max_period);
-    } else if (tensor->op == GGML_OP_CONV_TRANSPOSE_1D){
-        const int32_t s0 = tensor->op_params[0];
-        const int32_t p0 = tensor->op_params[1];
-        const int32_t d0 = tensor->op_params[2];
-        tensor_clone = ggml_conv_transpose_1d(ggml_ctx, src_clone[0], src_clone[1], s0, p0, d0);
-    } else if (tensor->op == GGML_OP_POOL_2D) {
-        enum ggml_op_pool op = static_cast<ggml_op_pool>(tensor->op_params[0]);
-        const int32_t k0 = tensor->op_params[1];
-        const int32_t k1 = tensor->op_params[2];
-        const int32_t s0 = tensor->op_params[3];
-        const int32_t s1 = tensor->op_params[4];
-        const int32_t p0 = tensor->op_params[5];
-        const int32_t p1 = tensor->op_params[6];
-
-        tensor_clone = ggml_pool_2d(ggml_ctx, src_clone[0], op, k0, k1, s0, s1, p0, p1);
-    } else if (tensor->op == GGML_OP_CONV_2D) {
-        const int32_t s0 = tensor->op_params[0];
-        const int32_t s1 = tensor->op_params[1];
-        const int32_t p0 = tensor->op_params[2];
-        const int32_t p1 = tensor->op_params[3];
-        const int32_t d0 = tensor->op_params[4];
-        const int32_t d1 = tensor->op_params[5];
-        tensor_clone = ggml_conv_2d(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1);
-    } else if (tensor->op == GGML_OP_CONV_TRANSPOSE_2D) {
-        const int32_t s = tensor->op_params[0];
-        tensor_clone = ggml_conv_transpose_2d_p0(ggml_ctx, src_clone[0], src_clone[1], s);
-    } else if (tensor->op == GGML_OP_LEAKY_RELU) {
-        const float * op_params = (const float *)tensor->op_params;
-        tensor_clone = ggml_leaky_relu(ggml_ctx, src_clone[0], op_params[0], false);
-    } else if (tensor->op == GGML_OP_RWKV_WKV6) {
-        tensor_clone = ggml_rwkv_wkv6(ggml_ctx, src_clone[0], src_clone[1],
-        src_clone[2], src_clone[3], src_clone[4], src_clone[5]);
-    } else if (tensor->op == GGML_OP_RWKV_WKV7) {
-        tensor_clone = ggml_rwkv_wkv7(ggml_ctx, src_clone[0], src_clone[1], src_clone[2], src_clone[3],
-        src_clone[4], src_clone[5], src_clone[6]);
-    } else if (tensor->op == GGML_OP_OPT_STEP_ADAMW) {
-        src_clone[0]->flags = src0->flags;
-        tensor_clone = ggml_opt_step_adamw(ggml_ctx, src_clone[0], src_clone[1],
-        src_clone[2], src_clone[3], src_clone[4]);
-    } else if (tensor->op == GGML_OP_OPT_STEP_SGD) {
-        src_clone[0]->flags = src0->flags;
-        tensor_clone = ggml_opt_step_sgd(ggml_ctx, src_clone[0], src_clone[1],
-        src_clone[2]);
-    } else if (tensor->op == GGML_OP_ADD_ID) {
-        tensor_clone = ggml_add_id(ggml_ctx, src_clone[0], src_clone[1], src_clone[2]);
-    } else if (tensor->op == GGML_OP_SSM_SCAN) {
-        tensor_clone = ggml_ssm_scan(ggml_ctx, src_clone[0], src_clone[1], src_clone[2],
-                                     src_clone[3], src_clone[4], src_clone[5], src_clone[6]);
-    } else if (tensor->op == GGML_OP_SSM_CONV) {
-        tensor_clone = ggml_ssm_conv(ggml_ctx, src_clone[0], src_clone[1]);
-    }
-    else {
-        std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
-        GGML_ABORT("fatal error");
+        cloned_tensors[tensor] = tensor_clone;
     }
 
     ggml_cgraph * cgraph_cpu = ggml_new_graph(ggml_ctx);
@@ -14476,10 +14475,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
     memcpy(comp_result, tensor_clone->data, comp_size);
     memcpy(comp_nb, tensor_clone->nb, sizeof(size_t) * GGML_MAX_DIMS);
 
-    for (int i = 0; i < GGML_MAX_SRC; i++) {
-        if (src_buffer[i] != nullptr) {
-            free(src_buffer[i]);
-        }
+    for (auto m : cloned_mallocs) {
+        free(m);
     }
 
     ggml_free(ggml_ctx);
@@ -14488,15 +14485,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
 }
 
 static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, int tensor_idx) {
-    ggml_tensor * tensor = cgraph->nodes[tensor_idx];
+    ggml_tensor * tensor = cgraph->nodes[tensor_idx + ctx->num_additional_fused_ops];
     if (tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_SET_ROWS) {
         return;
     }
-    if (ctx->num_additional_fused_ops == 1 &&
-        tensor->op == GGML_OP_RMS_NORM &&
-        cgraph->nodes[tensor_idx + 1]->op == GGML_OP_MUL) {
-        tensor = cgraph->nodes[tensor_idx + 1];
-    }
 
     if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) {
         return;

From 230d1169e5bfe04a013b2e20f4662ee56c2454b0 Mon Sep 17 00:00:00 2001
From: bssrdf <merlintiger@hotmail.com>
Date: Wed, 5 Nov 2025 15:55:04 -0500
Subject: [PATCH 05/32] improve CUDA cpy memory bandwidth when copying
 transposed tensor  (#16841)

* WIP

* added a cpy kernel specific to transposed tensor which uses smem to avoid uncoalesced access; test cases also added shwoing improved memory bandwidth

* added BF16 support

* more strict check to make sure src0 is a transpose

* reformulated to handle more complicated transpose cases

* bring back 2D transpose for higher performance

* allow build on windows

* tranpose copy more shapes

* minor tweak

* final clean up

* restore some test cases

* keep only the kernel for true tranposed case; updated with review suggestions

* make CI happy

* remove headers not needed

* reduced bank conflicts for fp16 and bf16

* add missing const*

* now bank conflicts free

* use padding instead of swizzling

---------

Co-authored-by: bssrdf <bssrdf@gmail.com>
---
 ggml/src/ggml-cuda/cpy.cu  | 103 ++++++++++++++++++++++++++++++++++---
 tests/test-backend-ops.cpp |  33 ++++++++++--
 2 files changed, 126 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
index c5821acbd..1dba60eb1 100644
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@@ -7,6 +7,10 @@
 
 typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
 
+const int CUDA_CPY_TILE_DIM_2D = 32; // 2D tile dimension for transposed blocks
+const int CUDA_CPY_BLOCK_NM = 8;     // block size of 3rd dimension if available
+const int CUDA_CPY_BLOCK_ROWS = 8;   // block dimension for marching through rows
+
 template <cpy_kernel_t cpy_1>
 static __global__ void cpy_flt(const char * cx, char * cdst, const int ne,
                                const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
@@ -35,6 +39,55 @@ static __global__ void cpy_flt(const char * cx, char * cdst, const int ne,
     cpy_1(cx + x_offset, cdst + dst_offset);
 }
 
+template <typename T>
+static __global__ void cpy_flt_transpose(const char * cx, char * cdst, const int ne,
+                               const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
+                               const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
+                               const int nb12, const int nb13) {
+
+    const T* src = reinterpret_cast<const T*>(cx);
+    T* dst = reinterpret_cast<T*>(cdst);
+
+    const int64_t nmat = ne / (ne00 * ne01);
+    const int64_t n = ne00 * ne01;
+
+    const int x = blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.x;
+    const int y = blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.y;
+    const int tx = blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.x;  // transpose block offset
+    const int ty = blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.y;
+
+    __shared__ float tile[CUDA_CPY_TILE_DIM_2D][CUDA_CPY_TILE_DIM_2D+1];
+
+#pragma unroll
+    for (int i = 0; i < CUDA_CPY_BLOCK_NM; ++i) {
+
+        const unsigned int imat = blockIdx.z * CUDA_CPY_BLOCK_NM + i;
+        if (imat >= nmat)
+            break;
+
+#pragma unroll
+        for (int j = 0; j < CUDA_CPY_TILE_DIM_2D; j += CUDA_CPY_BLOCK_ROWS) {
+            if(x < ne01 && y + j < ne00){
+                const int row = threadIdx.y+j;
+                const int col = threadIdx.x * sizeof(float)/sizeof(T);
+                T *tile2 = reinterpret_cast<T*>(tile[row]);
+                tile2[col] = src[imat*n + (y+j)*ne01 + x];
+            }
+        }
+
+        __syncthreads();
+
+#pragma unroll
+        for (int j = 0; j < CUDA_CPY_TILE_DIM_2D; j += CUDA_CPY_BLOCK_ROWS) {
+            if (ty + j < ne01 && tx < ne00) {
+                const int col = (threadIdx.y+j)*sizeof(float)/sizeof(T);
+                const T *tile2 = reinterpret_cast<const T*>(tile[threadIdx.x]);
+                dst[imat*n + (ty+j)*ne00 + tx] = tile2[col];
+            }
+        }
+    }
+}
+
 static __device__ void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) {
     float * cdstf = (float *)(cdsti);
 
@@ -136,15 +189,38 @@ cudaStream_t stream) {
         (cx, cdst, ne);
 }
 
-template<typename src_t, typename dst_t>
+template<typename src_t, typename dst_t, bool transposed = false>
 static void ggml_cpy_flt_cuda(
     const char * cx, char * cdst, const int ne,
     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
     const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
 
-    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
-    cpy_flt<cpy_1_flt<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+    if (transposed) {
+        GGML_ASSERT(ne == ne00*ne01*ne02);  // ne[3] is 1 assumed
+        int ne00n, ne01n, ne02n;
+        if (nb00 < nb02) {
+            ne00n = ne00;
+            ne01n = ne01;
+            ne02n = ne02;
+        } else if (nb00 > nb02) {
+            ne00n = ne00;
+            ne01n = ne01*ne02;
+            ne02n = 1;
+        } else {
+            GGML_ASSERT(false);
+        }
+
+        dim3 dimGrid( (ne01n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D,
+                      (ne00n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D,
+                      (ne/(ne01n*ne00n) + CUDA_CPY_BLOCK_NM - 1) / CUDA_CPY_BLOCK_NM);
+        dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1);
+        cpy_flt_transpose<dst_t><<<dimGrid, dimBlock, 0, stream>>>
+            (cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+    } else {
+        const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+        cpy_flt<cpy_1_flt<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+            (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+    }
 }
 
 static void ggml_cpy_f32_q8_0_cuda(
@@ -310,6 +386,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
     char * src1_ddc = (char *) src1->data;
 
     const bool contiguous_srcs = ggml_is_contiguous(src0) && ggml_is_contiguous(src1);
+    const bool can_be_transposed = nb01 == (int64_t)ggml_element_size(src0) && src0->ne[3] == 1;
 
     if (src0->type == src1->type && contiguous_srcs) {
         GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
@@ -322,7 +399,11 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
             CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
         }
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
-        ggml_cpy_flt_cuda<float, float>           (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        if (can_be_transposed) {
+            ggml_cpy_flt_cuda<float, float, true> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        } else {
+            ggml_cpy_flt_cuda<float, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        }
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
         if (contiguous_srcs) {
             ggml_cpy_flt_contiguous_cuda<float, nv_bfloat16> (src0_ddc, src1_ddc, ne, main_stream);
@@ -361,7 +442,11 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
     } else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
         ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
-        ggml_cpy_flt_cuda<half, half>               (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        if (can_be_transposed) {
+            ggml_cpy_flt_cuda<half, half, true> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        } else {
+            ggml_cpy_flt_cuda<half, half>       (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        }
     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_BF16) {
         if (contiguous_srcs) {
             ggml_cpy_flt_contiguous_cuda<half, nv_bfloat16>  (src0_ddc, src1_ddc, ne, main_stream);
@@ -375,7 +460,11 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
             ggml_cpy_flt_cuda<half, float>          (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
         }
     } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) {
-        ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        if (can_be_transposed) {
+            ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16, true> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        } else {
+            ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        }
     } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) {
         if (contiguous_srcs) {
             ggml_cpy_flt_contiguous_cuda<nv_bfloat16, half>  (src0_ddc, src1_ddc, ne, main_stream);
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 967a53c63..f57542027 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -2576,9 +2576,10 @@ struct test_cpy : public test_case {
     const std::array<int64_t, 4> permute_dst;
     bool _src_use_permute;
     bool _dst_use_permute;
+    bool _src_transpose;
 
     std::string vars() override {
-        return VARS_TO_STR5(type_src, type_dst, ne, permute_src, permute_dst);
+        return VARS_TO_STR6(type_src, type_dst, ne, permute_src, permute_dst, _src_transpose);
     }
 
     double max_nmse_err() override {
@@ -2616,10 +2617,12 @@ struct test_cpy : public test_case {
     test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
             std::array<int64_t, 4> ne = {10, 10, 10, 1},
             std::array<int64_t, 4> permute_src = {0, 0, 0, 0},
-            std::array<int64_t, 4> permute_dst = {0, 0, 0, 0})
+            std::array<int64_t, 4> permute_dst = {0, 0, 0, 0},
+            bool transpose_src = false)
         : type_src(type_src), type_dst(type_dst), ne(ne), permute_src(permute_src), permute_dst(permute_dst),
           _src_use_permute(permute_src[0] + permute_src[1] + permute_src[2] + permute_src[3] > 0),
-          _dst_use_permute(permute_dst[0] + permute_dst[1] + permute_dst[2] + permute_dst[3] > 0) {}
+          _dst_use_permute(permute_dst[0] + permute_dst[1] + permute_dst[2] + permute_dst[3] > 0),
+          _src_transpose(transpose_src){}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
@@ -2631,6 +2634,11 @@ struct test_cpy : public test_case {
             ggml_set_name(src, "src_permuted");
         }
 
+        if (_src_transpose) {
+            src = ggml_transpose(ctx, src);
+            ggml_set_name(src, "src_transposed");
+        }
+
         ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, src->ne);
         ggml_set_name(dst, "dst");
 
@@ -6641,6 +6649,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_I32, {256, 2, 3, 4}, {1, 0, 2, 3}));
     test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4}));
     test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4}, {1, 0, 2, 3}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 3}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
 
     test_cases.emplace_back(new test_cont());
     test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 1 ,1}));
@@ -7385,6 +7400,18 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
     test_cases.emplace_back(new test_cpy(GGML_TYPE_F32,  GGML_TYPE_Q4_0, {8192, 512, 2, 1}));
     test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_F32,  {8192, 512, 2, 1}));
 
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
+
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+
+
     test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {4096, 4096, 5, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));
     test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {12888, 256, 5, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));
     test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 4096, 5, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));

From 6db3d1ffe6d34e8becfe95cf2f109198a7154db3 Mon Sep 17 00:00:00 2001
From: l3utterfly <gc.pthzfoldr@gmail.com>
Date: Thu, 6 Nov 2025 13:46:38 +0800
Subject: [PATCH 06/32] ggml-hexagon: graceful fallback for older socs where
 rpcmem_alloc2 and FASTRPC_GET_URI is unsupported (#16987)

* support older socs where FASTRPC_GET_URI is unsupported

* added graceful fallback when FASTRPC_GET_URI call fails

* use weak symbols instead of loading libcdsprpc.so dynamically

* Add weak pragma for rpcmem_alloc2

* Remove weak declaration for rpcmem_alloc2 in ggml-hexagon.cpp

Removed weak declaration for rpcmem_alloc2.

* Enforce ndev to 1 for archs below v75

Force ndev to 1 for SoCs architectures lower than v75.
---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 28 ++++++++++++++++++++------
 ggml/src/ggml-hexagon/htp-utils.h      |  1 +
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 945652263..7064b7486 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -367,7 +367,13 @@ struct ggml_backend_hexagon_buffer_context {
     ggml_backend_hexagon_buffer_context(ggml_hexagon_session * sess, size_t size, bool repack) {
         size += 4 * 1024;  // extra page for padding
 
-        this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
+        if (rpcmem_alloc2) {
+            this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
+        } else {
+            GGML_LOG_INFO("ggml-hex: %s rpcmem_alloc2 not found, falling back to rpcmem_alloc\n", sess->name.c_str());
+            this->base = (uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
+        }
+
         if (!this->base) {
             GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer : size %zu\n", sess->name.c_str(), size);
             throw std::runtime_error("ggml-hex: rpcmem_alloc failed (see log for details)");
@@ -1679,12 +1685,13 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
     }
 
     // Get session URI
-    char htp_uri[256];
-    sprintf(htp_uri, "file:///libggml-htp-v%u.so?htp_iface_skel_handle_invoke&_modver=1.0", opt_arch);
 
     char session_uri[256];
     {
-        struct remote_rpc_get_uri u;
+        char htp_uri[256];
+        snprintf(htp_uri, sizeof(htp_uri), "file:///libggml-htp-v%u.so?htp_iface_skel_handle_invoke&_modver=1.0", opt_arch);
+
+        struct remote_rpc_get_uri u = {};
         u.session_id      = this->session_id;
         u.domain_name     = const_cast<char *>(CDSP_DOMAIN_NAME);
         u.domain_name_len = strlen(CDSP_DOMAIN_NAME);
@@ -1695,8 +1702,12 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
 
         int err = remote_session_control(FASTRPC_GET_URI, (void *) &u, sizeof(u));
         if (err != AEE_SUCCESS) {
-            GGML_LOG_ERROR("ggml-hex: failed to get URI for session %d : error 0x%x\n", dev_id, err);
-            throw std::runtime_error("ggml-hex: remote_session_control(get-uri) failed (see log for details)");
+            // fallback to single session uris
+            int htp_URI_domain_len = strlen(htp_uri) + MAX_DOMAIN_NAMELEN;
+
+            snprintf(session_uri, htp_URI_domain_len, "%s%s", htp_uri, my_domain->uri);
+
+            GGML_LOG_WARN("ggml-hex: failed to get URI for session %d : error 0x%x. Falling back to single session URI: %s\n", dev_id, err, session_uri);
         }
     }
 
@@ -3668,6 +3679,11 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
         }
     }
 
+    if(opt_arch < 75) {
+        opt_ndev = 1;
+        GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
+    }
+
     GGML_LOG_INFO("ggml-hex: Hexagon Arch version v%d\n", opt_arch);
 
     // Create devices / sessions
diff --git a/ggml/src/ggml-hexagon/htp-utils.h b/ggml/src/ggml-hexagon/htp-utils.h
index 66f9fd373..1a48f5dcb 100644
--- a/ggml/src/ggml-hexagon/htp-utils.h
+++ b/ggml/src/ggml-hexagon/htp-utils.h
@@ -64,6 +64,7 @@ extern "C" {
 #    pragma weak remote_handle64_control
 #    pragma weak fastrpc_mmap
 #    pragma weak fastrpc_munmap
+#    pragma weak rpcmem_alloc2
 #endif
 
 #if !defined(_WINDOWS)

From 22c8c3c6ad8d6239c366ce3a84ace8b0ac59ef88 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Thu, 6 Nov 2025 08:14:35 +0100
Subject: [PATCH 07/32] docs: explain CUDA 11 compilation [no ci] (#16824)

---
 docs/build.md | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/docs/build.md b/docs/build.md
index b410c710e..7d244ff01 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -178,6 +178,48 @@ GeForce RTX 3070      8.6
 cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES="86;89"
 ```
 
+### Overriding the CUDA Version
+
+If you have multiple CUDA installations on your system and want to compile llama.cpp for a specific one, e.g. for CUDA 11.7 installed under `/opt/cuda-11.7`:
+
+```bash
+cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_COMPILER=/opt/cuda-11.7/bin/nvcc -DCMAKE_INSTALL_RPATH="/opt/cuda-11.7/lib64;\$ORIGIN" -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON
+```
+
+#### Fixing Compatibility Issues with Old CUDA and New glibc
+
+If you try to use an old CUDA version (e.g. v11.7) with a new glibc version you can get errors like this:
+
+```
+/usr/include/bits/mathcalls.h(83): error: exception specification is
+  incompatible with that of previous function "cospi"
+
+
+  /opt/cuda-11.7/bin/../targets/x86_64-linux/include/crt/math_functions.h(5545):
+  here
+```
+
+It seems the least bad solution is to patch the CUDA installation to declare the correct signatures.
+Replace the following lines in `/path/to/your/cuda/installation/targets/x86_64-linux/include/crt/math_functions.h`:
+
+```C++
+// original lines
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 cospi(double x);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  cospif(float x);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 sinpi(double x);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  sinpif(float x);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 rsqrt(double x);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  rsqrtf(float x);
+
+// edited lines
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 cospi(double x) noexcept (true);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  cospif(float x) noexcept (true);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 sinpi(double x) noexcept (true);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  sinpif(float x) noexcept (true);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 rsqrt(double x) noexcept (true);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  rsqrtf(float x) noexcept (true);
+```
+
 ### Runtime CUDA environmental variables
 
 You may set the [cuda environmental variables](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) at runtime.

From 9d7c518d642db8657e2a53a9793c5f039ed8ea5a Mon Sep 17 00:00:00 2001
From: YehuditE <y8703470@gmail.com>
Date: Thu, 6 Nov 2025 12:02:33 +0200
Subject: [PATCH 08/32] sycl: add CONCAT operator support (#16047)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* sycl: add CONCAT operator support

* cleanup: remove stray lines added by mistake

* fix: code format issues in concat.cpp and tests/test-backend-ops.cpp

* chore: fix editorconfig violations

* cleanup: drop unnecessary i16 type support

* docs: update sycl-csv and regenerate ops.md

* update docs/ops.md

* fix: adapt to upstream master changes after rebase

* fix: remove empty files

* fix: drop whitespace

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
---
 docs/ops.md                      |  2 +-
 docs/ops/SYCL.csv                | 32 +++++------
 ggml/src/ggml-sycl/concat.cpp    | 99 ++++++++++++++++++--------------
 ggml/src/ggml-sycl/ggml-sycl.cpp |  6 +-
 4 files changed, 73 insertions(+), 66 deletions(-)

diff --git a/docs/ops.md b/docs/ops.md
index b8fcf0463..775b938bd 100644
--- a/docs/ops.md
+++ b/docs/ops.md
@@ -24,7 +24,7 @@ Legend:
 |                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
 |                             CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ✅ | ❌ | ❌ |
 |                            CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
-|                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ |
+|                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ |
 |                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
 |                          CONV_2D | ❌ | ❌ | ✅ | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ |
 |                       CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
diff --git a/docs/ops/SYCL.csv b/docs/ops/SYCL.csv
index 101e80f64..d7e71990a 100644
--- a/docs/ops/SYCL.csv
+++ b/docs/ops/SYCL.csv
@@ -9307,37 +9307,37 @@
 "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=0,inplace=1","support","1","yes","SYCL"
 "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=0,inplace=1","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","1","yes","SYCL"
-"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","0","no","SYCL"
+"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","0","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","1","yes","SYCL"
-"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","0","no","SYCL"
+"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","0","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","1","yes","SYCL"
-"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","0","no","SYCL"
+"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","0","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","1","yes","SYCL"
-"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","0","no","SYCL"
+"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","0","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","1","yes","SYCL"
-"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","0","no","SYCL"
+"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","0","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","1","yes","SYCL"
-"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","0","no","SYCL"
+"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","0","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","1","yes","SYCL"
-"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","0","no","SYCL"
+"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","0","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","1","yes","SYCL"
-"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","0","no","SYCL"
+"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","0","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","1","yes","SYCL"
-"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","0","no","SYCL"
+"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","0","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","1","yes","SYCL"
-"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","0","no","SYCL"
+"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","0","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","1","yes","SYCL"
-"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","0","no","SYCL"
+"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","0","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","1","yes","SYCL"
-"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","0","no","SYCL"
+"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","0","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","1","yes","SYCL"
-"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","0","no","SYCL"
+"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","0","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","1","yes","SYCL"
-"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","0","no","SYCL"
+"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","0","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","SYCL"
-"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","0","no","SYCL"
+"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","0","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","SYCL"
-"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","0","no","SYCL"
+"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","0","yes","SYCL"
 "SYCL0","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","SYCL"
 "SYCL0","ARGSORT","type=f32,ne=[16,10,10,10],order=0","support","1","yes","SYCL"
 "SYCL0","ARGSORT","type=f32,ne=[60,10,10,10],order=0","support","1","yes","SYCL"
diff --git a/ggml/src/ggml-sycl/concat.cpp b/ggml/src/ggml-sycl/concat.cpp
index c76836504..d16215bc9 100644
--- a/ggml/src/ggml-sycl/concat.cpp
+++ b/ggml/src/ggml-sycl/concat.cpp
@@ -11,9 +11,13 @@
 //
 
 #include "concat.hpp"
-#include "common.hpp"
 
-static void concat_f32_dim0(const float *x, const float *y, float *dst,
+static inline size_t elem_size(ggml_type t) {
+    return ggml_type_size(t) / ggml_blck_size(t);
+}
+
+template <typename T>
+static void concat_T_dim0(const T *x, const T *y, T *dst,
                             const int ne0, const int ne00,
                             const sycl::nd_item<3> &item_ct1) {
   int nidx = item_ct1.get_local_id(2) +
@@ -36,7 +40,8 @@ static void concat_f32_dim0(const float *x, const float *y, float *dst,
   }
 }
 
-static void concat_f32_dim1(const float *x, const float *y, float *dst,
+template <typename T>
+static void concat_T_dim1(const T *x, const T *y, T *dst,
                             const int ne0, const int ne01,
                             const sycl::nd_item<3> &item_ct1) {
   int nidx = item_ct1.get_local_id(2) +
@@ -59,7 +64,8 @@ static void concat_f32_dim1(const float *x, const float *y, float *dst,
   }
 }
 
-static void concat_f32_dim2(const float *x, const float *y, float *dst,
+template <typename T>
+static void concat_T_dim2(const T *x, const T *y, T *dst,
                             const int ne0, const int ne02,
                             const sycl::nd_item<3> &item_ct1) {
   int nidx = item_ct1.get_local_id(2) +
@@ -82,45 +88,35 @@ static void concat_f32_dim2(const float *x, const float *y, float *dst,
   }
 }
 
-static void concat_f32_sycl(const float *x, const float *y, float *dst,
+template <typename T>
+static void concat_T_sycl(const T *x, const T *y, T *dst,
                             int ne00, int ne01, int ne02, int ne0, int ne1,
                             int ne2, int dim, queue_ptr stream) {
   int num_blocks = (ne0 + SYCL_CONCAT_BLOCK_SIZE - 1) / SYCL_CONCAT_BLOCK_SIZE;
   sycl::range<3> gridDim(ne2, ne1, num_blocks);
   switch (dim) {
   case 0:
-    stream->parallel_for(
-        sycl::nd_range<3>(gridDim *
-                              sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-          concat_f32_dim0(x, y, dst, ne0, ne00, item_ct1);
-        });
-    break;
+      stream->parallel_for(sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
+                                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
+                        [=](sycl::nd_item<3> item_ct1) { concat_T_dim0<T>(x, y, dst, ne0, ne00, item_ct1); });
+      break;
   case 1:
-    stream->parallel_for(
-        sycl::nd_range<3>(gridDim *
-                              sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-          concat_f32_dim1(x, y, dst, ne0, ne01, item_ct1);
-        });
-    break;
+      stream->parallel_for(sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
+                                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
+                        [=](sycl::nd_item<3> item_ct1) { concat_T_dim1<T>(x, y, dst, ne0, ne01, item_ct1); });
+      break;
   // dim >=2 will be dispatched to the default path
   default:
-    stream->parallel_for(
-        sycl::nd_range<3>(gridDim *
-                              sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-          concat_f32_dim2(x, y, dst, ne0, ne02, item_ct1);
-        });
-    break;
+      stream->parallel_for(sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
+                                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
+                        [=](sycl::nd_item<3> item_ct1) { concat_T_dim2<T>(x, y, dst, ne0, ne02, item_ct1); });
+      break;
   }
 }
 
 // non-contiguous kernel (slow)
-static void concat_f32_sycl_non_cont(
+template<typename T>
+static void concat_T_sycl_non_cont(
     queue_ptr stream, const char *src0, const char *src1, char *dst,
     int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03, uint64_t nb00,
     uint64_t nb01, uint64_t nb02, uint64_t nb03, int64_t /*ne10*/,
@@ -137,24 +133,25 @@ static void concat_f32_sycl_non_cont(
       int64_t o[4] = { 0, 0, 0, 0 };
       o[dim]       = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
 
-      const float * x;
+      const T * x;
 
       for (int i0 = item_ct1.get_local_id(2); i0 < ne0; i0 += item_ct1.get_local_range(2)) {
           if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
-              x = (const float *) (src0 + (i3) *nb03 + (i2) *nb02 + (i1) *nb01 + (i0) *nb00);
+              x = (const T *) (src0 + (i3) *nb03 + (i2) *nb02 + (i1) *nb01 + (i0) *nb00);
           } else {
-              x = (const float *) (src1 + (i3 - o[3]) * nb13 + (i2 - o[2]) * nb12 + (i1 - o[1]) * nb11 +
+              x = (const T *) (src1 + (i3 - o[3]) * nb13 + (i2 - o[2]) * nb12 + (i1 - o[1]) * nb11 +
                                    (i0 - o[0]) * nb10);
           }
 
-          float *y = (float *)(dst + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0);
+          T *y = (T *)(dst + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0);
 
           *y = *x;
       }
   });
 }
 
-void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
+template <typename T>
+void concat_impl_sycl(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
     scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
     const ggml_tensor *  src0   = dst->src[0];
     const ggml_tensor *  src1   = dst->src[1];
@@ -163,15 +160,14 @@ void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
     const int32_t dim = ((int32_t *) dst->op_params)[0];
 
     if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
-        const float * src0_d = (const float *) src0->data;
-        const float * src1_d = (const float *) src1->data;
-
-        float * dst_d = (float *) dst->data;
-
+        const T * src0_d = (const T *) src0->data;
+        const T * src1_d = (const T *) src1->data;
+        T * dst_d = (T *) dst->data;
+        size_t type_size = elem_size(dst->type);
         if (dim != 3) {
             for (int i3 = 0; i3 < dst->ne[3]; i3++) {
-                concat_f32_sycl(src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4),
-                                dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1], src0->ne[2], dst->ne[0],
+                concat_T_sycl<T>(src0_d + i3 * (src0->nb[3] / type_size), src1_d + i3 * (src1->nb[3] / type_size),
+                                dst_d + i3 * (dst->nb[3] / type_size), src0->ne[0], src0->ne[1], src0->ne[2], dst->ne[0],
                                 dst->ne[1], dst->ne[2], dim, stream);
             }
         } else {
@@ -179,13 +175,28 @@ void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
             const size_t size1 = ggml_nbytes(src1);
 
             SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait()));
-            SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait()));
+            SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d + size0 / type_size, src1_d, size1).wait()));
         }
     } else {
-        concat_f32_sycl_non_cont(stream, (const char *) src0->data, (const char *) src1->data, (char *) dst->data,
+        concat_T_sycl_non_cont<T>(stream, (const char *) src0->data, (const char *) src1->data, (char *) dst->data,
                                  src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0->nb[0], src0->nb[1],
                                  src0->nb[2], src0->nb[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
                                  src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2],
                                  dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
     }
 }
+
+void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
+
+    switch (dst->type) {
+    case GGML_TYPE_F32:
+        concat_impl_sycl<float>(ctx, dst);
+        break;
+    case GGML_TYPE_I32:
+        concat_impl_sycl<int32_t>(ctx, dst);
+        break;
+    default:
+    GGML_ASSERT(false && "ggml_sycl_op_concat: unsupported type");
+    break;
+    }
+}
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index c97c58994..f3b3e3657 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -4534,16 +4534,12 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
                 }
                 return false;
             }
-        case GGML_OP_CONCAT:
-            {
-                ggml_type src0_type = op->src[0]->type;
-                return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
-            }
         case GGML_OP_REPEAT_BACK:
             {
                 ggml_type src0_type = op->src[0]->type;
                 return src0_type == GGML_TYPE_F32;
             }
+        case GGML_OP_CONCAT:
         case GGML_OP_DUP:
         case GGML_OP_ARGMAX:
         case GGML_OP_NONE:

From 4882f0ff786088871b3ab3abd9e525a4213ee218 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Thu, 6 Nov 2025 11:02:54 +0100
Subject: [PATCH 09/32] clip: implement minicpm-v sinusoidal embd using GGML
 (#17036)

* clip: implement minicpm-v sinusoidal embd using GGML

* fix repeat op
---
 tools/mtmd/clip.cpp | 174 ++++++++++++++++----------------------------
 1 file changed, 62 insertions(+), 112 deletions(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 16781fb19..d1423b67f 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -1083,16 +1083,24 @@ struct clip_graph {
     }
 
     ggml_cgraph * build_minicpmv() {
-        const int batch_size = 1;
-
         GGML_ASSERT(model.class_embedding == nullptr);
-        const int n_pos = n_patches;
+        const int n_pos       = n_patches;
+        const int n_embd_proj = clip_n_mmproj_embd(ctx);
 
         // position embeddings for the projector (not for ViT)
-        int n_output_dim = clip_n_mmproj_embd(ctx);
-        ggml_tensor * pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_output_dim, n_pos, batch_size);
-        ggml_set_name(pos_embed, "pos_embed");
-        ggml_set_input(pos_embed);
+        // see: https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/resampler.py#L70
+        // base frequency omega
+        ggml_tensor * omega = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_embd_proj / 4);
+        ggml_set_name(omega, "omega");
+        ggml_set_input(omega);
+
+        // 2D input positions (using float for sinusoidal embeddings)
+        ggml_tensor * pos_h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
+        ggml_set_name(pos_h, "pos_h");
+        ggml_set_input(pos_h);
+        ggml_tensor * pos_w = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
+        ggml_set_name(pos_w, "pos_w");
+        ggml_set_input(pos_w);
 
         // for selecting learned pos embd, used by ViT
         struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
@@ -1103,7 +1111,7 @@ struct clip_graph {
 
         ggml_tensor * inp = build_inp();
         ggml_tensor * embeddings = build_vit(
-                                inp, n_patches,
+                                inp, n_pos,
                                 NORM_TYPE_NORMAL,
                                 hparams.ffn_op,
                                 learned_pos_embd,
@@ -1115,17 +1123,39 @@ struct clip_graph {
         ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
 
         // norm
-        q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1);
+        q = build_norm(q, model.mm_model_ln_q_w,  model.mm_model_ln_q_b,  NORM_TYPE_NORMAL, eps, -1);
         v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1);
 
+        // calculate sinusoidal pos embd
+        ggml_tensor * pos_embed = nullptr;
+        {
+            // outer product
+            ggml_tensor * omega_b = ggml_repeat_4d(ctx0, omega, omega->ne[0], n_pos, 1, 1); // n_pos rows
+            ggml_tensor * theta_x = ggml_mul(ctx0, omega_b, pos_w);
+            ggml_tensor * theta_y = ggml_mul(ctx0, omega_b, pos_h);
+            // sin and cos
+            ggml_tensor * pos_embd_x = ggml_concat(
+                ctx0,
+                ggml_sin(ctx0, theta_x),
+                ggml_cos(ctx0, theta_x),
+                0 // concat on first dim
+            );
+            ggml_tensor * pos_embd_y = ggml_concat(
+                ctx0,
+                ggml_sin(ctx0, theta_y),
+                ggml_cos(ctx0, theta_y),
+                0 // concat on first dim
+            );
+            pos_embed = ggml_concat(ctx0, pos_embd_x, pos_embd_y, 0);
+        }
+
         // k = v + pos_embed
         ggml_tensor * k = ggml_add(ctx0, v, pos_embed);
 
         // attention
         {
-            int n_embd = clip_n_mmproj_embd(ctx);
             const int d_head = 128;
-            int n_head = n_embd/d_head;
+            int n_head = n_embd_proj/d_head;
             // Use actual config value if available, otherwise fall back to hardcoded values
             int num_query = ctx->model.hparams.minicpmv_query_num;
             ggml_tensor * Q = ggml_add(ctx0,
@@ -4564,92 +4594,6 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
     return n_patches;
 }
 
-static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) {
-    assert(embed_dim % 2 == 0);
-    int H = pos.size();
-    int W = pos[0].size();
-
-    std::vector<float> omega(embed_dim / 2);
-    for (int i = 0; i < embed_dim / 2; ++i) {
-        omega[i] = 1.0 / pow(10000.0, static_cast<float>(i) / (embed_dim / 2));
-    }
-
-    std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
-    for (int h = 0; h < H; ++h) {
-        for (int w = 0; w < W; ++w) {
-            for (int d = 0; d < embed_dim / 2; ++d) {
-                float out_value = pos[h][w] * omega[d];
-                emb[h][w][d] = sin(out_value);
-                emb[h][w][d + embed_dim / 2] = cos(out_value);
-            }
-        }
-    }
-
-    return emb;
-}
-
-static std::vector<std::vector<std::vector<float>>> get_2d_sincos_pos_embed_from_grid(int embed_dim, const std::vector<std::vector<std::vector<float>>> & grid) {
-    assert(embed_dim % 2 == 0);
-    std::vector<std::vector<std::vector<float>>> emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[0]); // (H, W, D/2)
-    std::vector<std::vector<std::vector<float>>> emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[1]); // (H, W, D/2)
-
-    int H = emb_h.size();
-    int W = emb_h[0].size();
-    std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
-
-    for (int h = 0; h < H; ++h) {
-        for (int w = 0; w < W; ++w) {
-            for (int d = 0; d < embed_dim / 2; ++d) {
-                emb[h][w][d] = emb_h[h][w][d];
-                emb[h][w][d + embed_dim / 2] = emb_w[h][w][d];
-            }
-        }
-    }
-    return emb;
-}
-
-static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, const std::pair<int, int> image_size) {
-    int grid_h_size = image_size.first;
-    int grid_w_size = image_size.second;
-
-    std::vector<float> grid_h(grid_h_size);
-    std::vector<float> grid_w(grid_w_size);
-
-    for (int i = 0; i < grid_h_size; ++i) {
-        grid_h[i] = static_cast<float>(i);
-    }
-    for (int i = 0; i < grid_w_size; ++i) {
-        grid_w[i] = static_cast<float>(i);
-    }
-
-    std::vector<std::vector<float>> grid(grid_h_size, std::vector<float>(grid_w_size));
-    for (int h = 0; h < grid_h_size; ++h) {
-        for (int w = 0; w < grid_w_size; ++w) {
-            grid[h][w] = grid_w[w];
-        }
-    }
-    std::vector<std::vector<std::vector<float>>> grid_2d = {grid, grid};
-    for (int h = 0; h < grid_h_size; ++h) {
-        for (int w = 0; w < grid_w_size; ++w) {
-            grid_2d[0][h][w] = grid_h[h];
-            grid_2d[1][h][w] = grid_w[w];
-        }
-    }
-
-    std::vector<std::vector<std::vector<float>>> pos_embed_3d = get_2d_sincos_pos_embed_from_grid(embed_dim, grid_2d);
-
-    int H = image_size.first;
-    int W = image_size.second;
-    std::vector<std::vector<float>> pos_embed_2d(H * W, std::vector<float>(embed_dim));
-    for (int h = 0; h < H; ++h) {
-        for (int w = 0; w < W; ++w) {
-            pos_embed_2d[w * H + h] = pos_embed_3d[h][w];
-        }
-    }
-
-    return pos_embed_2d;
-}
-
 bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
     clip_image_f32_batch imgs;
     clip_image_f32_ptr img_copy(clip_image_f32_init());
@@ -4788,22 +4732,28 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                 }
                 set_input_i32("positions", positions);
 
-                // inspired from resampler of Qwen-VL:
-                //    -> https://huggingface.co/Qwen/Qwen-VL/tree/main
-                //    -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
-                int embed_dim = clip_n_mmproj_embd(ctx);
-
-                // TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos?
-                auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
-
-                std::vector<float> pos_embed(embed_dim * pos_w * pos_h);
-                for(int i = 0; i < pos_w * pos_h; ++i){
-                    for(int j = 0; j < embed_dim; ++j){
-                        pos_embed[i * embed_dim + j] = pos_embed_t[i][j];
-                    }
+                // inputs for resampler projector
+                // set the 2D positions (using float for sinusoidal embedding)
+                int n_patches_per_col = image_size_width / patch_size;
+                std::vector<float> pos_data(n_pos);
+                // dimension H
+                for (int i = 0; i < n_pos; i++) {
+                    pos_data[i] = static_cast<float>(i / n_patches_per_col);
                 }
-
-                set_input_f32("pos_embed", pos_embed);
+                set_input_f32("pos_h", pos_data);
+                // dimension W
+                for (int i = 0; i < n_pos; i++) {
+                    pos_data[i] = static_cast<float>(i % n_patches_per_col);
+                }
+                set_input_f32("pos_w", pos_data);
+                // base frequency omega
+                const float base_freq   = 10000.0f;
+                const int   n_embd_proj = clip_n_mmproj_embd(ctx);
+                std::vector<float> omega(n_embd_proj / 4);
+                for (int i = 0; i < n_embd_proj / 4; ++i) {
+                    omega[i] = 1.0f / std::pow(base_freq, static_cast<float>(i) / (n_embd_proj / 4));
+                }
+                set_input_f32("omega", omega);
             } break;
         case PROJECTOR_TYPE_QWEN2VL:
         case PROJECTOR_TYPE_QWEN3VL:

From b7f9010d24766792d8887c227a883ed3b315d2be Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 6 Nov 2025 12:09:29 +0200
Subject: [PATCH 10/32] server : disable checkpoints with mtmd (#17045)

---
 tools/server/server.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index f5089bef2..0e1a9afc8 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -3832,7 +3832,9 @@ struct server_context {
                             // the largest pos_min required for a checkpoint to be useful
                             const auto pos_min_thold = std::max(0, n_past - n_swa);
 
-                            if (n_past > 0 && n_past < slot.prompt.n_tokens()) {
+                            // note: disallow with mtmd contexts for now
+                            //       https://github.com/ggml-org/llama.cpp/issues/17043
+                            if (!mctx && n_past > 0 && n_past < slot.prompt.n_tokens()) {
                                 const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id);
                                 if (pos_min == -1) {
                                     SLT_ERR(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d\n", n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min);

From 5b180c3d60f3df61cd9955bc5c69e64537958f92 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 6 Nov 2025 14:45:10 +0200
Subject: [PATCH 11/32] metal : initial Metal4 tensor API support (#16634)

* metal : rework mat-mat multiplication

* metal : initial Metal4 support

* cont

* metal : detect tensor support

* cont : better ifdefs

* metal : support tensors in mul_mm_id

* metal : add env for disabling tensor API

* tests : restore

* metal : remove unused constants

* metal : fix check for bfloat tensor support

* cont : handle API incompatibilities

* cont : handle even more incompatibilities

* metal : use tensor API only on M5 and later
---
 ggml/src/ggml-metal/ggml-metal-context.m |   5 +-
 ggml/src/ggml-metal/ggml-metal-device.h  |   5 +-
 ggml/src/ggml-metal/ggml-metal-device.m  | 211 ++++++++-
 ggml/src/ggml-metal/ggml-metal.metal     | 543 +++++++++++++++++------
 4 files changed, 617 insertions(+), 147 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal-context.m b/ggml/src/ggml-metal/ggml-metal-context.m
index 052efb7ac..b8d35b78a 100644
--- a/ggml/src/ggml-metal/ggml-metal-context.m
+++ b/ggml/src/ggml-metal/ggml-metal-context.m
@@ -35,7 +35,6 @@ struct ggml_metal {
     // additional, inference-time compiled pipelines
     ggml_metal_pipelines_t pipelines_ext;
 
-    bool use_bfloat;
     bool use_fusion;
     bool use_concurrency;
     bool use_graph_optimize;
@@ -121,11 +120,10 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
         }
     }
 
-    const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
+    //const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
 
     res->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
 
-    res->use_bfloat      = props_dev->has_bfloat;
     res->use_fusion      = getenv("GGML_METAL_FUSION_DISABLE") == nil;
     res->use_concurrency = getenv("GGML_METAL_CONCURRENCY_DISABLE") == nil;
 
@@ -147,7 +145,6 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
 
     memset(res->fuse_cnt, 0, sizeof(res->fuse_cnt));
 
-    GGML_LOG_INFO("%s: use bfloat         = %s\n", __func__, res->use_bfloat         ? "true" : "false");
     GGML_LOG_INFO("%s: use fusion         = %s\n", __func__, res->use_fusion         ? "true" : "false");
     GGML_LOG_INFO("%s: use concurrency    = %s\n", __func__, res->use_concurrency    ? "true" : "false");
     GGML_LOG_INFO("%s: use graph optimize = %s\n", __func__, res->use_graph_optimize ? "true" : "false");
diff --git a/ggml/src/ggml-metal/ggml-metal-device.h b/ggml/src/ggml-metal/ggml-metal-device.h
index 4d5829748..cb27dca98 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.h
+++ b/ggml/src/ggml-metal/ggml-metal-device.h
@@ -95,7 +95,9 @@ void ggml_metal_encoder_end_encoding(ggml_metal_encoder_t encoder);
 
 typedef struct ggml_metal_library * ggml_metal_library_t;
 
-ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev);
+ggml_metal_library_t ggml_metal_library_init            (ggml_metal_device_t dev);
+ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev, const char * source, bool verbose);
+
 void ggml_metal_library_free(ggml_metal_library_t lib);
 
 ggml_metal_pipeline_t ggml_metal_library_get_pipeline    (ggml_metal_library_t lib, const char * name);
@@ -193,6 +195,7 @@ struct ggml_metal_device_props {
     bool has_simdgroup_mm;
     bool has_unified_memory;
     bool has_bfloat;
+    bool has_tensor;
     bool use_residency_sets;
     bool use_shared_buffers;
 
diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m
index 0cadd19a3..606cfd0a5 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -21,8 +21,9 @@
 #define GGML_METAL_HAS_RESIDENCY_SETS 1
 #endif
 
-// overload of MTLGPUFamilyMetal3 (not available in some environments)
+// overload of MTLGPUFamilyMetalX (not available in some environments)
 static const NSInteger MTLGPUFamilyMetal3_GGML = 5001;
+static const NSInteger MTLGPUFamilyMetal4_GGML = 5002;
 
 // virtual address for GPU memory allocations
 static atomic_uintptr_t g_addr_device = 0x000000400ULL;
@@ -261,6 +262,10 @@ ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
                     [prep setObject:@"1" forKey:@"GGML_METAL_HAS_BF16"];
                 }
 
+                if (ggml_metal_device_get_props(dev)->has_tensor) {
+                    [prep setObject:@"1" forKey:@"GGML_METAL_HAS_TENSOR"];
+                }
+
 #if GGML_METAL_EMBED_LIBRARY
                 [prep setObject:@"1" forKey:@"GGML_METAL_EMBED_LIBRARY"];
 #endif
@@ -298,6 +303,72 @@ ggml_metal_library_t ggml_metal_library_init(ggml_metal_device_t dev) {
     return res;
 }
 
+ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev, const char * source, bool verbose) {
+    if (source == NULL) {
+        GGML_LOG_ERROR("%s: source is NULL\n", __func__);
+        return NULL;
+    }
+
+    id<MTLDevice> device = ggml_metal_device_get_obj(dev);
+    id<MTLLibrary> library = nil;
+    NSError * error = nil;
+
+    const int64_t t_start = ggml_time_us();
+
+    NSString * src = [[NSString alloc] initWithBytes:source
+                                              length:strlen(source)
+                                            encoding:NSUTF8StringEncoding];
+    if (!src) {
+        GGML_LOG_ERROR("%s: failed to create NSString from source\n", __func__);
+        return NULL;
+    }
+
+    @autoreleasepool {
+        NSMutableDictionary * prep = [NSMutableDictionary dictionary];
+
+        MTLCompileOptions * options = [MTLCompileOptions new];
+        options.preprocessorMacros = prep;
+
+        library = [device newLibraryWithSource:src options:options error:&error];
+        if (error) {
+            if (verbose) {
+                GGML_LOG_ERROR("%s: error compiling source: %s\n", __func__, [[error description] UTF8String]);
+            } else {
+                GGML_LOG_ERROR("%s: error compiling source\n", __func__);
+            }
+            library = nil;
+        }
+
+        [options release];
+    }
+
+    [src release];
+
+    if (!library) {
+        if (verbose) {
+            GGML_LOG_ERROR("%s: failed to create Metal library from source\n", __func__);
+        }
+
+        return NULL;
+    }
+
+    if (verbose) {
+        GGML_LOG_INFO("%s: compiled in %.3f sec\n", __func__, (ggml_time_us() - t_start) / 1e6);
+    }
+
+    ggml_metal_library_t res = calloc(1, sizeof(struct ggml_metal_library));
+    if (!res) {
+        GGML_LOG_ERROR("%s: calloc failed\n", __func__);
+        return NULL;
+    }
+
+    res->obj       = library;
+    res->device    = device;
+    res->pipelines = ggml_metal_pipelines_init();
+
+    return res;
+}
+
 void ggml_metal_library_free(ggml_metal_library_t lib) {
     if (!lib) {
         return;
@@ -345,9 +416,9 @@ ggml_metal_pipeline_t ggml_metal_library_compile_pipeline(ggml_metal_library_t l
         if (!mtl_function) {
             ggml_critical_section_end();
 
-            GGML_LOG_ERROR("%s: error: failed to compile pipeline: base = '%s', name = '%s'\n", __func__, base, name);
+            GGML_LOG_ERROR("%s: failed to compile pipeline: base = '%s', name = '%s'\n", __func__, base, name);
             if (error) {
-                GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
+                GGML_LOG_ERROR("%s: %s\n", __func__, [[error description] UTF8String]);
             }
 
             return nil;
@@ -355,13 +426,21 @@ ggml_metal_pipeline_t ggml_metal_library_compile_pipeline(ggml_metal_library_t l
 
         res->obj = [lib->device newComputePipelineStateWithFunction:mtl_function error:&error];
 
-        ggml_metal_pipelines_add(lib->pipelines, name, res);
-
         [mtl_function release];
 
         GGML_LOG_DEBUG("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, name, (void *) res->obj,
                 (int) res->obj.maxTotalThreadsPerThreadgroup,
                 (int) res->obj.threadExecutionWidth);
+
+        if (res->obj.maxTotalThreadsPerThreadgroup == 0 || res->obj.threadExecutionWidth == 0) {
+            ggml_critical_section_end();
+
+            GGML_LOG_ERROR("%s: incompatible pipeline %s\n", __func__, name);
+
+            return nil;
+        }
+
+        ggml_metal_pipelines_add(lib->pipelines, name, res);
     }
 
     ggml_critical_section_end();
@@ -469,6 +548,126 @@ ggml_metal_device_t ggml_metal_device_init(void) {
 
             dev->props.has_bfloat  = [dev->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
             dev->props.has_bfloat |= [dev->mtl_device supportsFamily:MTLGPUFamilyApple6];
+            if (getenv("GGML_METAL_BF16_DISABLE") != NULL) {
+                dev->props.has_bfloat = false;
+            }
+
+            dev->props.has_tensor = [dev->mtl_device supportsFamily:MTLGPUFamilyMetal4_GGML];
+            if (getenv("GGML_METAL_TENSOR_DISABLE") != NULL) {
+                dev->props.has_tensor = false;
+            }
+
+            // note: disable the tensor API by default for old chips because with the current implementation it is not useful
+            // - M2 Ultra:   ~5% slower
+            // - M4, M4 Max: no significant difference
+            //
+            // TODO: try to update the tensor API kernels to at least match the simdgroup performance
+            if (getenv("GGML_METAL_TENSOR_ENABLE") == NULL &&
+                ![[dev->mtl_device name] containsString:@"M5"] &&
+                ![[dev->mtl_device name] containsString:@"M6"]) {
+                GGML_LOG_WARN("%s: tensor API disabled for pre-M5 device\n", __func__);
+                dev->props.has_tensor = false;
+            }
+
+            // double-check that the tensor API compiles
+            if (dev->props.has_tensor) {
+                const char * src_tensor_f16 = "\n"
+                    "#include <metal_stdlib> \n"
+                    "#include <metal_tensor> \n"
+                    "#include <MetalPerformancePrimitives/MetalPerformancePrimitives.h> \n"
+                    " \n"
+                    "using namespace metal; \n"
+                    "using namespace mpp::tensor_ops; \n"
+                    " \n"
+                    "kernel void dummy_kernel( \n"
+                    "    tensor<device  half, dextents<int32_t, 2>> A [[buffer(0)]], \n"
+                    "    tensor<device  half, dextents<int32_t, 2>> B [[buffer(1)]], \n"
+                    "    device float * C [[buffer(2)]], \n"
+                    "    uint2 tgid [[threadgroup_position_in_grid]]) \n"
+                    "{ \n"
+                    "    auto tA = A.slice(0, (int)tgid.y); \n"
+                    "    auto tB = B.slice((int)tgid.x, 0); \n"
+                    " \n"
+                    "    matmul2d< \n"
+                    "        matmul2d_descriptor(8, 8, dynamic_extent), \n"
+                    "        execution_simdgroups<4>> mm; \n"
+                    " \n"
+                    "    auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>(); \n"
+                    " \n"
+                    "    auto sA = tA.slice(0, 0); \n"
+                    "    auto sB = tB.slice(0, 0); \n"
+                    "    mm.run(sB, sA, cT); \n"
+                    " \n"
+                    "    auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(4, 4)); \n"
+                    " \n"
+                    "    cT.store(tC); \n"
+                    "}";
+
+                GGML_LOG_INFO("%s: testing tensor API for f16 support\n", __func__);
+                ggml_metal_library_t lib = ggml_metal_library_init_from_source(dev, src_tensor_f16, false);
+                if (lib == NULL) {
+                    GGML_LOG_WARN("%s: - the tensor API is not supported in this environment - disabling\n", __func__);
+                    dev->props.has_tensor = false;
+                } else {
+                    ggml_metal_pipeline_t ppl = ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
+                    if (!ppl) {
+                        GGML_LOG_WARN("%s: - the tensor API is not supported in this environment - disabling\n", __func__);
+                        dev->props.has_tensor = false;
+                    }
+
+                    ggml_metal_library_free(lib);
+                }
+            }
+
+            // try to compile a dummy kernel to determine if the tensor API is supported for bfloat
+            if (dev->props.has_tensor && dev->props.has_bfloat) {
+                const char * src_tensor_bf16 = "\n"
+                    "#include <metal_stdlib> \n"
+                    "#include <metal_tensor> \n"
+                    "#include <MetalPerformancePrimitives/MetalPerformancePrimitives.h> \n"
+                    " \n"
+                    "using namespace metal; \n"
+                    "using namespace mpp::tensor_ops; \n"
+                    " \n"
+                    "kernel void dummy_kernel( \n"
+                    "    tensor<device bfloat, dextents<int32_t, 2>> A [[buffer(0)]], \n"
+                    "    tensor<device bfloat, dextents<int32_t, 2>> B [[buffer(1)]], \n"
+                    "    device float * C [[buffer(2)]], \n"
+                    "    uint2 tgid [[threadgroup_position_in_grid]]) \n"
+                    "{ \n"
+                    "    auto tA = A.slice(0, (int)tgid.y); \n"
+                    "    auto tB = B.slice((int)tgid.x, 0); \n"
+                    " \n"
+                    "    matmul2d< \n"
+                    "        matmul2d_descriptor(8, 8, dynamic_extent), \n"
+                    "        execution_simdgroups<4>> mm; \n"
+                    " \n"
+                    "    auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>(); \n"
+                    " \n"
+                    "    auto sA = tA.slice(0, 0); \n"
+                    "    auto sB = tB.slice(0, 0); \n"
+                    "    mm.run(sB, sA, cT); \n"
+                    " \n"
+                    "    auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(4, 4)); \n"
+                    " \n"
+                    "    cT.store(tC); \n"
+                    "}";
+
+                GGML_LOG_INFO("%s: testing tensor API for bfloat support\n", __func__);
+                ggml_metal_library_t lib = ggml_metal_library_init_from_source(dev, src_tensor_bf16, false);
+                if (lib == NULL) {
+                    GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
+                    dev->props.has_bfloat = false;
+                } else {
+                    ggml_metal_pipeline_t ppl = ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
+                    if (!ppl) {
+                        GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
+                        dev->props.has_bfloat = false;
+                    }
+
+                    ggml_metal_library_free(lib);
+                }
+            }
 
             dev->props.use_residency_sets = true;
 #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
@@ -476,7 +675,6 @@ ggml_metal_device_t ggml_metal_device_init(void) {
 #endif
 
             dev->props.use_shared_buffers = dev->props.has_unified_memory;
-
             if (getenv("GGML_METAL_SHARED_BUFFERS_DISABLE") != NULL) {
                 dev->props.use_shared_buffers = false;
             }
@@ -529,6 +727,7 @@ ggml_metal_device_t ggml_metal_device_init(void) {
             GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, dev->props.has_simdgroup_mm        ? "true" : "false");
             GGML_LOG_INFO("%s: has unified memory    = %s\n", __func__, dev->props.has_unified_memory      ? "true" : "false");
             GGML_LOG_INFO("%s: has bfloat            = %s\n", __func__, dev->props.has_bfloat              ? "true" : "false");
+            GGML_LOG_INFO("%s: has tensor            = %s\n", __func__, dev->props.has_tensor              ? "true" : "false");
             GGML_LOG_INFO("%s: use residency sets    = %s\n", __func__, dev->props.use_residency_sets      ? "true" : "false");
             GGML_LOG_INFO("%s: use shared buffers    = %s\n", __func__, dev->props.use_shared_buffers      ? "true" : "false");
 
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 424c400f2..cea535ade 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -9,6 +9,12 @@ __embed_ggml-common.h__
 
 #include <metal_stdlib>
 
+#ifdef GGML_METAL_HAS_TENSOR
+#include <metal_tensor>
+
+#include <MetalPerformancePrimitives/MetalPerformancePrimitives.h>
+#endif
+
 using namespace metal;
 
 #define MAX(x, y) ((x) > (y) ? (x) : (y))
@@ -1742,7 +1748,7 @@ kernel void kernel_op_sum_f32(
 
     float sumf = 0;
 
-    for (int64_t i0 = tpitg.x; i0 < args.np; i0 += ntg.x) {
+    for (uint64_t i0 = tpitg.x; i0 < args.np; i0 += ntg.x) {
         sumf += src0[i0];
     }
 
@@ -5467,6 +5473,7 @@ template [[host_name("kernel_flash_attn_ext_q8_0_dk576_dv512")]] kernel flash_at
 
 #undef FA_TYPES
 #undef FA_TYPES_BF
+#undef FA_TYPES_F32
 
 constant bool FC_flash_attn_ext_vec_has_mask  [[function_constant(FC_FLASH_ATTN_EXT_VEC + 0)]];
 constant bool FC_flash_attn_ext_vec_has_sinks [[function_constant(FC_FLASH_ATTN_EXT_VEC + 1)]];
@@ -6088,6 +6095,7 @@ template [[host_name("kernel_flash_attn_ext_vec_q5_1_dk576_dv512")]] kernel flas
 template [[host_name("kernel_flash_attn_ext_vec_q8_0_dk576_dv512")]] kernel flash_attn_ext_vec_t kernel_flash_attn_ext_vec<FA_TYPES,     block_q8_0, 8, dequantize_q8_0_t4, block_q8_0,  8, dequantize_q8_0_t4, 576, 512, 2>;
 
 #undef FA_TYPES
+#undef FA_TYPES_F32
 
 constant int32_t FC_flash_attn_ext_vec_reduce_DV  [[function_constant(FC_FLASH_ATTN_EXT_VEC_REDUCE + 0)]];
 constant int32_t FC_flash_attn_ext_vec_reduce_NWG [[function_constant(FC_FLASH_ATTN_EXT_VEC_REDUCE + 1)]];
@@ -8141,17 +8149,6 @@ kernel void kernel_set_rows_f(
 constant bool FC_mul_mm_bc_inp [[function_constant(FC_MUL_MM + 0)]];
 constant bool FC_mul_mm_bc_out [[function_constant(FC_MUL_MM + 1)]];
 
-#define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A
-#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B
-#define BLOCK_SIZE_K 32
-#define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A
-#define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B
-#define THREAD_PER_BLOCK 128
-#define THREAD_PER_ROW 2 // 2 thread for each row in matrix A to load numbers
-#define THREAD_PER_COL 4 // 4 thread for each row in matrix B to load numbers
-#define SG_MAT_SIZE 64 // simdgroup matrix is of shape 8x8
-#define SG_MAT_ROW 8
-
 // each block_q contains 16*nl weights
 template<typename S0, typename S0_4x4, typename S0_8x8, typename S1, typename S1_2x4, typename S1_8x8, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread S0_4x4 &), typename T0, typename T0_4x4, typename T1, typename T1_2x4>
 kernel void kernel_mul_mm(
@@ -8167,18 +8164,48 @@ kernel void kernel_mul_mm(
     threadgroup S0 * sa = (threadgroup S0 *)(shmem);
     threadgroup S1 * sb = (threadgroup S1 *)(shmem + 4096);
 
-    const int r0 = tgpig.y;
-    const int r1 = tgpig.x;
+    threadgroup float * sc = (threadgroup float *)(shmem);
+
+    constexpr int NR0 = 64;
+    constexpr int NR1 = 32;
+
+    constexpr int NK  = 32;
+    constexpr int NL0 = NK/16;
+    constexpr int NL1 = NK/8;
+
     const int im = tgpig.z;
+    const int r0 = tgpig.y*NR0;
+    const int r1 = tgpig.x*NR1;
 
     // if this block is of 64x32 shape or smaller
-    const short n_rows = (args.ne0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.ne0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M;
-    const short n_cols = (args.ne1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? (args.ne1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N;
+    const short nr0 = (args.ne0 - r0 < NR0) ? (args.ne0 - r0) : NR0;
+    const short nr1 = (args.ne1 - r1 < NR1) ? (args.ne1 - r1) : NR1;
 
     // a thread shouldn't load data outside of the matrix
-    const short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
-    const short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
+    const short lr0 = ((short)tiitg/NL0) < nr0 ? ((short)tiitg/NL0) : nr0 - 1; // 0 .. 63
+    const short lr1 = ((short)tiitg/NL1) < nr1 ? ((short)tiitg/NL1) : nr1 - 1; // 0 .. 31
 
+    const short il0 = (tiitg % NL0);
+
+    short il = il0;
+
+    const int i12 = im%args.ne12;
+    const int i13 = im/args.ne12;
+
+    const uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const short    offset1 = il0/nl;
+
+    device const block_q * x = (device const block_q *)(src0 + args.nb01*(r0 + lr0) + offset0) + offset1;
+
+    const short iy = 8*(tiitg % NL1);
+
+    device const T1 * y = (device const T1 *)(src1
+        + args.nb13*i13
+        + args.nb12*i12
+        + args.nb11*(r1 + lr1)
+        + args.nb10*iy);
+
+#ifndef GGML_METAL_HAS_TENSOR
     S0_8x8 ma[4];
     S1_8x8 mb[2];
 
@@ -8187,36 +8214,36 @@ kernel void kernel_mul_mm(
     for (short i = 0; i < 8; i++){
         mc[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
     }
+#else
+    auto tA = tensor<threadgroup S0, dextents<int32_t, 2>, tensor_inline>(sa, dextents<int32_t, 2>(NK,  NR0));
+    auto tB = tensor<threadgroup S1, dextents<int32_t, 2>, tensor_inline>(sb, dextents<int32_t, 2>(NR1, NK ));
 
-    short il = (tiitg % THREAD_PER_ROW);
+    mpp::tensor_ops::matmul2d<
+        mpp::tensor_ops::matmul2d_descriptor(NR1, NR0, NK, false, true, false, mpp::tensor_ops::matmul2d_descriptor::mode::multiply_accumulate),
+        execution_simdgroups<4>> mm;
 
-    const int i12 = im%args.ne12;
-    const int i13 = im/args.ne12;
+    auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>();
+#endif
 
-    const uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
-    const short    offset1 = il/nl;
-
-    device const block_q * x = (device const block_q *)(src0
-        + args.nb01*(r0*BLOCK_SIZE_M + thread_row) + offset0) + offset1;
-
-    const short iy = (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL));
-
-    device const T1 * y = (device const T1 *)(src1
-        + args.nb13*i13
-        + args.nb12*i12
-        + args.nb11*(r1*BLOCK_SIZE_N + thread_col)
-        + args.nb10*iy);
-
-    for (int loop_k = 0; loop_k < args.ne00; loop_k += BLOCK_SIZE_K) {
+    for (int loop_k = 0; loop_k < args.ne00; loop_k += NK) {
+#ifndef GGML_METAL_HAS_TENSOR
         // load data and store to threadgroup memory
         if (is_same<T0_4x4, block_q>::value && FC_mul_mm_bc_inp) {
             threadgroup_barrier(mem_flags::mem_threadgroup);
 
             // no need for dequantization
             for (short i = 0; i < 16; i++) {
-                *(sa + SG_MAT_SIZE * ((tiitg/THREAD_PER_ROW/8) \
-                +                     (tiitg%THREAD_PER_ROW)*16 + (i/8)*8) \
-                +                     (tiitg/THREAD_PER_ROW)%8  + (i&7)*8) = loop_k + 16*il + i < args.ne00 ? ((device T0 *) x)[i] : 0;
+                const short sx = 2*il0 + i/8;
+                const short sy = (tiitg/NL0)/8;
+
+              //const short lx = i%8;
+              //const short ly = (tiitg/NL0)%8;
+                const short lx = (tiitg/NL0)%8;
+                const short ly = i%8;
+
+                const short ib = 8*sx + sy;
+
+                *(sa + 64*ib + 8*ly + lx) = loop_k + 16*il + i < args.ne00 ? *((device T0 *) x + i) : 0;
             }
         } else {
             S0_4x4 temp_a;
@@ -8225,91 +8252,203 @@ kernel void kernel_mul_mm(
             threadgroup_barrier(mem_flags::mem_threadgroup);
 
             FOR_UNROLL (short i = 0; i < 16; i++) {
-                *(sa + SG_MAT_SIZE * ((tiitg/THREAD_PER_ROW/8) \
-                +                     (tiitg%THREAD_PER_ROW)*16 + (i/8)*8) \
-                +                     (tiitg/THREAD_PER_ROW)%8  + (i&7)*8) = temp_a[i/4][i%4];
+                const short sx = 2*il0 + i/8;
+                const short sy = (tiitg/NL0)/8;
+
+              //const short lx = i%8;
+              //const short ly = (tiitg/NL0)%8;
+                const short lx = (tiitg/NL0)%8;
+                const short ly = i%8;
+
+                const short ib = 8*sx + sy;
+
+                // NOTE: this is massively slower.. WTF?
+                //sa[64*ib + 8*ly + lx] = temp_a[i/4][i%4];
+
+                *(sa + 64*ib + 8*ly + lx) = temp_a[i/4][i%4];
             }
         }
 
         if (FC_mul_mm_bc_inp) {
             for (short i = 0; i < 8; ++i) {
-                sb[32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL) + i] = loop_k + iy + i < args.ne00 ? (S1) ((device T1 *) y)[i] : 0;
+                const short sx = (tiitg%NL1);
+                const short sy = (tiitg/NL1)/8;
+
+                const short lx = i;
+                const short ly = (tiitg/NL1)%8;
+              //const short lx = (tiitg/NL1)%8;
+              //const short ly = i;
+
+                const short ib = 4*sx + sy;
+
+                *(sb + 64*ib + 8*ly + lx) = loop_k + iy + i < args.ne00 ? (S1) *((device T1 *) y + i) : 0;
             }
         } else {
-            *(threadgroup S1_2x4 *)(sb + 32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL)) = (S1_2x4)(*((device T1_2x4 *) y));
+            const short sx = (tiitg%NL1);
+            const short sy = (tiitg/NL1)/8;
+
+            const short dx = sx;
+            const short dy = sy;
+
+            const short ly = (tiitg/NL1)%8;
+
+            const short ib = 4*sx + sy;
+
+            *(threadgroup S1_2x4 *)(sb + 64*ib + 8*ly) = (S1_2x4)(*((device T1_2x4 *) y));
         }
+#else
+        // load data and store to threadgroup memory
+        if (is_same<T0_4x4, block_q>::value && FC_mul_mm_bc_inp) {
+            threadgroup_barrier(mem_flags::mem_threadgroup);
+
+            // no need for dequantization
+            for (short i = 0; i < 16; i++) {
+                const short sx = 2*il0 + i/8;
+                const short sy = (tiitg/NL0)/8;
+
+                const short lx = i%8;
+                const short ly = (tiitg/NL0)%8;
+                //const short lx = (tiitg/NL0)%8;
+                //const short ly = i%8;
+
+                *(sa + NK*(8*sy + ly) + 8*sx + lx) = loop_k + 16*il + i < args.ne00 ? *((device T0 *) x + i) : 0;
+            }
+        } else {
+            S0_4x4 temp_a;
+            dequantize_func(x, il, temp_a);
+
+            threadgroup_barrier(mem_flags::mem_threadgroup);
+
+            FOR_UNROLL (short i = 0; i < 16; i++) {
+                const short sx = 2*il0 + i/8;
+                const short sy = (tiitg/NL0)/8;
+
+                const short lx = i%8;
+                const short ly = (tiitg/NL0)%8;
+                //const short lx = (tiitg/NL0)%8;
+                //const short ly = i%8;
+
+                *(sa + NK*(8*sy + ly) + 8*sx + lx) = temp_a[i/4][i%4];
+            }
+        }
+
+        if (FC_mul_mm_bc_inp) {
+            for (short i = 0; i < 8; ++i) {
+                const short sx = (tiitg%NL1);
+                const short sy = (tiitg/NL1)/8;
+
+                const short lx = i;
+                const short ly = (tiitg/NL1)%8;
+                //const short lx = (tiitg/NL1)%8;
+                //const short ly = i;
+
+                *(sb + NK*(8*sy + ly) + 8*sx + lx) = loop_k + iy + i < args.ne00 ? (S1) *((device T1 *) y + i) : 0;
+            }
+        } else {
+            const short sx = (tiitg%NL1);
+            const short sy = (tiitg/NL1)/8;
+
+            //const short lx = i;
+            const short ly = (tiitg/NL1)%8;
+            //const short lx = (tiitg/NL1)%8;
+            //const short ly = i;
+
+            *(threadgroup S1_2x4 *)(sb + NK*(8*sy + ly) + 8*sx) = (S1_2x4)(*((device T1_2x4 *) y));
+        }
+#endif
 
         il = (il + 2 < nl) ? il + 2 : il % 2;
         x  = (il < 2) ? x + (2 + nl - 1)/nl : x;
-        y += BLOCK_SIZE_K;
+
+        y += NK;
 
         threadgroup_barrier(mem_flags::mem_threadgroup);
 
+#ifndef GGML_METAL_HAS_TENSOR
         // load matrices from threadgroup memory and conduct outer products
-        threadgroup const S0 * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2));
-        threadgroup const S1 * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2));
+        threadgroup const S0 * lsma = (sa + 4*64*(sgitg%2));
+        threadgroup const S1 * lsmb = (sb + 2*64*(sgitg/2));
 
-        #pragma unroll(4)
-        for (short ik = 0; ik < BLOCK_SIZE_K/8; ik++) {
+        FOR_UNROLL (short ik = 0; ik < NK/8; ik++) {
             simdgroup_barrier(mem_flags::mem_none);
 
-            #pragma unroll(4)
-            for (short i = 0; i < 4; i++) {
-                simdgroup_load(ma[i], lsma + SG_MAT_SIZE * i);
-            }
-
-            #pragma unroll(2)
-            for (short i = 0; i < 2; i++) {
-                simdgroup_load(mb[i], lsmb + SG_MAT_SIZE * i);
+            FOR_UNROLL (short i = 0; i < 4; i++) {
+                simdgroup_load(ma[i], lsma + 64*i, 8, 0, false);
             }
 
             simdgroup_barrier(mem_flags::mem_none);
 
-            #pragma unroll(8)
-            for (short i = 0; i < 8; i++){
+            FOR_UNROLL (short i = 0; i < 2; i++) {
+                simdgroup_load(mb[i], lsmb + 64*i, 8, 0, false);
+            }
+
+            simdgroup_barrier(mem_flags::mem_none);
+
+            FOR_UNROLL (short i = 0; i < 8; i++){
                 simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]);
             }
 
-            lsma += (BLOCK_SIZE_M/SG_MAT_ROW)*SG_MAT_SIZE;
-            lsmb += (BLOCK_SIZE_N/SG_MAT_ROW)*SG_MAT_SIZE;
+            lsma += 8*64;
+            lsmb += 4*64;
         }
+#else
+        auto sA = tA.slice(0, 0);
+        auto sB = tB.slice(0, 0);
+
+        mm.run(sB, sA, cT);
+#endif
     }
 
-    if (!FC_mul_mm_bc_out || ((r0 + 1) * BLOCK_SIZE_M <= args.ne0 && (r1 + 1) * BLOCK_SIZE_N <= args.ne1)) {
+    if (!FC_mul_mm_bc_out || (r0 + NR0 <= args.ne0 && r1 + NR1 <= args.ne1)) {
         // if no bounds checks on the output are needed, we can directly write to device memory
+#ifdef GGML_METAL_HAS_TENSOR
         device float * C = (device float *) dst +
-            (BLOCK_SIZE_M * r0 + 32*(sgitg &  1)) + \
-            (BLOCK_SIZE_N * r1 + 16*(sgitg >> 1)) * args.ne0 + im*args.ne1*args.ne0;
+            r0 + \
+            r1 * args.ne0 + im*args.ne1*args.ne0;
+
+        auto tC = tensor<device float, dextents<int32_t, 2>, tensor_inline>(C, dextents<int32_t, 2>(args.ne0, NR1));
+        cT.store(tC);
+#else
+        device float * C = (device float *) dst +
+            (r0 + 32*(sgitg &  1)) + \
+            (r1 + 16*(sgitg >> 1)) * args.ne0 + im*args.ne1*args.ne0;
 
         for (short i = 0; i < 8; i++) {
-            simdgroup_store(mc[i], C + 8 * (i%4) + 8 * args.ne0 * (i/4), args.ne0);
+            simdgroup_store(mc[i], C + 8*(i%4) + 8*args.ne0*(i/4), args.ne0, 0, false);
         }
+#endif
     } else {
         // block is smaller than 64x32, we should avoid writing data outside of the matrix
         threadgroup_barrier(mem_flags::mem_threadgroup);
-        threadgroup float * temp_str = ((threadgroup float *) shmem) \
-                                     + 32*(sgitg&1) + (16*(sgitg >> 1))*BLOCK_SIZE_M;
+
+        threadgroup float * temp_str = ((threadgroup float *) shmem) + 32*(sgitg&1) + (16*(sgitg >> 1))*NR0;
+
+#ifdef GGML_METAL_HAS_TENSOR
+        auto tC = tensor<threadgroup float, dextents<int32_t, 2>, tensor_inline>(sc, dextents<int32_t, 2>(NR0, NR1));
+        cT.store(tC);
+#else
         for (short i = 0; i < 8; i++) {
-            simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*BLOCK_SIZE_M*(i/4), BLOCK_SIZE_M);
+            simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*NR0*(i/4), NR0, 0, false);
         }
+#endif
 
         threadgroup_barrier(mem_flags::mem_threadgroup);
 
         if (sgitg == 0) {
-            for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
-                device float  * D  = (device float  *) dst + (r0*BLOCK_SIZE_M) + (r1*BLOCK_SIZE_N + j)*args.ne0 + im*args.ne1*args.ne0;
+            for (int j = tiitg; j < nr1; j += NR1) {
+                device float  * D  = (device float  *) dst + r0 + (r1 + j)*args.ne0 + im*args.ne1*args.ne0;
                 device float4 * D4 = (device float4 *) D;
 
-                threadgroup float  * C  = temp_str + (j*BLOCK_SIZE_M);
+                threadgroup float  * C  = temp_str + (j*NR0);
                 threadgroup float4 * C4 = (threadgroup float4 *) C;
 
                 int i = 0;
-                for (; i < n_rows/4; i++) {
+                for (; i < nr0/4; i++) {
                     *(D4 + i) = *(C4 + i);
                 }
 
                 i *= 4;
-                for (; i < n_rows; i++) {
+                for (; i < nr0; i++) {
                     *(D + i) = *(C + i);
                 }
             }
@@ -8394,31 +8533,63 @@ kernel void kernel_mul_mm_id(
         ushort tiitg[[thread_index_in_threadgroup]],
         ushort tiisg[[thread_index_in_simdgroup]],
         ushort sgitg[[simdgroup_index_in_threadgroup]]) {
-
     threadgroup S0 * sa = (threadgroup S0 *)(shmem);
     threadgroup S1 * sb = (threadgroup S1 *)(shmem + 4096);
 
-    const int r0 = tgpig.y;
-    const int r1 = tgpig.x;
+    threadgroup float * sc = (threadgroup float *)(shmem);
+
+    constexpr int NR0 = 64;
+    constexpr int NR1 = 32;
+
+    constexpr int NK  = 32;
+    constexpr int NL0 = NK/16;
+    constexpr int NL1 = NK/8;
+
     const int im = tgpig.z; // expert
+    const int r0 = tgpig.y*NR0;
+    const int r1 = tgpig.x*NR1;
 
     device const uint32_t * tpe_u32 = (device const uint32_t *) (htpe);
     device const int32_t  * ids_i32 = (device const int32_t  *) (hids);
 
     const int32_t neh1 = tpe_u32[im];
 
-    if (r1*BLOCK_SIZE_N >= neh1) {
+    if (r1 >= neh1) {
         return;
     }
 
     // if this block is of 64x32 shape or smaller
-    const short n_rows = (args.ne0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.ne0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M;
-    const short n_cols = (    neh1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? (    neh1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N;
+    const short nr0 = (args.ne0 - r0 < NR0) ? (args.ne0 - r0) : NR0;
+    const short nr1 = (    neh1 - r1 < NR1) ? (    neh1 - r1) : NR1;
 
     // a thread shouldn't load data outside of the matrix
-    const short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
-    const short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
+    const short lr0 = ((short)tiitg/NL0) < nr0 ? ((short)tiitg/NL0) : nr0 - 1; // 0 .. 63
+    const short lr1 = ((short)tiitg/NL1) < nr1 ? ((short)tiitg/NL1) : nr1 - 1; // 0 .. 31
 
+    const short il0 = (tiitg % NL0);
+
+    short il = il0;
+
+    const int id = ids_i32[im*args.ne21 + r1 + lr1];
+
+    const short i11 = (id % args.ne20) % args.ne11;
+    const short i12 = (id / args.ne20);
+    const short i13 = 0;
+
+    const uint64_t offset0 = im*args.nb02 + i13*args.nb03;
+    const short    offset1 = il0/nl;
+
+    device const block_q * x = (device const block_q *)(src0 + args.nb01*(r0 + lr0) + offset0) + offset1;
+
+    const short iy = 8*(tiitg % NL1);
+
+    device const T1 * y = (device const T1 *)(src1
+        + args.nb13*i13
+        + args.nb12*i12
+        + args.nb11*i11
+        + args.nb10*iy);
+
+#ifndef GGML_METAL_HAS_TENSOR
     S0_8x8 ma[4];
     S1_8x8 mb[2];
 
@@ -8427,39 +8598,36 @@ kernel void kernel_mul_mm_id(
     for (short i = 0; i < 8; i++){
         mc[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
     }
+#else
+    auto tA = tensor<threadgroup S0, dextents<int32_t, 2>, tensor_inline>(sa, dextents<int32_t, 2>(NK,  NR0));
+    auto tB = tensor<threadgroup S1, dextents<int32_t, 2>, tensor_inline>(sb, dextents<int32_t, 2>(NR1, NK ));
 
-    short il = (tiitg % THREAD_PER_ROW);
+    mpp::tensor_ops::matmul2d<
+        mpp::tensor_ops::matmul2d_descriptor(NR1, NR0, NK, false, true, false, mpp::tensor_ops::matmul2d_descriptor::mode::multiply_accumulate),
+        execution_simdgroups<4>> mm;
 
-    const int id = ids_i32[im*args.ne21 + r1*BLOCK_SIZE_N + thread_col];
+    auto cT = mm.get_destination_cooperative_tensor<decltype(tA), decltype(tB), float>();
+#endif
 
-    const short i11 = (id % args.ne20) % args.ne11;
-    const short i12 = (id / args.ne20);
-    const short i13 = 0;
-
-    const uint64_t offset0 = im*args.nb02 + i13*args.nb03;
-    const short    offset1 = il/nl;
-
-    device const block_q * x = (device const block_q *)(src0
-        + args.nb01*(r0*BLOCK_SIZE_M + thread_row) + offset0) + offset1;
-
-    const short iy = (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL));
-
-    device const T1 * y = (device const T1 *)(src1
-        + args.nb13*i13
-        + args.nb12*i12
-        + args.nb11*i11
-        + args.nb10*iy);
-
-    for (int loop_k = 0; loop_k < args.ne00; loop_k += BLOCK_SIZE_K) {
+    for (int loop_k = 0; loop_k < args.ne00; loop_k += NK) {
+#ifndef GGML_METAL_HAS_TENSOR
         // load data and store to threadgroup memory
         if (is_same<T0_4x4, block_q>::value && FC_mul_mm_bc_inp) {
             threadgroup_barrier(mem_flags::mem_threadgroup);
 
             // no need for dequantization
             for (short i = 0; i < 16; i++) {
-                *(sa + SG_MAT_SIZE * ((tiitg/THREAD_PER_ROW/8) \
-                +                     (tiitg%THREAD_PER_ROW)*16 + (i/8)*8) \
-                +                     (tiitg/THREAD_PER_ROW)%8  + (i&7)*8) = loop_k + 16*il + i < args.ne00 ? ((device T0 *) x)[i] : 0;
+                const short sx = 2*il0 + i/8;
+                const short sy = (tiitg/NL0)/8;
+
+              //const short lx = i%8;
+              //const short ly = (tiitg/NL0)%8;
+                const short lx = (tiitg/NL0)%8;
+                const short ly = i%8;
+
+                const short ib = 8*sx + sy;
+
+                *(sa + 64*ib + 8*ly + lx) = loop_k + 16*il + i < args.ne00 ? *((device T0 *) x + i) : 0;
             }
         } else {
             S0_4x4 temp_a;
@@ -8468,85 +8636,188 @@ kernel void kernel_mul_mm_id(
             threadgroup_barrier(mem_flags::mem_threadgroup);
 
             FOR_UNROLL (short i = 0; i < 16; i++) {
-                *(sa + SG_MAT_SIZE * ((tiitg/THREAD_PER_ROW/8) \
-                +                     (tiitg%THREAD_PER_ROW)*16 + (i/8)*8) \
-                +                     (tiitg/THREAD_PER_ROW)%8  + (i&7)*8) = temp_a[i/4][i%4];
+                const short sx = 2*il0 + i/8;
+                const short sy = (tiitg/NL0)/8;
+
+              //const short lx = i%8;
+              //const short ly = (tiitg/NL0)%8;
+                const short lx = (tiitg/NL0)%8;
+                const short ly = i%8;
+
+                const short ib = 8*sx + sy;
+
+                // NOTE: this is massively slower.. WTF?
+                //sa[64*ib + 8*ly + lx] = temp_a[i/4][i%4];
+
+                *(sa + 64*ib + 8*ly + lx) = temp_a[i/4][i%4];
             }
         }
 
         if (FC_mul_mm_bc_inp) {
             for (short i = 0; i < 8; ++i) {
-                sb[32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL) + i] = loop_k + iy + i < args.ne00 ? (S1) ((device T1 *) y)[i] : 0;
+                const short sx = (tiitg%NL1);
+                const short sy = (tiitg/NL1)/8;
+
+                const short lx = i;
+                const short ly = (tiitg/NL1)%8;
+              //const short lx = (tiitg/NL1)%8;
+              //const short ly = i;
+
+                const short ib = 4*sx + sy;
+
+                *(sb + 64*ib + 8*ly + lx) = loop_k + iy + i < args.ne00 ? (S1) *((device T1 *) y + i) : 0;
             }
         } else {
-            *(threadgroup S1_2x4 *)(sb + 32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL)) = (S1_2x4)(*((device T1_2x4 *) y));
+            const short sx = (tiitg%NL1);
+            const short sy = (tiitg/NL1)/8;
+
+            const short dx = sx;
+            const short dy = sy;
+
+            const short ly = (tiitg/NL1)%8;
+
+            const short ib = 4*sx + sy;
+
+            *(threadgroup S1_2x4 *)(sb + 64*ib + 8*ly) = (S1_2x4)(*((device T1_2x4 *) y));
         }
+#else
+        // load data and store to threadgroup memory
+        if (is_same<T0_4x4, block_q>::value && FC_mul_mm_bc_inp) {
+            threadgroup_barrier(mem_flags::mem_threadgroup);
+
+            // no need for dequantization
+            for (short i = 0; i < 16; i++) {
+                const short sx = 2*il0 + i/8;
+                const short sy = (tiitg/NL0)/8;
+
+                const short lx = i%8;
+                const short ly = (tiitg/NL0)%8;
+                //const short lx = (tiitg/NL0)%8;
+                //const short ly = i%8;
+
+                *(sa + NK*(8*sy + ly) + 8*sx + lx) = loop_k + 16*il + i < args.ne00 ? *((device T0 *) x + i) : 0;
+            }
+        } else {
+            S0_4x4 temp_a;
+            dequantize_func(x, il, temp_a);
+
+            threadgroup_barrier(mem_flags::mem_threadgroup);
+
+            FOR_UNROLL (short i = 0; i < 16; i++) {
+                const short sx = 2*il0 + i/8;
+                const short sy = (tiitg/NL0)/8;
+
+                const short lx = i%8;
+                const short ly = (tiitg/NL0)%8;
+                //const short lx = (tiitg/NL0)%8;
+                //const short ly = i%8;
+
+                *(sa + NK*(8*sy + ly) + 8*sx + lx) = temp_a[i/4][i%4];
+            }
+        }
+
+        if (FC_mul_mm_bc_inp) {
+            for (short i = 0; i < 8; ++i) {
+                const short sx = (tiitg%NL1);
+                const short sy = (tiitg/NL1)/8;
+
+                const short lx = i;
+                const short ly = (tiitg/NL1)%8;
+                //const short lx = (tiitg/NL1)%8;
+                //const short ly = i;
+
+                *(sb + NK*(8*sy + ly) + 8*sx + lx) = loop_k + iy + i < args.ne00 ? (S1) *((device T1 *) y + i) : 0;
+            }
+        } else {
+            const short sx = (tiitg%NL1);
+            const short sy = (tiitg/NL1)/8;
+
+            //const short lx = i;
+            const short ly = (tiitg/NL1)%8;
+            //const short lx = (tiitg/NL1)%8;
+            //const short ly = i;
+
+            *(threadgroup S1_2x4 *)(sb + NK*(8*sy + ly) + 8*sx) = (S1_2x4)(*((device T1_2x4 *) y));
+        }
+#endif
 
         il = (il + 2 < nl) ? il + 2 : il % 2;
         x  = (il < 2) ? x + (2 + nl - 1)/nl : x;
-        y += BLOCK_SIZE_K;
+
+        y += NK;
 
         threadgroup_barrier(mem_flags::mem_threadgroup);
 
+#ifndef GGML_METAL_HAS_TENSOR
         // load matrices from threadgroup memory and conduct outer products
-        threadgroup const S0 * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2));
-        threadgroup const S1 * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2));
+        threadgroup const S0 * lsma = (sa + 4*64*(sgitg%2));
+        threadgroup const S1 * lsmb = (sb + 2*64*(sgitg/2));
 
-        #pragma unroll(4)
-        for (short ik = 0; ik < BLOCK_SIZE_K/8; ik++) {
-            #pragma unroll(4)
-            for (short i = 0; i < 4; i++) {
-                simdgroup_load(ma[i], lsma + SG_MAT_SIZE * i);
+        FOR_UNROLL (short ik = 0; ik < NK/8; ik++) {
+            simdgroup_barrier(mem_flags::mem_none);
+
+            FOR_UNROLL (short i = 0; i < 4; i++) {
+                simdgroup_load(ma[i], lsma + 64*i, 8, 0, false);
             }
 
             simdgroup_barrier(mem_flags::mem_none);
 
-            #pragma unroll(2)
-            for (short i = 0; i < 2; i++) {
-                simdgroup_load(mb[i], lsmb + SG_MAT_SIZE * i);
+            FOR_UNROLL (short i = 0; i < 2; i++) {
+                simdgroup_load(mb[i], lsmb + 64*i, 8, 0, false);
             }
 
-            #pragma unroll(8)
-            for (short i = 0; i < 8; i++){
+            simdgroup_barrier(mem_flags::mem_none);
+
+            FOR_UNROLL (short i = 0; i < 8; i++){
                 simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]);
             }
 
-            lsma += (BLOCK_SIZE_M/SG_MAT_ROW)*SG_MAT_SIZE;
-            lsmb += (BLOCK_SIZE_N/SG_MAT_ROW)*SG_MAT_SIZE;
+            lsma += 8*64;
+            lsmb += 4*64;
         }
+#else
+        auto sA = tA.slice(0, 0);
+        auto sB = tB.slice(0, 0);
+
+        mm.run(sB, sA, cT);
+#endif
     }
 
+    // block is smaller than 64x32, we should avoid writing data outside of the matrix
     threadgroup_barrier(mem_flags::mem_threadgroup);
 
-    threadgroup float * temp_str = ((threadgroup float *) shmem) \
-                                 + 32*(sgitg&1) + (16*(sgitg >> 1))*BLOCK_SIZE_M;
+#ifdef GGML_METAL_HAS_TENSOR
+    auto tC = tensor<threadgroup float, dextents<int32_t, 2>, tensor_inline>(sc, dextents<int32_t, 2>(NR0, NR1));
+    cT.store(tC);
+#else
+    threadgroup float * temp_str = ((threadgroup float *) shmem) + 32*(sgitg&1) + (16*(sgitg >> 1))*NR0;
 
-    #pragma unroll(8)
     for (short i = 0; i < 8; i++) {
-        simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*BLOCK_SIZE_M*(i/4), BLOCK_SIZE_M);
+        simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*NR0*(i/4), NR0, 0, false);
     }
+#endif
 
     threadgroup_barrier(mem_flags::mem_threadgroup);
 
-    for (short j = sgitg; j < n_cols; j += 4) {
-        const int id = ids_i32[im*args.ne21 + r1*BLOCK_SIZE_N + j];
+    for (short j = sgitg; j < nr1; j += 4) {
+        const int id = ids_i32[im*args.ne21 + r1 + j];
 
         const short ide = id % args.ne20;
         const short idt = id / args.ne20;
 
-        device float  * D  = (device float  *) dst + (r0*BLOCK_SIZE_M) + ide*args.ne0 + idt*args.ne1*args.ne0;
+        device float  * D  = (device float  *) dst + r0 + ide*args.ne0 + idt*args.ne1*args.ne0;
         device float4 * D4 = (device float4 *) D;
 
-        threadgroup float  * C  = (threadgroup float  *) shmem + (j*BLOCK_SIZE_M);
+        threadgroup float  * C  = (threadgroup float  *) shmem + j*NR0;
         threadgroup float4 * C4 = (threadgroup float4 *) C;
 
         int i = tiisg;
-        for (; i < n_rows/4; i += 32) {
+        for (; i < nr0/4; i += 32) {
             *(D4 + i) = *(C4 + i);
         }
 
-        i = (4*(n_rows/4)) + tiisg;
-        for (; i < n_rows; i += 32) {
+        i = (4*(nr0/4)) + tiisg;
+        for (; i < nr0; i += 32) {
             *(D + i) = *(C + i);
         }
     }

From aa374175c30184aeb1813ec71fc68780dd073906 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Thu, 6 Nov 2025 14:05:47 +0100
Subject: [PATCH 12/32] CUDA: fix crash on uneven context without FA (#16988)

---
 ggml/src/ggml-cuda/ggml-cuda.cu | 12 ++++-----
 ggml/src/ggml-cuda/mmf.cu       | 12 ++++++---
 ggml/src/ggml-cuda/mmf.cuh      |  2 +-
 ggml/src/ggml-cuda/mmvf.cu      |  8 +++++-
 ggml/src/ggml-cuda/mmvf.cuh     |  2 +-
 src/llama-context.cpp           |  2 ++
 tests/test-backend-ops.cpp      | 44 ++++++++++++++-------------------
 7 files changed, 44 insertions(+), 38 deletions(-)

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 415a7e962..049aece1b 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2113,7 +2113,7 @@ static bool ggml_cuda_should_fuse_mul_mat_vec_f(const ggml_tensor * tensor) {
         src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
 
     const int cc      = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
-    use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, is_mul_mat_id ? src1->ne[2] : src1->ne[1]);
+    use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, is_mul_mat_id ? src1->ne[2] : src1->ne[1]);
 
     const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft) ||
                        ggml_backend_buft_is_cuda_split(src1->buffer->buft);
@@ -2207,16 +2207,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
             const int cc            = ggml_cuda_info().devices[id].cc;
             const int warp_size     = ggml_cuda_info().devices[id].warp_size;
             use_mul_mat_q           = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
-            use_mul_mat_f           = use_mul_mat_f             && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1], /*mul_mat_id=*/false);
-            use_mul_mat_vec_f       = use_mul_mat_vec_f         && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src1->ne[1]);
+            use_mul_mat_f           = use_mul_mat_f             && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
+            use_mul_mat_vec_f       = use_mul_mat_vec_f         && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
             any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16   || !fast_fp16_hardware_available(cc);
         }
     } else {
         const int cc            = ggml_cuda_info().devices[ctx.device].cc;
         const int warp_size     = ggml_cuda_info().devices[ctx.device].warp_size;
         use_mul_mat_q           = use_mul_mat_q             && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
-        use_mul_mat_f           = use_mul_mat_f             && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1], /*mul_mat_id=*/false);
-        use_mul_mat_vec_f       = use_mul_mat_vec_f         && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src1->ne[1]);
+        use_mul_mat_f           = use_mul_mat_f             && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
+        use_mul_mat_vec_f       = use_mul_mat_vec_f         && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
         any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16   || !fast_fp16_hardware_available(cc);
     }
 
@@ -2287,7 +2287,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
             return;
         }
 
-        if (ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src1->ne[2], /*mul_mat_id=*/true)) {
+        if (ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src0->nb, src1->ne[2], /*mul_mat_id=*/true)) {
             ggml_cuda_mul_mat_f(ctx, src0, src1, ids, dst);
             return;
         }
diff --git a/ggml/src/ggml-cuda/mmf.cu b/ggml/src/ggml-cuda/mmf.cu
index 2b0a61395..69a60aceb 100644
--- a/ggml/src/ggml-cuda/mmf.cu
+++ b/ggml/src/ggml-cuda/mmf.cu
@@ -119,15 +119,21 @@ void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * sr
     }
 }
 
-bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * src0_ne, const int src1_ncols, bool mul_mat_id) {
-
+bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * src0_ne,
+        const size_t * src0_nb, const int src1_ncols, bool mul_mat_id) {
     if (ggml_is_quantized(type)) {
         return false;
     }
 
-    if (src0_ne[0] % (warp_size * (4/ggml_type_size(type))) != 0) {
+    const size_t ts = ggml_type_size(type);
+    if (src0_ne[0] % (warp_size * (4/ts)) != 0) {
         return false;
     }
+    for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
+        if (src0_nb[i] % (2*ts) != 0) {
+            return false;
+        }
+    }
     if (src0_ne[1] % MMF_ROWS_PER_BLOCK != 0) {
         return false;
     }
diff --git a/ggml/src/ggml-cuda/mmf.cuh b/ggml/src/ggml-cuda/mmf.cuh
index f7e46e2f6..45724e091 100644
--- a/ggml/src/ggml-cuda/mmf.cuh
+++ b/ggml/src/ggml-cuda/mmf.cuh
@@ -17,7 +17,7 @@ struct mmf_ids_data {
 
 void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
 
-bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * scr0_ne, const int src1_ncols, bool mul_mat_id);
+bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * scr0_ne, const size_t * src0_nb, const int src1_ncols, bool mul_mat_id);
 
 template <typename T, int rows_per_block, int cols_per_block, int nwarps, bool has_ids>
 __launch_bounds__(ggml_cuda_get_physical_warp_size()*nwarps, 1)
diff --git a/ggml/src/ggml-cuda/mmvf.cu b/ggml/src/ggml-cuda/mmvf.cu
index 4e3178343..526d90d7a 100644
--- a/ggml/src/ggml-cuda/mmvf.cu
+++ b/ggml/src/ggml-cuda/mmvf.cu
@@ -716,10 +716,16 @@ void ggml_cuda_op_mul_mat_vec_f(
     GGML_UNUSED_VARS(ctx, src1, dst, src1_ddq_i, src1_ncols, src1_padded_row_size);
 }
 
-bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0_ne, int64_t ne11) {
+bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0_ne, const size_t * src0_nb, int64_t ne11) {
     if (src0_ne[0] % 2 != 0) {
         return false;
     }
+    const size_t ts = ggml_type_size(type);
+    for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
+        if (src0_nb[i] % (2*ts) != 0) {
+            return false;
+        }
+    }
     switch (type) {
         case GGML_TYPE_F32:
             if (GGML_CUDA_CC_IS_NVIDIA(cc)) {
diff --git a/ggml/src/ggml-cuda/mmvf.cuh b/ggml/src/ggml-cuda/mmvf.cuh
index a205aa8e4..a09fbdc72 100644
--- a/ggml/src/ggml-cuda/mmvf.cuh
+++ b/ggml/src/ggml-cuda/mmvf.cuh
@@ -9,4 +9,4 @@ void ggml_cuda_op_mul_mat_vec_f(
     const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
     const int64_t src1_padded_row_size, cudaStream_t stream);
 
-bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0_ne, int64_t ne11);
+bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0_ne, const size_t * src0_nb, int64_t ne11);
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 2b3936627..866514038 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -21,6 +21,8 @@ llama_context::llama_context(
               llama_context_params params) :
     model(model),
     balloc(std::make_unique<llama_batch_allocr>(model.hparams.n_pos_per_embd())) {
+    // TODO warning when creating llama_context with awkward ctx size that is not a power of 2,
+    //     may need to be backend-dependent
     LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__);
 
     t_start_us = model.t_start_us;
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index f57542027..bffd60f38 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -3385,11 +3385,11 @@ struct test_mul_mat : public test_case {
     const std::array<int64_t, 2> bs;  // dims 3 and 4
     const std::array<int64_t, 2> nr;  // repeat in dims 3 and 4
     const std::array<int64_t, 4> per; // permutation of dimensions
-    const bool v; // whether a and b are non-contiguous views
+    const int64_t k_v; // size of k in memory, resulting in a non-contiguous view for k_v > k, no view for k_v == 0
     const uint32_t o; // number of outputs
 
     std::string vars() override {
-        return VARS_TO_STR10(type_a, type_b, m, n, k, bs, nr, per, v, o);
+        return VARS_TO_STR10(type_a, type_b, m, n, k, bs, nr, per, k_v, o);
     }
 
     double max_nmse_err() override {
@@ -3410,8 +3410,8 @@ struct test_mul_mat : public test_case {
             std::array<int64_t, 2> bs = {10, 10},
             std::array<int64_t, 2> nr = {2, 2},
             std::array<int64_t, 4> per = {0, 1, 2, 3},
-            bool v = false, uint32_t o = 1)
-        : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per), v(v), o(o) {}
+            int64_t k_v = 0, uint32_t o = 1)
+        : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per), k_v(k_v), o(o) {}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         // C^T = A * B^T: (k, m) * (k, n) => (m, n)
@@ -3421,7 +3421,7 @@ struct test_mul_mat : public test_case {
         const int npermuted = (per[0] != 0) + (per[1] != 1) + (per[2] != 2) + (per[3] != 3);
         if (npermuted > 0) {
             GGML_ASSERT(npermuted == 2);
-            GGML_ASSERT(!v); // not handled
+            GGML_ASSERT(k_v == 0); // not handled
             GGML_ASSERT(!ggml_is_quantized(type_a) || per[0] == 0);
             GGML_ASSERT(!ggml_is_quantized(type_b) || per[0] == 0);
 
@@ -3445,29 +3445,21 @@ struct test_mul_mat : public test_case {
             ggml_set_name(a, "a_permuted");
             ggml_set_name(b, "b_permuted");
         } else {
-            if (v) {
-                a = ggml_new_tensor_4d(ctx, type_a, k*2, m, bs[0],       bs[1]);
-                b = ggml_new_tensor_4d(ctx, type_b, k*2, n, bs[0]*nr[0], bs[1]*nr[1]);
+            const int64_t k_physical = k_v == 0 ? k : k_v;
+            a = ggml_new_tensor_4d(ctx, type_a, k_physical, m, bs[0],       bs[1]);
+            b = ggml_new_tensor_4d(ctx, type_b, k_physical, n, bs[0]*nr[0], bs[1]*nr[1]);
 
-                if (!ggml_is_quantized(type_a)) {
-                    if (bs[1] == 1 && nr[1] == 1) {
-                        ggml_set_param(a);
-                    }
-                    ggml_set_param(b);
+            if (!ggml_is_quantized(type_a)) {
+                if (bs[1] == 1 && nr[1] == 1) {
+                    ggml_set_param(a);
                 }
+                ggml_set_param(b);
+            }
 
+            if (k_v != 0) {
+                GGML_ASSERT(k_v > k);
                 a = ggml_view_4d(ctx, a, k, m, bs[0],       bs[1],       a->nb[1], a->nb[2], a->nb[3], 0);
                 b = ggml_view_4d(ctx, b, k, n, bs[0]*nr[0], bs[1]*nr[1], b->nb[1], b->nb[2], b->nb[3], 0);
-            } else {
-                a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0],       bs[1]);
-                b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
-
-                if (!ggml_is_quantized(type_a)) {
-                    if (bs[1] == 1 && nr[1] == 1) {
-                        ggml_set_param(a);
-                    }
-                    ggml_set_param(b);
-                }
             }
             ggml_set_name(a, "a");
             ggml_set_name(b, "b");
@@ -6901,7 +6893,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45,  64, { 8,  1}, {4, 1}));
     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 193, {1,  1}, {4, 1}, {0, 2, 1, 3}));
     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 67,  {1,  1}, {4, 1}, {0, 2, 1, 3}));
-    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 16, 32, 32, { 1,  1}, {1, 1}, {0, 1, 2, 3}, true, 3));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 16, 32, 32, { 1,  1}, {1, 1}, {0, 1, 2, 3}, 64, 3));
     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 64, 77, 77, {12,1}, {1,1}));
 
 #if 0
@@ -6927,7 +6919,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
                     for (uint32_t k = 0; k < 2; ++k) {
                         for (ggml_type type: {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32}) {
                             test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 1056 + m, 1, 128 + k,  {bs,  bs2}, {nr, 1}, {0, 2, 1, 3}));
-                            test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 128 + m,  1, 1056 + k, {bs,  bs2}, {nr, 1}, {0, 1, 2, 3}, true));
+                            test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 128 + m,  1, 1056 + k, {bs,  bs2}, {nr, 1}, {0, 1, 2, 3}, 2*1056 + k));
                         }
                     }
                 }
@@ -7432,7 +7424,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
     test_cases.emplace_back(new test_pad_reflect_1d(GGML_TYPE_F32, {3000, 384, 4, 1}));
 
     test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 16416, 1, 128, {8,  1}, {4, 1}, {0, 2, 1, 3}));
-    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8,  1}, {4, 1}, {0, 1, 2, 3}, true));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8,  1}, {4, 1}, {0, 1, 2, 3}, 2*16416));
 
     for (int bs : {1, 2, 3, 4, 5, 8, 512}) {
         for (ggml_type type_a : all_types) {

From 7f09a680af6e0ef612de81018e1d19c19b8651e8 Mon Sep 17 00:00:00 2001
From: xctan <xc-tan@outlook.com>
Date: Fri, 7 Nov 2025 00:12:45 +0800
Subject: [PATCH 13/32] ggml-cpu : optimize RVV q2_k and q3_k kernels (#16887)

---
 ggml/src/ggml-cpu/arch/riscv/quants.c | 163 ++++++++++++++++++--------
 1 file changed, 111 insertions(+), 52 deletions(-)

diff --git a/ggml/src/ggml-cpu/arch/riscv/quants.c b/ggml/src/ggml-cpu/arch/riscv/quants.c
index ee41a3502..ae0ebb3ca 100644
--- a/ggml/src/ggml-cpu/arch/riscv/quants.c
+++ b/ggml/src/ggml-cpu/arch/riscv/quants.c
@@ -580,16 +580,19 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
             const float dmin = -y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
             uint8_t *patmp = atmp;
             int vsums;
-            int tmp;
+            int tmp, t1, t2, t3, t4, t5, t6, t7;
             __asm__ __volatile__(
                 "vsetivli zero, 16, e8, m1\n\t"
                 "vmv.v.x v8, zero\n\t"
+                "lb zero, 15(%[sc])\n\t"
                 "vle8.v v1, (%[sc])\n\t"
+                "vle8.v v2, (%[bsums])\n\t"
+                "addi %[tmp], %[bsums], 16\n\t"
                 "vand.vi v0, v1, 0xF\n\t"
                 "vsrl.vi v1, v1, 4\n\t"
+                "vle8.v v3, (%[tmp])\n\t"
                 "vse8.v v0, (%[scale])\n\t"
                 "vsetivli zero, 16, e16, m2\n\t"
-                "vle16.v v2, (%[bsums])\n\t"
                 "vzext.vf2 v0, v1\n\t"
                 "vwmul.vv v4, v0, v2\n\t"
                 "vsetivli zero, 16, e32, m4\n\t"
@@ -608,46 +611,89 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
 
             for (int j = 0; j < QK_K/128; ++j) {
                 __asm__ __volatile__(
-                    "vsetvli zero, %[vl32], e8, m2\n\t"
+                    "lb zero, 31(%[q2])\n\t"
+                    "addi %[tmp], %[q2], 16\n\t"
+                    "addi %[t1], %[q8], 16\n\t"
+                    "vsetivli zero, 16, e8, m1\n\t"
                     "vle8.v v0, (%[q2])\n\t"
+                    "vle8.v v1, (%[tmp])\n\t"
                     "vsrl.vi v2, v0, 2\n\t"
+                    "vsrl.vi v3, v1, 2\n\t"
                     "vsrl.vi v4, v0, 4\n\t"
-                    "vsrl.vi v6, v0, 6\n\t"
-                    "vand.vi v0, v0, 0x3\n\t"
-                    "vand.vi v2, v2, 0x3\n\t"
-                    "vand.vi v4, v4, 0x3\n\t"
-                    "vsetvli zero, %[vl128], e8, m8\n\t"
+                    "addi %[tmp], %[q8], 32\n\t"
                     "vle8.v v8, (%[q8])\n\t"
-                    "vsetvli zero, %[vl64], e8, m4\n\t"
+                    "vle8.v v9, (%[t1])\n\t"
+                    "addi %[t1], %[t1], 32\n\t"
+                    "vsrl.vi v5, v1, 4\n\t"
+                    "vsrl.vi v6, v0, 6\n\t"
+                    "vsrl.vi v7, v1, 6\n\t"
+                    "vle8.v v10, (%[tmp])\n\t"
+                    "vle8.v v11, (%[t1])\n\t"
+                    "addi %[tmp], %[tmp], 32\n\t"
+                    "addi %[t1], %[t1], 32\n\t"
+                    "vand.vi v0, v0, 0x3\n\t"
+                    "vand.vi v1, v1, 0x3\n\t"
+                    "vand.vi v2, v2, 0x3\n\t"
+                    "vle8.v v12, (%[tmp])\n\t"
+                    "vle8.v v13, (%[t1])\n\t"
+                    "addi %[tmp], %[tmp], 32\n\t"
+                    "addi %[t1], %[t1], 32\n\t"
+                    "vand.vi v3, v3, 0x3\n\t"
+                    "vand.vi v4, v4, 0x3\n\t"
+                    "vand.vi v5, v5, 0x3\n\t"
+                    "vle8.v v14, (%[tmp])\n\t"
+                    "vle8.v v15, (%[t1])\n\t"
                     "vwmul.vv v16, v0, v8\n\t"
+                    "vwmul.vv v18, v1, v9\n\t"
+                    "vwmul.vv v20, v2, v10\n\t"
+                    "vwmul.vv v22, v3, v11\n\t"
                     "vwmul.vv v24, v4, v12\n\t"
-                    "vsetivli zero, 16, e16, m2\n\t"
+                    "vwmul.vv v26, v5, v13\n\t"
+                    "vwmul.vv v28, v6, v14\n\t"
+                    "vwmul.vv v30, v7, v15\n\t"
+                    "vsetivli zero, 8, e16, m1\n\t"
                     "vmv.v.x v0, zero\n\t"
-                    "vwredsum.vs v10, v16, v0\n\t"
+                    "lbu %[tmp], 0(%[scale])\n\t"
+                    "vwredsum.vs v8, v16, v0\n\t"
                     "vwredsum.vs v9, v18, v0\n\t"
-                    "vwredsum.vs v8, v20, v0\n\t"
-                    "vwredsum.vs v7, v22, v0\n\t"
-                    "vwredsum.vs v11, v24, v0\n\t"
-                    "vwredsum.vs v12, v26, v0\n\t"
-                    "vwredsum.vs v13, v28, v0\n\t"
-                    "vwredsum.vs v14, v30, v0\n\t"
+                    "lbu %[t1], 1(%[scale])\n\t"
+                    "vwredsum.vs v10, v20, v0\n\t"
+                    "vwredsum.vs v11, v22, v0\n\t"
+                    "lbu %[t2], 2(%[scale])\n\t"
+                    "vwredsum.vs v12, v24, v0\n\t"
+                    "vwredsum.vs v13, v26, v0\n\t"
+                    "lbu %[t3], 3(%[scale])\n\t"
+                    "vwredsum.vs v14, v28, v0\n\t"
+                    "vwredsum.vs v15, v30, v0\n\t"
+                    "lbu %[t4], 4(%[scale])\n\t"
+                    "vwredsum.vs v8, v17, v8\n\t"
+                    "vwredsum.vs v9, v19, v9\n\t"
+                    "lbu %[t5], 5(%[scale])\n\t"
+                    "vwredsum.vs v10, v21, v10\n\t"
+                    "vwredsum.vs v11, v23, v11\n\t"
+                    "lbu %[t6], 6(%[scale])\n\t"
+                    "vwredsum.vs v12, v25, v12\n\t"
+                    "vwredsum.vs v13, v27, v13\n\t"
+                    "lbu %[t7], 7(%[scale])\n\t"
+                    "vwredsum.vs v14, v29, v14\n\t"
+                    "vwredsum.vs v15, v31, v15\n\t"
                     "vsetivli zero, 4, e32, m1\n\t"
-                    "vslideup.vi v10, v9, 1\n\t"
-                    "vslideup.vi v8, v7, 1\n\t"
-                    "vslideup.vi v11, v12, 1\n\t"
-                    "vslideup.vi v13, v14, 1\n\t"
-                    "vslideup.vi v10, v8, 2\n\t"
-                    "vslideup.vi v11, v13, 2\n\t"
-                    "vsetivli zero, 8, e32, m2\n\t"
-                    "vle8.v v15, (%[scale])\n\t"
-                    "vzext.vf4 v12, v15\n\t"
-                    "vmul.vv v10, v10, v12\n\t"
-                    "vredsum.vs v0, v10, v0\n\t"
+                    "vmul.vx v0, v8, %[tmp]\n\t"
+                    "vmul.vx v1, v9, %[t1]\n\t"
+                    "vmacc.vx v0, %[t2], v10\n\t"
+                    "vmacc.vx v1, %[t3], v11\n\t"
+                    "vmacc.vx v0, %[t4], v12\n\t"
+                    "vmacc.vx v1, %[t5], v13\n\t"
+                    "vmacc.vx v0, %[t6], v14\n\t"
+                    "vmacc.vx v1, %[t7], v15\n\t"
                     "vmv.x.s %[tmp], v0\n\t"
-                    "add %[isum], %[isum], %[tmp]"
-                    : [tmp] "=&r" (tmp), [isum] "+&r" (isum)
+                    "vmv.x.s %[t1], v1\n\t"
+                    "add %[isum], %[isum], %[tmp]\n\t"
+                    "add %[isum], %[isum], %[t1]"
+                    : [tmp] "=&r" (tmp), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3)
+                    , [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r" (t6), [t7] "=&r" (t7)
+                    , [isum] "+&r" (isum)
                     : [q2] "r" (q2), [scale] "r" (patmp), [q8] "r" (q8)
-                    , [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
                     : "memory"
                     , "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
                     , "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
@@ -929,7 +975,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
             const  int8_t * restrict q8 = y[i].qs;
 
             int8_t * scale = (int8_t *)utmp;
-            int tmp;
+            int tmp, t1, t2, t3, t4, t5, t6, t7;
             __asm__ __volatile__(
                 "vsetivli zero, 12, e8, m1\n\t"
                 "vle8.v v0, (%[s6b])\n\t"
@@ -967,19 +1013,23 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
             int isum = 0;
             for (int j = 0; j < QK_K; j += 128) {
                 __asm__ __volatile__(
+                    "lb zero, 31(%[q3])\n\t"
                     "vsetvli zero, %[vl32], e8, m2, ta, mu\n\t"
                     "vle8.v v8, (%[q3])\n\t"
                     "vsrl.vi v10, v8, 2\n\t"
                     "vsrl.vi v12, v8, 4\n\t"
                     "vsrl.vi v14, v8, 6\n\t"
+                    "lb zero, 64(%[q8])\n\t"
                     "vand.vi v8, v8, 3\n\t"
                     "vand.vi v10, v10, 3\n\t"
                     "vand.vi v12, v12, 3\n\t"
                     "vle8.v v2, (%[qh])\n\t"
+                    "lb zero, 127(%[q8])\n\t"
                     "vand.vx v4, v2, %[m]\n\t"
                     "slli %[m], %[m], 1\n\t"
                     "vmseq.vx v0, v4, zero\n\t"
                     "vadd.vi v8, v8, -4, v0.t\n\t"
+                    "lb zero, 0(%[q8])\n\t"
                     "vand.vx v4, v2, %[m]\n\t"
                     "slli %[m], %[m], 1\n\t"
                     "vmseq.vx v0, v4, zero\n\t"
@@ -994,34 +1044,43 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
                     "vadd.vi v14, v14, -4, v0.t\n\t"
                     "vsetvli zero, %[vl128], e8, m8\n\t"
                     "vle8.v v0, (%[q8])\n\t"
+                    "lb %[tmp], 0(%[scale])\n\t"
+                    "lb %[t1], 1(%[scale])\n\t"
+                    "lb %[t2], 2(%[scale])\n\t"
+                    "lb %[t3], 3(%[scale])\n\t"
                     "vsetvli zero, %[vl64], e8, m4\n\t"
                     "vwmul.vv v16, v0, v8\n\t"
                     "vwmul.vv v24, v4, v12\n\t"
                     "vsetivli zero, 16, e16, m2\n\t"
                     "vmv.v.x v0, zero\n\t"
-                    "vwredsum.vs v10, v16, v0\n\t"
+                    "vwredsum.vs v8, v16, v0\n\t"
+                    "lb %[t4], 4(%[scale])\n\t"
+                    "lb %[t5], 5(%[scale])\n\t"
                     "vwredsum.vs v9, v18, v0\n\t"
-                    "vwredsum.vs v8, v20, v0\n\t"
-                    "vwredsum.vs v7, v22, v0\n\t"
-                    "vwredsum.vs v11, v24, v0\n\t"
-                    "vwredsum.vs v12, v26, v0\n\t"
-                    "vwredsum.vs v13, v28, v0\n\t"
-                    "vwredsum.vs v14, v30, v0\n\t"
+                    "vwredsum.vs v10, v20, v0\n\t"
+                    "vwredsum.vs v11, v22, v0\n\t"
+                    "vwredsum.vs v12, v24, v0\n\t"
+                    "lb %[t6], 6(%[scale])\n\t"
+                    "lb %[t7], 7(%[scale])\n\t"
+                    "vwredsum.vs v13, v26, v0\n\t"
+                    "vwredsum.vs v14, v28, v0\n\t"
+                    "vwredsum.vs v15, v30, v0\n\t"
                     "vsetivli zero, 4, e32, m1\n\t"
-                    "vslideup.vi v10, v9, 1\n\t"
-                    "vslideup.vi v8, v7, 1\n\t"
-                    "vslideup.vi v11, v12, 1\n\t"
-                    "vslideup.vi v13, v14, 1\n\t"
-                    "vslideup.vi v10, v8, 2\n\t"
-                    "vslideup.vi v11, v13, 2\n\t"
-                    "vsetivli zero, 8, e32, m2\n\t"
-                    "vle8.v v15, (%[scale])\n\t"
-                    "vsext.vf4 v12, v15\n\t"
-                    "vmul.vv v10, v10, v12\n\t"
-                    "vredsum.vs v0, v10, v0\n\t"
+                    "vmul.vx v0, v8, %[tmp]\n\t"
+                    "vmul.vx v1, v9, %[t1]\n\t"
+                    "vmacc.vx v0, %[t2], v10\n\t"
+                    "vmacc.vx v1, %[t3], v11\n\t"
+                    "vmacc.vx v0, %[t4], v12\n\t"
+                    "vmacc.vx v1, %[t5], v13\n\t"
+                    "vmacc.vx v0, %[t6], v14\n\t"
+                    "vmacc.vx v1, %[t7], v15\n\t"
                     "vmv.x.s %[tmp], v0\n\t"
-                    "add %[isum], %[isum], %[tmp]"
-                    : [tmp] "=&r" (tmp), [m] "+&r" (m), [isum] "+&r" (isum)
+                    "vmv.x.s %[t1], v1\n\t"
+                    "add %[isum], %[isum], %[tmp]\n\t"
+                    "add %[isum], %[isum], %[t1]"
+                    : [tmp] "=&r" (tmp), [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3)
+                    , [t4] "=&r" (t4), [t5] "=&r" (t5), [t6] "=&r" (t6), [t7] "=&r" (t7)
+                    , [m] "+&r" (m), [isum] "+&r" (isum)
                     : [vl128] "r" (128), [vl64] "r" (64), [vl32] "r" (32)
                     , [q3] "r" (q3), [qh] "r" (qh), [scale] "r" (scale), [q8] "r" (q8)
                     : "memory"

From 5c9a18e674be444e847d8e07a6b851764708fb12 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Fri, 7 Nov 2025 11:23:34 +0100
Subject: [PATCH 14/32] common: move download functions to download.(cpp|h)
 (#17059)

* common: move download functions to download.(cpp|h)

* rm unused includes

* minor cleanup

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 common/CMakeLists.txt |    2 +
 common/arg.cpp        | 1004 +---------------------------------------
 common/arg.h          |    4 +-
 common/download.cpp   | 1014 +++++++++++++++++++++++++++++++++++++++++
 common/download.h     |   41 ++
 5 files changed, 1066 insertions(+), 999 deletions(-)
 create mode 100644 common/download.cpp
 create mode 100644 common/download.h

diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index fe290bf8f..7086d08e5 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -56,6 +56,8 @@ add_library(${TARGET} STATIC
     common.h
     console.cpp
     console.h
+    download.cpp
+    download.h
     http.h
     json-partial.cpp
     json-partial.h
diff --git a/common/arg.cpp b/common/arg.cpp
index 4316917d7..5597de121 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2,10 +2,10 @@
 
 #include "chat.h"
 #include "common.h"
-#include "gguf.h" // for reading GGUF splits
 #include "json-schema-to-grammar.h"
 #include "log.h"
 #include "sampling.h"
+#include "download.h"
 
 // fix problem with std::min and std::max
 #if defined(_WIN32)
@@ -22,23 +22,14 @@
 #include <algorithm>
 #include <climits>
 #include <cstdarg>
-#include <filesystem>
 #include <fstream>
-#include <future>
 #include <list>
 #include <regex>
 #include <set>
 #include <string>
-#include <thread>
+#include <thread> // for hardware_concurrency
 #include <vector>
 
-#if defined(LLAMA_USE_CURL)
-#include <curl/curl.h>
-#include <curl/easy.h>
-#else
-#include "http.h"
-#endif
-
 #ifdef __linux__
 #include <linux/limits.h>
 #elif defined(_WIN32)
@@ -52,16 +43,9 @@
 #endif
 #define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
 
-// isatty
-#if defined(_WIN32)
-#include <io.h>
-#else
-#include <unistd.h>
-#endif
-
 using json = nlohmann::ordered_json;
 
-std::initializer_list<enum llama_example> mmproj_examples = {
+static std::initializer_list<enum llama_example> mmproj_examples = {
     LLAMA_EXAMPLE_MTMD,
     LLAMA_EXAMPLE_SERVER,
 };
@@ -76,50 +60,13 @@ static std::string read_file(const std::string & fname) {
     return content;
 }
 
-static void write_file(const std::string & fname, const std::string & content) {
-    const std::string fname_tmp = fname + ".tmp";
-    std::ofstream     file(fname_tmp);
-    if (!file) {
-        throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str()));
-    }
-
-    try {
-        file << content;
-        file.close();
-
-        // Makes write atomic
-        if (rename(fname_tmp.c_str(), fname.c_str()) != 0) {
-            LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, fname_tmp.c_str(), fname.c_str());
-            // If rename fails, try to delete the temporary file
-            if (remove(fname_tmp.c_str()) != 0) {
-                LOG_ERR("%s: unable to delete temporary file: %s\n", __func__, fname_tmp.c_str());
-            }
-        }
-    } catch (...) {
-        // If anything fails, try to delete the temporary file
-        if (remove(fname_tmp.c_str()) != 0) {
-            LOG_ERR("%s: unable to delete temporary file: %s\n", __func__, fname_tmp.c_str());
-        }
-
-        throw std::runtime_error(string_format("error: failed to write file '%s'\n", fname.c_str()));
-    }
-}
-
-static bool is_output_a_tty() {
-#if defined(_WIN32)
-    return _isatty(_fileno(stdout));
-#else
-    return isatty(1);
-#endif
-}
-
 common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
-    this->examples = std::move(examples);
+    this->examples = examples;
     return *this;
 }
 
 common_arg & common_arg::set_excludes(std::initializer_list<enum llama_example> excludes) {
-    this->excludes = std::move(excludes);
+    this->excludes = excludes;
     return *this;
 }
 
@@ -142,7 +89,7 @@ bool common_arg::is_exclude(enum llama_example ex) {
     return excludes.find(ex) != excludes.end();
 }
 
-bool common_arg::get_value_from_env(std::string & output) {
+bool common_arg::get_value_from_env(std::string & output) const {
     if (env == nullptr) return false;
     char * value = std::getenv(env);
     if (value) {
@@ -152,7 +99,7 @@ bool common_arg::get_value_from_env(std::string & output) {
     return false;
 }
 
-bool common_arg::has_value_from_env() {
+bool common_arg::has_value_from_env() const {
     return env != nullptr && std::getenv(env);
 }
 
@@ -220,943 +167,6 @@ std::string common_arg::to_string() {
     return ss.str();
 }
 
-//
-// downloader
-//
-
-struct common_hf_file_res {
-    std::string repo; // repo name with ":tag" removed
-    std::string ggufFile;
-    std::string mmprojFile;
-};
-
-static void write_etag(const std::string & path, const std::string & etag) {
-    const std::string etag_path = path + ".etag";
-    write_file(etag_path, etag);
-    LOG_DBG("%s: file etag saved: %s\n", __func__, etag_path.c_str());
-}
-
-static std::string read_etag(const std::string & path) {
-    std::string none;
-    const std::string etag_path = path + ".etag";
-
-    if (std::filesystem::exists(etag_path)) {
-        std::ifstream etag_in(etag_path);
-        if (!etag_in) {
-            LOG_ERR("%s: could not open .etag file for reading: %s\n", __func__, etag_path.c_str());
-            return none;
-        }
-        std::string etag;
-        std::getline(etag_in, etag);
-        return etag;
-    }
-
-    // no etag file, but maybe there is an old .json
-    // remove this code later
-    const std::string metadata_path = path + ".json";
-
-    if (std::filesystem::exists(metadata_path)) {
-        std::ifstream metadata_in(metadata_path);
-        try {
-            nlohmann::json metadata_json;
-            metadata_in >> metadata_json;
-            LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(),
-                    metadata_json.dump().c_str());
-            if (metadata_json.contains("etag") && metadata_json.at("etag").is_string()) {
-                std::string etag = metadata_json.at("etag");
-                write_etag(path, etag);
-                if (!std::filesystem::remove(metadata_path)) {
-                    LOG_WRN("%s: failed to delete old .json metadata file: %s\n", __func__, metadata_path.c_str());
-                }
-                return etag;
-            }
-        } catch (const nlohmann::json::exception & e) {
-            LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
-        }
-    }
-    return none;
-}
-
-#ifdef LLAMA_USE_CURL
-
-//
-// CURL utils
-//
-
-using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
-
-// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
-struct curl_slist_ptr {
-    struct curl_slist * ptr = nullptr;
-    ~curl_slist_ptr() {
-        if (ptr) {
-            curl_slist_free_all(ptr);
-        }
-    }
-};
-
-static CURLcode common_curl_perf(CURL * curl) {
-    CURLcode res = curl_easy_perform(curl);
-    if (res != CURLE_OK) {
-        LOG_ERR("%s: curl_easy_perform() failed\n", __func__);
-    }
-
-    return res;
-}
-
-// Send a HEAD request to retrieve the etag and last-modified headers
-struct common_load_model_from_url_headers {
-    std::string etag;
-    std::string last_modified;
-    std::string accept_ranges;
-};
-
-struct FILE_deleter {
-    void operator()(FILE * f) const { fclose(f); }
-};
-
-static size_t common_header_callback(char * buffer, size_t, size_t n_items, void * userdata) {
-    common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
-    static std::regex                    header_regex("([^:]+): (.*)\r\n");
-    static std::regex                    etag_regex("ETag", std::regex_constants::icase);
-    static std::regex                    last_modified_regex("Last-Modified", std::regex_constants::icase);
-    static std::regex                    accept_ranges_regex("Accept-Ranges", std::regex_constants::icase);
-    std::string                          header(buffer, n_items);
-    std::smatch                          match;
-    if (std::regex_match(header, match, header_regex)) {
-        const std::string & key   = match[1];
-        const std::string & value = match[2];
-        if (std::regex_match(key, match, etag_regex)) {
-            headers->etag = value;
-        } else if (std::regex_match(key, match, last_modified_regex)) {
-            headers->last_modified = value;
-        } else if (std::regex_match(key, match, accept_ranges_regex)) {
-            headers->accept_ranges = value;
-        }
-    }
-
-    return n_items;
-}
-
-static size_t common_write_callback(void * data, size_t size, size_t nmemb, void * fd) {
-    return std::fwrite(data, size, nmemb, static_cast<FILE *>(fd));
-}
-
-// helper function to hide password in URL
-static std::string llama_download_hide_password_in_url(const std::string & url) {
-    // Use regex to match and replace the user[:password]@ pattern in URLs
-    // Pattern: scheme://[user[:password]@]host[...]
-    static const std::regex url_regex(R"(^(?:[A-Za-z][A-Za-z0-9+.-]://)(?:[^/@]+@)?.$)");
-    std::smatch             match;
-
-    if (std::regex_match(url, match, url_regex)) {
-        // match[1] = scheme (e.g., "https://")
-        // match[2] = user[:password]@ part
-        // match[3] = rest of URL (host and path)
-        return match[1].str() + "********@" + match[3].str();
-    }
-
-    return url;  // No credentials found or malformed URL
-}
-
-static void common_curl_easy_setopt_head(CURL * curl, const std::string & url) {
-    // Set the URL, allow to follow http redirection
-    curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
-    curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
-
-#    if defined(_WIN32)
-    // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
-    //   operating system. Currently implemented under MS-Windows.
-    curl_easy_setopt(curl, CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
-#    endif
-
-    curl_easy_setopt(curl, CURLOPT_NOBODY, 1L);      // will trigger the HEAD verb
-    curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L);  // hide head request progress
-    curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, common_header_callback);
-}
-
-static void common_curl_easy_setopt_get(CURL * curl) {
-    curl_easy_setopt(curl, CURLOPT_NOBODY, 0L);
-    curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, common_write_callback);
-
-    //  display download progress
-    curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
-}
-
-static bool common_pull_file(CURL * curl, const std::string & path_temporary) {
-    if (std::filesystem::exists(path_temporary)) {
-        const std::string partial_size = std::to_string(std::filesystem::file_size(path_temporary));
-        LOG_INF("%s: server supports range requests, resuming download from byte %s\n", __func__, partial_size.c_str());
-        const std::string range_str = partial_size + "-";
-        curl_easy_setopt(curl, CURLOPT_RANGE, range_str.c_str());
-    }
-
-    // Always open file in append mode could be resuming
-    std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "ab"));
-    if (!outfile) {
-        LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path_temporary.c_str());
-        return false;
-    }
-
-    common_curl_easy_setopt_get(curl);
-    curl_easy_setopt(curl, CURLOPT_WRITEDATA, outfile.get());
-
-    return common_curl_perf(curl) == CURLE_OK;
-}
-
-static bool common_download_head(CURL *              curl,
-                                 curl_slist_ptr &    http_headers,
-                                 const std::string & url,
-                                 const std::string & bearer_token) {
-    if (!curl) {
-        LOG_ERR("%s: error initializing libcurl\n", __func__);
-        return false;
-    }
-
-    http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
-    // Check if hf-token or bearer-token was specified
-    if (!bearer_token.empty()) {
-        std::string auth_header = "Authorization: Bearer " + bearer_token;
-        http_headers.ptr        = curl_slist_append(http_headers.ptr, auth_header.c_str());
-    }
-
-    curl_easy_setopt(curl, CURLOPT_HTTPHEADER, http_headers.ptr);
-    common_curl_easy_setopt_head(curl, url);
-    return common_curl_perf(curl) == CURLE_OK;
-}
-
-// download one single file from remote URL to local path
-static bool common_download_file_single_online(const std::string & url,
-                                               const std::string & path,
-                                               const std::string & bearer_token) {
-    static const int max_attempts        = 3;
-    static const int retry_delay_seconds = 2;
-    for (int i = 0; i < max_attempts; ++i) {
-        std::string etag;
-
-        // Check if the file already exists locally
-        const auto file_exists = std::filesystem::exists(path);
-        if (file_exists) {
-            etag = read_etag(path);
-        } else {
-            LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
-        }
-
-        bool head_request_ok = false;
-        bool should_download = !file_exists;  // by default, we should download if the file does not exist
-
-        // Initialize libcurl
-        curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
-        common_load_model_from_url_headers headers;
-        curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
-        curl_slist_ptr http_headers;
-        const bool     was_perform_successful = common_download_head(curl.get(), http_headers, url, bearer_token);
-        if (!was_perform_successful) {
-            head_request_ok = false;
-        }
-
-        long http_code = 0;
-        curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
-        if (http_code == 200) {
-            head_request_ok = true;
-        } else {
-            LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
-            head_request_ok = false;
-        }
-
-        // if head_request_ok is false, we don't have the etag or last-modified headers
-        // we leave should_download as-is, which is true if the file does not exist
-        bool should_download_from_scratch = false;
-        if (head_request_ok) {
-            // check if ETag or Last-Modified headers are different
-            // if it is, we need to download the file again
-            if (!etag.empty() && etag != headers.etag) {
-                LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(),
-                        headers.etag.c_str());
-                should_download              = true;
-                should_download_from_scratch = true;
-            }
-        }
-
-        const bool accept_ranges_supported = !headers.accept_ranges.empty() && headers.accept_ranges != "none";
-        if (should_download) {
-            if (file_exists &&
-                !accept_ranges_supported) {  // Resumable downloads not supported, delete and start again.
-                LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
-                if (remove(path.c_str()) != 0) {
-                    LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
-                    return false;
-                }
-            }
-
-            const std::string path_temporary = path + ".downloadInProgress";
-            if (should_download_from_scratch) {
-                if (std::filesystem::exists(path_temporary)) {
-                    if (remove(path_temporary.c_str()) != 0) {
-                        LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
-                        return false;
-                    }
-                }
-
-                if (std::filesystem::exists(path)) {
-                    if (remove(path.c_str()) != 0) {
-                        LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
-                        return false;
-                    }
-                }
-            }
-            if (head_request_ok) {
-                write_etag(path, headers.etag);
-            }
-
-            // start the download
-            LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n",
-                    __func__, llama_download_hide_password_in_url(url).c_str(), path_temporary.c_str(),
-                    headers.etag.c_str(), headers.last_modified.c_str());
-            const bool was_pull_successful = common_pull_file(curl.get(), path_temporary);
-            if (!was_pull_successful) {
-                if (i + 1 < max_attempts) {
-                    const int exponential_backoff_delay = std::pow(retry_delay_seconds, i) * 1000;
-                    LOG_WRN("%s: retrying after %d milliseconds...\n", __func__, exponential_backoff_delay);
-                    std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
-                } else {
-                    LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
-                }
-
-                continue;
-            }
-
-            long http_code = 0;
-            curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
-            if (http_code < 200 || http_code >= 400) {
-                LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
-                return false;
-            }
-
-            if (rename(path_temporary.c_str(), path.c_str()) != 0) {
-                LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
-                return false;
-            }
-        } else {
-            LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
-        }
-
-        break;
-    }
-
-    return true;
-}
-
-std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params) {
-    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
-    curl_slist_ptr http_headers;
-    std::vector<char> res_buffer;
-
-    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
-    curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
-    curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
-    curl_easy_setopt(curl.get(), CURLOPT_VERBOSE, 1L);
-    typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
-    auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
-        auto data_vec = static_cast<std::vector<char> *>(data);
-        data_vec->insert(data_vec->end(), (char *)ptr, (char *)ptr + size * nmemb);
-        return size * nmemb;
-    };
-    curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
-    curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_buffer);
-#if defined(_WIN32)
-    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
-#endif
-    if (params.timeout > 0) {
-        curl_easy_setopt(curl.get(), CURLOPT_TIMEOUT, params.timeout);
-    }
-    if (params.max_size > 0) {
-        curl_easy_setopt(curl.get(), CURLOPT_MAXFILESIZE, params.max_size);
-    }
-    http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
-    for (const auto & header : params.headers) {
-        http_headers.ptr = curl_slist_append(http_headers.ptr, header.c_str());
-    }
-    curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
-
-    CURLcode res = curl_easy_perform(curl.get());
-
-    if (res != CURLE_OK) {
-        std::string error_msg = curl_easy_strerror(res);
-        throw std::runtime_error("error: cannot make GET request: " + error_msg);
-    }
-
-    long res_code;
-    curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
-
-    return { res_code, std::move(res_buffer) };
-}
-
-#else
-
-static void print_progress(size_t current, size_t total) {
-    if (!is_output_a_tty()) {
-        return;
-    }
-
-    if (!total) {
-        return;
-    }
-
-    size_t width = 50;
-    size_t pct = (100 * current) / total;
-    size_t pos = (width * current) / total;
-
-    std::cout << "["
-              << std::string(pos, '=')
-              << (pos < width ? ">" : "")
-              << std::string(width - pos, ' ')
-              << "] " << std::setw(3) << pct << "%  ("
-              << current / (1024 * 1024) << " MB / "
-              << total / (1024 * 1024) << " MB)\r";
-    std::cout.flush();
-}
-
-static bool common_pull_file(httplib::Client & cli,
-                             const std::string & resolve_path,
-                             const std::string & path_tmp,
-                             bool supports_ranges,
-                             size_t existing_size,
-                             size_t & total_size) {
-    std::ofstream ofs(path_tmp, std::ios::binary | std::ios::app);
-    if (!ofs.is_open()) {
-        LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path_tmp.c_str());
-        return false;
-    }
-
-    httplib::Headers headers;
-    if (supports_ranges && existing_size > 0) {
-        headers.emplace("Range", "bytes=" + std::to_string(existing_size) + "-");
-    }
-
-    std::atomic<size_t> downloaded{existing_size};
-
-    auto res = cli.Get(resolve_path, headers,
-        [&](const httplib::Response &response) {
-            if (existing_size > 0 && response.status != 206) {
-                LOG_WRN("%s: server did not respond with 206 Partial Content for a resume request. Status: %d\n", __func__, response.status);
-                return false;
-            }
-            if (existing_size == 0 && response.status != 200) {
-                LOG_WRN("%s: download received non-successful status code: %d\n", __func__, response.status);
-                return false;
-            }
-            if (total_size == 0 && response.has_header("Content-Length")) {
-                try {
-                    size_t content_length = std::stoull(response.get_header_value("Content-Length"));
-                    total_size = existing_size + content_length;
-                } catch (const std::exception &e) {
-                    LOG_WRN("%s: invalid Content-Length header: %s\n", __func__, e.what());
-                }
-            }
-            return true;
-        },
-        [&](const char *data, size_t len) {
-            ofs.write(data, len);
-            if (!ofs) {
-                LOG_ERR("%s: error writing to file: %s\n", __func__, path_tmp.c_str());
-                return false;
-            }
-            downloaded += len;
-            print_progress(downloaded, total_size);
-            return true;
-        },
-        nullptr
-    );
-
-    std::cout << "\n";
-
-    if (!res) {
-        LOG_ERR("%s: error during download. Status: %d\n", __func__, res ? res->status : -1);
-        return false;
-    }
-
-    return true;
-}
-
-// download one single file from remote URL to local path
-static bool common_download_file_single_online(const std::string & url,
-                                               const std::string & path,
-                                               const std::string & bearer_token) {
-    static const int max_attempts        = 3;
-    static const int retry_delay_seconds = 2;
-
-    auto [cli, parts] = common_http_client(url);
-
-    httplib::Headers default_headers = {{"User-Agent", "llama-cpp"}};
-    if (!bearer_token.empty()) {
-        default_headers.insert({"Authorization", "Bearer " + bearer_token});
-    }
-    cli.set_default_headers(default_headers);
-
-    const bool file_exists = std::filesystem::exists(path);
-
-    std::string last_etag;
-    if (file_exists) {
-        last_etag = read_etag(path);
-    } else {
-        LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
-    }
-
-    for (int i = 0; i < max_attempts; ++i) {
-        auto head = cli.Head(parts.path);
-        bool head_ok = head && head->status >= 200 && head->status < 300;
-        if (!head_ok) {
-            LOG_WRN("%s: HEAD invalid http status code received: %d\n", __func__, head ? head->status : -1);
-            if (file_exists) {
-                LOG_INF("%s: Using cached file (HEAD failed): %s\n", __func__, path.c_str());
-                return true;
-            }
-        }
-
-        std::string etag;
-        if (head_ok && head->has_header("ETag")) {
-            etag = head->get_header_value("ETag");
-        }
-
-        size_t total_size = 0;
-        if (head_ok && head->has_header("Content-Length")) {
-            try {
-                total_size = std::stoull(head->get_header_value("Content-Length"));
-            } catch (const std::exception& e) {
-                LOG_WRN("%s: Invalid Content-Length in HEAD response: %s\n", __func__, e.what());
-            }
-        }
-
-        bool supports_ranges = false;
-        if (head_ok && head->has_header("Accept-Ranges")) {
-            supports_ranges = head->get_header_value("Accept-Ranges") != "none";
-        }
-
-        bool should_download_from_scratch = false;
-        if (!last_etag.empty() && !etag.empty() && last_etag != etag) {
-            LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__,
-                    last_etag.c_str(), etag.c_str());
-            should_download_from_scratch = true;
-        }
-
-        if (file_exists) {
-            if (!should_download_from_scratch) {
-                LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
-                return true;
-            }
-            LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
-            if (remove(path.c_str()) != 0) {
-                LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
-                return false;
-            }
-        }
-
-        const std::string path_temporary = path + ".downloadInProgress";
-        size_t existing_size = 0;
-
-        if (std::filesystem::exists(path_temporary)) {
-            if (supports_ranges && !should_download_from_scratch) {
-                existing_size = std::filesystem::file_size(path_temporary);
-            } else if (remove(path_temporary.c_str()) != 0) {
-                LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
-                return false;
-            }
-        }
-
-        // start the download
-        LOG_INF("%s: trying to download model from %s to %s (etag:%s)...\n",
-                __func__, common_http_show_masked_url(parts).c_str(), path_temporary.c_str(), etag.c_str());
-        const bool was_pull_successful = common_pull_file(cli, parts.path, path_temporary, supports_ranges, existing_size, total_size);
-        if (!was_pull_successful) {
-            if (i + 1 < max_attempts) {
-                const int exponential_backoff_delay = std::pow(retry_delay_seconds, i) * 1000;
-                LOG_WRN("%s: retrying after %d milliseconds...\n", __func__, exponential_backoff_delay);
-                std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
-            } else {
-                LOG_ERR("%s: download failed after %d attempts\n", __func__, max_attempts);
-            }
-            continue;
-        }
-
-        if (std::rename(path_temporary.c_str(), path.c_str()) != 0) {
-            LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
-            return false;
-        }
-        if (!etag.empty()) {
-            write_etag(path, etag);
-        }
-        break;
-    }
-
-    return true;
-}
-
-std::pair<long, std::vector<char>> common_remote_get_content(const std::string          & url,
-                                                             const common_remote_params & params) {
-    auto [cli, parts] = common_http_client(url);
-
-    httplib::Headers headers = {{"User-Agent", "llama-cpp"}};
-    for (const auto & header : params.headers) {
-        size_t pos = header.find(':');
-        if (pos != std::string::npos) {
-            headers.emplace(header.substr(0, pos), header.substr(pos + 1));
-        } else {
-            headers.emplace(header, "");
-        }
-    }
-
-    if (params.timeout > 0) {
-        cli.set_read_timeout(params.timeout, 0);
-        cli.set_write_timeout(params.timeout, 0);
-    }
-
-    std::vector<char> buf;
-    auto res = cli.Get(parts.path, headers,
-        [&](const char *data, size_t len) {
-            buf.insert(buf.end(), data, data + len);
-            return params.max_size == 0 ||
-                   buf.size() <= static_cast<size_t>(params.max_size);
-        },
-        nullptr
-    );
-
-    if (!res) {
-        throw std::runtime_error("error: cannot make GET request");
-    }
-
-    return { res->status, std::move(buf) };
-}
-
-#endif // LLAMA_USE_CURL
-
-static bool common_download_file_single(const std::string & url,
-                                        const std::string & path,
-                                        const std::string & bearer_token,
-                                        bool                offline) {
-    if (!offline) {
-        return common_download_file_single_online(url, path, bearer_token);
-    }
-
-    if (!std::filesystem::exists(path)) {
-        LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
-        return false;
-    }
-
-    LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
-    return true;
-}
-
-// download multiple files from remote URLs to local paths
-// the input is a vector of pairs <url, path>
-static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
-    // Prepare download in parallel
-    std::vector<std::future<bool>> futures_download;
-    for (auto const & item : urls) {
-        futures_download.push_back(std::async(std::launch::async, [bearer_token, offline](const std::pair<std::string, std::string> & it) -> bool {
-            return common_download_file_single(it.first, it.second, bearer_token, offline);
-        }, item));
-    }
-
-    // Wait for all downloads to complete
-    for (auto & f : futures_download) {
-        if (!f.get()) {
-            return false;
-        }
-    }
-
-    return true;
-}
-
-static bool common_download_model(
-        const common_params_model & model,
-        const std::string & bearer_token,
-        bool offline) {
-    // Basic validation of the model.url
-    if (model.url.empty()) {
-        LOG_ERR("%s: invalid model url\n", __func__);
-        return false;
-    }
-
-    if (!common_download_file_single(model.url, model.path, bearer_token, offline)) {
-        return false;
-    }
-
-    // check for additional GGUFs split to download
-    int n_split = 0;
-    {
-        struct gguf_init_params gguf_params = {
-            /*.no_alloc = */ true,
-            /*.ctx      = */ NULL,
-        };
-        auto * ctx_gguf = gguf_init_from_file(model.path.c_str(), gguf_params);
-        if (!ctx_gguf) {
-            LOG_ERR("\n%s:  failed to load input GGUF from %s\n", __func__, model.path.c_str());
-            return false;
-        }
-
-        auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
-        if (key_n_split >= 0) {
-            n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
-        }
-
-        gguf_free(ctx_gguf);
-    }
-
-    if (n_split > 1) {
-        char split_prefix[PATH_MAX] = {0};
-        char split_url_prefix[LLAMA_MAX_URL_LENGTH] = {0};
-
-        // Verify the first split file format
-        // and extract split URL and PATH prefixes
-        {
-            if (!llama_split_prefix(split_prefix, sizeof(split_prefix), model.path.c_str(), 0, n_split)) {
-                LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, model.path.c_str(), n_split);
-                return false;
-            }
-
-            if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model.url.c_str(), 0, n_split)) {
-                LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model.url.c_str(), n_split);
-                return false;
-            }
-        }
-
-        std::vector<std::pair<std::string, std::string>> urls;
-        for (int idx = 1; idx < n_split; idx++) {
-            char split_path[PATH_MAX] = {0};
-            llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
-
-            char split_url[LLAMA_MAX_URL_LENGTH] = {0};
-            llama_split_path(split_url, sizeof(split_url), split_url_prefix, idx, n_split);
-
-            if (std::string(split_path) == model.path) {
-                continue; // skip the already downloaded file
-            }
-
-            urls.push_back({split_url, split_path});
-        }
-
-        // Download in parallel
-        common_download_file_multiple(urls, bearer_token, offline);
-    }
-
-    return true;
-}
-
-/**
- * Allow getting the HF file from the HF repo with tag (like ollama), for example:
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
- * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
- *
- * Return pair of <repo, file> (with "repo" already having tag removed)
- *
- * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
- */
-static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token, bool offline) {
-    auto parts = string_split<std::string>(hf_repo_with_tag, ':');
-    std::string tag = parts.size() > 1 ? parts.back() : "latest";
-    std::string hf_repo = parts[0];
-    if (string_split<std::string>(hf_repo, '/').size() != 2) {
-        throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
-    }
-
-    std::string url = get_model_endpoint() + "v2/" + hf_repo + "/manifests/" + tag;
-
-    // headers
-    std::vector<std::string> headers;
-    headers.push_back("Accept: application/json");
-    if (!bearer_token.empty()) {
-        headers.push_back("Authorization: Bearer " + bearer_token);
-    }
-    // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
-    // User-Agent header is already set in common_remote_get_content, no need to set it here
-
-    // we use "=" to avoid clashing with other component, while still being allowed on windows
-    std::string cached_response_fname = "manifest=" + hf_repo + "=" + tag + ".json";
-    string_replace_all(cached_response_fname, "/", "_");
-    std::string cached_response_path = fs_get_cache_file(cached_response_fname);
-
-    // make the request
-    common_remote_params params;
-    params.headers = headers;
-    long res_code = 0;
-    std::string res_str;
-    bool use_cache = false;
-    if (!offline) {
-        try {
-            auto res = common_remote_get_content(url, params);
-            res_code = res.first;
-            res_str = std::string(res.second.data(), res.second.size());
-        } catch (const std::exception & e) {
-            LOG_WRN("error: failed to get manifest at %s: %s\n", url.c_str(), e.what());
-        }
-    }
-    if (res_code == 0) {
-        if (std::filesystem::exists(cached_response_path)) {
-            LOG_WRN("trying to read manifest from cache: %s\n", cached_response_path.c_str());
-            res_str = read_file(cached_response_path);
-            res_code = 200;
-            use_cache = true;
-        } else {
-            throw std::runtime_error(
-                offline ? "error: failed to get manifest (offline mode)"
-                : "error: failed to get manifest (check your internet connection)");
-        }
-    }
-    std::string ggufFile;
-    std::string mmprojFile;
-
-    if (res_code == 200 || res_code == 304) {
-        try {
-            auto j = json::parse(res_str);
-
-            if (j.contains("ggufFile") && j["ggufFile"].contains("rfilename")) {
-                ggufFile = j["ggufFile"]["rfilename"].get<std::string>();
-            }
-            if (j.contains("mmprojFile") && j["mmprojFile"].contains("rfilename")) {
-                mmprojFile = j["mmprojFile"]["rfilename"].get<std::string>();
-            }
-        } catch (const std::exception & e) {
-            throw std::runtime_error(std::string("error parsing manifest JSON: ") + e.what());
-        }
-        if (!use_cache) {
-            // if not using cached response, update the cache file
-            write_file(cached_response_path, res_str);
-        }
-    } else if (res_code == 401) {
-        throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
-    } else {
-        throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
-    }
-
-    // check response
-    if (ggufFile.empty()) {
-        throw std::runtime_error("error: model does not have ggufFile");
-    }
-
-    return { hf_repo, ggufFile, mmprojFile };
-}
-
-//
-// Docker registry functions
-//
-
-static std::string common_docker_get_token(const std::string & repo) {
-    std::string url = "https://auth.docker.io/token?service=registry.docker.io&scope=repository:" + repo + ":pull";
-
-    common_remote_params params;
-    auto                 res = common_remote_get_content(url, params);
-
-    if (res.first != 200) {
-        throw std::runtime_error("Failed to get Docker registry token, HTTP code: " + std::to_string(res.first));
-    }
-
-    std::string            response_str(res.second.begin(), res.second.end());
-    nlohmann::ordered_json response = nlohmann::ordered_json::parse(response_str);
-
-    if (!response.contains("token")) {
-        throw std::runtime_error("Docker registry token response missing 'token' field");
-    }
-
-    return response["token"].get<std::string>();
-}
-
-static std::string common_docker_resolve_model(const std::string & docker) {
-    // Parse ai/smollm2:135M-Q4_0
-    size_t      colon_pos = docker.find(':');
-    std::string repo, tag;
-    if (colon_pos != std::string::npos) {
-        repo = docker.substr(0, colon_pos);
-        tag  = docker.substr(colon_pos + 1);
-    } else {
-        repo = docker;
-        tag  = "latest";
-    }
-
-    // ai/ is the default
-    size_t      slash_pos = docker.find('/');
-    if (slash_pos == std::string::npos) {
-        repo.insert(0, "ai/");
-    }
-
-    LOG_INF("%s: Downloading Docker Model: %s:%s\n", __func__, repo.c_str(), tag.c_str());
-    try {
-        // --- helper: digest validation ---
-        auto validate_oci_digest = [](const std::string & digest) -> std::string {
-            // Expected: algo:hex ; start with sha256 (64 hex chars)
-            // You can extend this map if supporting other algorithms in future.
-            static const std::regex re("^sha256:([a-fA-F0-9]{64})$");
-            std::smatch m;
-            if (!std::regex_match(digest, m, re)) {
-                throw std::runtime_error("Invalid OCI digest format received in manifest: " + digest);
-            }
-            // normalize hex to lowercase
-            std::string normalized = digest;
-            std::transform(normalized.begin()+7, normalized.end(), normalized.begin()+7, [](unsigned char c){
-                return std::tolower(c);
-            });
-            return normalized;
-        };
-
-        std::string token = common_docker_get_token(repo);  // Get authentication token
-
-        // Get manifest
-        const std::string    url_prefix = "https://registry-1.docker.io/v2/" + repo;
-        std::string          manifest_url = url_prefix + "/manifests/" + tag;
-        common_remote_params manifest_params;
-        manifest_params.headers.push_back("Authorization: Bearer " + token);
-        manifest_params.headers.push_back(
-            "Accept: application/vnd.docker.distribution.manifest.v2+json,application/vnd.oci.image.manifest.v1+json");
-        auto manifest_res = common_remote_get_content(manifest_url, manifest_params);
-        if (manifest_res.first != 200) {
-            throw std::runtime_error("Failed to get Docker manifest, HTTP code: " + std::to_string(manifest_res.first));
-        }
-
-        std::string            manifest_str(manifest_res.second.begin(), manifest_res.second.end());
-        nlohmann::ordered_json manifest = nlohmann::ordered_json::parse(manifest_str);
-        std::string            gguf_digest;  // Find the GGUF layer
-        if (manifest.contains("layers")) {
-            for (const auto & layer : manifest["layers"]) {
-                if (layer.contains("mediaType")) {
-                    std::string media_type = layer["mediaType"].get<std::string>();
-                    if (media_type == "application/vnd.docker.ai.gguf.v3" ||
-                        media_type.find("gguf") != std::string::npos) {
-                        gguf_digest = layer["digest"].get<std::string>();
-                        break;
-                    }
-                }
-            }
-        }
-
-        if (gguf_digest.empty()) {
-            throw std::runtime_error("No GGUF layer found in Docker manifest");
-        }
-
-        // Validate & normalize digest
-        gguf_digest = validate_oci_digest(gguf_digest);
-        LOG_DBG("%s: Using validated digest: %s\n", __func__, gguf_digest.c_str());
-
-        // Prepare local filename
-        std::string model_filename = repo;
-        std::replace(model_filename.begin(), model_filename.end(), '/', '_');
-        model_filename += "_" + tag + ".gguf";
-        std::string local_path = fs_get_cache_file(model_filename);
-
-        const std::string blob_url = url_prefix + "/blobs/" + gguf_digest;
-        if (!common_download_file_single(blob_url, local_path, token, false)) {
-            throw std::runtime_error("Failed to download Docker Model");
-        }
-
-        LOG_INF("%s: Downloaded Docker Model to: %s\n", __func__, local_path.c_str());
-        return local_path;
-    } catch (const std::exception & e) {
-        LOG_ERR("%s: Docker Model download failed: %s\n", __func__, e.what());
-        throw;
-    }
-}
-
 //
 // utils
 //
diff --git a/common/arg.h b/common/arg.h
index 77997c4ef..7ab7e2cea 100644
--- a/common/arg.h
+++ b/common/arg.h
@@ -59,8 +59,8 @@ struct common_arg {
     common_arg & set_sparam();
     bool in_example(enum llama_example ex);
     bool is_exclude(enum llama_example ex);
-    bool get_value_from_env(std::string & output);
-    bool has_value_from_env();
+    bool get_value_from_env(std::string & output) const;
+    bool has_value_from_env() const;
     std::string to_string();
 };
 
diff --git a/common/download.cpp b/common/download.cpp
new file mode 100644
index 000000000..02d75fc0d
--- /dev/null
+++ b/common/download.cpp
@@ -0,0 +1,1014 @@
+#include "arg.h"
+
+#include "common.h"
+#include "gguf.h" // for reading GGUF splits
+#include "log.h"
+#include "download.h"
+
+#define JSON_ASSERT GGML_ASSERT
+#include <nlohmann/json.hpp>
+
+#include <algorithm>
+#include <filesystem>
+#include <fstream>
+#include <future>
+#include <regex>
+#include <string>
+#include <thread>
+#include <vector>
+
+#if defined(LLAMA_USE_CURL)
+#include <curl/curl.h>
+#include <curl/easy.h>
+#else
+#include "http.h"
+#endif
+
+#ifdef __linux__
+#include <linux/limits.h>
+#elif defined(_WIN32)
+#   if !defined(PATH_MAX)
+#   define PATH_MAX MAX_PATH
+#   endif
+#elif defined(_AIX)
+#include <sys/limits.h>
+#else
+#include <sys/syslimits.h>
+#endif
+#define LLAMA_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
+
+// isatty
+#if defined(_WIN32)
+#include <io.h>
+#else
+#include <unistd.h>
+#endif
+
+using json = nlohmann::ordered_json;
+
+//
+// downloader
+//
+
+static std::string read_file(const std::string & fname) {
+    std::ifstream file(fname);
+    if (!file) {
+        throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str()));
+    }
+    std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+    file.close();
+    return content;
+}
+
+static void write_file(const std::string & fname, const std::string & content) {
+    const std::string fname_tmp = fname + ".tmp";
+    std::ofstream     file(fname_tmp);
+    if (!file) {
+        throw std::runtime_error(string_format("error: failed to open file '%s'\n", fname.c_str()));
+    }
+
+    try {
+        file << content;
+        file.close();
+
+        // Makes write atomic
+        if (rename(fname_tmp.c_str(), fname.c_str()) != 0) {
+            LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, fname_tmp.c_str(), fname.c_str());
+            // If rename fails, try to delete the temporary file
+            if (remove(fname_tmp.c_str()) != 0) {
+                LOG_ERR("%s: unable to delete temporary file: %s\n", __func__, fname_tmp.c_str());
+            }
+        }
+    } catch (...) {
+        // If anything fails, try to delete the temporary file
+        if (remove(fname_tmp.c_str()) != 0) {
+            LOG_ERR("%s: unable to delete temporary file: %s\n", __func__, fname_tmp.c_str());
+        }
+
+        throw std::runtime_error(string_format("error: failed to write file '%s'\n", fname.c_str()));
+    }
+}
+
+static void write_etag(const std::string & path, const std::string & etag) {
+    const std::string etag_path = path + ".etag";
+    write_file(etag_path, etag);
+    LOG_DBG("%s: file etag saved: %s\n", __func__, etag_path.c_str());
+}
+
+static std::string read_etag(const std::string & path) {
+    std::string none;
+    const std::string etag_path = path + ".etag";
+
+    if (std::filesystem::exists(etag_path)) {
+        std::ifstream etag_in(etag_path);
+        if (!etag_in) {
+            LOG_ERR("%s: could not open .etag file for reading: %s\n", __func__, etag_path.c_str());
+            return none;
+        }
+        std::string etag;
+        std::getline(etag_in, etag);
+        return etag;
+    }
+
+    // no etag file, but maybe there is an old .json
+    // remove this code later
+    const std::string metadata_path = path + ".json";
+
+    if (std::filesystem::exists(metadata_path)) {
+        std::ifstream metadata_in(metadata_path);
+        try {
+            nlohmann::json metadata_json;
+            metadata_in >> metadata_json;
+            LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(),
+                    metadata_json.dump().c_str());
+            if (metadata_json.contains("etag") && metadata_json.at("etag").is_string()) {
+                std::string etag = metadata_json.at("etag");
+                write_etag(path, etag);
+                if (!std::filesystem::remove(metadata_path)) {
+                    LOG_WRN("%s: failed to delete old .json metadata file: %s\n", __func__, metadata_path.c_str());
+                }
+                return etag;
+            }
+        } catch (const nlohmann::json::exception & e) {
+            LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
+        }
+    }
+    return none;
+}
+
+#ifdef LLAMA_USE_CURL
+
+//
+// CURL utils
+//
+
+using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
+
+// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
+struct curl_slist_ptr {
+    struct curl_slist * ptr = nullptr;
+    ~curl_slist_ptr() {
+        if (ptr) {
+            curl_slist_free_all(ptr);
+        }
+    }
+};
+
+static CURLcode common_curl_perf(CURL * curl) {
+    CURLcode res = curl_easy_perform(curl);
+    if (res != CURLE_OK) {
+        LOG_ERR("%s: curl_easy_perform() failed\n", __func__);
+    }
+
+    return res;
+}
+
+// Send a HEAD request to retrieve the etag and last-modified headers
+struct common_load_model_from_url_headers {
+    std::string etag;
+    std::string last_modified;
+    std::string accept_ranges;
+};
+
+struct FILE_deleter {
+    void operator()(FILE * f) const { fclose(f); }
+};
+
+static size_t common_header_callback(char * buffer, size_t, size_t n_items, void * userdata) {
+    common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
+    static std::regex                    header_regex("([^:]+): (.*)\r\n");
+    static std::regex                    etag_regex("ETag", std::regex_constants::icase);
+    static std::regex                    last_modified_regex("Last-Modified", std::regex_constants::icase);
+    static std::regex                    accept_ranges_regex("Accept-Ranges", std::regex_constants::icase);
+    std::string                          header(buffer, n_items);
+    std::smatch                          match;
+    if (std::regex_match(header, match, header_regex)) {
+        const std::string & key   = match[1];
+        const std::string & value = match[2];
+        if (std::regex_match(key, match, etag_regex)) {
+            headers->etag = value;
+        } else if (std::regex_match(key, match, last_modified_regex)) {
+            headers->last_modified = value;
+        } else if (std::regex_match(key, match, accept_ranges_regex)) {
+            headers->accept_ranges = value;
+        }
+    }
+
+    return n_items;
+}
+
+static size_t common_write_callback(void * data, size_t size, size_t nmemb, void * fd) {
+    return std::fwrite(data, size, nmemb, static_cast<FILE *>(fd));
+}
+
+// helper function to hide password in URL
+static std::string llama_download_hide_password_in_url(const std::string & url) {
+    // Use regex to match and replace the user[:password]@ pattern in URLs
+    // Pattern: scheme://[user[:password]@]host[...]
+    static const std::regex url_regex(R"(^(?:[A-Za-z][A-Za-z0-9+.-]://)(?:[^/@]+@)?.$)");
+    std::smatch             match;
+
+    if (std::regex_match(url, match, url_regex)) {
+        // match[1] = scheme (e.g., "https://")
+        // match[2] = user[:password]@ part
+        // match[3] = rest of URL (host and path)
+        return match[1].str() + "********@" + match[3].str();
+    }
+
+    return url;  // No credentials found or malformed URL
+}
+
+static void common_curl_easy_setopt_head(CURL * curl, const std::string & url) {
+    // Set the URL, allow to follow http redirection
+    curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
+    curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
+
+#    if defined(_WIN32)
+    // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
+    //   operating system. Currently implemented under MS-Windows.
+    curl_easy_setopt(curl, CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
+#    endif
+
+    curl_easy_setopt(curl, CURLOPT_NOBODY, 1L);      // will trigger the HEAD verb
+    curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L);  // hide head request progress
+    curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, common_header_callback);
+}
+
+static void common_curl_easy_setopt_get(CURL * curl) {
+    curl_easy_setopt(curl, CURLOPT_NOBODY, 0L);
+    curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, common_write_callback);
+
+    //  display download progress
+    curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
+}
+
+static bool common_pull_file(CURL * curl, const std::string & path_temporary) {
+    if (std::filesystem::exists(path_temporary)) {
+        const std::string partial_size = std::to_string(std::filesystem::file_size(path_temporary));
+        LOG_INF("%s: server supports range requests, resuming download from byte %s\n", __func__, partial_size.c_str());
+        const std::string range_str = partial_size + "-";
+        curl_easy_setopt(curl, CURLOPT_RANGE, range_str.c_str());
+    }
+
+    // Always open file in append mode could be resuming
+    std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "ab"));
+    if (!outfile) {
+        LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path_temporary.c_str());
+        return false;
+    }
+
+    common_curl_easy_setopt_get(curl);
+    curl_easy_setopt(curl, CURLOPT_WRITEDATA, outfile.get());
+
+    return common_curl_perf(curl) == CURLE_OK;
+}
+
+static bool common_download_head(CURL *              curl,
+                                 curl_slist_ptr &    http_headers,
+                                 const std::string & url,
+                                 const std::string & bearer_token) {
+    if (!curl) {
+        LOG_ERR("%s: error initializing libcurl\n", __func__);
+        return false;
+    }
+
+    http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
+    // Check if hf-token or bearer-token was specified
+    if (!bearer_token.empty()) {
+        std::string auth_header = "Authorization: Bearer " + bearer_token;
+        http_headers.ptr        = curl_slist_append(http_headers.ptr, auth_header.c_str());
+    }
+
+    curl_easy_setopt(curl, CURLOPT_HTTPHEADER, http_headers.ptr);
+    common_curl_easy_setopt_head(curl, url);
+    return common_curl_perf(curl) == CURLE_OK;
+}
+
+// download one single file from remote URL to local path
+static bool common_download_file_single_online(const std::string & url,
+                                               const std::string & path,
+                                               const std::string & bearer_token) {
+    static const int max_attempts        = 3;
+    static const int retry_delay_seconds = 2;
+    for (int i = 0; i < max_attempts; ++i) {
+        std::string etag;
+
+        // Check if the file already exists locally
+        const auto file_exists = std::filesystem::exists(path);
+        if (file_exists) {
+            etag = read_etag(path);
+        } else {
+            LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
+        }
+
+        bool head_request_ok = false;
+        bool should_download = !file_exists;  // by default, we should download if the file does not exist
+
+        // Initialize libcurl
+        curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
+        common_load_model_from_url_headers headers;
+        curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
+        curl_slist_ptr http_headers;
+        const bool     was_perform_successful = common_download_head(curl.get(), http_headers, url, bearer_token);
+        if (!was_perform_successful) {
+            head_request_ok = false;
+        }
+
+        long http_code = 0;
+        curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
+        if (http_code == 200) {
+            head_request_ok = true;
+        } else {
+            LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
+            head_request_ok = false;
+        }
+
+        // if head_request_ok is false, we don't have the etag or last-modified headers
+        // we leave should_download as-is, which is true if the file does not exist
+        bool should_download_from_scratch = false;
+        if (head_request_ok) {
+            // check if ETag or Last-Modified headers are different
+            // if it is, we need to download the file again
+            if (!etag.empty() && etag != headers.etag) {
+                LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(),
+                        headers.etag.c_str());
+                should_download              = true;
+                should_download_from_scratch = true;
+            }
+        }
+
+        const bool accept_ranges_supported = !headers.accept_ranges.empty() && headers.accept_ranges != "none";
+        if (should_download) {
+            if (file_exists &&
+                !accept_ranges_supported) {  // Resumable downloads not supported, delete and start again.
+                LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
+                if (remove(path.c_str()) != 0) {
+                    LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
+                    return false;
+                }
+            }
+
+            const std::string path_temporary = path + ".downloadInProgress";
+            if (should_download_from_scratch) {
+                if (std::filesystem::exists(path_temporary)) {
+                    if (remove(path_temporary.c_str()) != 0) {
+                        LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
+                        return false;
+                    }
+                }
+
+                if (std::filesystem::exists(path)) {
+                    if (remove(path.c_str()) != 0) {
+                        LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
+                        return false;
+                    }
+                }
+            }
+            if (head_request_ok) {
+                write_etag(path, headers.etag);
+            }
+
+            // start the download
+            LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n",
+                    __func__, llama_download_hide_password_in_url(url).c_str(), path_temporary.c_str(),
+                    headers.etag.c_str(), headers.last_modified.c_str());
+            const bool was_pull_successful = common_pull_file(curl.get(), path_temporary);
+            if (!was_pull_successful) {
+                if (i + 1 < max_attempts) {
+                    const int exponential_backoff_delay = std::pow(retry_delay_seconds, i) * 1000;
+                    LOG_WRN("%s: retrying after %d milliseconds...\n", __func__, exponential_backoff_delay);
+                    std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
+                } else {
+                    LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
+                }
+
+                continue;
+            }
+
+            long http_code = 0;
+            curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
+            if (http_code < 200 || http_code >= 400) {
+                LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
+                return false;
+            }
+
+            if (rename(path_temporary.c_str(), path.c_str()) != 0) {
+                LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
+                return false;
+            }
+        } else {
+            LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
+        }
+
+        break;
+    }
+
+    return true;
+}
+
+std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params) {
+    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
+    curl_slist_ptr http_headers;
+    std::vector<char> res_buffer;
+
+    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
+    curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
+    curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
+    curl_easy_setopt(curl.get(), CURLOPT_VERBOSE, 1L);
+    typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
+    auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
+        auto data_vec = static_cast<std::vector<char> *>(data);
+        data_vec->insert(data_vec->end(), (char *)ptr, (char *)ptr + size * nmemb);
+        return size * nmemb;
+    };
+    curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
+    curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_buffer);
+#if defined(_WIN32)
+    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
+#endif
+    if (params.timeout > 0) {
+        curl_easy_setopt(curl.get(), CURLOPT_TIMEOUT, params.timeout);
+    }
+    if (params.max_size > 0) {
+        curl_easy_setopt(curl.get(), CURLOPT_MAXFILESIZE, params.max_size);
+    }
+    http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
+    for (const auto & header : params.headers) {
+        http_headers.ptr = curl_slist_append(http_headers.ptr, header.c_str());
+    }
+    curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
+
+    CURLcode res = curl_easy_perform(curl.get());
+
+    if (res != CURLE_OK) {
+        std::string error_msg = curl_easy_strerror(res);
+        throw std::runtime_error("error: cannot make GET request: " + error_msg);
+    }
+
+    long res_code;
+    curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
+
+    return { res_code, std::move(res_buffer) };
+}
+
+#else
+
+static bool is_output_a_tty() {
+#if defined(_WIN32)
+    return _isatty(_fileno(stdout));
+#else
+    return isatty(1);
+#endif
+}
+
+static void print_progress(size_t current, size_t total) {
+    if (!is_output_a_tty()) {
+        return;
+    }
+
+    if (!total) {
+        return;
+    }
+
+    size_t width = 50;
+    size_t pct = (100 * current) / total;
+    size_t pos = (width * current) / total;
+
+    std::cout << "["
+              << std::string(pos, '=')
+              << (pos < width ? ">" : "")
+              << std::string(width - pos, ' ')
+              << "] " << std::setw(3) << pct << "%  ("
+              << current / (1024 * 1024) << " MB / "
+              << total / (1024 * 1024) << " MB)\r";
+    std::cout.flush();
+}
+
+static bool common_pull_file(httplib::Client & cli,
+                             const std::string & resolve_path,
+                             const std::string & path_tmp,
+                             bool supports_ranges,
+                             size_t existing_size,
+                             size_t & total_size) {
+    std::ofstream ofs(path_tmp, std::ios::binary | std::ios::app);
+    if (!ofs.is_open()) {
+        LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path_tmp.c_str());
+        return false;
+    }
+
+    httplib::Headers headers;
+    if (supports_ranges && existing_size > 0) {
+        headers.emplace("Range", "bytes=" + std::to_string(existing_size) + "-");
+    }
+
+    std::atomic<size_t> downloaded{existing_size};
+
+    auto res = cli.Get(resolve_path, headers,
+        [&](const httplib::Response &response) {
+            if (existing_size > 0 && response.status != 206) {
+                LOG_WRN("%s: server did not respond with 206 Partial Content for a resume request. Status: %d\n", __func__, response.status);
+                return false;
+            }
+            if (existing_size == 0 && response.status != 200) {
+                LOG_WRN("%s: download received non-successful status code: %d\n", __func__, response.status);
+                return false;
+            }
+            if (total_size == 0 && response.has_header("Content-Length")) {
+                try {
+                    size_t content_length = std::stoull(response.get_header_value("Content-Length"));
+                    total_size = existing_size + content_length;
+                } catch (const std::exception &e) {
+                    LOG_WRN("%s: invalid Content-Length header: %s\n", __func__, e.what());
+                }
+            }
+            return true;
+        },
+        [&](const char *data, size_t len) {
+            ofs.write(data, len);
+            if (!ofs) {
+                LOG_ERR("%s: error writing to file: %s\n", __func__, path_tmp.c_str());
+                return false;
+            }
+            downloaded += len;
+            print_progress(downloaded, total_size);
+            return true;
+        },
+        nullptr
+    );
+
+    std::cout << "\n";
+
+    if (!res) {
+        LOG_ERR("%s: error during download. Status: %d\n", __func__, res ? res->status : -1);
+        return false;
+    }
+
+    return true;
+}
+
+// download one single file from remote URL to local path
+static bool common_download_file_single_online(const std::string & url,
+                                               const std::string & path,
+                                               const std::string & bearer_token) {
+    static const int max_attempts        = 3;
+    static const int retry_delay_seconds = 2;
+
+    auto [cli, parts] = common_http_client(url);
+
+    httplib::Headers default_headers = {{"User-Agent", "llama-cpp"}};
+    if (!bearer_token.empty()) {
+        default_headers.insert({"Authorization", "Bearer " + bearer_token});
+    }
+    cli.set_default_headers(default_headers);
+
+    const bool file_exists = std::filesystem::exists(path);
+
+    std::string last_etag;
+    if (file_exists) {
+        last_etag = read_etag(path);
+    } else {
+        LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
+    }
+
+    for (int i = 0; i < max_attempts; ++i) {
+        auto head = cli.Head(parts.path);
+        bool head_ok = head && head->status >= 200 && head->status < 300;
+        if (!head_ok) {
+            LOG_WRN("%s: HEAD invalid http status code received: %d\n", __func__, head ? head->status : -1);
+            if (file_exists) {
+                LOG_INF("%s: Using cached file (HEAD failed): %s\n", __func__, path.c_str());
+                return true;
+            }
+        }
+
+        std::string etag;
+        if (head_ok && head->has_header("ETag")) {
+            etag = head->get_header_value("ETag");
+        }
+
+        size_t total_size = 0;
+        if (head_ok && head->has_header("Content-Length")) {
+            try {
+                total_size = std::stoull(head->get_header_value("Content-Length"));
+            } catch (const std::exception& e) {
+                LOG_WRN("%s: Invalid Content-Length in HEAD response: %s\n", __func__, e.what());
+            }
+        }
+
+        bool supports_ranges = false;
+        if (head_ok && head->has_header("Accept-Ranges")) {
+            supports_ranges = head->get_header_value("Accept-Ranges") != "none";
+        }
+
+        bool should_download_from_scratch = false;
+        if (!last_etag.empty() && !etag.empty() && last_etag != etag) {
+            LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__,
+                    last_etag.c_str(), etag.c_str());
+            should_download_from_scratch = true;
+        }
+
+        if (file_exists) {
+            if (!should_download_from_scratch) {
+                LOG_INF("%s: using cached file: %s\n", __func__, path.c_str());
+                return true;
+            }
+            LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
+            if (remove(path.c_str()) != 0) {
+                LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
+                return false;
+            }
+        }
+
+        const std::string path_temporary = path + ".downloadInProgress";
+        size_t existing_size = 0;
+
+        if (std::filesystem::exists(path_temporary)) {
+            if (supports_ranges && !should_download_from_scratch) {
+                existing_size = std::filesystem::file_size(path_temporary);
+            } else if (remove(path_temporary.c_str()) != 0) {
+                LOG_ERR("%s: unable to delete file: %s\n", __func__, path_temporary.c_str());
+                return false;
+            }
+        }
+
+        // start the download
+        LOG_INF("%s: trying to download model from %s to %s (etag:%s)...\n",
+                __func__, common_http_show_masked_url(parts).c_str(), path_temporary.c_str(), etag.c_str());
+        const bool was_pull_successful = common_pull_file(cli, parts.path, path_temporary, supports_ranges, existing_size, total_size);
+        if (!was_pull_successful) {
+            if (i + 1 < max_attempts) {
+                const int exponential_backoff_delay = std::pow(retry_delay_seconds, i) * 1000;
+                LOG_WRN("%s: retrying after %d milliseconds...\n", __func__, exponential_backoff_delay);
+                std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
+            } else {
+                LOG_ERR("%s: download failed after %d attempts\n", __func__, max_attempts);
+            }
+            continue;
+        }
+
+        if (std::rename(path_temporary.c_str(), path.c_str()) != 0) {
+            LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
+            return false;
+        }
+        if (!etag.empty()) {
+            write_etag(path, etag);
+        }
+        break;
+    }
+
+    return true;
+}
+
+std::pair<long, std::vector<char>> common_remote_get_content(const std::string          & url,
+                                                             const common_remote_params & params) {
+    auto [cli, parts] = common_http_client(url);
+
+    httplib::Headers headers = {{"User-Agent", "llama-cpp"}};
+    for (const auto & header : params.headers) {
+        size_t pos = header.find(':');
+        if (pos != std::string::npos) {
+            headers.emplace(header.substr(0, pos), header.substr(pos + 1));
+        } else {
+            headers.emplace(header, "");
+        }
+    }
+
+    if (params.timeout > 0) {
+        cli.set_read_timeout(params.timeout, 0);
+        cli.set_write_timeout(params.timeout, 0);
+    }
+
+    std::vector<char> buf;
+    auto res = cli.Get(parts.path, headers,
+        [&](const char *data, size_t len) {
+            buf.insert(buf.end(), data, data + len);
+            return params.max_size == 0 ||
+                   buf.size() <= static_cast<size_t>(params.max_size);
+        },
+        nullptr
+    );
+
+    if (!res) {
+        throw std::runtime_error("error: cannot make GET request");
+    }
+
+    return { res->status, std::move(buf) };
+}
+
+#endif // LLAMA_USE_CURL
+
+static bool common_download_file_single(const std::string & url,
+                                        const std::string & path,
+                                        const std::string & bearer_token,
+                                        bool                offline) {
+    if (!offline) {
+        return common_download_file_single_online(url, path, bearer_token);
+    }
+
+    if (!std::filesystem::exists(path)) {
+        LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
+        return false;
+    }
+
+    LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
+    return true;
+}
+
+// download multiple files from remote URLs to local paths
+// the input is a vector of pairs <url, path>
+static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
+    // Prepare download in parallel
+    std::vector<std::future<bool>> futures_download;
+    for (auto const & item : urls) {
+        futures_download.push_back(std::async(std::launch::async, [bearer_token, offline](const std::pair<std::string, std::string> & it) -> bool {
+            return common_download_file_single(it.first, it.second, bearer_token, offline);
+        }, item));
+    }
+
+    // Wait for all downloads to complete
+    for (auto & f : futures_download) {
+        if (!f.get()) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+bool common_download_model(
+        const common_params_model & model,
+        const std::string & bearer_token,
+        bool offline) {
+    // Basic validation of the model.url
+    if (model.url.empty()) {
+        LOG_ERR("%s: invalid model url\n", __func__);
+        return false;
+    }
+
+    if (!common_download_file_single(model.url, model.path, bearer_token, offline)) {
+        return false;
+    }
+
+    // check for additional GGUFs split to download
+    int n_split = 0;
+    {
+        struct gguf_init_params gguf_params = {
+            /*.no_alloc = */ true,
+            /*.ctx      = */ NULL,
+        };
+        auto * ctx_gguf = gguf_init_from_file(model.path.c_str(), gguf_params);
+        if (!ctx_gguf) {
+            LOG_ERR("\n%s:  failed to load input GGUF from %s\n", __func__, model.path.c_str());
+            return false;
+        }
+
+        auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
+        if (key_n_split >= 0) {
+            n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
+        }
+
+        gguf_free(ctx_gguf);
+    }
+
+    if (n_split > 1) {
+        char split_prefix[PATH_MAX] = {0};
+        char split_url_prefix[LLAMA_MAX_URL_LENGTH] = {0};
+
+        // Verify the first split file format
+        // and extract split URL and PATH prefixes
+        {
+            if (!llama_split_prefix(split_prefix, sizeof(split_prefix), model.path.c_str(), 0, n_split)) {
+                LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, model.path.c_str(), n_split);
+                return false;
+            }
+
+            if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model.url.c_str(), 0, n_split)) {
+                LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model.url.c_str(), n_split);
+                return false;
+            }
+        }
+
+        std::vector<std::pair<std::string, std::string>> urls;
+        for (int idx = 1; idx < n_split; idx++) {
+            char split_path[PATH_MAX] = {0};
+            llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
+
+            char split_url[LLAMA_MAX_URL_LENGTH] = {0};
+            llama_split_path(split_url, sizeof(split_url), split_url_prefix, idx, n_split);
+
+            if (std::string(split_path) == model.path) {
+                continue; // skip the already downloaded file
+            }
+
+            urls.push_back({split_url, split_path});
+        }
+
+        // Download in parallel
+        common_download_file_multiple(urls, bearer_token, offline);
+    }
+
+    return true;
+}
+
+common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token, bool offline) {
+    auto parts = string_split<std::string>(hf_repo_with_tag, ':');
+    std::string tag = parts.size() > 1 ? parts.back() : "latest";
+    std::string hf_repo = parts[0];
+    if (string_split<std::string>(hf_repo, '/').size() != 2) {
+        throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
+    }
+
+    std::string url = get_model_endpoint() + "v2/" + hf_repo + "/manifests/" + tag;
+
+    // headers
+    std::vector<std::string> headers;
+    headers.push_back("Accept: application/json");
+    if (!bearer_token.empty()) {
+        headers.push_back("Authorization: Bearer " + bearer_token);
+    }
+    // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
+    // User-Agent header is already set in common_remote_get_content, no need to set it here
+
+    // we use "=" to avoid clashing with other component, while still being allowed on windows
+    std::string cached_response_fname = "manifest=" + hf_repo + "=" + tag + ".json";
+    string_replace_all(cached_response_fname, "/", "_");
+    std::string cached_response_path = fs_get_cache_file(cached_response_fname);
+
+    // make the request
+    common_remote_params params;
+    params.headers = headers;
+    long res_code = 0;
+    std::string res_str;
+    bool use_cache = false;
+    if (!offline) {
+        try {
+            auto res = common_remote_get_content(url, params);
+            res_code = res.first;
+            res_str = std::string(res.second.data(), res.second.size());
+        } catch (const std::exception & e) {
+            LOG_WRN("error: failed to get manifest at %s: %s\n", url.c_str(), e.what());
+        }
+    }
+    if (res_code == 0) {
+        if (std::filesystem::exists(cached_response_path)) {
+            LOG_WRN("trying to read manifest from cache: %s\n", cached_response_path.c_str());
+            res_str = read_file(cached_response_path);
+            res_code = 200;
+            use_cache = true;
+        } else {
+            throw std::runtime_error(
+                offline ? "error: failed to get manifest (offline mode)"
+                : "error: failed to get manifest (check your internet connection)");
+        }
+    }
+    std::string ggufFile;
+    std::string mmprojFile;
+
+    if (res_code == 200 || res_code == 304) {
+        try {
+            auto j = json::parse(res_str);
+
+            if (j.contains("ggufFile") && j["ggufFile"].contains("rfilename")) {
+                ggufFile = j["ggufFile"]["rfilename"].get<std::string>();
+            }
+            if (j.contains("mmprojFile") && j["mmprojFile"].contains("rfilename")) {
+                mmprojFile = j["mmprojFile"]["rfilename"].get<std::string>();
+            }
+        } catch (const std::exception & e) {
+            throw std::runtime_error(std::string("error parsing manifest JSON: ") + e.what());
+        }
+        if (!use_cache) {
+            // if not using cached response, update the cache file
+            write_file(cached_response_path, res_str);
+        }
+    } else if (res_code == 401) {
+        throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
+    } else {
+        throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
+    }
+
+    // check response
+    if (ggufFile.empty()) {
+        throw std::runtime_error("error: model does not have ggufFile");
+    }
+
+    return { hf_repo, ggufFile, mmprojFile };
+}
+
+//
+// Docker registry functions
+//
+
+static std::string common_docker_get_token(const std::string & repo) {
+    std::string url = "https://auth.docker.io/token?service=registry.docker.io&scope=repository:" + repo + ":pull";
+
+    common_remote_params params;
+    auto                 res = common_remote_get_content(url, params);
+
+    if (res.first != 200) {
+        throw std::runtime_error("Failed to get Docker registry token, HTTP code: " + std::to_string(res.first));
+    }
+
+    std::string            response_str(res.second.begin(), res.second.end());
+    nlohmann::ordered_json response = nlohmann::ordered_json::parse(response_str);
+
+    if (!response.contains("token")) {
+        throw std::runtime_error("Docker registry token response missing 'token' field");
+    }
+
+    return response["token"].get<std::string>();
+}
+
+std::string common_docker_resolve_model(const std::string & docker) {
+    // Parse ai/smollm2:135M-Q4_0
+    size_t      colon_pos = docker.find(':');
+    std::string repo, tag;
+    if (colon_pos != std::string::npos) {
+        repo = docker.substr(0, colon_pos);
+        tag  = docker.substr(colon_pos + 1);
+    } else {
+        repo = docker;
+        tag  = "latest";
+    }
+
+    // ai/ is the default
+    size_t      slash_pos = docker.find('/');
+    if (slash_pos == std::string::npos) {
+        repo.insert(0, "ai/");
+    }
+
+    LOG_INF("%s: Downloading Docker Model: %s:%s\n", __func__, repo.c_str(), tag.c_str());
+    try {
+        // --- helper: digest validation ---
+        auto validate_oci_digest = [](const std::string & digest) -> std::string {
+            // Expected: algo:hex ; start with sha256 (64 hex chars)
+            // You can extend this map if supporting other algorithms in future.
+            static const std::regex re("^sha256:([a-fA-F0-9]{64})$");
+            std::smatch m;
+            if (!std::regex_match(digest, m, re)) {
+                throw std::runtime_error("Invalid OCI digest format received in manifest: " + digest);
+            }
+            // normalize hex to lowercase
+            std::string normalized = digest;
+            std::transform(normalized.begin()+7, normalized.end(), normalized.begin()+7, [](unsigned char c){
+                return std::tolower(c);
+            });
+            return normalized;
+        };
+
+        std::string token = common_docker_get_token(repo);  // Get authentication token
+
+        // Get manifest
+        const std::string    url_prefix = "https://registry-1.docker.io/v2/" + repo;
+        std::string          manifest_url = url_prefix + "/manifests/" + tag;
+        common_remote_params manifest_params;
+        manifest_params.headers.push_back("Authorization: Bearer " + token);
+        manifest_params.headers.push_back(
+            "Accept: application/vnd.docker.distribution.manifest.v2+json,application/vnd.oci.image.manifest.v1+json");
+        auto manifest_res = common_remote_get_content(manifest_url, manifest_params);
+        if (manifest_res.first != 200) {
+            throw std::runtime_error("Failed to get Docker manifest, HTTP code: " + std::to_string(manifest_res.first));
+        }
+
+        std::string            manifest_str(manifest_res.second.begin(), manifest_res.second.end());
+        nlohmann::ordered_json manifest = nlohmann::ordered_json::parse(manifest_str);
+        std::string            gguf_digest;  // Find the GGUF layer
+        if (manifest.contains("layers")) {
+            for (const auto & layer : manifest["layers"]) {
+                if (layer.contains("mediaType")) {
+                    std::string media_type = layer["mediaType"].get<std::string>();
+                    if (media_type == "application/vnd.docker.ai.gguf.v3" ||
+                        media_type.find("gguf") != std::string::npos) {
+                        gguf_digest = layer["digest"].get<std::string>();
+                        break;
+                    }
+                }
+            }
+        }
+
+        if (gguf_digest.empty()) {
+            throw std::runtime_error("No GGUF layer found in Docker manifest");
+        }
+
+        // Validate & normalize digest
+        gguf_digest = validate_oci_digest(gguf_digest);
+        LOG_DBG("%s: Using validated digest: %s\n", __func__, gguf_digest.c_str());
+
+        // Prepare local filename
+        std::string model_filename = repo;
+        std::replace(model_filename.begin(), model_filename.end(), '/', '_');
+        model_filename += "_" + tag + ".gguf";
+        std::string local_path = fs_get_cache_file(model_filename);
+
+        const std::string blob_url = url_prefix + "/blobs/" + gguf_digest;
+        if (!common_download_file_single(blob_url, local_path, token, false)) {
+            throw std::runtime_error("Failed to download Docker Model");
+        }
+
+        LOG_INF("%s: Downloaded Docker Model to: %s\n", __func__, local_path.c_str());
+        return local_path;
+    } catch (const std::exception & e) {
+        LOG_ERR("%s: Docker Model download failed: %s\n", __func__, e.what());
+        throw;
+    }
+}
diff --git a/common/download.h b/common/download.h
new file mode 100644
index 000000000..ddf36155e
--- /dev/null
+++ b/common/download.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <string>
+
+struct common_params_model;
+
+//
+// download functionalities
+//
+
+struct common_hf_file_res {
+    std::string repo; // repo name with ":tag" removed
+    std::string ggufFile;
+    std::string mmprojFile;
+};
+
+// resolve and download model from Docker registry
+// return local path to downloaded model file
+std::string common_docker_resolve_model(const std::string & docker);
+
+/**
+ * Allow getting the HF file from the HF repo with tag (like ollama), for example:
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
+ * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
+ *
+ * Return pair of <repo, file> (with "repo" already having tag removed)
+ *
+ * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
+ */
+common_hf_file_res common_get_hf_file(
+    const std::string & hf_repo_with_tag,
+    const std::string & bearer_token,
+    bool offline);
+
+// returns true if download succeeded
+bool common_download_model(
+    const common_params_model & model,
+    const std::string & bearer_token,
+    bool offline);

From 8c0d6bb4559e15f1236035a0465781c1345324e5 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 7 Nov 2025 12:24:47 +0200
Subject: [PATCH 15/32] server : print the samplers chain for each request
 (#17070)

---
 tools/server/server.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 0e1a9afc8..164e8cf4e 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -2823,6 +2823,8 @@ struct server_context {
                 send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
                 return false;
             }
+
+            SLT_INF(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl).c_str());
         }
 
         // initialize draft batch

From 7c23f3f0d4b9f5d6ea140756eb694b562d5acebb Mon Sep 17 00:00:00 2001
From: iron <lizhenneng@gmail.com>
Date: Sat, 8 Nov 2025 00:18:14 +0800
Subject: [PATCH 16/32] ggml-cpu: detect correct cpu flags for arm64 (#16229)
 (#16239)

When using GCC 9 and GCC 12 on the arm64 platform of ubuntu 2004,
the command "gcc -mcpu=native -E -v -" fails to detect the correct CPU flags,
which results in compilation failures for certain extended instructions,
but the correct CPU flags can be obtained by using gcc -march.

Signed-off-by: lizhenneng <lizhenneng@kylinos.cn>
Co-authored-by: lizhenneng <lizhenneng@kylinos.cn>
---
 ggml/src/ggml-cpu/CMakeLists.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index 23ec8bb08..485227d24 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -118,18 +118,18 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
                 # so we check for them manually and enable them if available
 
                 execute_process(
-                    COMMAND ${CMAKE_C_COMPILER} -mcpu=native -E -v -
+                    COMMAND ${CMAKE_C_COMPILER} -march=native -E -v -
                     INPUT_FILE "/dev/null"
                     OUTPUT_QUIET
                     ERROR_VARIABLE ARM_MCPU
                     RESULT_VARIABLE ARM_MCPU_RESULT
                 )
                 if (NOT ARM_MCPU_RESULT)
-                    string(REGEX MATCH "-mcpu=[^ ']+" ARM_MCPU_FLAG "${ARM_MCPU}")
+                    string(REGEX MATCH "-march=[^ ']+" ARM_MCPU_FLAG "${ARM_MCPU}")
                 endif()
                 if ("${ARM_MCPU_FLAG}" STREQUAL "")
-                    set(ARM_MCPU_FLAG -mcpu=native)
-                    message(STATUS "ARM -mcpu not found, -mcpu=native will be used")
+                    set(ARM_MCPU_FLAG -march=native)
+                    message(STATUS "ARM -mcpu not found, -march=native will be used")
                 endif()
 
                 include(CheckCXXSourceRuns)

From 9eb9a1331dec83098c858150cd0a8ad9f6d8f46c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= <angt@huggingface.co>
Date: Fri, 7 Nov 2025 17:34:05 +0100
Subject: [PATCH 17/32] Revert "ggml-cpu: detect correct cpu flags for arm64
 (#16229) (#16239)" (#17084)

This reverts commit 7c23f3f0d4b9f5d6ea140756eb694b562d5acebb.
---
 ggml/src/ggml-cpu/CMakeLists.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index 485227d24..23ec8bb08 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -118,18 +118,18 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
                 # so we check for them manually and enable them if available
 
                 execute_process(
-                    COMMAND ${CMAKE_C_COMPILER} -march=native -E -v -
+                    COMMAND ${CMAKE_C_COMPILER} -mcpu=native -E -v -
                     INPUT_FILE "/dev/null"
                     OUTPUT_QUIET
                     ERROR_VARIABLE ARM_MCPU
                     RESULT_VARIABLE ARM_MCPU_RESULT
                 )
                 if (NOT ARM_MCPU_RESULT)
-                    string(REGEX MATCH "-march=[^ ']+" ARM_MCPU_FLAG "${ARM_MCPU}")
+                    string(REGEX MATCH "-mcpu=[^ ']+" ARM_MCPU_FLAG "${ARM_MCPU}")
                 endif()
                 if ("${ARM_MCPU_FLAG}" STREQUAL "")
-                    set(ARM_MCPU_FLAG -march=native)
-                    message(STATUS "ARM -mcpu not found, -march=native will be used")
+                    set(ARM_MCPU_FLAG -mcpu=native)
+                    message(STATUS "ARM -mcpu not found, -mcpu=native will be used")
                 endif()
 
                 include(CheckCXXSourceRuns)

From 16bcc1259d311d0fd37fe00fefcc7900324d38cb Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 7 Nov 2025 20:03:25 +0200
Subject: [PATCH 18/32] kv-cache : pad the cache size to 256 for performance
 (#17046)

* kv-cache : pad the size of the small SWA cache for performance

* context : pad the total context to 256

* cont : future-proof the swa pad

* server : adjust test params to new logic
---
 include/llama.h                             |  1 +
 src/llama-context.cpp                       |  4 ++++
 src/llama-kv-cache-iswa.cpp                 |  4 +++-
 tools/server/tests/unit/test_speculative.py | 12 ++++++------
 4 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index 98bed9d61..aa9932afb 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -463,6 +463,7 @@ extern "C" {
 
     // NOTE: After creating a llama_context, it is recommended to query the actual values using these functions
     //       In some cases the requested values via llama_context_params may differ from the actual values used by the context
+    //       ref: https://github.com/ggml-org/llama.cpp/pull/17046#discussion_r2503085732
     LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_ctx_seq  (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 866514038..e115fcd93 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -114,10 +114,14 @@ llama_context::llama_context(
         }
     }
 
+    // ref: https://github.com/ggml-org/llama.cpp/pull/17046#discussion_r2503085732
+    cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256);
+
     if (cparams.kv_unified) {
         cparams.n_ctx_seq = cparams.n_ctx;
     } else {
         cparams.n_ctx_seq = cparams.n_ctx / cparams.n_seq_max;
+        cparams.n_ctx_seq = GGML_PAD(cparams.n_ctx_seq, 256);
 
         if (cparams.n_ctx_seq == 0) {
             throw std::runtime_error("n_ctx_seq == 0");
diff --git a/src/llama-kv-cache-iswa.cpp b/src/llama-kv-cache-iswa.cpp
index facba1d00..3a34102a2 100644
--- a/src/llama-kv-cache-iswa.cpp
+++ b/src/llama-kv-cache-iswa.cpp
@@ -45,7 +45,9 @@ llama_kv_cache_iswa::llama_kv_cache_iswa(
 
     const uint32_t size_base = kv_size;
 
-    uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*(unified ? n_seq_max : 1) + n_ubatch, n_pad));
+    // note: the SWA cache is always padded to 256 for performance
+    //       https://github.com/ggml-org/llama.cpp/issues/17037
+    uint32_t size_swa = GGML_PAD(std::min(size_base, hparams.n_swa*(unified ? n_seq_max : 1) + n_ubatch), 256);
 
     // when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size
     if (swa_full) {
diff --git a/tools/server/tests/unit/test_speculative.py b/tools/server/tests/unit/test_speculative.py
index 65952de8b..d2f3fba5f 100644
--- a/tools/server/tests/unit/test_speculative.py
+++ b/tools/server/tests/unit/test_speculative.py
@@ -77,10 +77,10 @@ def test_different_draft_min_draft_max():
 
 def test_slot_ctx_not_exceeded():
     global server
-    server.n_ctx = 64
+    server.n_ctx = 256
     server.start()
     res = server.make_request("POST", "/completion", data={
-        "prompt": "Hello " * 56,
+        "prompt": "Hello " * 248,
         "temperature": 0.0,
         "top_k": 1,
         "speculative.p_min": 0.0,
@@ -91,19 +91,19 @@ def test_slot_ctx_not_exceeded():
 
 def test_with_ctx_shift():
     global server
-    server.n_ctx = 64
+    server.n_ctx = 256
     server.enable_ctx_shift = True
     server.start()
     res = server.make_request("POST", "/completion", data={
-        "prompt": "Hello " * 56,
+        "prompt": "Hello " * 248,
         "temperature": 0.0,
         "top_k": 1,
-        "n_predict": 64,
+        "n_predict": 256,
         "speculative.p_min": 0.0,
     })
     assert res.status_code == 200
     assert len(res.body["content"]) > 0
-    assert res.body["tokens_predicted"] == 64
+    assert res.body["tokens_predicted"] == 256
     assert res.body["truncated"] == True
 
 

From 9008027aa376526819415469f74fb9281136224e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Fri, 7 Nov 2025 19:27:58 +0100
Subject: [PATCH 19/32] hparams : add n_embd_inp() to support extended embed
 (#16928)

* add n_embd_full to support extended embed

* don't change output

* rename to n_embd_inp

* restore n_embd where applicable
---
 include/llama.h            |  1 +
 src/llama-context.cpp      |  6 +++---
 src/llama-graph.cpp        |  4 ++--
 src/llama-hparams.cpp      | 10 ++++++++++
 src/llama-hparams.h        |  3 +++
 src/llama-model.cpp        | 23 +++++++----------------
 src/models/qwen3vl-moe.cpp |  3 +--
 src/models/qwen3vl.cpp     |  5 +----
 tools/mtmd/mtmd.cpp        |  2 +-
 9 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/include/llama.h b/include/llama.h
index aa9932afb..8547226ff 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -486,6 +486,7 @@ extern "C" {
 
     LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model);
     LLAMA_API int32_t llama_model_n_embd     (const struct llama_model * model);
+    LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model);
     LLAMA_API int32_t llama_model_n_layer    (const struct llama_model * model);
     LLAMA_API int32_t llama_model_n_head     (const struct llama_model * model);
     LLAMA_API int32_t llama_model_n_head_kv  (const struct llama_model * model);
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index e115fcd93..70a3ec62d 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -827,7 +827,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
 
     const auto & hparams = model.hparams;
 
-    const int64_t n_embd  = hparams.n_embd;
+    const int64_t n_embd  = hparams.n_embd_inp();
     const int64_t n_vocab = model.vocab.n_tokens();
 
     // note: during encode, we always pass the full sequence starting from pos = 0
@@ -996,7 +996,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
     const auto & hparams = model.hparams;
 
     const int64_t n_vocab = vocab.n_tokens();
-    const int64_t n_embd  = hparams.n_embd;
+    const int64_t n_embd  = hparams.n_embd_inp();
 
     // when computing embeddings, all tokens are output
     const bool output_all = cparams.embeddings;
@@ -2154,7 +2154,7 @@ void llama_context::opt_epoch_iter(
             batch.logits  [pos_batch]    = true;
         }
 
-        if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
+        if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd_inp(), cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
             LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
             return;
         }
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index f9751b318..b199e9462 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -1142,7 +1142,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
 
 // input embeddings with optional lora
 ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
-    const int64_t n_embd = hparams.n_embd;
+    const int64_t n_embd = hparams.n_embd_inp();
 
     auto inp = std::make_unique<llm_graph_input_embd>();
 
@@ -1279,7 +1279,7 @@ ggml_tensor * llm_graph_context::build_inp_cross_embd() const {
     //    return cur;
     //}
 
-    const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd;
+    const auto n_embd = !cross->v_embd.empty() ? cross->n_embd : hparams.n_embd_inp();
     const auto n_enc  = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
 
     cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_enc);
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index 514d65384..8cdbaf69f 100644
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -60,6 +60,16 @@ uint32_t llama_hparams::n_gqa(uint32_t il) const {
     return n_head/n_head_kv;
 }
 
+uint32_t llama_hparams::n_embd_inp() const {
+    uint32_t n_embd_inp = n_embd;
+
+    if (n_deepstack_layers > 0) {
+        n_embd_inp += n_embd * n_deepstack_layers;
+    }
+
+    return n_embd_inp;
+}
+
 uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
     const uint32_t n_head_kv = this->n_head_kv(il);
 
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 539fecb3f..9203af83b 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -227,6 +227,9 @@ struct llama_hparams {
 
     uint32_t n_gqa(uint32_t il = 0) const;
 
+    // dimension of main + auxiliary input embeddings
+    uint32_t n_embd_inp() const;
+
     // dimension of key embeddings across all k-v heads
     uint32_t n_embd_k_gqa(uint32_t il = 0) const;
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 1987135ca..829f1e3c1 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -276,8 +276,8 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
             } break;
         case GGML_OP_IM2COL:
             {
-                const int n_embd = hparams.n_embd;
-                ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
+                const int n_embd_inp = hparams.n_embd_inp();
+                ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd_inp, w->ne[1], 1, 1);
                 op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
             } break;
         case GGML_OP_SCALE:
@@ -1039,9 +1039,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     case 64: type = LLM_TYPE_32B; break;
                     default: type = LLM_TYPE_UNKNOWN;
                 }
-                // since vision model stacks deepstack features along feature dim
-                // we also create a fake "n_embd" for text model to be the main embd + deepstack embds
-                hparams.n_embd *= hparams.n_deepstack_layers + 1;
             } break;
         case LLM_ARCH_QWEN3MOE:
             {
@@ -1065,9 +1062,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     case 94: type = LLM_TYPE_235B_A22B; break;
                     default: type = LLM_TYPE_UNKNOWN;
                 }
-                // since vision model stacks deepstack features along feature dim
-                // we also create a fake "n_embd" for text model to be the main embd + deepstack embds
-                hparams.n_embd *= hparams.n_deepstack_layers + 1;
             } break;
         case LLM_ARCH_PHI2:
             {
@@ -3341,10 +3335,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
             case LLM_ARCH_QWEN3:
             case LLM_ARCH_QWEN3VL:
                 {
-                    // for model loading, the weights only have the main embd
-                    // so we need to divide by the number of deepstack layers + 1
-                    // n_embd is const int so we declare a new variable
-                    int64_t n_embd = hparams.n_embd / (hparams.n_deepstack_layers + 1);
                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
@@ -3380,10 +3370,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
             case LLM_ARCH_QWEN3MOE:
             case LLM_ARCH_QWEN3VLMOE:
                 {
-                    // for model loading, the weights only have the main embd
-                    // so we need to divide by the number of deepstack layers + 1
-                    // n_embd is const int so we declare a new variable
-                    int64_t n_embd = hparams.n_embd / (hparams.n_deepstack_layers + 1);
                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
                     // output
@@ -6535,6 +6521,7 @@ void llama_model::print_info() const {
     if (!hparams.vocab_only) {
         LLAMA_LOG_INFO("%s: n_ctx_train      = %u\n",     __func__, hparams.n_ctx_train);
         LLAMA_LOG_INFO("%s: n_embd           = %u\n",     __func__, hparams.n_embd);
+        LLAMA_LOG_INFO("%s: n_embd_inp       = %u\n",     __func__, hparams.n_embd_inp());
         LLAMA_LOG_INFO("%s: n_layer          = %u\n",     __func__, hparams.n_layer);
         LLAMA_LOG_INFO("%s: n_head           = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head(il);    }, hparams.n_layer).c_str());
         LLAMA_LOG_INFO("%s: n_head_kv        = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
@@ -7380,6 +7367,10 @@ int32_t llama_model_n_embd(const llama_model * model) {
     return model->hparams.n_embd;
 }
 
+int32_t llama_model_n_embd_inp(const llama_model * model) {
+    return model->hparams.n_embd_inp();
+}
+
 int32_t llama_model_n_layer(const llama_model * model) {
     return model->hparams.n_layer;
 }
diff --git a/src/models/qwen3vl-moe.cpp b/src/models/qwen3vl-moe.cpp
index c48643c0c..f72f80a83 100644
--- a/src/models/qwen3vl-moe.cpp
+++ b/src/models/qwen3vl-moe.cpp
@@ -1,9 +1,8 @@
 #include "models.h"
 
 llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_full = hparams.n_embd; // main embd + deepstack embds
     const size_t n_deepstack_layers = hparams.n_deepstack_layers;
-    const int64_t n_embd = n_embd_full / (n_deepstack_layers + 1);
+    const int64_t n_embd = hparams.n_embd;
     const int64_t n_embd_head = hparams.n_embd_head_v;
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
diff --git a/src/models/qwen3vl.cpp b/src/models/qwen3vl.cpp
index 10b36c1f6..0bae52239 100644
--- a/src/models/qwen3vl.cpp
+++ b/src/models/qwen3vl.cpp
@@ -1,13 +1,10 @@
 #include "models.h"
 
 llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-
-    const int64_t n_embd_full = hparams.n_embd; // main embd + deepstack embds
     const size_t n_deepstack_layers = hparams.n_deepstack_layers;
-    const int64_t n_embd = n_embd_full / (n_deepstack_layers + 1);
+    const int64_t n_embd = hparams.n_embd;
     const int64_t n_embd_head = hparams.n_embd_head_v;
 
-
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
     GGML_ASSERT(n_embd_head == hparams.n_rot);
 
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 4343f3b6f..e59913776 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -163,7 +163,7 @@ struct mtmd_context {
         print_timings(ctx_params.print_timings),
         n_threads    (ctx_params.n_threads),
         media_marker (ctx_params.media_marker),
-        n_embd_text  (llama_model_n_embd(text_model))
+        n_embd_text  (llama_model_n_embd_inp(text_model))
     {
         if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
             throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");

From 7956bb4d7f430f23f3c4726f7e6404b89f6e20a4 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 7 Nov 2025 21:23:11 +0200
Subject: [PATCH 20/32] bench : cache the llama_context state at computed depth
 (#16944)

* bench : cache llama_context state at depth

* cont : handle failures to restore the old state

* cont : print information when the state is being reused
---
 tools/llama-bench/llama-bench.cpp | 47 ++++++++++++++++++++++++++-----
 1 file changed, 40 insertions(+), 7 deletions(-)

diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp
index 0de07b981..852a51245 100644
--- a/tools/llama-bench/llama-bench.cpp
+++ b/tools/llama-bench/llama-bench.cpp
@@ -1919,6 +1919,12 @@ struct sql_printer : public printer {
     }
 };
 
+struct ctx_state {
+    int depth = 0; // in tokens
+
+    std::vector<uint8_t> buf; // the llama_context state buffer
+};
+
 static bool test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) {
     llama_set_n_threads(ctx, n_threads, n_threads);
 
@@ -2051,6 +2057,10 @@ int main(int argc, char ** argv) {
     llama_model *               lmodel    = nullptr;
     const cmd_params_instance * prev_inst = nullptr;
 
+    // store the llama_context state at the previous depth that we performed a test
+    // ref: https://github.com/ggml-org/llama.cpp/pull/16944#issuecomment-3478151721
+    ctx_state cstate;
+
     int  params_idx   = 0;
     auto params_count = params_instances.size();
     for (const auto & inst : params_instances) {
@@ -2134,14 +2144,37 @@ int main(int argc, char ** argv) {
             llama_memory_clear(llama_get_memory(ctx), false);
 
             if (t.n_depth > 0) {
-                if (params.progress) {
-                    fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d\n", params_idx, params_count,
-                            i + 1, params.reps);
+                bool is_cached = t.n_depth == cstate.depth;
+
+                if (is_cached) {
+                    // if previously we have computed at this depth, just restore the state
+                    const size_t ret = llama_state_seq_set_data(ctx, cstate.buf.data(), cstate.buf.size(), 0);
+                    if (ret == 0) {
+                        // if the old state is incompatible with the current context - reprocess from scratch
+                        is_cached = false;
+                    }
                 }
-                bool res = test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);
-                if (!res) {
-                    fprintf(stderr, "%s: error: failed to run depth\n", __func__);
-                    exit(1);
+
+                if (!is_cached) {
+                    if (params.progress) {
+                        fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d\n", params_idx, params_count,
+                                i + 1, params.reps);
+                    }
+                    bool res = test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);
+                    if (!res) {
+                        fprintf(stderr, "%s: error: failed to run depth\n", __func__);
+                        exit(1);
+                    }
+
+                    // store the context state for reuse in later runs
+                    cstate.depth = t.n_depth;
+                    cstate.buf.resize(llama_state_seq_get_size(ctx, 0));
+                    llama_state_seq_get_data(ctx, cstate.buf.data(), cstate.buf.size(), 0);
+                } else {
+                    if (params.progress) {
+                        fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d (cached)\n", params_idx, params_count,
+                                i + 1, params.reps);
+                    }
                 }
             }
 

From 65156105069fa86a4a81b6cb0e8cb583f6420677 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Fri, 7 Nov 2025 20:53:14 +0100
Subject: [PATCH 21/32] CUDA: fix should_use_mmvf for ne11 == 1 (#17085)

* CUDA: fix should_use_mmvf for ne11 == 1

* Apply suggestion from @am17an

Co-authored-by: Aman Gupta <amangupta052@gmail.com>

---------

Co-authored-by: Aman Gupta <amangupta052@gmail.com>
---
 ggml/src/ggml-cuda/mmf.cu  | 8 +++++++-
 ggml/src/ggml-cuda/mmvf.cu | 9 ++++++++-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cuda/mmf.cu b/ggml/src/ggml-cuda/mmf.cu
index 69a60aceb..153dd5a97 100644
--- a/ggml/src/ggml-cuda/mmf.cu
+++ b/ggml/src/ggml-cuda/mmf.cu
@@ -129,7 +129,13 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
     if (src0_ne[0] % (warp_size * (4/ts)) != 0) {
         return false;
     }
-    for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
+
+    if (src0_nb[0] != ts) {
+        return false;
+    }
+
+    // Pointers not aligned to the size of half2/nv_bfloat162/float2 would result in a crash:
+    for (size_t i = 1; i < GGML_MAX_DIMS; ++i) {
         if (src0_nb[i] % (2*ts) != 0) {
             return false;
         }
diff --git a/ggml/src/ggml-cuda/mmvf.cu b/ggml/src/ggml-cuda/mmvf.cu
index 526d90d7a..6238ce7eb 100644
--- a/ggml/src/ggml-cuda/mmvf.cu
+++ b/ggml/src/ggml-cuda/mmvf.cu
@@ -720,12 +720,19 @@ bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0
     if (src0_ne[0] % 2 != 0) {
         return false;
     }
+
     const size_t ts = ggml_type_size(type);
-    for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
+    if (src0_nb[0] != ts) {
+        return false;
+    }
+
+    // Pointers not aligned to the size of half2/nv_bfloat162/float2 would result in a crash:
+    for (size_t i = 1; i < GGML_MAX_DIMS; ++i) {
         if (src0_nb[i] % (2*ts) != 0) {
             return false;
         }
     }
+
     switch (type) {
         case GGML_TYPE_F32:
             if (GGML_CUDA_CC_IS_NVIDIA(cc)) {

From ac76d3620158aa191118abaf5603479646969543 Mon Sep 17 00:00:00 2001
From: Acly <aclysia@gmail.com>
Date: Fri, 7 Nov 2025 21:08:50 +0100
Subject: [PATCH 22/32] vulkan : refactor buffer handling in vk_op_f32 (#16840)

* vulkan : refactor/simplify buffer handling in vk_op_* functions

* Combine UMA handling into ggml_vk_tensor_subbuffer
---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 556 +++++----------------------
 1 file changed, 97 insertions(+), 459 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index ab94bc3d7..a0a05f2e5 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -5387,7 +5387,7 @@ static void ggml_vk_host_free(vk_device& device, void* ptr) {
     device->pinned_memory.erase(device->pinned_memory.begin() + index);
 }
 
-static void ggml_vk_host_get(vk_device& device, const void * ptr, vk_buffer& buf, size_t& buf_offset) {
+static void ggml_vk_host_get(const vk_device& device, const void * ptr, vk_buffer& buf, size_t& buf_offset) {
     std::lock_guard<std::recursive_mutex> guard(device->mutex);
     buf = nullptr;
     buf_offset = 0;
@@ -5402,6 +5402,32 @@ static void ggml_vk_host_get(vk_device& device, const void * ptr, vk_buffer& buf
     }
 }
 
+static vk_subbuffer ggml_vk_tensor_subbuffer(
+    const ggml_backend_vk_context * ctx, const ggml_tensor * tensor, bool allow_misalign = false) {
+
+    vk_buffer buffer = nullptr;
+    size_t offset = 0;
+    if (ctx->device->uma) {
+        ggml_vk_host_get(ctx->device, tensor->data, buffer, offset);
+    }
+    if (!buffer) {
+        auto buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context;
+        buffer = buf_ctx->dev_buffer;
+        offset = vk_tensor_offset(tensor) + tensor->view_offs;
+    }
+    GGML_ASSERT(buffer != nullptr);
+
+    size_t size = ggml_nbytes(tensor);
+
+    size_t misalign_bytes = offset & (ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
+    // The shader must support misaligned offsets when indexing into the buffer
+    GGML_ASSERT(allow_misalign || misalign_bytes == 0);
+    offset &= ~misalign_bytes;
+    size += misalign_bytes;
+
+    return vk_subbuffer{buffer, offset, size};
+}
+
 static vk_submission ggml_vk_begin_submission(vk_device& device, vk_command_pool& p, bool one_time = true) {
     vk_submission s;
     s.buffer = ggml_vk_create_cmd_buffer(device, p);
@@ -7953,72 +7979,12 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
     const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
     const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
 
-    vk_buffer d_Q = nullptr, d_K = nullptr, d_V = nullptr, d_D = nullptr, d_M = nullptr, d_S = nullptr;
-    size_t q_buf_offset = 0, k_buf_offset = 0, v_buf_offset = 0, d_buf_offset = 0, m_buf_offset = 0, s_buf_offset = 0;
-
-    bool Q_uma = false, K_uma = false, V_uma = false, D_uma = false, M_uma = false, S_uma = false;
-
-    if (ctx->device->uma) {
-        ggml_vk_host_get(ctx->device, q->data, d_Q, q_buf_offset);
-        ggml_vk_host_get(ctx->device, k->data, d_K, k_buf_offset);
-        ggml_vk_host_get(ctx->device, v->data, d_V, v_buf_offset);
-        ggml_vk_host_get(ctx->device, dst->data, d_D, d_buf_offset);
-        Q_uma = d_Q != nullptr;
-        K_uma = d_K != nullptr;
-        V_uma = d_V != nullptr;
-        D_uma = d_D != nullptr;
-        if (mask) {
-            ggml_vk_host_get(ctx->device, mask->data, d_M, m_buf_offset);
-            M_uma = d_M != nullptr;
-        }
-        if (sinks) {
-            ggml_vk_host_get(ctx->device, sinks->data, d_S, s_buf_offset);
-            S_uma = d_S != nullptr;
-        }
-    }
-
-
-    ggml_backend_vk_buffer_context * d_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
-    ggml_backend_vk_buffer_context * q_buf_ctx = (ggml_backend_vk_buffer_context *)q->buffer->context;
-    ggml_backend_vk_buffer_context * k_buf_ctx = (ggml_backend_vk_buffer_context *)k->buffer->context;
-    ggml_backend_vk_buffer_context * v_buf_ctx = (ggml_backend_vk_buffer_context *)v->buffer->context;
-
-    if (!Q_uma) {
-        d_Q = q_buf_ctx->dev_buffer;
-        q_buf_offset = vk_tensor_offset(q) + q->view_offs;
-    }
-    if (!K_uma) {
-        d_K = k_buf_ctx->dev_buffer;
-        k_buf_offset = vk_tensor_offset(k) + k->view_offs;
-    }
-    if (!V_uma) {
-        d_V = v_buf_ctx->dev_buffer;
-        v_buf_offset = vk_tensor_offset(v) + v->view_offs;
-    }
-    if (!D_uma) {
-        d_D = d_buf_ctx->dev_buffer;
-        d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
-    }
-
-    if (!M_uma) {
-        d_M = d_Q;
-        m_buf_offset = q_buf_offset;
-        if (mask) {
-            ggml_backend_vk_buffer_context * m_buf_ctx = (ggml_backend_vk_buffer_context*)mask->buffer->context;
-            d_M = m_buf_ctx->dev_buffer;
-            m_buf_offset = vk_tensor_offset(mask) + mask->view_offs;
-        }
-    }
-
-    if (!S_uma) {
-        d_S = d_Q;
-        s_buf_offset = q_buf_offset;
-        if (sinks) {
-            ggml_backend_vk_buffer_context * s_buf_ctx = (ggml_backend_vk_buffer_context*)sinks->buffer->context;
-            d_S = s_buf_ctx->dev_buffer;
-            s_buf_offset = vk_tensor_offset(sinks) + sinks->view_offs;
-        }
-    }
+    vk_subbuffer q_buf = ggml_vk_tensor_subbuffer(ctx, q);
+    vk_subbuffer k_buf = ggml_vk_tensor_subbuffer(ctx, k);
+    vk_subbuffer v_buf = ggml_vk_tensor_subbuffer(ctx, v);
+    vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst);
+    vk_subbuffer mask_buf = mask ? ggml_vk_tensor_subbuffer(ctx, mask) : q_buf;
+    vk_subbuffer sinks_buf = sinks ? ggml_vk_tensor_subbuffer(ctx, sinks) : q_buf;
 
     uint32_t mask_n_head_log2 = ((sinks != nullptr) << 24) | ((mask != nullptr) << 16) | n_head_log2;
 
@@ -8040,15 +8006,9 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
             ggml_vk_sync_buffers(ctx, subctx);
         }
 
+        vk_subbuffer split_k_buf = ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0);
         ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
-                                    {
-                                        ggml_vk_subbuffer(ctx, d_Q, q_buf_offset),
-                                        ggml_vk_subbuffer(ctx, d_K, k_buf_offset),
-                                        ggml_vk_subbuffer(ctx, d_V, v_buf_offset),
-                                        ggml_vk_subbuffer(ctx, d_M, m_buf_offset),
-                                        ggml_vk_subbuffer(ctx, d_S, s_buf_offset),
-                                        ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0),
-                                    },
+                                    {q_buf, k_buf, v_buf, mask_buf, sinks_buf, split_k_buf},
                                     // We only use split_k when group query attention is enabled, which means
                                     // there's no more than one tile of rows (i.e. workgroups_x would have been
                                     // one). We reuse workgroups_x to mean the number of splits, so we need to
@@ -8058,23 +8018,12 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
         ggml_vk_sync_buffers(ctx, subctx);
         const std::array<uint32_t, 5> pc2 = { HSV, (uint32_t)ne1, (uint32_t)ne3, split_k, (sinks != nullptr) };
         ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_flash_attn_split_k_reduce,
-                                    {
-                                        ggml_vk_subbuffer(ctx, ctx->prealloc_split_k, 0),
-                                        ggml_vk_subbuffer(ctx, d_S, s_buf_offset),
-                                        ggml_vk_subbuffer(ctx, d_D, d_buf_offset),
-                                    },
+                                    {split_k_buf, sinks_buf, dst_buf},
                                     pc2, { (uint32_t)ne1, HSV, (uint32_t)ne3 });
         ctx->prealloc_split_k_need_sync = true;
     } else {
         ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
-                                    {
-                                        ggml_vk_subbuffer(ctx, d_Q, q_buf_offset),
-                                        ggml_vk_subbuffer(ctx, d_K, k_buf_offset),
-                                        ggml_vk_subbuffer(ctx, d_V, v_buf_offset),
-                                        ggml_vk_subbuffer(ctx, d_M, m_buf_offset),
-                                        ggml_vk_subbuffer(ctx, d_S, s_buf_offset),
-                                        ggml_vk_subbuffer(ctx, d_D, d_buf_offset),
-                                    },
+                                    {q_buf, k_buf, v_buf, mask_buf, sinks_buf, dst_buf},
                                     pc, { workgroups_x, workgroups_y, workgroups_z });
     }
 }
@@ -8757,35 +8706,15 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
     const uint64_t ne01 = src0->ne[1];
     const uint64_t ne02 = src0->ne[2];
     const uint64_t ne03 = src0->ne[3];
-    const uint64_t ne0 = ne00 * ne01;
 
     const bool use_src1 = src1 != nullptr;
     const uint64_t ne10 = use_src1 ? src1->ne[0] : 0;
     const uint64_t ne11 = use_src1 ? src1->ne[1] : 0;
     const uint64_t ne12 = use_src1 ? src1->ne[2] : 0;
     const uint64_t ne13 = use_src1 ? src1->ne[3] : 0;
-    const uint64_t ne1 = ne10 * ne11;
-    // const uint64_t nb10 = use_src1 ? src1->nb[0] : 0;
 
     const bool use_src2 = src2 != nullptr;
-    const uint64_t ne20 = use_src2 ? src2->ne[0] : 0;
-    const uint64_t ne21 = use_src2 ? src2->ne[1] : 0;
-    const uint64_t ne22 = use_src2 ? src2->ne[2] : 0;
-    const uint64_t ne23 = use_src2 ? src2->ne[3] : 0;
-    const uint64_t ne2 = ne20 * ne21;
-
     const bool use_src3 = src3 != nullptr;
-    const uint64_t ne30 = use_src3 ? src3->ne[0] : 0;
-    const uint64_t ne31 = use_src3 ? src3->ne[1] : 0;
-    const uint64_t ne32 = use_src3 ? src3->ne[2] : 0;
-    const uint64_t ne33 = use_src3 ? src3->ne[3] : 0;
-    const uint64_t ne3 = ne30 * ne31;
-
-    const uint64_t ned0 = dst->ne[0];
-    const uint64_t ned1 = dst->ne[1];
-    const uint64_t ned2 = dst->ne[2];
-    const uint64_t ned3 = dst->ne[3];
-    const uint64_t ned = ned0 * ned1;
 
     init_pushconst_fastdiv(pc);
 
@@ -8804,74 +8733,14 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
 
     const bool op_supports_incontiguous = ggml_vk_op_supports_incontiguous(op);
 
-    ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
-    ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context;
-    ggml_backend_vk_buffer_context * src1_buf_ctx = use_src1 ? (ggml_backend_vk_buffer_context *)src1->buffer->context : nullptr;
-    ggml_backend_vk_buffer_context * src2_buf_ctx = use_src2 ? (ggml_backend_vk_buffer_context *)src2->buffer->context : nullptr;
-    ggml_backend_vk_buffer_context * src3_buf_ctx = use_src3 ? (ggml_backend_vk_buffer_context *)src3->buffer->context : nullptr;
+    vk_subbuffer src0_buf = ggml_vk_tensor_subbuffer(ctx, src0, op_supports_incontiguous);
+    vk_subbuffer src1_buf = use_src1 ? ggml_vk_tensor_subbuffer(ctx, src1, op_supports_incontiguous) : vk_subbuffer{};
+    vk_subbuffer src2_buf = use_src2 ? ggml_vk_tensor_subbuffer(ctx, src2, op_supports_incontiguous) : vk_subbuffer{};
+    vk_subbuffer src3_buf = use_src3 ? ggml_vk_tensor_subbuffer(ctx, src3, op_supports_incontiguous) : vk_subbuffer{};
+    vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst, op_supports_incontiguous);
 
-    vk_buffer d_X = nullptr;
-    size_t x_buf_offset = 0;
-    vk_buffer d_Y = nullptr;
-    size_t y_buf_offset = 0;
-    vk_buffer d_Z = nullptr;
-    size_t z_buf_offset = 0;
-    vk_buffer d_W = nullptr;
-    size_t w_buf_offset = 0;
-
-    bool src0_uma = false;
-    bool src1_uma = false;
-    bool src2_uma = false;
-    bool src3_uma = false;
-
-    if (ctx->device->uma) {
-        ggml_vk_host_get(ctx->device, src0->data, d_X, x_buf_offset);
-        src0_uma = d_X != nullptr;
-        if (use_src1) {
-            ggml_vk_host_get(ctx->device, src1->data, d_Y, y_buf_offset);
-            src1_uma = d_Y != nullptr;
-        }
-        if (use_src2) {
-            ggml_vk_host_get(ctx->device, src2->data, d_Z, z_buf_offset);
-            src2_uma = d_Z != nullptr;
-        }
-        if (use_src3) {
-            ggml_vk_host_get(ctx->device, src3->data, d_W, w_buf_offset);
-            src3_uma = d_W != nullptr;
-        }
-    }
-
-    vk_buffer d_D = dst_buf_ctx->dev_buffer;
-
-    GGML_ASSERT(d_D != nullptr);
-    uint64_t d_buf_offset = vk_tensor_offset(dst) + dst->view_offs;
-    if(!src0_uma) {
-        d_X = src0_buf_ctx->dev_buffer;
-        x_buf_offset = vk_tensor_offset(src0) + src0->view_offs;
-        GGML_ASSERT(d_X != nullptr);
-    }
-    if (use_src1 && !src1_uma) {
-        d_Y = src1_buf_ctx->dev_buffer;
-        y_buf_offset = vk_tensor_offset(src1) + src1->view_offs;
-        GGML_ASSERT(d_Y != nullptr);
-    }
-    if (use_src2 && !src2_uma) {
-        d_Z = src2_buf_ctx->dev_buffer;
-        z_buf_offset = vk_tensor_offset(src2) + src2->view_offs;
-        GGML_ASSERT(d_Z != nullptr);
-    }
-    if (use_src3 && !src3_uma) {
-        d_W = src3_buf_ctx->dev_buffer;
-        w_buf_offset = vk_tensor_offset(src3) + src3->view_offs;
-        GGML_ASSERT(d_W != nullptr);
-    }
-    // Compute misalignment offset for descriptors and store it in in push constants, then align the descriptor offsets.
+    // Compute misalignment offset for descriptors and store it in in push constants.
     init_pushconst_tensor_offsets(ctx, pc, src0, src1, src2, src3, dst);
-    x_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
-    y_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
-    z_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
-    w_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
-    d_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1);
 
     std::array<uint32_t, 3> elements;
 
@@ -8955,9 +8824,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
             const uint32_t KH = ne01;
             const uint32_t KW = ne00;
 
-            const uint32_t OD = ned3 / N;
-            const uint32_t OH = ned2;
-            const uint32_t OW = ned1;
+            const uint32_t OD = dst->ne[3] / N;
+            const uint32_t OH = dst->ne[2];
+            const uint32_t OW = dst->ne[1];
 
             const uint32_t IC_KD_KH_KW = IC*KD*KH*KW;
             const uint32_t N_OD_OH = N*OD*OH;
@@ -9072,112 +8941,50 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
         break;
     }
 
-    uint64_t x_sz, y_sz, z_sz, w_sz, d_sz;
-
-    if (op_supports_incontiguous) {
-        x_sz = ggml_nbytes(src0) + get_misalign_bytes(ctx, src0);
-        y_sz = use_src1 ? ggml_nbytes(src1) + get_misalign_bytes(ctx, src1) : 0;
-        z_sz = use_src2 ? ggml_nbytes(src2) + get_misalign_bytes(ctx, src2) : 0;
-        w_sz = use_src3 ? ggml_nbytes(src3) + get_misalign_bytes(ctx, src3) : 0;
-        d_sz = ggml_nbytes(dst) + get_misalign_bytes(ctx, dst);
-
-        if (x_buf_offset + x_sz >= d_X->size) {
-            x_sz = ggml_vk_get_max_buffer_range(ctx, d_X, x_buf_offset);
-        }
-        if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
-            y_sz = ggml_vk_get_max_buffer_range(ctx, d_Y, y_buf_offset);
-        }
-        if (use_src2 && z_buf_offset + z_sz >= d_Z->size) {
-            z_sz = ggml_vk_get_max_buffer_range(ctx, d_Z, z_buf_offset);
-        }
-        if (use_src3 && w_buf_offset + w_sz >= d_W->size) {
-            w_sz = ggml_vk_get_max_buffer_range(ctx, d_W, w_buf_offset);
-        }
-        if (d_buf_offset + d_sz >= d_D->size) {
-            d_sz = ggml_vk_get_max_buffer_range(ctx, d_D, d_buf_offset);
-        }
-    } else {
-        x_sz = ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0 * ne02 * ne03;
-        y_sz = use_src1 ? ggml_type_size(src1->type) * ne1 * ne12 * ne13 : 0;
-        z_sz = use_src2 ? ggml_type_size(src2->type) * ne2 * ne22 * ne23 : 0;
-        w_sz = use_src3 ? ggml_type_size(src3->type) * ne3 * ne32 * ne33 : 0;
-        d_sz = ggml_type_size(dst->type) * ned * ned2 * ned3;
-    }
-
     if (op == GGML_OP_ADD || op == GGML_OP_RMS_NORM) {
-        vk_buffer d_A = ctx->do_add_rms_partials ? ctx->prealloc_add_rms_partials : d_X;
-        size_t a_buf_offset = ctx->do_add_rms_partials ? ctx->prealloc_size_add_rms_partials_offset : 0;
+        vk_subbuffer a_buf = src0_buf;
+        if (ctx->do_add_rms_partials) {
+            a_buf = ggml_vk_subbuffer(ctx, ctx->prealloc_add_rms_partials, ctx->prealloc_size_add_rms_partials_offset);
+        }
         ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
-            { vk_subbuffer{ d_X, x_buf_offset, x_sz },
-              vk_subbuffer{ d_Y, y_buf_offset, y_sz },
-              vk_subbuffer{ d_D, d_buf_offset, d_sz },
-              ggml_vk_subbuffer(ctx, d_A, a_buf_offset),
-            }, pc, elements);
+            { src0_buf, src1_buf, dst_buf, a_buf }, pc, elements);
     } else if (op == GGML_OP_GLU) {
         // Empty src1 is possible in glu, but the shader needs a buffer
-        vk_subbuffer subbuf_y;
-        if (use_src1) {
-            subbuf_y = { d_Y, y_buf_offset, y_sz };
-        } else {
-            subbuf_y = { d_X, 0, x_sz };
-        }
-
-        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
+        vk_subbuffer subbuf1 = use_src1 ? src1_buf : src0_buf;
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, subbuf1, dst_buf }, pc, elements);
     } else if (op == GGML_OP_SOFT_MAX) {
         // Empty src1 and src2 is possible in soft_max, but the shader needs a buffer
-        vk_subbuffer subbuf_y;
-        if (use_src1) {
-            subbuf_y = { d_Y, y_buf_offset, y_sz };
-        } else {
-            subbuf_y = { d_X, 0, x_sz };
-        }
-
-        vk_subbuffer subbuf_z;
-        if (use_src2) {
-            subbuf_z = { d_Z, z_buf_offset, z_sz };
-        } else {
-            subbuf_z = { d_X, 0, x_sz };
-        }
-
-        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
+        vk_subbuffer subbuf1 = use_src1 ? src1_buf : src0_buf;
+        vk_subbuffer subbuf2 = use_src2 ? src2_buf : src0_buf;
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, subbuf1, subbuf2, dst_buf }, pc, elements);
     } else if (op == GGML_OP_ROPE || op == GGML_OP_ROPE_BACK) {
-        // Empty src2 is possible in rope, but the shader needs a buffer
-        vk_subbuffer subbuf_z, subbuf_w;
-        if (use_src2) {
-            subbuf_z = { d_Z, z_buf_offset, z_sz };
-        } else {
-            subbuf_z = { d_X, 0, x_sz };
-        }
-        if (use_src3) {
-            subbuf_w = { d_W, w_buf_offset, w_sz };
-        } else {
-            subbuf_w = { d_X, 0, x_sz };
-        }
-
-        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz }, subbuf_w }, pc, elements);
+        // Empty src2 and src3 is possible in rope, but the shader needs a buffer
+        vk_subbuffer subbuf2 = use_src2 ? src2_buf : src0_buf;
+        vk_subbuffer subbuf3 = use_src3 ? src3_buf : src0_buf;
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, subbuf2, dst_buf, subbuf3 }, pc, elements);
     } else if (op == GGML_OP_IM2COL || op == GGML_OP_IM2COL_3D) {
         if (ctx->device->shader_int64 && ctx->device->buffer_device_address) {
             // buffer device address path doesn't use dst buffer
-            d_sz = 1;
+            dst_buf.size = 1;
         }
         // im2col uses only src1 and dst buffers
-        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src1_buf, dst_buf }, pc, elements);
     } else if (op == GGML_OP_COUNT_EQUAL) {
         // count_equal assumes that destination buffer is initialized with zeroes
-        ggml_vk_buffer_memset_async(subctx, d_D, d_buf_offset, 0, d_sz);
+        ggml_vk_buffer_memset_async(subctx, dst_buf.buffer, dst_buf.offset, 0, dst_buf.size);
         ggml_vk_sync_buffers(ctx, subctx);
-        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, dst_buf }, pc, elements);
     } else if (op == GGML_OP_OPT_STEP_SGD) {
         // OPT_STEP_SGD works on src0, it does not need dst
-        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz } }, pc, elements);
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, src2_buf }, pc, elements);
     } else if (use_src3) {
-        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_W, w_buf_offset, w_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, src2_buf, src3_buf, dst_buf }, pc, elements);
     } else if (use_src2) {
-        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, src2_buf, dst_buf }, pc, elements);
     } else if (use_src1) {
-        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, src1_buf, dst_buf }, pc, elements);
     } else {
-        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { src0_buf, dst_buf }, pc, elements);
     }
 }
 
@@ -9413,39 +9220,10 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx
 
     ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
 
-    ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
-    ggml_backend_vk_buffer_context * src_buf_ctxs[7] = { nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr };
+    vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst);
+    vk_subbuffer src_buf[7] = {};
     for (int i = 0; i < num_srcs; i++) {
-        src_buf_ctxs[i] = (ggml_backend_vk_buffer_context *)dst->src[i]->buffer->context;
-    }
-
-    vk_buffer d_D = nullptr, d_srcs[7] = { nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr };
-    size_t dst_offset = 0, src_offsets[7] = { 0, 0, 0, 0, 0, 0, 0 };
-    bool dst_uma = false, srcs_uma[7] = { false, false, false, false, false, false, false };
-
-    if (ctx->device->uma) {
-        for (int i = 0; i < num_srcs; i++) {
-            ggml_vk_host_get(ctx->device, dst->src[i]->data, d_srcs[i], src_offsets[i]);
-            srcs_uma[i] = d_srcs[i] != nullptr;
-        }
-
-        ggml_vk_host_get(ctx->device, dst->data, d_D, dst_offset);
-        dst_uma = d_D != nullptr;
-    }
-
-    uint64_t src_sizes[7] = { 0, 0, 0, 0, 0, 0, 0 };
-    for (int i = 0; i < num_srcs; i++) {
-        src_sizes[i] = ggml_nbytes(dst->src[i]);
-        if (!srcs_uma[i]) {
-            d_srcs[i] = src_buf_ctxs[i]->dev_buffer;
-            src_offsets[i] = vk_tensor_offset(dst->src[i]) + dst->src[i]->view_offs;
-        }
-    }
-
-    const uint64_t dst_size = ggml_nbytes(dst);
-    if (!dst_uma) {
-        d_D = dst_buf_ctx->dev_buffer;
-        dst_offset = vk_tensor_offset(dst) + dst->view_offs;
+        src_buf[i] = ggml_vk_tensor_subbuffer(ctx, dst->src[i]);
     }
 
     std::array<uint32_t, 3> elements = {
@@ -9455,26 +9233,13 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx
     };
 
     if (version == 6) {
-        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {
-            vk_subbuffer{ d_srcs[0], src_offsets[0], src_sizes[0] },
-            vk_subbuffer{ d_srcs[1], src_offsets[1], src_sizes[1] },
-            vk_subbuffer{ d_srcs[2], src_offsets[2], src_sizes[2] },
-            vk_subbuffer{ d_srcs[3], src_offsets[3], src_sizes[3] },
-            vk_subbuffer{ d_srcs[4], src_offsets[4], src_sizes[4] },
-            vk_subbuffer{ d_srcs[5], src_offsets[5], src_sizes[5] },
-            vk_subbuffer{ d_D, dst_offset, dst_size }
-        }, pc, elements);
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
+            {src_buf[0], src_buf[1], src_buf[2], src_buf[3], src_buf[4], src_buf[5], dst_buf},
+            pc, elements);
     } else if (version == 7) {
-        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {
-            vk_subbuffer{ d_srcs[0], src_offsets[0], src_sizes[0] },
-            vk_subbuffer{ d_srcs[1], src_offsets[1], src_sizes[1] },
-            vk_subbuffer{ d_srcs[2], src_offsets[2], src_sizes[2] },
-            vk_subbuffer{ d_srcs[3], src_offsets[3], src_sizes[3] },
-            vk_subbuffer{ d_srcs[4], src_offsets[4], src_sizes[4] },
-            vk_subbuffer{ d_srcs[5], src_offsets[5], src_sizes[5] },
-            vk_subbuffer{ d_srcs[6], src_offsets[6], src_sizes[6] },
-            vk_subbuffer{ d_D, dst_offset, dst_size }
-        }, pc, elements);
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
+            {src_buf[0], src_buf[1], src_buf[2], src_buf[3], src_buf[4], src_buf[5], src_buf[6], dst_buf},
+            pc, elements);
     } else {
         // shouldn't happen
         GGML_ASSERT(false);
@@ -9554,40 +9319,10 @@ static void ggml_vk_ssm_scan(ggml_backend_vk_context * ctx, vk_context& subctx,
         n_head, head_dim, n_group, n_tok
     };
 
-    ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context;
-    ggml_backend_vk_buffer_context * src_buf_ctxs[GGML_MAX_SRC];
-    for (int i = 0; i < GGML_MAX_SRC && dst->src[i] != nullptr; i++) {
-        src_buf_ctxs[i] = (ggml_backend_vk_buffer_context *)dst->src[i]->buffer->context;
-    }
-
-    vk_buffer d_D = nullptr, d_srcs[GGML_MAX_SRC] = { nullptr };
-    size_t dst_offset = 0, src_offsets[GGML_MAX_SRC] = { 0 };
-    bool dst_uma = false, srcs_uma[GGML_MAX_SRC] = { false };
-
-    if (ctx->device->uma) {
-        for (int i = 0; i < GGML_MAX_SRC && dst->src[i] != nullptr; i++) {
-            ggml_vk_host_get(ctx->device, dst->src[i]->data, d_srcs[i], src_offsets[i]);
-            srcs_uma[i] = d_srcs[i] != nullptr;
-        }
-        ggml_vk_host_get(ctx->device, dst->data, d_D, dst_offset);
-        dst_uma = d_D != nullptr;
-    }
-
-    if (!dst_uma) {
-        d_D = dst_buf_ctx->dev_buffer;
-        dst_offset = vk_tensor_offset(dst) + dst->view_offs;
-    }
-    for (int i = 0; i < GGML_MAX_SRC && dst->src[i] != nullptr; i++) {
-        if (!srcs_uma[i]) {
-            d_srcs[i] = src_buf_ctxs[i]->dev_buffer;
-            src_offsets[i] = vk_tensor_offset(dst->src[i]) + dst->src[i]->view_offs;
-        }
-    }
-
-    size_t dst_size = ggml_nbytes(dst);
-    size_t src_sizes[GGML_MAX_SRC];
-    for (int i = 0; i < GGML_MAX_SRC && dst->src[i] != nullptr; i++) {
-        src_sizes[i] = ggml_nbytes(dst->src[i]);
+    vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, dst);
+    vk_subbuffer src_buf[7] = {};
+    for (int i = 0; i < 7 && dst->src[i] != nullptr; i++) {
+        src_buf[i] = ggml_vk_tensor_subbuffer(ctx, dst->src[i]);
     }
 
     std::array<uint32_t, 3> elements;
@@ -9597,16 +9332,9 @@ static void ggml_vk_ssm_scan(ggml_backend_vk_context * ctx, vk_context& subctx,
     const uint32_t num_workgroups_y = n_seq;
     elements = { num_workgroups_x, num_workgroups_y, 1 };
 
-    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {
-        vk_subbuffer{ d_srcs[0], src_offsets[0], src_sizes[0] },
-        vk_subbuffer{ d_srcs[1], src_offsets[1], src_sizes[1] },
-        vk_subbuffer{ d_srcs[2], src_offsets[2], src_sizes[2] },
-        vk_subbuffer{ d_srcs[3], src_offsets[3], src_sizes[3] },
-        vk_subbuffer{ d_srcs[4], src_offsets[4], src_sizes[4] },
-        vk_subbuffer{ d_srcs[5], src_offsets[5], src_sizes[5] },
-        vk_subbuffer{ d_srcs[6], src_offsets[6], src_sizes[6] },
-        vk_subbuffer{ d_D, dst_offset, dst_size }
-    }, pc, elements);
+    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
+        {src_buf[0], src_buf[1], src_buf[2], src_buf[3], src_buf[4], src_buf[5], src_buf[6], dst_buf},
+        pc, elements);
 }
 
 static void ggml_vk_ssm_conv(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) {
@@ -9653,66 +9381,17 @@ static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_cont
 
     ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
 
-    ggml_backend_vk_buffer_context * x_buf_ctx = (ggml_backend_vk_buffer_context *)x->buffer->context;
-    ggml_backend_vk_buffer_context * g_buf_ctx = (ggml_backend_vk_buffer_context *)g->buffer->context;
-    ggml_backend_vk_buffer_context * gm_buf_ctx = (ggml_backend_vk_buffer_context *)gm->buffer->context;
-    ggml_backend_vk_buffer_context * gv_buf_ctx = (ggml_backend_vk_buffer_context *)gv->buffer->context;
-    ggml_backend_vk_buffer_context * p_buf_ctx = (ggml_backend_vk_buffer_context *)p->buffer->context;
-
-    vk_buffer d_X = nullptr, d_G = nullptr, d_GM = nullptr, d_GV = nullptr, d_P = nullptr;
-    size_t x_offset = 0, g_offset = 0, gm_offset = 0, gv_offset = 0, p_offset = 0;
-    bool X_uma = false, G_uma = false, GM_uma = false, GV_uma = false, P_uma = false;
-
-    if (ctx->device->uma) {
-        ggml_vk_host_get(ctx->device, x->data, d_X, x_offset);
-        ggml_vk_host_get(ctx->device, g->data, d_G, g_offset);
-        ggml_vk_host_get(ctx->device, gm->data, d_GM, gm_offset);
-        ggml_vk_host_get(ctx->device, gv->data, d_GV, gv_offset);
-        ggml_vk_host_get(ctx->device, p->data, d_P, p_offset);
-
-        X_uma = d_X != nullptr;
-        G_uma = d_G != nullptr;
-        GM_uma = d_GM != nullptr;
-        GV_uma = d_GV != nullptr;
-        P_uma = d_P != nullptr;
-    }
-
-    if (!X_uma) {
-        d_X = x_buf_ctx->dev_buffer;
-        x_offset = vk_tensor_offset(x) + x->view_offs;
-    }
-    if (!G_uma) {
-        d_G = g_buf_ctx->dev_buffer;
-        g_offset = vk_tensor_offset(g) + g->view_offs;
-    }
-    if (!GM_uma) {
-        d_GM = gm_buf_ctx->dev_buffer;
-        gm_offset = vk_tensor_offset(gm) + gm->view_offs;
-    }
-    if (!GV_uma) {
-        d_GV = gv_buf_ctx->dev_buffer;
-        gv_offset = vk_tensor_offset(gv) + gv->view_offs;
-    }
-    if (!P_uma) {
-        d_P = p_buf_ctx->dev_buffer;
-        p_offset = vk_tensor_offset(p) + p->view_offs;
-    }
-
-    const uint64_t x_size = ggml_nbytes(x);
-    const uint64_t g_size = ggml_nbytes(g);
-    const uint64_t gm_size = ggml_nbytes(gm);
-    const uint64_t gv_size = ggml_nbytes(gv);
-    const uint64_t p_size = ggml_nbytes(p);
+    vk_subbuffer x_buf = ggml_vk_tensor_subbuffer(ctx, x);
+    vk_subbuffer g_buf = ggml_vk_tensor_subbuffer(ctx, g);
+    vk_subbuffer gm_buf = ggml_vk_tensor_subbuffer(ctx, gm);
+    vk_subbuffer gv_buf = ggml_vk_tensor_subbuffer(ctx, gv);
+    vk_subbuffer p_buf = ggml_vk_tensor_subbuffer(ctx, p);
 
     std::array<uint32_t, 3> elements = { (uint32_t)ggml_nelements(x), 1, 1 };
 
-    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {
-        vk_subbuffer{ d_X, x_offset, x_size },
-        vk_subbuffer{ d_G, g_offset, g_size },
-        vk_subbuffer{ d_GM, gm_offset, gm_size },
-        vk_subbuffer{ d_GV, gv_offset, gv_size },
-        vk_subbuffer{ d_P, p_offset, p_size },
-    }, pc, elements);
+    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
+        {x_buf, g_buf, gm_buf, gv_buf, p_buf},
+        pc, elements);
 }
 
 static void ggml_vk_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst) {
@@ -10044,45 +9723,9 @@ static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx,
 
     ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
 
-    ggml_backend_vk_buffer_context * logits_buf_ctx = (ggml_backend_vk_buffer_context *)logits->buffer->context;
-    ggml_backend_vk_buffer_context * weights_buf_ctx = (ggml_backend_vk_buffer_context *)weights->buffer->context;
-    ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context;
-
-    vk_buffer d_logits = nullptr;
-    size_t logits_buf_offset = 0;
-    vk_buffer d_weights = nullptr;
-    size_t weights_buf_offset = 0;
-    vk_buffer d_ids = nullptr;
-    size_t ids_buf_offset = 0;
-
-    bool logits_uma = false;
-    bool weights_uma = false;
-    bool ids_uma = false;
-
-    if (ctx->device->uma) {
-        ggml_vk_host_get(ctx->device, logits->data, d_logits, logits_buf_offset);
-        ggml_vk_host_get(ctx->device, weights->data, d_weights, weights_buf_offset);
-        ggml_vk_host_get(ctx->device, ids->data, d_ids, ids_buf_offset);
-        logits_uma = d_logits != nullptr;
-        weights_uma = d_weights != nullptr;
-        ids_uma = d_ids != nullptr;
-    }
-
-    if (!logits_uma) {
-        d_logits = logits_buf_ctx->dev_buffer;
-        logits_buf_offset = vk_tensor_offset(logits) + logits->view_offs;
-        GGML_ASSERT(d_logits != nullptr);
-    }
-    if (!weights_uma) {
-        d_weights = weights_buf_ctx->dev_buffer;
-        weights_buf_offset = vk_tensor_offset(weights) + weights->view_offs;
-        GGML_ASSERT(d_weights != nullptr);
-    }
-    if (!ids_uma) {
-        d_ids = ids_buf_ctx->dev_buffer;
-        ids_buf_offset = vk_tensor_offset(ids) + ids->view_offs;
-        GGML_ASSERT(d_ids != nullptr);
-    }
+    vk_subbuffer logits_buf = ggml_vk_tensor_subbuffer(ctx, logits);
+    vk_subbuffer weights_buf = ggml_vk_tensor_subbuffer(ctx, weights);
+    vk_subbuffer ids_buf = ggml_vk_tensor_subbuffer(ctx, ids);
 
     vk_op_topk_moe_push_constants pc {};
     pc.n_rows = n_rows;
@@ -10098,12 +9741,7 @@ static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx,
     const uint32_t rows_per_block = 4;
     std::array<uint32_t, 3> elements = { CEIL_DIV(n_rows, rows_per_block), 1, 1 };
 
-    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
-        {
-            ggml_vk_subbuffer(ctx, d_logits, logits_buf_offset),
-            ggml_vk_subbuffer(ctx, d_weights, weights_buf_offset),
-            ggml_vk_subbuffer(ctx, d_ids, ids_buf_offset),
-        }, pc, elements);
+    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {logits_buf, weights_buf, ids_buf}, pc, elements);
 }
 
 static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_cgraph * cgraph, int node_idx, bool backprop) {

From 299f5d782c8ffd7195a1ed6a6d5561f759beb07e Mon Sep 17 00:00:00 2001
From: bssrdf <merlintiger@hotmail.com>
Date: Fri, 7 Nov 2025 17:41:58 -0500
Subject: [PATCH 23/32] CUDA: properly handle nb00=nb02 case for cpy (#17081)

---
 ggml/src/ggml-cuda/cpy.cu  | 4 +---
 tests/test-backend-ops.cpp | 1 +
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
index 1dba60eb1..50612237c 100644
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@@ -198,7 +198,7 @@ static void ggml_cpy_flt_cuda(
     if (transposed) {
         GGML_ASSERT(ne == ne00*ne01*ne02);  // ne[3] is 1 assumed
         int ne00n, ne01n, ne02n;
-        if (nb00 < nb02) {
+        if (nb00 <= nb02) { // most likely safe to handle nb00 = nb02 case here
             ne00n = ne00;
             ne01n = ne01;
             ne02n = ne02;
@@ -206,8 +206,6 @@ static void ggml_cpy_flt_cuda(
             ne00n = ne00;
             ne01n = ne01*ne02;
             ne02n = 1;
-        } else {
-            GGML_ASSERT(false);
         }
 
         dim3 dimGrid( (ne01n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D,
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index bffd60f38..b9ae82eed 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -6648,6 +6648,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
     test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
     test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
 
     test_cases.emplace_back(new test_cont());
     test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 1 ,1}));

From 647b960bd8017ee882d6633bc2e43e2ae82ee85c Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Fri, 7 Nov 2025 19:27:20 -0800
Subject: [PATCH 24/32] ggml webgpu: faster matrix multiplication/matrix-vector
 multiplication (#17031)

* Faster tensors (#8)

Add fast matrix and matrix/vector multiplication.

* Use map for shader replacements instead of pair of strings
---
 .github/workflows/build.yml                   |  18 +-
 ggml/src/ggml-webgpu/ggml-webgpu.cpp          | 326 +++++++++++++++++-
 .../ggml-webgpu/wgsl-shaders/embed_wgsl.py    |   9 +-
 .../wgsl-shaders/mul_mat.tmpl.wgsl            |  10 +-
 .../wgsl-shaders/mul_mat_decls.tmpl           |  97 ++++++
 .../wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl   | 247 +++++++++++++
 .../mul_mat_subgroup_matrix.tmpl.wgsl         | 302 ++++++++++++++++
 .../wgsl-shaders/mul_mat_vec.tmpl.wgsl        | 267 ++++++++++++++
 8 files changed, 1247 insertions(+), 29 deletions(-)
 create mode 100644 ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl
 create mode 100644 ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl
 create mode 100644 ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl
 create mode 100644 ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 15e113309..36084c550 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -161,15 +161,16 @@ jobs:
       - name: Dawn Dependency
         id: dawn-depends
         run: |
-          DAWN_VERSION="v1.0.0"
+          DAWN_VERSION="v2.0.0"
           DAWN_OWNER="reeselevine"
           DAWN_REPO="dawn"
-          DAWN_ASSET_NAME="Dawn-a1a6b45cced25a3b7f4fb491e0ae70796cc7f22b-macos-latest-Release.tar.gz"
+          DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release.zip"
           echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
-          curl -L -o artifact.tar.gz \
+          curl -L -o artifact.zip \
             "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
           mkdir dawn
-          tar -xvf artifact.tar.gz -C dawn --strip-components=1
+          unzip artifact.zip
+          tar -xvf Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release.tar.gz -C dawn --strip-components=1
 
       - name: Build
         id: cmake_build
@@ -521,15 +522,16 @@ jobs:
         id: dawn-depends
         run: |
           sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
-          DAWN_VERSION="v1.0.0"
+          DAWN_VERSION="v2.0.0"
           DAWN_OWNER="reeselevine"
           DAWN_REPO="dawn"
-          DAWN_ASSET_NAME="Dawn-a1a6b45cced25a3b7f4fb491e0ae70796cc7f22b-ubuntu-latest-Release.tar.gz"
+          DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-ubuntu-latest-Release.zip"
           echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
-          curl -L -o artifact.tar.gz \
+          curl -L -o artifact.zip \
             "https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
           mkdir dawn
-          tar -xvf artifact.tar.gz -C dawn --strip-components=1
+          unzip artifact.zip
+          tar -xvf Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-ubuntu-latest-Release.tar.gz -C dawn --strip-components=1
 
       - name: Build
         id: cmake_build
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 1a1575673..9e8cbc477 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -15,6 +15,7 @@
 #include <condition_variable>
 #include <cstring>
 #include <iostream>
+#include <map>
 #include <mutex>
 #include <optional>
 #include <string>
@@ -73,6 +74,30 @@
 // For operations which process a row in parallel, this seems like a reasonable default
 #define WEBGPU_ROW_SPLIT_WG_SIZE 64
 
+// Matrix multiplication parameters
+
+// Register tiling parameters
+#define WEBGPU_MUL_MAT_TILE_M    8
+#define WEBGPU_MUL_MAT_TILE_N    8
+#define WEBGPU_MUL_MAT_WG_SIZE_M 8
+#define WEBGPU_MUL_MAT_WG_SIZE_N 8
+#define WEBGPU_MUL_MAT_TILE_K    32
+
+// Subgroup matrix parameters
+// The number of subgroups in the M dimension
+#define WEBGPU_MUL_MAT_SUBGROUP_M        2
+// The number of subgroups in the N dimension
+#define WEBGPU_MUL_MAT_SUBGROUP_N        2
+// The number of subgroup matrices each subgroup accumulates over
+#define WEBGPU_MUL_MAT_SUBGROUP_MATRIX_M 4
+#define WEBGPU_MUL_MAT_SUBGROUP_MATRIX_N 2
+
+// Matrix-vector multiplication parameters
+#define WEBGPU_MUL_MAT_VEC_WG_SIZE        256
+// Must be multiple of 4 to work with vectorized paths, and must divide mul_mat_vec wg size
+#define WEBGPU_MUL_MAT_VEC_OUTPUTS_PER_WG 64
+#define WEBGPU_MUL_MAT_VEC_TILE_K         256
+
 /* End Constants */
 
 // This is a "fake" base pointer, since WebGPU buffers do not have pointers to their locations.
@@ -236,6 +261,10 @@ struct webgpu_context_struct {
     wgpu::Queue    queue;
     wgpu::Limits   limits;
 
+    bool                       supports_subgroup_matrix = false;
+    uint32_t                   subgroup_size;
+    wgpu::SubgroupMatrixConfig subgroup_matrix_config;
+
     // Separate this out from limits since on some Metal systems, the limit returned by
     // querying the limits is higher than the actual allowed maximum.
     uint32_t max_wg_size_x;
@@ -247,6 +276,11 @@ struct webgpu_context_struct {
     webgpu_buf_pool set_rows_error_buf_pool;
 
     webgpu_pipeline memset_pipeline;
+
+    std::map<int, std::map<int, std::map<int, webgpu_pipeline>>> mul_mat_pipelines;  // src0_type, src1_type, vectorized
+    std::map<int, std::map<int, std::map<int, webgpu_pipeline>>>
+        mul_mat_vec_pipelines;                                                       // src0_type, src1_type, vectorized
+
     webgpu_pipeline mul_mat_pipeline[30][2];
     webgpu_pipeline set_rows_pipeline[1][2];  // dst->type, vectorized
     webgpu_pipeline get_rows_pipeline[30];
@@ -321,6 +355,25 @@ struct ggml_backend_webgpu_buffer_context {
 
 /* WebGPU object initializations */
 
+// Process a WGSL shader string, replacing tokens of the form {{KEY}} with
+// the corresponding values provided in `repls`.
+static std::string ggml_webgpu_process_shader_repls(const char *                               src,
+                                                    const std::map<std::string, std::string> & repls) {
+    if (!src) {
+        return std::string();
+    }
+    std::string s = src;
+    for (const auto & kv : repls) {
+        std::string token = "{{" + kv.first + "}}";
+        size_t      pos   = 0;
+        while ((pos = s.find(token, pos)) != std::string::npos) {
+            s.replace(pos, token.length(), kv.second);
+            pos += kv.second.length();
+        }
+    }
+    return s;
+}
+
 static void ggml_webgpu_create_pipeline(wgpu::Device &                           device,
                                         webgpu_pipeline &                        pipeline,
                                         const char *                             shader_code,
@@ -346,6 +399,30 @@ static void ggml_webgpu_create_pipeline(wgpu::Device &
     pipeline = { device.CreateComputePipeline(&pipeline_desc), label };
 }
 
+static webgpu_pipeline ggml_webgpu_create_pipeline2(wgpu::Device &                           device,
+                                                    const char *                             shader_code,
+                                                    const char *                             label,
+                                                    const std::vector<wgpu::ConstantEntry> & constants = {}) {
+    wgpu::ShaderSourceWGSL shader_source;
+    shader_source.code = shader_code;
+
+    wgpu::ShaderModuleDescriptor shader_desc;
+    shader_desc.nextInChain = &shader_source;
+
+    wgpu::ShaderModule shader_module = device.CreateShaderModule(&shader_desc);
+
+    wgpu::ComputePipelineDescriptor pipeline_desc;
+    pipeline_desc.label              = label;
+    pipeline_desc.compute.module     = shader_module;
+    pipeline_desc.compute.entryPoint = "main";   // Entry point in the WGSL code
+    pipeline_desc.layout             = nullptr;  // nullptr means auto layout
+    if (constants.size() > 0) {
+        pipeline_desc.compute.constants     = constants.data();
+        pipeline_desc.compute.constantCount = constants.size();
+    }
+    return { device.CreateComputePipeline(&pipeline_desc), label };
+}
+
 static void ggml_webgpu_create_buffer(wgpu::Device &    device,
                                       wgpu::Buffer &    buffer,
                                       size_t            size,
@@ -512,6 +589,7 @@ static webgpu_command ggml_backend_webgpu_build(webgpu_context &
                                                 std::vector<uint32_t>             params,
                                                 std::vector<wgpu::BindGroupEntry> bind_group_entries,
                                                 uint32_t                          wg_x,
+                                                uint32_t                          wg_y                = 1,
                                                 std::optional<webgpu_pool_bufs>   set_rows_error_bufs = std::nullopt) {
     webgpu_pool_bufs params_bufs = ctx->param_buf_pool.alloc_bufs();
 
@@ -557,7 +635,7 @@ static webgpu_command ggml_backend_webgpu_build(webgpu_context &
 #endif
     pass.SetPipeline(pipeline.pipeline);
     pass.SetBindGroup(0, bind_group);
-    pass.DispatchWorkgroups(wg_x, 1, 1);
+    pass.DispatchWorkgroups(wg_x, wg_y, 1);
     pass.End();
 
 #ifdef GGML_WEBGPU_GPU_PROFILE
@@ -779,7 +857,7 @@ static std::optional<webgpu_command> ggml_webgpu_set_rows(webgpu_context & ctx,
 
     uint32_t wg_x = (threads + max_wg_size - 1) / max_wg_size;
 
-    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, error_bufs);
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, 1, error_bufs);
 }
 
 static webgpu_command ggml_webgpu_get_rows(webgpu_context & ctx,
@@ -835,8 +913,8 @@ static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx,
         (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
         (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
         (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
-        (uint32_t) dst->ne[1],                                  // number of rows in result (M)
-        (uint32_t) dst->ne[0],                                  // number of columns in result (N)
+        (uint32_t) dst->ne[0],                                  // number of rows in result (M, transposed)
+        (uint32_t) dst->ne[1],                                  // number of columns in result (N)
         (uint32_t) src0->ne[0],                                 // number of columns in src0/src1 (K)
         (uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),  // stride (elements/blocks) of src0 in dimension 1
         (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)),  // stride (elements/blocks) of src1 in dimension 1
@@ -865,9 +943,67 @@ static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx,
          .size    = ggml_webgpu_tensor_binding_size(ctx, dst)  },
     };
 
+    webgpu_pipeline pipeline = ctx->mul_mat_pipeline[src0->type][src1->type];
+
     uint32_t wg_x =
         (dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3] + WEBGPU_MUL_MAT_WG_SIZE - 1) / WEBGPU_MUL_MAT_WG_SIZE;
-    return ggml_backend_webgpu_build(ctx, ctx->mul_mat_pipeline[src0->type][src1->type], params, entries, wg_x);
+    uint32_t wg_y = 1;
+
+    bool use_fast = false;
+    switch (src1->type) {
+        case GGML_TYPE_F16:
+            use_fast = (src0->type == GGML_TYPE_F16);
+            break;
+        case GGML_TYPE_F32:
+            switch (src0->type) {
+                case GGML_TYPE_F32:
+                case GGML_TYPE_F16:
+                case GGML_TYPE_Q4_0:
+                    use_fast = true;
+                    break;
+                default:
+                    break;
+            }
+            break;
+        default:
+            break;
+    }
+
+    if (use_fast) {
+        int vectorized = src0->ne[0] % 4 == 0 && dst->ne[0] % 4 == 0 && dst->ne[1] % 4 == 0;
+        if (dst->ne[1] == 1) {
+            // We don't support vectorized mul_mat_vec for quantized types
+            vectorized       = vectorized && (src0->type < 2);
+            pipeline         = ctx->mul_mat_vec_pipelines[src0->type][src1->type][vectorized];
+            uint32_t batches = dst->ne[2] * dst->ne[3];
+            uint32_t output_groups =
+                (dst->ne[0] + WEBGPU_MUL_MAT_VEC_OUTPUTS_PER_WG - 1) / WEBGPU_MUL_MAT_VEC_OUTPUTS_PER_WG;
+            uint32_t total_wg = output_groups * batches;
+            wg_x              = total_wg % ctx->limits.maxComputeWorkgroupsPerDimension;
+            wg_y              = (total_wg + ctx->limits.maxComputeWorkgroupsPerDimension - 1) /
+                   ctx->limits.maxComputeWorkgroupsPerDimension;
+        } else {
+            pipeline = ctx->mul_mat_pipelines[src0->type][src1->type][vectorized];
+            uint32_t wg_m;
+            uint32_t wg_n;
+            if (ctx->supports_subgroup_matrix) {
+                // The total number of subgroups/workgroups needed per matrix.
+                uint32_t wg_m_sg_tile =
+                    WEBGPU_MUL_MAT_SUBGROUP_M * WEBGPU_MUL_MAT_SUBGROUP_MATRIX_M * ctx->subgroup_matrix_config.M;
+                wg_m = (dst->ne[0] + wg_m_sg_tile - 1) / wg_m_sg_tile;
+                uint32_t wg_n_sg_tile =
+                    WEBGPU_MUL_MAT_SUBGROUP_N * WEBGPU_MUL_MAT_SUBGROUP_MATRIX_N * ctx->subgroup_matrix_config.N;
+                wg_n = (dst->ne[1] + wg_n_sg_tile - 1) / wg_n_sg_tile;
+            } else {
+                uint32_t tile_m_s = WEBGPU_MUL_MAT_TILE_M * WEBGPU_MUL_MAT_WG_SIZE_M;
+                uint32_t tile_n_s = WEBGPU_MUL_MAT_TILE_N * WEBGPU_MUL_MAT_WG_SIZE_N;
+                wg_m              = (dst->ne[0] + tile_m_s - 1) / tile_m_s;
+                wg_n              = (dst->ne[1] + tile_n_s - 1) / tile_n_s;
+            }
+            wg_x = wg_m * wg_n * dst->ne[2] * dst->ne[3];
+        }
+    }
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
 }
 
 static webgpu_command ggml_webgpu_binary_op(webgpu_context &  ctx,
@@ -1583,12 +1719,6 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_context & webgpu_ctx) {
 }
 
 static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F32][GGML_TYPE_F32],
-                                wgsl_mul_mat_f32_f32, "mul_mat_f32_f32");
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F16][GGML_TYPE_F16],
-                                wgsl_mul_mat_f16_f16, "mul_mat_f16_f16");
-    ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F16][GGML_TYPE_F32],
-                                wgsl_mul_mat_f16_f32, "mul_mat_f16_f32");
     ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_0][GGML_TYPE_F32],
                                 wgsl_mul_mat_q4_0_f32, "mul_mat_q4_0_f32");
     ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_1][GGML_TYPE_F32],
@@ -1627,6 +1757,136 @@ static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
                                 wgsl_mul_mat_iq4_nl_f32, "mul_mat_iq4_nl_f32");
     ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ4_XS][GGML_TYPE_F32],
                                 wgsl_mul_mat_iq4_xs_f32, "mul_mat_iq4_xs_f32");
+
+    if (webgpu_ctx->supports_subgroup_matrix) {
+        std::map<std::string, std::string> sg_matrix_repls;
+        sg_matrix_repls["WEBGPU_MAX_SUBGROUP_SIZE"] = std::to_string(webgpu_ctx->subgroup_size);
+        sg_matrix_repls["WEBGPU_TILE_K"]            = std::to_string(WEBGPU_MUL_MAT_TILE_K);
+        sg_matrix_repls["WEBGPU_SUBGROUP_M"]        = std::to_string(WEBGPU_MUL_MAT_SUBGROUP_M);
+        sg_matrix_repls["WEBGPU_SUBGROUP_N"]        = std::to_string(WEBGPU_MUL_MAT_SUBGROUP_N);
+        sg_matrix_repls["WEBGPU_SUBGROUP_MATRIX_M"] = std::to_string(WEBGPU_MUL_MAT_SUBGROUP_MATRIX_M);
+        sg_matrix_repls["WEBGPU_SUBGROUP_MATRIX_N"] = std::to_string(WEBGPU_MUL_MAT_SUBGROUP_MATRIX_N);
+        sg_matrix_repls["WEBGPU_SG_MAT_M_SIZE"]     = std::to_string(webgpu_ctx->subgroup_matrix_config.M);
+        sg_matrix_repls["WEBGPU_SG_MAT_N_SIZE"]     = std::to_string(webgpu_ctx->subgroup_matrix_config.N);
+        sg_matrix_repls["WEBGPU_SG_MAT_K_SIZE"]     = std::to_string(webgpu_ctx->subgroup_matrix_config.K);
+
+        std::string proc_mul_mat_subgroup_matrix_f32_f32 =
+            ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f32_f32, sg_matrix_repls);
+        std::string proc_mul_mat_subgroup_matrix_f32_f32_vec =
+            ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f32_f32_vec, sg_matrix_repls);
+        std::string proc_mul_mat_subgroup_matrix_f16_f32 =
+            ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f32, sg_matrix_repls);
+        std::string proc_mul_mat_subgroup_matrix_f16_f32_vec =
+            ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f32_vec, sg_matrix_repls);
+        std::string proc_mul_mat_subgroup_matrix_f16_f16 =
+            ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f16, sg_matrix_repls);
+        std::string proc_mul_mat_subgroup_matrix_f16_f16_vec =
+            ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f16_vec, sg_matrix_repls);
+        std::string proc_mul_mat_subgroup_matrix_q4_0_f32 =
+            ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_q4_0_f32, sg_matrix_repls);
+        std::string proc_mul_mat_subgroup_matrix_q4_0_f32_vec =
+            ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_q4_0_f32_vec, sg_matrix_repls);
+
+        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
+            webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f32_f32.c_str(), "mul_mat_subgroup_matrix_f32_f32");
+        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][1] =
+            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f32_f32_vec.c_str(),
+                                         "mul_mat_subgroup_matrix_f32_f32_vec");
+        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
+            webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f32.c_str(), "mul_mat_subgroup_matrix_f16_f32");
+        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][1] =
+            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f32_vec.c_str(),
+                                         "mul_mat_subgroup_matrix_f16_f32_vec");
+        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][0] = ggml_webgpu_create_pipeline2(
+            webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f16.c_str(), "mul_mat_subgroup_matrix_f16_f16");
+        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] =
+            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f16_vec.c_str(),
+                                         "mul_mat_subgroup_matrix_f16_f16_vec");
+        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
+            webgpu_ctx->device, proc_mul_mat_subgroup_matrix_q4_0_f32.c_str(), "mul_mat_subgroup_matrix_q4_0_f32");
+        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][1] =
+            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_q4_0_f32_vec.c_str(),
+                                         "mul_mat_subgroup_matrix_q4_0_f32_vec");
+    } else {
+        std::vector<wgpu::ConstantEntry> mul_mat_reg_tile_constants(3);
+        mul_mat_reg_tile_constants[0].key   = "TILE_K";
+        mul_mat_reg_tile_constants[0].value = WEBGPU_MUL_MAT_TILE_K;
+        mul_mat_reg_tile_constants[1].key   = "WORKGROUP_SIZE_M";
+        mul_mat_reg_tile_constants[1].value = WEBGPU_MUL_MAT_WG_SIZE_M;
+        mul_mat_reg_tile_constants[2].key   = "WORKGROUP_SIZE_N";
+        mul_mat_reg_tile_constants[2].value = WEBGPU_MUL_MAT_WG_SIZE_N;
+
+        std::map<std::string, std::string> reg_repls;
+        reg_repls["WEBGPU_TILE_M"] = std::to_string(WEBGPU_MUL_MAT_TILE_M);
+        reg_repls["WEBGPU_TILE_N"] = std::to_string(WEBGPU_MUL_MAT_TILE_N);
+
+        // Process each reg-tile shader with tile replacements.
+        // Keep the processed strings in-scope so .c_str() remains valid.
+        std::string proc_mul_mat_reg_tile_f32_f32 =
+            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f32_f32, reg_repls);
+        std::string proc_mul_mat_reg_tile_f32_f32_vec =
+            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f32_f32_vec, reg_repls);
+        std::string proc_mul_mat_reg_tile_f16_f32 =
+            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f32, reg_repls);
+        std::string proc_mul_mat_reg_tile_f16_f32_vec =
+            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f32_vec, reg_repls);
+        std::string proc_mul_mat_reg_tile_f16_f16 =
+            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f16, reg_repls);
+        std::string proc_mul_mat_reg_tile_f16_f16_vec =
+            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f16_vec, reg_repls);
+        std::string proc_mul_mat_reg_tile_q4_0_f32 =
+            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_q4_0_f32, reg_repls);
+        std::string proc_mul_mat_reg_tile_q4_0_f32_vec =
+            ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_q4_0_f32_vec, reg_repls);
+
+        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] =
+            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f32_f32.c_str(),
+                                         "mul_mat_reg_tile_f32_f32", mul_mat_reg_tile_constants);
+        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][1] =
+            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f32_f32_vec.c_str(),
+                                         "mul_mat_reg_tile_f32_f32_vec", mul_mat_reg_tile_constants);
+        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][0] =
+            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f32.c_str(),
+                                         "mul_mat_reg_tile_f16_f32", mul_mat_reg_tile_constants);
+        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][1] =
+            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f32_vec.c_str(),
+                                         "mul_mat_reg_tile_f16_f32_vec", mul_mat_reg_tile_constants);
+        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][0] =
+            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f16.c_str(),
+                                         "mul_mat_reg_tile_f16_f16", mul_mat_reg_tile_constants);
+        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] =
+            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f16_vec.c_str(),
+                                         "mul_mat_reg_tile_f16_f16_vec", mul_mat_reg_tile_constants);
+        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][0] =
+            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_q4_0_f32.c_str(),
+                                         "mul_mat_reg_tile_q4_0_f32", mul_mat_reg_tile_constants);
+        webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][1] =
+            ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_q4_0_f32_vec.c_str(),
+                                         "mul_mat_reg_tile_q4_0_f32_vec", mul_mat_reg_tile_constants);
+    }
+
+    std::vector<wgpu::ConstantEntry> mul_mat_vec_constants(3);
+    mul_mat_vec_constants[0].key   = "WORKGROUP_SIZE";
+    mul_mat_vec_constants[0].value = WEBGPU_MUL_MAT_VEC_WG_SIZE;
+    mul_mat_vec_constants[1].key   = "TILE_K";
+    mul_mat_vec_constants[1].value = WEBGPU_MUL_MAT_VEC_TILE_K;
+    mul_mat_vec_constants[2].key   = "OUTPUTS_PER_WG";
+    mul_mat_vec_constants[2].value = WEBGPU_MUL_MAT_VEC_OUTPUTS_PER_WG;
+
+    webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
+        webgpu_ctx->device, wgsl_mul_mat_vec_f32_f32, "mul_mat_vec_f32_f32", mul_mat_vec_constants);
+    webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][1] = ggml_webgpu_create_pipeline2(
+        webgpu_ctx->device, wgsl_mul_mat_vec_f32_f32_vec, "mul_mat_vec_f32_f32_vec", mul_mat_vec_constants);
+    webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
+        webgpu_ctx->device, wgsl_mul_mat_vec_f16_f32, "mul_mat_vec_f16_f32", mul_mat_vec_constants);
+    webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][1] = ggml_webgpu_create_pipeline2(
+        webgpu_ctx->device, wgsl_mul_mat_vec_f16_f32_vec, "mul_mat_vec_f16_f32_vec", mul_mat_vec_constants);
+    webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][0] = ggml_webgpu_create_pipeline2(
+        webgpu_ctx->device, wgsl_mul_mat_vec_f16_f16, "mul_mat_vec_f16_f16", mul_mat_vec_constants);
+    webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] = ggml_webgpu_create_pipeline2(
+        webgpu_ctx->device, wgsl_mul_mat_vec_f16_f16_vec, "mul_mat_vec_f16_f16_vec", mul_mat_vec_constants);
+    webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
+        webgpu_ctx->device, wgsl_mul_mat_vec_q4_0_f32, "mul_mat_vec_q4_0_f32", mul_mat_vec_constants);
 }
 
 static void ggml_webgpu_init_set_rows_pipeline(webgpu_context & webgpu_ctx) {
@@ -2124,7 +2384,13 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
 
     webgpu_context ctx = reg_ctx->webgpu_ctx;
 
-    wgpu::RequestAdapterOptions options = {};
+    // TODO: track need for these toggles: https://issues.chromium.org/issues/42251215
+    const char * const          adapterEnabledToggles[] = { "vulkan_enable_f16_on_nvidia", "use_vulkan_memory_model" };
+    wgpu::DawnTogglesDescriptor adapterTogglesDesc;
+    adapterTogglesDesc.enabledToggles     = adapterEnabledToggles;
+    adapterTogglesDesc.enabledToggleCount = 2;
+    wgpu::RequestAdapterOptions options   = {};
+    options.nextInChain                   = &adapterTogglesDesc;
     ctx->instance.WaitAny(ctx->instance.RequestAdapter(
                               &options, wgpu::CallbackMode::AllowSpontaneous,
                               [&ctx](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message) {
@@ -2140,12 +2406,46 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
     ctx->adapter.GetLimits(&ctx->limits);
     ctx->max_wg_size_x = 288;  // default value
 
-    wgpu::AdapterInfo info{};
+    wgpu::AdapterInfo                            info{};
+    wgpu::AdapterPropertiesSubgroupMatrixConfigs subgroup_matrix_configs{};
+    if (ctx->adapter.HasFeature(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix)) {
+        info.nextInChain = &subgroup_matrix_configs;
+    }
     ctx->adapter.GetInfo(&info);
 
+    wgpu::SupportedFeatures features;
+    ctx->adapter.GetFeatures(&features);
+    // we require f16 support
+    GGML_ASSERT(ctx->adapter.HasFeature(wgpu::FeatureName::ShaderF16));
+
+    // Only support square f16 matrices of size 8 or 16 for now
+    bool valid_subgroup_matrix_config = false;
+    if (ctx->adapter.HasFeature(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix)) {
+        for (size_t i = 0; i < subgroup_matrix_configs.configCount; i++) {
+            const wgpu::SubgroupMatrixConfig config = subgroup_matrix_configs.configs[i];
+            if (config.M == config.N && config.N == config.K && (config.K == 8 || config.K == 16) &&
+                config.componentType == wgpu::SubgroupMatrixComponentType::F16 &&
+                config.resultComponentType == wgpu::SubgroupMatrixComponentType::F16) {
+                ctx->subgroup_matrix_config  = config;
+                valid_subgroup_matrix_config = true;
+                break;
+            }
+        }
+    }
+
+    // For subgroup matrix code to be the most efficient, we would like the subgroup size to be consistent and accurate.
+    // Unfortunately, that is not possible, so we use the maximum subgroup size reported by the adapter.
+    ctx->subgroup_size            = info.subgroupMaxSize;
+    ctx->supports_subgroup_matrix = valid_subgroup_matrix_config;
+
     // Initialize device
     std::vector<wgpu::FeatureName> required_features = { wgpu::FeatureName::ShaderF16,
                                                          wgpu::FeatureName::ImplicitDeviceSynchronization };
+    if (ctx->supports_subgroup_matrix) {
+        required_features.push_back(wgpu::FeatureName::Subgroups);
+        required_features.push_back(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix);
+    }
+
 #ifdef GGML_WEBGPU_GPU_PROFILE
     required_features.push_back(wgpu::FeatureName::TimestampQuery);
 #endif
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py
index 251051eae..ed8068d41 100755
--- a/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py
@@ -72,9 +72,12 @@ def generate_variants(fname, input_dir, output_dir, outfile):
         except ValueError:
             decls_map = {}
 
-        with open(os.path.join(input_dir, "common_decls.tmpl"), "r", encoding="utf-8") as f:
-            common_decls = f.read()
-        decls_map.update(parse_decls(common_decls))
+        for fname in sorted(os.listdir(input_dir)):
+            if fname.endswith(".tmpl"):
+                tmpl_path = os.path.join(input_dir, fname)
+                with open(tmpl_path, "r", encoding="utf-8") as f_tmpl:
+                    decls = f_tmpl.read()
+                    decls_map.update(parse_decls(decls))
 
         shader_template = extract_block(text, "SHADER")
         for variant in variants:
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl
index 141db9b39..0f8e6e5ac 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl
@@ -864,8 +864,8 @@ struct MulMatParams {
     broadcast3: u32
 };
 
-@group(0) @binding(0) var<storage, read_write> src0: array<{{SRC0_TYPE}}>; // N rows, K columns
-@group(0) @binding(1) var<storage, read_write> src1: array<{{SRC1_TYPE}}>; // M rows, K columns (transposed)
+@group(0) @binding(0) var<storage, read_write> src0: array<{{SRC0_TYPE}}>; // M rows, K columns
+@group(0) @binding(1) var<storage, read_write> src1: array<{{SRC1_TYPE}}>; // K rows, N columns (transposed)
 @group(0) @binding(2) var<storage, read_write> dst: array<f32>; // M rows, N columns
 
 @group(0) @binding(3) var<uniform> params: MulMatParams;
@@ -891,8 +891,8 @@ fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
 
     let dst2_rem = dst3_rem % dst2_stride;
 
-    let row = dst2_rem / params.n; // output row
-    let col = dst2_rem % params.n; // output column
+    let row = dst2_rem / params.m; // output row
+    let col = dst2_rem % params.m; // output column
 
     let src0_idx_base = params.offset_src0 + src03_idx * params.stride_03 + src02_idx * params.stride_02 + col * params.stride_01;
     let src1_idx_base = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12 + row * params.stride_11;
@@ -901,7 +901,7 @@ fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
     for (var i: u32 = 0u; i < params.k/{{BLOCK_SIZE}}; i = i + 1u) {
         sum += multiply_add(src0_idx_base, src1_idx_base, i);
     }
-    dst[params.offset_dst + dst3_idx * dst3_stride + dst2_idx * dst2_stride + row * params.n + col] = sum;
+    dst[params.offset_dst + dst3_idx * dst3_stride + dst2_idx * dst2_stride + row * params.m + col] = sum;
 }
 
 #end(SHADER)
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl
new file mode 100644
index 000000000..109ff8d61
--- /dev/null
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl
@@ -0,0 +1,97 @@
+#decl(SHMEM_VEC)
+fn store_shmem(val: vec4<f16>, idx: u32) {
+    shmem[idx] = val.x;
+    shmem[idx + 1] = val.y;
+    shmem[idx + 2] = val.z;
+    shmem[idx + 3] = val.w;
+}
+#enddecl(SHMEM_VEC)
+
+#decl(SHMEM_SCALAR)
+fn store_shmem(val: f16, idx: u32) {
+    shmem[idx] = val;
+}
+#enddecl(SHMEM_SCALAR)
+
+#decl(INIT_SRC0_SHMEM_FLOAT)
+
+fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
+    for (var elem_idx = thread_id * {{VEC_SIZE}}; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE * {{VEC_SIZE}}) {
+        let tile_m = elem_idx / TILE_K;
+        let tile_k = elem_idx % TILE_K;
+        let global_m = offset_m + tile_m;
+        let global_k = k_outer + tile_k;
+        let src0_idx = batch_offset + global_m * params.stride_01 + global_k;
+        let src0_val = select( // taking a slight performance hit to avoid oob
+            {{SRC0_TYPE}}(0.0),
+            src0[src0_idx/{{VEC_SIZE}}],
+            global_m < params.m && global_k < params.k);
+        store_shmem({{SHMEM_TYPE}}(src0_val), elem_idx);
+    }
+}
+
+#enddecl(INIT_SRC0_SHMEM_FLOAT)
+
+#decl(INIT_SRC1_SHMEM)
+
+fn init_shmem_src1(thread_id: u32, batch_offset: u32, offset_n: u32, k_outer: u32) {
+    for (var elem_idx = thread_id * {{VEC_SIZE}}; elem_idx < TILE_SRC1_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE * {{VEC_SIZE}}) {
+        let tile_n = elem_idx / TILE_K;
+        let tile_k = elem_idx % TILE_K;
+        let global_n = offset_n + tile_n;
+        let global_k = k_outer + tile_k;
+        let src1_idx = batch_offset + global_n * params.stride_11 + global_k;
+        let src1_val = select(
+            {{SRC1_TYPE}}(0.0),
+            src1[src1_idx/{{VEC_SIZE}}],
+            global_n < params.n && global_k < params.k);
+        store_shmem({{SHMEM_TYPE}}(src1_val), TILE_SRC0_SHMEM + elem_idx);
+    }
+}
+
+#enddecl(INIT_SRC1_SHMEM)
+
+#decl(INIT_SRC0_SHMEM_Q4_0)
+
+const BLOCK_SIZE = 32u;
+// the number of blocks per k-tile. Note that this currently only works if TILE_K is a multiple of BLOCK_SIZE, which may need to be rethought for larger quantized types.
+override BLOCKS_K = TILE_K/BLOCK_SIZE;
+const NQ = 16u;
+const F16_PER_BLOCK = 9u; // 1 scale + 8x4 packed weights
+const WEIGHTS_PER_F16 = 4u; // 4 weights per f16
+const F16_PER_THREAD = NQ / WEIGHTS_PER_F16;
+
+fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) {
+    for (var i = thread_id * NQ; i < TILE_SRC0_SHMEM; i += TOTAL_WORKGROUP_SIZE * NQ) {
+        let blck_idx = i / BLOCK_SIZE;
+        let block_offset = (i % BLOCK_SIZE) / WEIGHTS_PER_F16;
+        let shmem_idx = blck_idx * BLOCK_SIZE + block_offset * 2u;
+
+        let tile_m = blck_idx / BLOCKS_K;
+        let global_m = offset_m + tile_m;
+        let block_k = blck_idx % BLOCKS_K;
+        let global_k = k_outer / BLOCK_SIZE + block_k;
+
+        if (global_m < params.m && global_k < params.k / BLOCK_SIZE) {
+            let src0_idx = batch_offset + global_m * params.stride_01 + global_k;
+            let scale_idx = src0_idx * F16_PER_BLOCK;
+            let d = src0[scale_idx];
+
+            for (var j = 0u; j < F16_PER_THREAD; j += 2) {
+                let q_0 = src0[scale_idx + 1u + block_offset + j];
+                let q_1 = src0[scale_idx + 1u + block_offset + j + 1];
+
+                let q_packed = bitcast<u32>(vec2(q_0, q_1));
+                for (var k = 0u; k < 4u; k++) {
+                    let q_byte = get_byte(q_packed, k);
+                    let q_hi = (f16((q_byte >> 4) & 0xF) - 8.0) * d;
+                    let q_lo = (f16(q_byte & 0xF) - 8.0) * d;
+                    shmem[shmem_idx + j * 2 + k] = q_lo;
+                    shmem[shmem_idx + j * 2 + k + 16u] = q_hi;
+                }
+            }
+        }
+    }
+}
+
+#enddecl(INIT_SRC0_SHMEM_Q4_0)
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl
new file mode 100644
index 000000000..6b1dd26cd
--- /dev/null
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl
@@ -0,0 +1,247 @@
+#define(VARIANTS)
+[
+  {
+    "SHADER_SUFFIX": "f32_f32_vec",
+    "REPLS": {
+      "SRC0_TYPE" : "vec4<f32>",
+      "SRC1_TYPE" : "vec4<f32>",
+      "DST_TYPE" : "vec4<f32>",
+      "SHMEM_TYPE" : "vec4<f16>",
+      "VEC_SIZE" : 4,
+    },
+    "DECLS": ["VEC", "SHMEM_VEC", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"]
+  },
+  {
+    "SHADER_SUFFIX": "f32_f32",
+    "REPLS": {
+      "SRC0_TYPE" : "f32",
+      "SRC1_TYPE" : "f32",
+      "DST_TYPE" : "f32",
+      "SHMEM_TYPE" : "f16",
+      "VEC_SIZE" : 1,
+    },
+    "DECLS": ["SCALAR", "SHMEM_SCALAR", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"]
+  },
+  {
+    "SHADER_SUFFIX": "f16_f32_vec",
+    "REPLS": {
+      "SRC0_TYPE" : "vec4<f16>",
+      "SRC1_TYPE" : "vec4<f32>",
+      "DST_TYPE" : "vec4<f32>",
+      "SHMEM_TYPE" : "vec4<f16>",
+      "VEC_SIZE" : 4,
+    },
+    "DECLS": ["VEC", "SHMEM_VEC", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"]
+  },
+  {
+    "SHADER_SUFFIX": "f16_f32",
+    "REPLS": {
+      "SRC0_TYPE" : "f16",
+      "SRC1_TYPE" : "f32",
+      "DST_TYPE" : "f32",
+      "SHMEM_TYPE" : "f16",
+      "VEC_SIZE" : 1,
+    },
+    "DECLS": ["SCALAR", "SHMEM_SCALAR", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"]
+  },
+  {
+    "SHADER_SUFFIX": "f16_f16_vec",
+    "REPLS": {
+      "SRC0_TYPE" : "vec4<f16>",
+      "SRC1_TYPE" : "vec4<f16>",
+      "DST_TYPE" : "vec4<f32>",
+      "SHMEM_TYPE" : "vec4<f16>",
+      "VEC_SIZE" : 4,
+    },
+    "DECLS": ["VEC", "SHMEM_VEC", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"]
+  },
+  {
+    "SHADER_SUFFIX": "f16_f16",
+    "REPLS": {
+      "SRC0_TYPE" : "f16",
+      "SRC1_TYPE" : "f16",
+      "DST_TYPE" : "f32",
+      "SHMEM_TYPE" : "f16",
+      "VEC_SIZE" : 1,
+    },
+    "DECLS": ["SCALAR", "SHMEM_SCALAR", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"]
+  },
+  {
+    "SHADER_SUFFIX": "q4_0_f32_vec",
+    "REPLS": {
+      "SRC0_TYPE" : "f16",
+      "SRC1_TYPE" : "vec4<f32>",
+      "DST_TYPE" : "vec4<f32>",
+      "SHMEM_TYPE" : "vec4<f16>",
+      "VEC_SIZE" : 4,
+    },
+    "DECLS": ["BYTE_HELPERS", "VEC", "SHMEM_VEC", "INIT_SRC0_SHMEM_Q4_0", "INIT_SRC1_SHMEM"]
+  },
+  {
+    "SHADER_SUFFIX": "q4_0_f32",
+    "REPLS": {
+      "SRC0_TYPE" : "f16",
+      "SRC1_TYPE" : "f32",
+      "DST_TYPE" : "f32",
+      "SHMEM_TYPE" : "f16",
+      "VEC_SIZE" : 1,
+    },
+    "DECLS": ["BYTE_HELPERS", "SCALAR", "SHMEM_SCALAR", "INIT_SRC0_SHMEM_Q4_0", "INIT_SRC1_SHMEM"]
+  }
+]
+
+#end(VARIANTS)
+
+#define(DECLS)
+
+#decl(VEC)
+fn store_val(acc: array<array<f16, TILE_N>, TILE_M>, tn: u32, tm: u32) -> vec4<f32> {
+    return vec4<f32>(f32(acc[tm][tn]), f32(acc[tm + 1][tn]), f32(acc[tm + 2][tn]), f32(acc[tm + 3][tn]));
+}
+#enddecl(VEC)
+
+#decl(SCALAR)
+fn store_val(acc: array<array<f16, TILE_N>, TILE_M>, tn: u32, tm: u32) -> f32 {
+    return f32(acc[tm][tn]);
+}
+#enddecl(SCALAR)
+
+#end(DECLS)
+
+#define(SHADER)
+enable f16;
+
+struct MulMatParams {
+    offset_src0: u32,
+    offset_src1: u32,
+    offset_dst: u32,
+    m: u32,
+    n: u32,
+    k: u32,
+    stride_01: u32,
+    stride_11: u32,
+    stride_02: u32,
+    stride_12: u32,
+    stride_03: u32,
+    stride_13: u32,
+    bs02: u32,
+    bs03: u32,
+    broadcast2: u32,
+    broadcast3: u32
+};
+
+@group(0) @binding(0) var<storage, read_write> src0: array<{{SRC0_TYPE}}>; // M rows, K columns
+@group(0) @binding(1) var<storage, read_write> src1: array<{{SRC1_TYPE}}>; // K rows, N columns (transposed)
+@group(0) @binding(2) var<storage, read_write> dst: array<{{DST_TYPE}}>; // M rows, N columns (transposed)
+
+@group(0) @binding(3) var<uniform> params: MulMatParams;
+
+DECLS
+
+fn get_local_n(thread_id: u32) -> u32 {
+    return thread_id / WORKGROUP_SIZE_M;
+}
+fn get_local_m(thread_id: u32) -> u32 {
+    return thread_id % WORKGROUP_SIZE_M;
+}
+
+// TILE_M must be multiple of 4 for vec4 loads
+const TILE_M = {{WEBGPU_TILE_M}}u;
+const TILE_N = {{WEBGPU_TILE_N}}u;
+
+override WORKGROUP_SIZE_M: u32;
+override WORKGROUP_SIZE_N: u32;
+override TILE_K: u32;
+
+override TOTAL_WORKGROUP_SIZE = WORKGROUP_SIZE_M * WORKGROUP_SIZE_N;
+override TILE_SRC0_SHMEM = TILE_K * WORKGROUP_SIZE_M * TILE_M;
+override TILE_SRC1_SHMEM = TILE_K * WORKGROUP_SIZE_N * TILE_N;
+
+var<workgroup> shmem: array<f16, TILE_SRC0_SHMEM + TILE_SRC1_SHMEM>;
+
+@compute @workgroup_size(TOTAL_WORKGROUP_SIZE)
+fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
+        @builtin(local_invocation_id) local_id: vec3<u32>) {
+
+    let thread_id = local_id.x;
+    let local_m = get_local_m(thread_id);
+    let local_n = get_local_n(thread_id);
+
+    let wg_n_count = (params.n + WORKGROUP_SIZE_N * TILE_N - 1u) / (WORKGROUP_SIZE_N * TILE_N);
+    let wg_m_count = (params.m + WORKGROUP_SIZE_M * TILE_M - 1u) / (WORKGROUP_SIZE_M * TILE_M);
+    let wg_per_matrix = wg_m_count * wg_n_count;
+
+    let batch_idx = wg_id.x / wg_per_matrix;
+
+    let wg_in_batch = wg_id.x % wg_per_matrix;
+    let wg_m = wg_in_batch % wg_m_count;
+    let wg_n = wg_in_batch / wg_m_count;
+
+    let output_row_base = wg_m * WORKGROUP_SIZE_M * TILE_M + local_m * TILE_M;
+    let output_col_base = wg_n * WORKGROUP_SIZE_N * TILE_N + local_n * TILE_N;
+
+    let dst2_stride = params.m * params.n;
+    let dst3_stride = dst2_stride * params.bs02 * params.broadcast2;
+
+    let dst3_idx = batch_idx / (params.bs02 * params.broadcast2);
+    let src03_idx = dst3_idx / params.broadcast3;
+    let src13_idx = dst3_idx;
+    let dst2_idx = batch_idx % (params.bs02 * params.broadcast2);
+    let src02_idx = dst2_idx / params.broadcast2;
+    let src12_idx = dst2_idx;
+
+    let src0_batch_offset = params.offset_src0 + src03_idx * params.stride_03 + src02_idx * params.stride_02;
+    let src1_batch_offset = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12;
+
+    let offset_m = wg_m * WORKGROUP_SIZE_M * TILE_M;
+    let offset_n = wg_n * WORKGROUP_SIZE_N * TILE_N;
+
+    var acc: array<array<f16, TILE_N>, TILE_M>;
+
+    for (var k_outer = 0u; k_outer < params.k; k_outer += TILE_K) {
+
+        // see mul_mat_decls.tmpl
+        init_shmem_src0(thread_id, src0_batch_offset, offset_m, k_outer);
+        init_shmem_src1(thread_id, src1_batch_offset, offset_n, k_outer);
+
+        workgroupBarrier();
+
+        let k_end = min(TILE_K, params.k - k_outer);
+
+        for (var k_inner = 0u; k_inner < k_end; k_inner++) {
+            var src0_tile: array<f16, TILE_M>;
+            for (var tm = 0u; tm < TILE_M; tm++) {
+                let src0_m = local_m * TILE_M + tm;
+                let src0_idx = k_inner + src0_m * TILE_K;
+                src0_tile[tm] = shmem[src0_idx];
+            }
+            for (var tn = 0u; tn < TILE_N; tn++) {
+                let src1_n = local_n * TILE_N + tn;
+                let src1_idx = src1_n * TILE_K + k_inner;
+                let src1_val = shmem[TILE_SRC0_SHMEM + src1_idx];
+                for (var tm = 0u; tm < TILE_M; tm++) {
+                      acc[tm][tn] += src0_tile[tm] * src1_val;
+                }
+            }
+        }
+
+        workgroupBarrier();
+    }
+
+    let dst_batch_offset = params.offset_dst + dst3_idx * dst3_stride + dst2_idx * dst2_stride;
+
+    for (var tn = 0u; tn < TILE_N; tn++) {
+        let global_col = output_col_base + tn;
+        if (global_col < params.n) {
+            for (var tm = 0u; tm < TILE_M; tm += {{VEC_SIZE}}) {
+                let global_row = output_row_base + tm;
+                if (global_row < params.m) {
+                    let dst_idx = dst_batch_offset + global_col * params.m + global_row;
+                    dst[dst_idx/{{VEC_SIZE}}] = store_val(acc, tn, tm);
+                }
+            }
+        }
+    }
+}
+
+#end(SHADER)
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl
new file mode 100644
index 000000000..47c8ce36a
--- /dev/null
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl
@@ -0,0 +1,302 @@
+#define(VARIANTS)
+[
+  {
+    "SHADER_SUFFIX": "f32_f32_vec",
+    "REPLS": {
+      "SRC0_TYPE" : "vec4<f32>",
+      "SRC1_TYPE" : "vec4<f32>",
+      "DST_TYPE" : "vec4<f32>",
+      "SHMEM_TYPE" : "vec4<f16>",
+      "VEC_SIZE" : 4,
+    },
+    "DECLS": ["VEC", "SHMEM_VEC", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"]
+  },
+  {
+    "SHADER_SUFFIX": "f32_f32",
+    "REPLS": {
+      "SRC0_TYPE" : "f32",
+      "SRC1_TYPE" : "f32",
+      "DST_TYPE" : "f32",
+      "SHMEM_TYPE" : "f16",
+      "VEC_SIZE" : 1,
+    },
+    "DECLS": ["SCALAR", "SHMEM_SCALAR", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"]
+  },
+  {
+    "SHADER_SUFFIX": "f16_f32_vec",
+    "REPLS": {
+      "SRC0_TYPE" : "vec4<f16>",
+      "SRC1_TYPE" : "vec4<f32>",
+      "DST_TYPE" : "vec4<f32>",
+      "SHMEM_TYPE" : "vec4<f16>",
+      "VEC_SIZE" : 4,
+    },
+    "DECLS": ["VEC", "SHMEM_VEC", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"]
+  },
+  {
+    "SHADER_SUFFIX": "f16_f32",
+    "REPLS": {
+      "SRC0_TYPE" : "f16",
+      "SRC1_TYPE" : "f32",
+      "DST_TYPE" : "f32",
+      "SHMEM_TYPE" : "f16",
+      "VEC_SIZE" : 1,
+    },
+    "DECLS": ["SCALAR", "SHMEM_SCALAR", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"]
+  },
+  {
+    "SHADER_SUFFIX": "f16_f16_vec",
+    "REPLS": {
+      "SRC0_TYPE" : "vec4<f16>",
+      "SRC1_TYPE" : "vec4<f16>",
+      "DST_TYPE" : "vec4<f32>",
+      "SHMEM_TYPE" : "vec4<f16>",
+      "VEC_SIZE" : 4,
+    },
+    "DECLS": ["VEC", "SHMEM_VEC", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"]
+  },
+  {
+    "SHADER_SUFFIX": "f16_f16",
+    "REPLS": {
+      "SRC0_TYPE" : "f16",
+      "SRC1_TYPE" : "f16",
+      "DST_TYPE" : "f32",
+      "SHMEM_TYPE" : "f16",
+      "VEC_SIZE" : 1,
+    },
+    "DECLS": ["SCALAR", "SHMEM_SCALAR", "INIT_SRC0_SHMEM_FLOAT", "INIT_SRC1_SHMEM"]
+  },
+  {
+    "SHADER_SUFFIX": "q4_0_f32_vec",
+    "REPLS": {
+      "SRC0_TYPE" : "f16",
+      "SRC1_TYPE" : "vec4<f32>",
+      "DST_TYPE" : "vec4<f32>",
+      "SHMEM_TYPE" : "vec4<f16>",
+      "VEC_SIZE" : 4,
+    },
+    "DECLS": ["BYTE_HELPERS", "VEC", "SHMEM_VEC", "INIT_SRC0_SHMEM_Q4_0", "INIT_SRC1_SHMEM"]
+  },
+  {
+    "SHADER_SUFFIX": "q4_0_f32",
+    "REPLS": {
+      "SRC0_TYPE" : "f16",
+      "SRC1_TYPE" : "f32",
+      "DST_TYPE" : "f32",
+      "SHMEM_TYPE" : "f16",
+      "VEC_SIZE" : 1,
+    },
+    "DECLS": ["BYTE_HELPERS", "SCALAR", "SHMEM_SCALAR", "INIT_SRC0_SHMEM_Q4_0", "INIT_SRC1_SHMEM"]
+  }
+]
+
+#end(VARIANTS)
+
+#define(DECLS)
+
+#decl(VEC)
+fn store_dst(shmem_idx: u32, dst_idx: u32) {
+    dst[dst_idx] = vec4<f32>(
+        f32(shmem[shmem_idx]),
+        f32(shmem[shmem_idx + 1]),
+        f32(shmem[shmem_idx + 2]),
+        f32(shmem[shmem_idx + 3])
+    );
+}
+#enddecl(VEC)
+
+#decl(SCALAR)
+fn store_dst(shmem_idx: u32, dst_idx: u32) {
+    dst[dst_idx] = f32(shmem[shmem_idx]);
+}
+#enddecl(SCALAR)
+
+#end(DECLS)
+
+#define(SHADER)
+diagnostic(off, chromium.subgroup_matrix_uniformity);
+enable f16;
+enable subgroups;
+enable chromium_experimental_subgroup_matrix;
+
+struct MulMatParams {
+    offset_src0: u32,
+    offset_src1: u32,
+    offset_dst: u32,
+    m: u32,
+    n: u32,
+    k: u32,
+    stride_01: u32,
+    stride_11: u32,
+    stride_02: u32,
+    stride_12: u32,
+    stride_03: u32,
+    stride_13: u32,
+    bs02: u32,
+    bs03: u32,
+    broadcast2: u32,
+    broadcast3: u32
+};
+
+@group(0) @binding(0) var<storage, read_write> src0: array<{{SRC0_TYPE}}>; // M rows, K columns
+@group(0) @binding(1) var<storage, read_write> src1: array<{{SRC1_TYPE}}>; // K rows, N columns (transposed)
+@group(0) @binding(2) var<storage, read_write> dst: array<{{DST_TYPE}}>; // M rows, N columns (transposed)
+
+@group(0) @binding(3) var<uniform> params: MulMatParams;
+
+DECLS
+
+// Note: These are string interpolated at build time, cannot use override constants due to limitations in
+// current Dawn version type definitions/matrix load requirements for constant memory sizes.
+const SUBGROUP_M = {{WEBGPU_SUBGROUP_M}}u;
+const SUBGROUP_N = {{WEBGPU_SUBGROUP_N}}u;
+// For portability we assume the max subgroup size, meaning some subgroups will be masked out if the
+// runtime subgroup size is smaller.
+const MAX_SUBGROUP_SIZE = {{WEBGPU_MAX_SUBGROUP_SIZE}}u;
+
+const EXPECTED_SUBGROUPS = SUBGROUP_M * SUBGROUP_N;
+
+const SUBGROUP_MATRIX_M_SIZE = {{WEBGPU_SG_MAT_M_SIZE}}u;
+const SUBGROUP_MATRIX_N_SIZE = {{WEBGPU_SG_MAT_N_SIZE}}u;
+const SUBGROUP_MATRIX_K_SIZE = {{WEBGPU_SG_MAT_K_SIZE}}u;
+
+const SUBGROUP_MATRIX_M = {{WEBGPU_SUBGROUP_MATRIX_M}}u;
+const SUBGROUP_MATRIX_N = {{WEBGPU_SUBGROUP_MATRIX_N}}u;
+
+const TILE_K = {{WEBGPU_TILE_K}}u;
+
+const WG_M_SG_TILE_SIZE = SUBGROUP_M * SUBGROUP_MATRIX_M * SUBGROUP_MATRIX_M_SIZE;
+const WG_N_SG_TILE_SIZE = SUBGROUP_N * SUBGROUP_MATRIX_N * SUBGROUP_MATRIX_N_SIZE;
+
+const TOTAL_WORKGROUP_SIZE = SUBGROUP_M * SUBGROUP_N * MAX_SUBGROUP_SIZE;
+const TILE_SRC0_SHMEM = TILE_K * SUBGROUP_M * SUBGROUP_MATRIX_M * SUBGROUP_MATRIX_M_SIZE;
+const TILE_SRC1_SHMEM = TILE_K * SUBGROUP_N * SUBGROUP_MATRIX_N * SUBGROUP_MATRIX_N_SIZE;
+
+const SG_MAT_ACCUM_SHMEM = SUBGROUP_M * SUBGROUP_MATRIX_M * SUBGROUP_N * SUBGROUP_MATRIX_N * SUBGROUP_MATRIX_M_SIZE * SUBGROUP_MATRIX_N_SIZE;
+
+// We reuse shmem for accumulation matrices
+const SHMEM_SIZE = max(TILE_SRC0_SHMEM + TILE_SRC1_SHMEM, SG_MAT_ACCUM_SHMEM);
+
+var<workgroup> shmem: array<f16, SHMEM_SIZE>;
+
+@compute @workgroup_size(TOTAL_WORKGROUP_SIZE)
+fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
+        @builtin(local_invocation_id) local_id: vec3<u32>,
+        @builtin(subgroup_id) subgroup_id: u32) {
+
+    let thread_id = local_id.x;
+    let subgroup_m = subgroup_id % SUBGROUP_M;
+    let subgroup_n = subgroup_id / SUBGROUP_M;
+
+    let wg_m_count = (params.m + WG_M_SG_TILE_SIZE - 1) / WG_M_SG_TILE_SIZE;
+    let wg_n_count = (params.n + WG_N_SG_TILE_SIZE - 1) / WG_N_SG_TILE_SIZE;
+    let wg_per_matrix = wg_m_count * wg_n_count;
+
+    let batch_idx = wg_id.x / wg_per_matrix;
+
+    let wg_in_batch = wg_id.x % wg_per_matrix;
+    let wg_m = wg_in_batch % wg_m_count;
+    let wg_n = wg_in_batch / wg_m_count;
+
+    let dst2_stride = params.m * params.n;
+    let dst3_stride = dst2_stride * params.bs02 * params.broadcast2;
+
+    let dst3_idx = batch_idx / (params.bs02 * params.broadcast2);
+    let src03_idx = dst3_idx / params.broadcast3;
+    let src13_idx = dst3_idx;
+    let dst2_idx = batch_idx % (params.bs02 * params.broadcast2);
+    let src02_idx = dst2_idx / params.broadcast2;
+    let src12_idx = dst2_idx;
+
+    let src0_batch_offset = params.offset_src0 + src03_idx * params.stride_03 + src02_idx * params.stride_02;
+    let src1_batch_offset = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12;
+
+    let offset_m = wg_m * SUBGROUP_M * SUBGROUP_MATRIX_M * SUBGROUP_MATRIX_M_SIZE;
+    let offset_n = wg_n * SUBGROUP_N * SUBGROUP_MATRIX_N * SUBGROUP_MATRIX_N_SIZE;
+
+    var acc_sg_mat : array<array<subgroup_matrix_result<f16, SUBGROUP_MATRIX_N_SIZE, SUBGROUP_MATRIX_M_SIZE>, SUBGROUP_MATRIX_N>, SUBGROUP_MATRIX_M>;
+
+    for (var k_outer = 0u; k_outer < params.k; k_outer += TILE_K) {
+
+        // see mul_mat_decls.tmpl
+        init_shmem_src0(thread_id, src0_batch_offset, offset_m, k_outer);
+        init_shmem_src1(thread_id, src1_batch_offset, offset_n, k_outer);
+
+        workgroupBarrier();
+
+        if (subgroup_id < EXPECTED_SUBGROUPS) {
+
+            for (var k_inner = 0u; k_inner < TILE_K; k_inner += SUBGROUP_MATRIX_K_SIZE) {
+
+                let src0_shmem_idx_base = subgroup_m * SUBGROUP_MATRIX_M * SUBGROUP_MATRIX_M_SIZE * TILE_K + k_inner;
+                var src0_sg_mats: array<subgroup_matrix_left<f16, SUBGROUP_MATRIX_K_SIZE, SUBGROUP_MATRIX_M_SIZE>, SUBGROUP_MATRIX_M>;
+                for (var m = 0u; m < SUBGROUP_MATRIX_M; m++) {
+                    src0_sg_mats[m] = subgroupMatrixLoad<subgroup_matrix_left<f16, SUBGROUP_MATRIX_K_SIZE, SUBGROUP_MATRIX_M_SIZE>>(
+                        &shmem,
+                        src0_shmem_idx_base + m * SUBGROUP_MATRIX_M_SIZE * TILE_K,
+                        false,
+                        TILE_K
+                    );
+                }
+
+                let src1_shmem_idx_base = TILE_SRC0_SHMEM + subgroup_n * SUBGROUP_MATRIX_N * SUBGROUP_MATRIX_N_SIZE * TILE_K + k_inner;
+                for (var n = 0u; n < SUBGROUP_MATRIX_N; n++) {
+                    let src1_sg_mat = subgroupMatrixLoad<subgroup_matrix_right<f16, SUBGROUP_MATRIX_N_SIZE, SUBGROUP_MATRIX_K_SIZE>>(
+                        &shmem,
+                        src1_shmem_idx_base + n * SUBGROUP_MATRIX_N_SIZE * TILE_K,
+                        true,
+                        TILE_K
+                    );
+                    for (var m = 0u; m < SUBGROUP_MATRIX_M; m++) {
+                        acc_sg_mat[m][n] = subgroupMatrixMultiplyAccumulate(src0_sg_mats[m], src1_sg_mat, acc_sg_mat[m][n]);
+                    }
+                }
+            }
+        }
+
+        workgroupBarrier();
+    }
+
+    let dst_batch_offset = params.offset_dst + dst3_idx * dst3_stride + dst2_idx * dst2_stride;
+
+    // Stage the subgroup matrix tiles into shared memory
+    // This uses WG_M_SG_TILE_SIZE as the stride (number of columns in the workgroup tile).
+    let WG_TILE_STRIDE = WG_M_SG_TILE_SIZE;
+    let tile_row_base_local = subgroup_n * SUBGROUP_MATRIX_N * SUBGROUP_MATRIX_N_SIZE;
+    let tile_col_base_local = subgroup_m * SUBGROUP_MATRIX_M * SUBGROUP_MATRIX_M_SIZE;
+
+    if (subgroup_id < EXPECTED_SUBGROUPS) { // 2-5% performance hit :(
+        for (var n = 0u; n < SUBGROUP_MATRIX_N; n++) {
+            for (var m = 0u; m < SUBGROUP_MATRIX_M; m++) {
+                let local_row = tile_row_base_local + n * SUBGROUP_MATRIX_N_SIZE;
+                let local_col = tile_col_base_local + m * SUBGROUP_MATRIX_M_SIZE;
+                let out_base = local_row * WG_TILE_STRIDE + local_col;
+                subgroupMatrixStore(&shmem, out_base, acc_sg_mat[m][n], true, WG_TILE_STRIDE);
+            }
+        }
+    }
+
+    workgroupBarrier();
+
+    // Cooperative write: iterate over the entire workgroup tile
+    let tile_rows = WG_N_SG_TILE_SIZE;
+    let tile_cols = WG_M_SG_TILE_SIZE;
+    let total_tile_elems = tile_rows * tile_cols;
+    let tile_dst_row_base = wg_m * SUBGROUP_M * SUBGROUP_MATRIX_M * SUBGROUP_MATRIX_M_SIZE;
+    let tile_dst_col_base = wg_n * SUBGROUP_N * SUBGROUP_MATRIX_N * SUBGROUP_MATRIX_N_SIZE;
+
+    for (var idx = thread_id * {{VEC_SIZE}}; idx < total_tile_elems; idx += TOTAL_WORKGROUP_SIZE * {{VEC_SIZE}}) {
+        let local_row = idx % WG_TILE_STRIDE;
+        let local_col = idx / WG_TILE_STRIDE;
+
+        let global_row = tile_dst_row_base + local_row;
+        let global_col = tile_dst_col_base + local_col;
+
+        if (global_col < params.n && global_row < params.m) {
+            let dst_idx = dst_batch_offset + global_col * params.m + global_row;
+            store_dst(idx, dst_idx/{{VEC_SIZE}});
+        }
+    }
+}
+
+#end(SHADER)
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl
new file mode 100644
index 000000000..ffbb64032
--- /dev/null
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl
@@ -0,0 +1,267 @@
+#define(VARIANTS)
+[
+  {
+    "SHADER_SUFFIX": "f32_f32_vec",
+    "REPLS": {
+      "SRC0_TYPE" : "vec4<f32>",
+      "SRC1_TYPE" : "vec4<f32>",
+      "DST_TYPE": "vec4<f32>",
+      "VEC_SIZE" : 4,
+    },
+    "DECLS": ["VEC", "MUL_ACC_FLOAT"]
+  },
+  {
+    "SHADER_SUFFIX": "f32_f32",
+    "REPLS": {
+      "SRC0_TYPE" : "f32",
+      "SRC1_TYPE" : "f32",
+      "DST_TYPE": "f32",
+      "VEC_SIZE" : 1,
+    },
+    "DECLS": ["SCALAR", "MUL_ACC_FLOAT"]
+  },
+  {
+    "SHADER_SUFFIX": "f16_f32_vec",
+    "REPLS": {
+      "SRC0_TYPE" : "vec4<f16>",
+      "SRC1_TYPE" : "vec4<f32>",
+      "DST_TYPE": "vec4<f32>",
+      "VEC_SIZE" : 4,
+    },
+    "DECLS": ["VEC", "MUL_ACC_FLOAT"]
+  },
+  {
+    "SHADER_SUFFIX": "f16_f32",
+    "REPLS": {
+      "SRC0_TYPE" : "f16",
+      "SRC1_TYPE" : "f32",
+      "DST_TYPE": "f32",
+      "VEC_SIZE" : 1,
+    },
+    "DECLS": ["SCALAR", "MUL_ACC_FLOAT"]
+  },
+  {
+    "SHADER_SUFFIX": "f16_f16_vec",
+    "REPLS": {
+      "SRC0_TYPE" : "vec4<f16>",
+      "SRC1_TYPE" : "vec4<f16>",
+      "DST_TYPE": "vec4<f32>",
+      "VEC_SIZE" : 4,
+    },
+    "DECLS": ["VEC", "MUL_ACC_FLOAT"]
+  },
+  {
+    "SHADER_SUFFIX": "f16_f16",
+    "REPLS": {
+      "SRC0_TYPE" : "f16",
+      "SRC1_TYPE" : "f16",
+      "DST_TYPE": "f32",
+      "VEC_SIZE" : 1,
+    },
+    "DECLS": ["SCALAR", "MUL_ACC_FLOAT"]
+  },
+  {
+    "SHADER_SUFFIX": "q4_0_f32",
+    "REPLS": {
+      "SRC0_TYPE" : "f16",
+      "SRC1_TYPE" : "f32",
+      "DST_TYPE": "f32",
+      "VEC_SIZE" : 1,
+    },
+    "DECLS": ["BYTE_HELPERS", "SCALAR", "MUL_ACC_Q4_0"]
+  }
+]
+
+#end(VARIANTS)
+
+#define(DECLS)
+
+#decl(VEC)
+fn inner_dot(src0_val: {{SRC0_TYPE}}, src1_val: {{SRC1_TYPE}}) -> f32 {
+    return f32(dot({{SRC1_TYPE}}(src0_val), src1_val));
+}
+
+fn store_val(group_base: u32) -> vec4<f32> {
+    return vec4<f32>(partial_sums[group_base],
+                     partial_sums[group_base + THREADS_PER_OUTPUT],
+                     partial_sums[group_base + THREADS_PER_OUTPUT * 2],
+                     partial_sums[group_base + THREADS_PER_OUTPUT * 3]);
+}
+#enddecl(VEC)
+
+#decl(SCALAR)
+fn inner_dot(src0_val: {{SRC0_TYPE}}, src1_val: {{SRC1_TYPE}}) -> f32 {
+    return f32(src0_val) * f32(src1_val);
+}
+
+fn store_val(group_base: u32) -> f32 {
+    return partial_sums[group_base];
+}
+#enddecl(SCALAR)
+
+#decl(MUL_ACC_FLOAT)
+
+fn mul_acc(tig:u32, tile_size: u32, idx_base: u32, k_outer: u32) -> f32 {
+    var local_sum = 0.0;
+    for (var i = tig * {{VEC_SIZE}}; i < tile_size; i += THREADS_PER_OUTPUT * {{VEC_SIZE}}) {
+        let a = src0[(idx_base + k_outer + i) / {{VEC_SIZE}}];
+        let b = shared_vector[i / {{VEC_SIZE}}];
+        local_sum += inner_dot(a, b);
+    }
+    return local_sum;
+}
+
+#enddecl(MUL_ACC_FLOAT)
+
+#decl(MUL_ACC_Q4_0)
+
+const BLOCK_SIZE = 32;
+const NQ = 16u; // number of weights per thread
+const F16_PER_BLOCK = 9u; // 1 scale + 8x4 packed weights
+const WEIGHTS_PER_F16 = 4u; // 4 weights per f16
+const F16_PER_THREAD = NQ / WEIGHTS_PER_F16;
+
+fn mul_acc(tig:u32, tile_size: u32, idx_base: u32, k_outer: u32) -> f32 {
+    var local_sum = 0.0;
+    for (var i = tig * NQ; i < tile_size; i += THREADS_PER_OUTPUT * NQ) {
+        let blck_idx = i / BLOCK_SIZE;
+        let block_offset = (i % BLOCK_SIZE) / WEIGHTS_PER_F16;
+        let scale_idx = (idx_base + k_outer / BLOCK_SIZE + blck_idx) * F16_PER_BLOCK;
+        // each f16 contains offsets [block_offset, block_offset + 1] and [block_offset + 16, block_offset + 17]
+        let shmem_idx = blck_idx * BLOCK_SIZE + block_offset * 2u;
+        let d = f32(src0[scale_idx]);
+        for (var j = 0u; j < F16_PER_THREAD; j += 2) {
+            let q_0 = src0[scale_idx + 1 + block_offset + j];
+            let q_1 = src0[scale_idx + 1 + block_offset + j + 1];
+            let q_packed = bitcast<u32>(vec2(q_0, q_1));
+            for (var k: u32 = 0; k < 4; k++) {
+                let q_byte = get_byte(q_packed, k);
+                let q_hi = (f32((q_byte >> 4) & 0xF) - 8.0) * d;
+                let q_lo = (f32(q_byte & 0xF) - 8.0) * d;
+                local_sum += q_lo * shared_vector[shmem_idx + j * 2 + k];
+                local_sum += q_hi * shared_vector[shmem_idx + j * 2 + k + 16];
+            }
+        }
+    }
+    return local_sum;
+}
+
+#enddecl(MUL_ACC_Q4_0)
+
+#end(DECLS)
+
+#define(SHADER)
+enable f16;
+
+DECLS
+
+struct MulMatParams {
+    offset_src0: u32,
+    offset_src1: u32,
+    offset_dst: u32,
+    m: u32,
+    n: u32,
+    k: u32,
+    stride_01: u32,
+    stride_11: u32,
+    stride_02: u32,
+    stride_12: u32,
+    stride_03: u32,
+    stride_13: u32,
+    bs02: u32,
+    bs03: u32,
+    broadcast2: u32,
+    broadcast3: u32
+};
+
+@group(0) @binding(0) var<storage, read_write> src0: array<{{SRC0_TYPE}}>; // Matrix (M x K)
+@group(0) @binding(1) var<storage, read_write> src1: array<{{SRC1_TYPE}}>; // Vector (K x 1, transposed)
+@group(0) @binding(2) var<storage, read_write> dst: array<{{DST_TYPE}}>;  // Result vector (transposed)
+
+@group(0) @binding(3) var<uniform> params: MulMatParams;
+
+override WORKGROUP_SIZE: u32;
+override TILE_K: u32;
+override OUTPUTS_PER_WG: u32;
+override THREADS_PER_OUTPUT = WORKGROUP_SIZE / OUTPUTS_PER_WG;
+
+// Shared memory for collaborative loading and reduction
+var<workgroup> shared_vector: array<{{SRC1_TYPE}}, TILE_K/{{VEC_SIZE}}>;  // Cache vector tile
+var<workgroup> partial_sums: array<f32, WORKGROUP_SIZE>;   // For reduction
+
+@compute @workgroup_size(WORKGROUP_SIZE)
+fn main(
+    @builtin(local_invocation_id) local_id: vec3<u32>,
+    @builtin(workgroup_id) wg_id: vec3<u32>,
+    @builtin(num_workgroups) num_wg: vec3<u32>) {
+    let thread_id = local_id.x;
+
+    // Handle batch dimensions
+    let total_batches = params.bs02 * params.broadcast2 * params.bs03 * params.broadcast3;
+    let wg_linear = wg_id.y * num_wg.x + wg_id.x;
+    let output_groups = (params.m + OUTPUTS_PER_WG - 1u) / OUTPUTS_PER_WG;
+    let batch_idx = wg_linear / output_groups;
+    if (batch_idx >= total_batches) {
+        return;
+    }
+
+    // Which of the outputs does this thread belong to?
+    let thread_group = thread_id / THREADS_PER_OUTPUT;
+    let thread_in_group = thread_id % THREADS_PER_OUTPUT;
+
+    // Each workgroup computes OUTPUTS_PER_WG consecutive outputs
+    let output_row = (wg_linear % output_groups) * OUTPUTS_PER_WG + thread_group;
+
+    let dst2_stride = params.m * params.n;
+    let dst2_idx = batch_idx % (params.bs02 * params.broadcast2);
+    let dst3_stride = dst2_stride * params.bs02 * params.broadcast2;
+    let dst3_idx = batch_idx / (params.bs02 * params.broadcast2);
+    let src03_idx = dst3_idx / params.broadcast3;
+    let src13_idx = dst3_idx;
+    let src02_idx = dst2_idx / params.broadcast2;
+    let src12_idx = dst2_idx;
+
+    let src0_idx_base = params.offset_src0 + src03_idx * params.stride_03 + src02_idx * params.stride_02 + output_row * params.stride_01;
+    let src1_idx_base = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12;
+    let dst_idx = params.offset_dst + dst3_idx * dst3_stride + dst2_idx * dst2_stride + output_row;
+
+    var local_sum = 0.0;
+
+    // Each thread processes multiple K elements and accumulates
+    for (var k_tile = 0u; k_tile < params.k; k_tile += TILE_K) {
+        let tile_size = min(TILE_K, params.k - k_tile);
+
+        // Cooperatively load vector tile into shared memory (all threads)
+        for (var i = thread_id * {{VEC_SIZE}}; i < tile_size; i += WORKGROUP_SIZE * {{VEC_SIZE}}) {
+            shared_vector[i / {{VEC_SIZE}}] = src1[(src1_idx_base + k_tile + i) / {{VEC_SIZE}}];
+        }
+
+        workgroupBarrier();
+
+        if (output_row < params.m) {
+            local_sum += mul_acc(thread_in_group, tile_size, src0_idx_base, k_tile);
+        }
+
+        workgroupBarrier();
+    }
+
+    // Store partial sums and reduce within each partition
+    partial_sums[thread_id] = local_sum;
+    workgroupBarrier();
+    let group_base = thread_group * THREADS_PER_OUTPUT;
+    let thread_base = group_base + thread_in_group;
+    var offset = THREADS_PER_OUTPUT / 2;
+    while (offset > 0) {
+        if (thread_in_group < offset) {
+            partial_sums[thread_base] += partial_sums[thread_base + offset];
+        }
+        offset = offset / 2;
+        workgroupBarrier();
+    }
+
+    // Store back to global memory
+    if (output_row < params.m && thread_group % {{VEC_SIZE}} == 0 && thread_in_group == 0) {
+        dst[dst_idx / {{VEC_SIZE}}] = store_val(group_base);
+    }
+}
+#end(SHADER)

From e14e842e87103ec2a004770cec95a3f94f861bda Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Sat, 8 Nov 2025 08:26:18 +0100
Subject: [PATCH 25/32] CUDA: fix MMQ stream-k fixup ne1 indices (#17089)

---
 ggml/src/ggml-cuda/mmq.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
index c9a07e82f..2e133b6bd 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -3494,7 +3494,7 @@ static __global__ void mul_mat_q_stream_k_fixup(
     const int col_diff = col_high - col_low;
 
     for (int j = threadIdx.y*warp_size + threadIdx.x; j < mmq_x; j += nwarps*warp_size) {
-        ids_dst_shared[j] = ids_dst[col_low + j];
+        ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
     }
     __syncthreads();
 

From d6fe40fa0040bf7207b2380d0f3628fb56d6a86f Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Sat, 8 Nov 2025 01:39:45 -0600
Subject: [PATCH 26/32] vulkan: Fix test-thread-safety crashes (#17024)

The std::map pipeline_flash_attn_f32_f16 could be searched and inserted at the
same time, which needs to hold the lock. To be safe, hold the lock for all of
ggml_vk_load_shaders.
---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index a0a05f2e5..2646e80be 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -130,9 +130,9 @@ struct vk_pipeline_struct {
     // true if fields have been set by ggml_vk_create_pipeline
     bool initialized {};
     // set to true to request the pipeline is compiled
-    bool needed {};
+    std::atomic<bool> needed {};
     // set to true when the shader has been compiled
-    bool compiled {};
+    std::atomic<bool> compiled {};
     // number of registers used, extracted from pipeline executable properties
     uint32_t register_count {};
 };
@@ -1842,10 +1842,7 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
         }
     }
 
-    {
-        std::lock_guard<std::recursive_mutex> guard(device->mutex);
-        device->all_pipelines.push_back(pipeline);
-    }
+    device->all_pipelines.push_back(pipeline);
 
     {
         std::lock_guard<std::mutex> guard(compile_count_mutex);
@@ -2536,6 +2533,7 @@ static uint32_t get_subgroup_size(const std::string &pipeline_name, const vk_dev
 static void ggml_vk_load_shaders(vk_device& device) {
     VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");
 
+    std::lock_guard<std::recursive_mutex> guard(device->mutex);
     // some shaders have a minimum subgroup size
     const uint32_t subgroup_size_8 = std::max(device->subgroup_size, 8u);
     const uint32_t subgroup_size_16 = std::max(device->subgroup_size, 16u);
@@ -2729,6 +2727,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
         if (!pipeline->needed || pipeline->compiled) {
             return;
         }
+        // TODO: We're no longer benefitting from the async compiles (shaders are
+        // compiled individually, as needed) and this complexity can be removed.
         {
             // wait until fewer than N compiles are in progress
             uint32_t N = std::max(1u, std::thread::hardware_concurrency());
@@ -7914,12 +7914,15 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
 
     vk_pipeline pipeline = nullptr;
 
-    auto &pipelines = ctx->device->pipeline_flash_attn_f32_f16[k->type];
-    auto it = pipelines.find(fa_pipeline_state);
-    if (it != pipelines.end()) {
-        pipeline = it->second;
-    } else {
-        pipelines[fa_pipeline_state] = pipeline = std::make_shared<vk_pipeline_struct>();
+    {
+        std::lock_guard<std::recursive_mutex> guard(ctx->device->mutex);
+        auto &pipelines = ctx->device->pipeline_flash_attn_f32_f16[k->type];
+        auto it = pipelines.find(fa_pipeline_state);
+        if (it != pipelines.end()) {
+            pipeline = it->second;
+        } else {
+            pipelines[fa_pipeline_state] = pipeline = std::make_shared<vk_pipeline_struct>();
+        }
     }
 
     assert(pipeline);

From b4e335d8dc503ec0adf76fa4053ab7094b6310dd Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Sat, 8 Nov 2025 01:52:15 -0600
Subject: [PATCH 27/32] vulkan: fuse rms_norm + mul + rope (+ view + set_rows)
 (#16977)

This change combines the rms_norm+mul and rope+view+set_rows fusions to
allow fusing the whole sequence together. This comes up in Qwen3, Bailing,
and some other models.
---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp          | 290 ++++++++++++++++--
 .../vulkan-shaders/generic_binary_head.glsl   |   8 +
 .../ggml-vulkan/vulkan-shaders/rms_norm.comp  |  44 ++-
 .../vulkan-shaders/rope_funcs.glsl            | 227 ++++++++++++++
 .../ggml-vulkan/vulkan-shaders/rope_head.glsl |  56 +---
 .../vulkan-shaders/rope_multi.comp            |  67 +---
 .../ggml-vulkan/vulkan-shaders/rope_neox.comp |  45 +--
 .../ggml-vulkan/vulkan-shaders/rope_norm.comp |  45 +--
 .../vulkan-shaders/rope_params.glsl           |  27 ++
 .../vulkan-shaders/rope_vision.comp           |  44 +--
 .../vulkan-shaders/vulkan-shaders-gen.cpp     |  34 +-
 tests/test-backend-ops.cpp                    |  89 ++++++
 12 files changed, 695 insertions(+), 281 deletions(-)
 create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl
 create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 2646e80be..9c2aeb57f 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -466,6 +466,14 @@ static constexpr std::initializer_list<std::array<int, 3>> rope_view_set_rows_ed
     { 2, 0, 1 }, // set_rows->src[0] == view
 };
 
+static constexpr std::initializer_list<std::array<int, 3>> rms_norm_mul_rope_view_set_rows_edges {
+    { 1, 0, 0 }, // mul->src[0]      == rms
+    { 2, 0, 1 }, // rope->src[0]     == mul
+    { 3, 0, 2 }, // view->src[0]     == rope
+    { 4, 0, 3 }, // set_rows->src[0] == view
+};
+
+
 struct vk_device_struct {
     std::recursive_mutex mutex;
 
@@ -617,6 +625,8 @@ struct vk_device_struct {
     vk_pipeline pipeline_rms_norm_mul_f32;
     vk_pipeline pipeline_rms_norm_partials_f32;
     vk_pipeline pipeline_rms_norm_mul_partials_f32;
+    vk_pipeline pipeline_rms_norm_mul_rope_f32_f32;
+    vk_pipeline pipeline_rms_norm_mul_rope_f32_f16;
     vk_pipeline pipeline_rms_norm_back_f32;
     vk_pipeline pipeline_l2_norm_f32;
 
@@ -1060,6 +1070,7 @@ struct vk_op_diag_mask_push_constants {
 };
 
 struct vk_op_rope_push_constants {
+    uint32_t rope_mode;
     uint32_t ncols;
     uint32_t n_dims;
     float freq_scale;
@@ -1079,6 +1090,12 @@ struct vk_op_rope_push_constants {
     uint32_t set_rows_stride;
 };
 
+// For fused rms_norm+mul+rope(+view+set_rows)
+struct vk_op_rms_norm_mul_rope_push_constants {
+    vk_op_binary_push_constants bin;
+    vk_op_rope_push_constants rope;
+};
+
 struct vk_op_soft_max_push_constants {
     uint32_t KX;
     uint32_t KY;
@@ -3557,6 +3574,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
     ggml_vk_create_pipeline(device, device->pipeline_rms_norm_partials_f32, "rms_norm_partials_f32", rms_norm_partials_f32_len, rms_norm_partials_f32_data, "main", 4, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {0, 0}, 1, true);
     ggml_vk_create_pipeline(device, device->pipeline_rms_norm_mul_partials_f32, "rms_norm_mul_partials_f32", rms_norm_partials_f32_len, rms_norm_partials_f32_data, "main", 4, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {0, 1}, 1, true);
 
+    if (device->float_controls_rte_fp16 &&
+        sizeof(vk_op_rms_norm_mul_rope_push_constants) <= device->properties.limits.maxPushConstantsSize) {
+        ggml_vk_create_pipeline(device, device->pipeline_rms_norm_mul_rope_f32_f32, "rms_norm_mul_rope_f32_f32", rms_norm_mul_rope_f32_f32_len, rms_norm_mul_rope_f32_f32_data, "main", 7, sizeof(vk_op_rms_norm_mul_rope_push_constants), {1, 1, 1}, {0, 1}, 1, true);
+        ggml_vk_create_pipeline(device, device->pipeline_rms_norm_mul_rope_f32_f16, "rms_norm_mul_rope_f32_f16", rms_norm_mul_rope_f32_f16_rte_len, rms_norm_mul_rope_f32_f16_rte_data, "main", 7, sizeof(vk_op_rms_norm_mul_rope_push_constants), {1, 1, 1}, {0, 1}, 1, true);
+    }
+
     ggml_vk_create_pipeline(device, device->pipeline_rms_norm_back_f32, "rms_norm_back_f32", rms_norm_back_f32_len, rms_norm_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_l2_norm_f32, "l2_norm_f32", l2_norm_f32_len, l2_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
 
@@ -9590,21 +9613,149 @@ static uint32_t ggml_vk_rms_partials_size(ggml_backend_vk_context * ctx, const g
     return num_bytes;
 }
 
-static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, float * op_params) {
+static vk_op_rope_push_constants ggml_vk_make_rope_constants(const ggml_tensor *dst, const ggml_tensor *src0, const bool has_ff, bool backprop, const uint32_t set_rows_stride) {
+    const int n_dims        = ((const int32_t *) dst->op_params)[1];
+    const int mode          = ((const int32_t *) dst->op_params)[2];
+    // const int n_ctx         = ((const int32_t *) dst->op_params)[3];
+    const int n_ctx_orig    = ((const int32_t *) dst->op_params)[4];
+    const float freq_base   = ((const float *)   dst->op_params)[5];
+    const float freq_scale  = ((const float *)   dst->op_params)[6];
+    const float ext_factor  = ((const float *)   dst->op_params)[7];
+    const float attn_factor = ((const float *)   dst->op_params)[8];
+    const float beta_fast   = ((const float *)   dst->op_params)[9];
+    const float beta_slow   = ((const float *)   dst->op_params)[10];
+    int sections[4] {};
+    if (mode & GGML_ROPE_TYPE_MROPE) {
+        memcpy(sections, (const int32_t *) dst->op_params + 11, sizeof(int)*4);
+    }
+
+    const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE;
+
+    float corr_dims[2];
+    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+
+    uint32_t nb01 = src0->nb[1] / ggml_type_size(src0->type);
+    uint32_t nb02 = src0->nb[2] / ggml_type_size(src0->type);
+
+    vk_op_rope_push_constants rope {
+        (uint32_t)mode, (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
+        freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale,
+        has_ff, (uint32_t)src0->ne[2], nb01, nb02,
+        { sections[0], sections[1], sections[2], sections[3] }, is_imrope, backprop, set_rows_stride,
+    };
+
+    return rope;
+}
+
+static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const struct ggml_cgraph * cgraph, int node_idx, float * op_params) {
+    ggml_tensor * dst;
+    const ggml_tensor * src0;
+    const ggml_tensor * src1;
+
+    if (ctx->num_additional_fused_ops > 0) {
+        // fused rms_norm + mul
+        ggml_tensor *mul = cgraph->nodes[node_idx + 1];
+        ggml_tensor *other_src = mul->src[0] == cgraph->nodes[node_idx + 0] ? mul->src[1] : mul->src[0];
+        dst = mul;
+        src0 = cgraph->nodes[node_idx]->src[0];
+        src1 = other_src;
+    } else {
+        dst = cgraph->nodes[node_idx];
+        src0 = src1 = dst->src[0];
+    }
+
     const uint32_t src0_type_size = ggml_type_size(src0->type);
     const uint32_t src1_type_size = ggml_type_size(src1->type);
     const uint32_t dst_type_size = ggml_type_size(dst->type);
 
     uint32_t param3 = ctx->do_add_rms_partials ? ggml_vk_rms_num_partials(ctx, dst) : 0;
 
-    ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_RMS_NORM, {
+    vk_op_binary_push_constants bin {
         (uint32_t)ggml_nelements(src0),
         (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
         (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
         (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
         0,
         op_params[0], 0.0f, (int32_t)param3,
-    });
+    };
+
+    // more than one fused op means rms_norm+mul+rope
+    if (ctx->num_additional_fused_ops > 1) {
+        static constexpr uint32_t max_tensors = 7;
+        const ggml_tensor *tensors[max_tensors] {};
+
+        ggml_tensor *rms = cgraph->nodes[node_idx + 0];
+        ggml_tensor *mul = cgraph->nodes[node_idx + 1];
+        ggml_tensor *rope = cgraph->nodes[node_idx + 2];
+
+        ggml_tensor *other_src = mul->src[0] == rms ? mul->src[1] : mul->src[0];
+
+        bool do_set_rows = ctx->num_additional_fused_ops == 4;
+
+        tensors[0] = rms->src[0];
+        tensors[1] = other_src;
+        tensors[2] = mul;
+        tensors[3] = rope->src[1]; // pos
+        tensors[4] = rope->src[2]; // ff
+        tensors[5] = cgraph->nodes[node_idx + ctx->num_additional_fused_ops]; // dst
+        tensors[6] = do_set_rows ? tensors[5]->src[1] : nullptr;
+        const uint32_t set_rows_stride = do_set_rows ? tensors[5]->nb[1] / ggml_type_size(tensors[5]->type) : 0;
+
+        vk_op_rms_norm_mul_rope_push_constants pc;
+        pc.bin = bin;
+        pc.rope = ggml_vk_make_rope_constants(rope, rope->src[0], tensors[4] != nullptr, false, set_rows_stride);
+
+        vk_pipeline pipeline = tensors[5]->type == GGML_TYPE_F16 ? ctx->device->pipeline_rms_norm_mul_rope_f32_f16 : ctx->device->pipeline_rms_norm_mul_rope_f32_f32;
+
+        ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
+
+        ggml_backend_vk_buffer_context * buf_ctx[max_tensors];
+        vk_buffer buf[max_tensors];
+        size_t offset[max_tensors];
+        bool uma[max_tensors];
+
+        for (uint32_t i = 0; i < max_tensors; ++i) {
+            if (!tensors[i]) {
+                // If any remaining descriptors are unused, just point them at src[0]
+                buf[i] = buf[0];
+                offset[i] = 0;
+                continue;
+            }
+            buf_ctx[i] = (ggml_backend_vk_buffer_context *)tensors[i]->buffer->context;
+            buf[i] = nullptr;
+            offset[i] = 0;
+            uma[i] = false;
+
+            if (ctx->device->uma) {
+                ggml_vk_host_get(ctx->device, tensors[i]->data, buf[i], offset[i]);
+                uma[i] = buf[i] != nullptr;
+            }
+            if (!uma[i]) {
+                buf[i] = buf_ctx[i]->dev_buffer;
+                offset[i] = vk_tensor_offset(tensors[i]) + tensors[i]->view_offs;
+            }
+            GGML_ASSERT(buf[i] != nullptr);
+        }
+
+        std::array<uint32_t, 3> elements;
+        elements = { (uint32_t)rms->src[0]->ne[1], (uint32_t)rms->src[0]->ne[2], (uint32_t)rms->src[0]->ne[3] };
+
+        static_assert(max_tensors == 7);
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
+            {
+                ggml_vk_subbuffer(ctx, buf[0], offset[0]),
+                ggml_vk_subbuffer(ctx, buf[1], offset[1]),
+                ggml_vk_subbuffer(ctx, buf[2], offset[2]),
+                ggml_vk_subbuffer(ctx, buf[3], offset[3]),
+                ggml_vk_subbuffer(ctx, buf[4], offset[4]),
+                ggml_vk_subbuffer(ctx, buf[5], offset[5]),
+                ggml_vk_subbuffer(ctx, buf[6], offset[6]),
+            }, pc, elements);
+    } else {
+        ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_RMS_NORM, std::move(bin));
+    }
 
     if (ctx->do_add_rms_partials_offset_calculation) {
         ctx->prealloc_size_add_rms_partials_offset += ggml_vk_rms_partials_size(ctx, src0);
@@ -9758,9 +9909,6 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, cons
     // const int n_ctx         = ((int32_t *) dst->op_params)[3];
     const int n_ctx_orig    = ((int32_t *) dst->op_params)[4];
     const float freq_base   = ((float *)   dst->op_params)[5];
-    const float freq_scale  = ((float *)   dst->op_params)[6];
-    const float ext_factor  = ((float *)   dst->op_params)[7];
-    const float attn_factor = ((float *)   dst->op_params)[8];
     const float beta_fast   = ((float *)   dst->op_params)[9];
     const float beta_slow   = ((float *)   dst->op_params)[10];
     int sections[4] {};
@@ -9768,16 +9916,9 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, cons
         memcpy(sections, (int32_t *) dst->op_params + 11, sizeof(int)*4);
     }
 
-    const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE;
-
     float corr_dims[2];
     ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
 
-    const float theta_scale = powf(freq_base, -2.0f/n_dims);
-
-    uint32_t s1 = src0->nb[1] / ggml_type_size(src0->type);
-    uint32_t s2 = src0->nb[2] / ggml_type_size(src0->type);
-
     uint32_t set_rows_stride = 0;
     // Fused rope + view + set_rows passes the set_rows destination stride in set_rows_stride
     // and overrides the dst and sets src3=row_indices
@@ -9787,12 +9928,8 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, cons
         dst = cgraph->nodes[node_idx + 2];
     }
 
-    ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, src3, dst, GGML_OP_ROPE, {
-        (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
-        freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale,
-        src2 != nullptr, (uint32_t)src0->ne[2], s1, s2,
-        { sections[0], sections[1], sections[2], sections[3] }, is_imrope, backprop, set_rows_stride,
-    });
+    ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, src3, dst, GGML_OP_ROPE,
+        ggml_vk_make_rope_constants(cgraph->nodes[node_idx], src0, src2 != nullptr, backprop, set_rows_stride));
 }
 
 static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
@@ -11307,6 +11444,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
         if (n->op == GGML_OP_GLU) {
             std::cerr << " " << ggml_glu_op_name(ggml_get_glu_op(n)) << " " << (n->src[1] ? "split" : "single") << " ";
         }
+        if (n->op == GGML_OP_ROPE) {
+            const int mode = ((const int32_t *) n->op_params)[2];
+            std::cerr << " rope mode: " << mode;
+        }
         std::cerr << std::endl;
     }
 #endif
@@ -11414,14 +11555,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
 
         break;
     case GGML_OP_RMS_NORM:
-        if (ctx->num_additional_fused_ops > 0) {
-            // fused rms_norm + mul
-            ggml_tensor *mul = cgraph->nodes[node_idx + 1];
-            ggml_tensor *other_src = mul->src[0] == node ? mul->src[1] : mul->src[0];
-            ggml_vk_rms_norm(ctx, compute_ctx, src0, other_src, mul, (float *)node->op_params);
-        } else {
-            ggml_vk_rms_norm(ctx, compute_ctx, src0, src0, node, (float *)node->op_params);
-        }
+        ggml_vk_rms_norm(ctx, compute_ctx, cgraph, node_idx, (float *)node->op_params);
         break;
     case GGML_OP_RMS_NORM_BACK:
         ggml_vk_rms_norm_back(ctx, compute_ctx, src0, src1, node);
@@ -12407,6 +12541,70 @@ static bool ggml_vk_can_fuse_rope_set_rows(ggml_backend_vk_context * ctx, const
     return true;
 }
 
+// Check whether the tensors overlap in memory but are not equal.
+// Fusions can potenitally overwrite src tensors in ways that are not prevented
+// by ggml-alloc. If the fusion is entirely elementwise, then it's OK for them
+// to overlap if they are exactly equal.
+// XXX TODO this check is probably missing from several fusion optimizations.
+static bool ggml_vk_tensors_overlap_but_not_equal(const ggml_tensor * a, const ggml_tensor * b) {
+    ggml_backend_vk_buffer_context * a_buf_ctx = (ggml_backend_vk_buffer_context *)a->buffer->context;
+    vk_buffer a_buf = a_buf_ctx->dev_buffer;
+    ggml_backend_vk_buffer_context * b_buf_ctx = (ggml_backend_vk_buffer_context *)b->buffer->context;
+    vk_buffer b_buf = b_buf_ctx->dev_buffer;
+    if (a_buf == b_buf) {
+        auto a_base = vk_tensor_offset(a) + a->view_offs;
+        auto a_size = ggml_nbytes(a);
+        auto b_base = vk_tensor_offset(b) + b->view_offs;
+        auto b_size = ggml_nbytes(b);
+
+        if (a_base == b_base && a_size == b_size) {
+            return false;
+        }
+
+        if ((b_base <= a_base && a_base < b_base + b_size) ||
+            (a_base <= b_base && b_base < a_base + a_size)) {
+            return true;
+        }
+    }
+    return false;
+}
+
+static bool ggml_vk_can_fuse_rms_norm_mul_rope(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph,
+                                               int node_idx) {
+    GGML_UNUSED(ctx);
+    const ggml_tensor *rms = cgraph->nodes[node_idx + 0];
+    const ggml_tensor *mul = cgraph->nodes[node_idx + 1];
+    const ggml_tensor *rope = cgraph->nodes[node_idx + 2];
+
+    const int mode = ((const int32_t *) rope->op_params)[2];
+
+    // noncontig tensors aren't tested, and don't seem common in practice
+    if (!ggml_is_contiguous(rms) ||
+        !ggml_is_contiguous(mul) ||
+        !ggml_is_contiguous(rope)) {
+        return false;
+    }
+
+    // only norm/neox are handled in the shader
+    if (mode != GGML_ROPE_TYPE_NEOX && mode != GGML_ROPE_TYPE_NORMAL) {
+        return false;
+    }
+
+    // shared memory size for passing data from mul->rope
+    if (mul->ne[0] > 1024) {
+        return false;
+    }
+
+    // must not overwrite srcs in a way that's not elementwise
+    ggml_tensor *other_src = mul->src[0] == rms ? mul->src[1] : mul->src[0];
+    if (ggml_vk_tensors_overlap_but_not_equal(rms->src[0], rope) ||
+        ggml_vk_tensors_overlap_but_not_equal(other_src, rope)) {
+        return false;
+    }
+
+    return true;
+}
+
 static uint32_t ggml_vk_fuse_multi_add(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx) {
 
     const ggml_tensor *first_node = cgraph->nodes[node_idx];
@@ -12552,12 +12750,20 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
             uint32_t num_adds = ggml_vk_fuse_multi_add(ctx, cgraph, i);
             if (num_adds) {
                 ctx->num_additional_fused_ops = num_adds - 1;
-            } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
-                ctx->num_additional_fused_ops = 1;
             } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT, GGML_OP_ADD })) {
                 ctx->num_additional_fused_ops = 1;
             } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID })) {
                 ctx->num_additional_fused_ops = 1;
+            } else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 4 }) &&
+                       ggml_check_edges(cgraph, i, rms_norm_mul_rope_view_set_rows_edges) &&
+                       ggml_vk_can_fuse_rms_norm_mul_rope(ctx, cgraph, i) &&
+                       ggml_vk_can_fuse_rope_set_rows(ctx, cgraph, i + 2)) {
+                ctx->num_additional_fused_ops = 4;
+            } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ROPE })&&
+                       ggml_vk_can_fuse_rms_norm_mul_rope(ctx, cgraph, i)) {
+                ctx->num_additional_fused_ops = 2;
+            } else if (ggml_vk_can_fuse(ctx, cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
+                ctx->num_additional_fused_ops = 1;
             } else if (ggml_can_fuse_subgraph(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, { i + 2 }) &&
                        ggml_check_edges(cgraph, i, rope_view_set_rows_edges) &&
                        ggml_vk_can_fuse_rope_set_rows(ctx, cgraph, i)) {
@@ -12790,14 +12996,34 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
             }
             if (ok) {
                 current_set.push_back(j);
+
+                int rope_idx = j;
+
+                // When we've found RMS_NORM + MUL, try to find a ROPE that uses it
+                if (j > 0 &&
+                    graph->nodes[j]->op == GGML_OP_MUL &&
+                    graph->nodes[j-1]->op == GGML_OP_RMS_NORM) {
+                    for (int k = j + 1; k < std::min(j + 15, graph->n_nodes); ++k) {
+                        if (graph->nodes[k]->op == GGML_OP_ROPE &&
+                            graph->nodes[k]->src[0] == graph->nodes[j] &&
+                            // Check that other srcs are already valid
+                            graph->nodes[k]->src[1]->op == GGML_OP_NONE &&
+                            (graph->nodes[k]->src[2] == nullptr || graph->nodes[k]->src[2]->op == GGML_OP_NONE)) {
+                            rope_idx = k;
+                            current_set.push_back(rope_idx);
+                            used[rope_idx] = true;
+                            break;
+                        }
+                    }
+                }
                 // Look for ROPE + VIEW + SET_ROWS and make them consecutive
-                if (graph->nodes[j]->op == GGML_OP_ROPE) {
+                if (graph->nodes[rope_idx]->op == GGML_OP_ROPE) {
                     int view_idx = -1;
                     int set_rows_idx = -1;
-                    for (int k = j+1; k < std::min(j + 10, graph->n_nodes); ++k) {
+                    for (int k = rope_idx+1; k < std::min(rope_idx + 10, graph->n_nodes); ++k) {
                         if (view_idx == -1 &&
                             graph->nodes[k]->op == GGML_OP_VIEW &&
-                            graph->nodes[k]->src[0] == graph->nodes[j]) {
+                            graph->nodes[k]->src[0] == graph->nodes[rope_idx]) {
                             view_idx = k;
                             continue;
                         }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl
index 99595fc68..c1ad51725 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/generic_binary_head.glsl
@@ -3,6 +3,9 @@
 
 #include "rte.glsl"
 #include "utils.glsl"
+#if RMS_NORM_ROPE_FUSION
+#include "rope_params.glsl"
+#endif
 
 layout (push_constant) uniform parameter
 {
@@ -12,11 +15,16 @@ layout (push_constant) uniform parameter
     uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
     uint misalign_offsets;
     float param1; float param2; int param3;
+#if RMS_NORM_ROPE_FUSION
+    rope_params rope;
+#endif
 } p;
 
+#if !RMS_NORM_ROPE_FUSION
 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
 layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
 layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
+#endif
 
 // true if src0/src1 are the same shape and the indices can be reused without additional modulus
 layout(constant_id = 0) const bool norepeat = false;
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp
index d5b211ffa..3a47949d5 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp
@@ -3,6 +3,32 @@
 #include "generic_binary_head.glsl"
 #include "types.glsl"
 
+#if RMS_NORM_ROPE_FUSION
+
+layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
+layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
+
+// data is passed from rms_norm -> rope through shared memory.
+// rms_norm calls this data_d, rope calls this rope_data_a.
+// Binding 2 is not used
+shared FLOAT_TYPE rope_data_a[1024];
+#define data_d rope_data_a
+
+layout (binding = 3) readonly buffer R_Y {int rope_data_pos[];};
+layout (binding = 4) readonly buffer R_Z {float rope_data_ff[];};
+layout (binding = 5) writeonly buffer R_D {ROPE_D_TYPE rope_data_d[];};
+layout (binding = 6) readonly buffer R_I {uvec2 rope_data_i[];}; // indices for set_rows
+
+#include "rope_params.glsl"
+#include "rope_funcs.glsl"
+
+#define GGML_ROPE_TYPE_NORMAL 0
+#define GGML_ROPE_TYPE_NEOX   2
+#define GGML_ROPE_TYPE_MROPE  8
+#define GGML_ROPE_TYPE_VISION 24
+
+#endif
+
 #extension GL_EXT_control_flow_attributes : enable
 #define BLOCK_SIZE 512
 
@@ -28,8 +54,12 @@ void rms_norm(uint num_iters) {
 
     uint32_t a_offset = samp*stride_sample + channel*stride_channel + row*stride_row + get_aoffset();
     uint32_t b_offset = src1_idx(0, row, channel, samp) + get_boffset();
+#if RMS_NORM_ROPE_FUSION
+    // Per-row offset in shared memory
+    uint32_t d_offset = 0;
+#else
     uint32_t d_offset = ((samp*nchannels + channel)*nrows + row)*ncols + get_doffset();
-
+#endif
     FLOAT_TYPE sum = FLOAT_TYPE(0.0f); // partial sum for thread in warp
 
     [[unroll]] for (uint col = tid, idx = 0; idx < num_iters; col += BLOCK_SIZE, ++idx) {
@@ -79,6 +109,18 @@ void rms_norm(uint num_iters) {
             data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col]));
         }
     }
+#if RMS_NORM_ROPE_FUSION
+    barrier();
+    rope_params rp = p.rope;
+    uint rope_row = (samp*nchannels + channel)*nrows + row;
+    for (uint t = 2*tid; t < ncols; t += 2*BLOCK_SIZE) {
+        if (rp.rope_mode == GGML_ROPE_TYPE_NEOX) {
+            rope_neox(t, rope_row, rp);
+        } else if (rp.rope_mode == GGML_ROPE_TYPE_NORMAL) {
+            rope_norm(t, rope_row, rp);
+        }
+    }
+#endif
 }
 
 void main() {
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl
new file mode 100644
index 000000000..9726b722d
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl
@@ -0,0 +1,227 @@
+
+float rope_yarn_ramp(const float low, const float high, const uint i0) {
+    const float y = (i0 / 2 - low) / max(0.001f, high - low);
+    return 1.0f - min(1.0f, max(0.0f, y));
+}
+
+uint rope_a_coord(const uint i0, const uint i01, const uint i02, rope_params p) {
+#if RMS_NORM_ROPE_FUSION
+    // Per-row offset in shared memory
+    const uint ix = i0;
+#else
+    const uint ix = i02*p.nb02 + i01*p.nb01 + i0;
+#endif
+    return ix;
+}
+
+void rope_yarn(const float theta_extrap, const uint i0, out float cos_theta, out float sin_theta, rope_params p) {
+    float mscale = p.attn_factor;
+    // Get n-d rotational scaling corrected for extrapolation
+    float theta_interp = p.freq_scale * theta_extrap;
+    float theta = theta_interp;
+    if (p.ext_factor != 0.0f) {
+        float ramp_mix = rope_yarn_ramp(p.corr_dims[0], p.corr_dims[1], i0) * p.ext_factor;
+        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+
+        // Get n-d magnitude scaling corrected for interpolation
+        mscale *= 1.0f + 0.1f * log(1.0f / p.freq_scale);
+    }
+    // Backprogagation uses inverted rotation
+    if (p.is_back != 0) {
+        theta = -theta;
+    }
+    cos_theta = cos(theta) * mscale;
+    sin_theta = sin(theta) * mscale;
+}
+
+void rope_norm(const uint i0, const uint i1, rope_params p) {
+    uint ne0 = p.ncols;
+    uint ne1 = p.p_delta_rows;
+
+    if (i0 >= ne0) {
+        return;
+    }
+
+    // i1 is actually i2*nb2+i1, but the rows are contiguous
+    const uint i01 = i1 % ne1;
+    const uint i02 = i1 / ne1;
+
+    uint idst = i1*ne0 + i0;
+    const uint ix = rope_a_coord(i0, i01, i02, p);
+
+    // Fusion optimization: ROPE + VIEW + SET_ROWS..
+    // The rope output is viewed as a 1D tensor and offset based on a row index in data_i.
+    if (p.set_rows_stride != 0) {
+        idst = i01*ne0 + i0;
+        idst += rope_data_i[i02].x * p.set_rows_stride;
+    }
+
+    if (i0 >= p.n_dims) {
+        rope_data_d[idst + 0] = ROPE_D_TYPE(rope_data_a[ix + 0]);
+        rope_data_d[idst + 1] = ROPE_D_TYPE(rope_data_a[ix + 1]);
+
+        return;
+    }
+
+    const float theta_base = rope_data_pos[i02] * pow(p.theta_scale, i0/2.0f);
+
+    const float freq_factor = p.has_ff != 0 ? rope_data_ff[i0/2] : 1.0f;
+
+    float cos_theta, sin_theta;
+    rope_yarn(theta_base / freq_factor, i0, cos_theta, sin_theta, p);
+
+    const float x0 = float(rope_data_a[ix + 0]);
+    const float x1 = float(rope_data_a[ix + 1]);
+
+    rope_data_d[idst + 0] = ROPE_D_TYPE(x0*cos_theta - x1*sin_theta);
+    rope_data_d[idst + 1] = ROPE_D_TYPE(x0*sin_theta + x1*cos_theta);
+}
+
+void rope_neox(const uint i0, const uint i1, rope_params p) {
+    uint ne0 = p.ncols;
+    uint ne1 = p.p_delta_rows;
+
+    if (i0 >= ne0) {
+        return;
+    }
+
+    const uint i01 = i1 % ne1;
+    const uint i02 = i1 / ne1;
+
+    uint idst = i1*ne0 + i0/2;
+    const uint ix = rope_a_coord(i0/2, i01, i02, p);
+
+    // Fusion optimization: ROPE + VIEW + SET_ROWS..
+    // The rope output is viewed as a 1D tensor and offset based on a row index in rope_data_i.
+    if (p.set_rows_stride != 0) {
+        idst = i01*ne0 + i0/2;
+        idst += rope_data_i[i02].x * p.set_rows_stride;
+    }
+
+    if (i0 >= p.n_dims) {
+        rope_data_d[idst + i0/2 + 0] = ROPE_D_TYPE(rope_data_a[ix + i0/2 + 0]);
+        rope_data_d[idst + i0/2 + 1] = ROPE_D_TYPE(rope_data_a[ix + i0/2 + 1]);
+
+        return;
+    }
+
+    const float theta_base = rope_data_pos[i02] * pow(p.theta_scale, i0/2.0f);
+
+    const float freq_factor = p.has_ff != 0 ? rope_data_ff[i0/2] : 1.0f;
+
+    float cos_theta, sin_theta;
+    rope_yarn(theta_base / freq_factor, i0, cos_theta, sin_theta, p);
+
+    const float x0 = float(rope_data_a[ix + 0]);
+    const float x1 = float(rope_data_a[ix + p.n_dims/2]);
+
+    rope_data_d[idst + 0]          = ROPE_D_TYPE(x0*cos_theta - x1*sin_theta);
+    rope_data_d[idst + p.n_dims/2] = ROPE_D_TYPE(x0*sin_theta + x1*cos_theta);
+}
+
+
+void rope_multi(const uint i0, const uint i1, rope_params p) {
+    uint ne0 = p.ncols;
+    uint ne1 = p.p_delta_rows;
+    uint ne2 = p.ne02;
+
+    if (i0 >= ne0) {
+        return;
+    }
+
+    const uint i01 = i1 % ne1;
+    const uint i02 = i1 / ne1;
+
+    const uint idst = i1*ne0 + i0/2;
+    const uint ix = rope_a_coord(i0/2, i01, i02, p);
+
+    if (i0 >= p.n_dims) {
+        rope_data_d[idst + i0/2 + 0] = ROPE_D_TYPE(rope_data_a[ix + i0/2 + 0]);
+        rope_data_d[idst + i0/2 + 1] = ROPE_D_TYPE(rope_data_a[ix + i0/2 + 1]);
+
+        return;
+    }
+
+    const int sect_dims = p.sections[0] + p.sections[1] + p.sections[2] + p.sections[3];
+    const int sec_w = p.sections[1] + p.sections[0];
+    const uint sector = (i0 / 2) % sect_dims;
+
+    float theta_base = 0.0;
+    if (p.is_imrope != 0) {
+        if (sector % 3 == 1 && sector < 3 * p.sections[1]) {
+            theta_base = rope_data_pos[i02 + ne2 * 1]*pow(p.theta_scale, i0/2.0f);
+        } else if (sector % 3 == 2 && sector < 3 * p.sections[2]) {
+            theta_base = rope_data_pos[i02 + ne2 * 2]*pow(p.theta_scale, i0/2.0f);
+        } else if (sector % 3 == 0 && sector < 3 * p.sections[0]) {
+            theta_base = rope_data_pos[i02]*pow(p.theta_scale, i0/2.0f);
+        } else {
+            theta_base = rope_data_pos[i02 + ne2 * 3]*pow(p.theta_scale, i0/2.0f);
+        }
+    } else {
+        if (sector < p.sections[0]) {
+            theta_base = rope_data_pos[i02]*pow(p.theta_scale, i0/2.0f);
+        }
+        else if (sector >= p.sections[0] && sector < sec_w) {
+            theta_base = rope_data_pos[i02 + ne2 * 1]*pow(p.theta_scale, i0/2.0f);
+        }
+        else if (sector >= sec_w && sector < sec_w + p.sections[2]) {
+            theta_base = rope_data_pos[i02 + ne2 * 2]*pow(p.theta_scale, i0/2.0f);
+        }
+        else if (sector >= sec_w + p.sections[2]) {
+            theta_base = rope_data_pos[i02 + ne2 * 3]*pow(p.theta_scale, i0/2.0f);
+        }
+    }
+
+    const float freq_factor = p.has_ff != 0 ? rope_data_ff[i0/2] : 1.0f;
+
+    float cos_theta, sin_theta;
+    rope_yarn(theta_base / freq_factor, i0, cos_theta, sin_theta, p);
+
+    const float x0 = float(rope_data_a[ix + 0]);
+    const float x1 = float(rope_data_a[ix + p.n_dims/2]);
+
+    rope_data_d[idst + 0]          = ROPE_D_TYPE(x0*cos_theta - x1*sin_theta);
+    rope_data_d[idst + p.n_dims/2] = ROPE_D_TYPE(x0*sin_theta + x1*cos_theta);
+}
+
+void rope_vision(const uint i0, const uint i1, rope_params p) {
+    uint ne0 = p.ncols;
+    uint ne1 = p.p_delta_rows;
+    uint ne2 = p.ne02;
+
+    if (i0 >= ne0) {
+        return;
+    }
+
+    const uint i01 = i1 % ne1;
+    const uint i02 = i1 / ne1;
+
+    const uint idst = i1*ne0 + i0/2;
+    const uint ix = rope_a_coord(i0/2, i01, i02, p);
+
+    const int sect_dims = p.sections[0] + p.sections[1];
+    const int sec_w = p.sections[1] + p.sections[0];
+    const uint sector = (i0 / 2) % sect_dims;
+
+    float theta_base = 0.0;
+    if (sector < p.sections[0]) {
+        const uint p0 = sector;
+        theta_base = rope_data_pos[i02]*pow(p.theta_scale, p0);
+    }
+    else if (sector >= p.sections[0] && sector < sec_w) {
+        const uint p0 = sector - p.sections[0];
+        theta_base = rope_data_pos[i02 + ne2]*pow(p.theta_scale, p0);
+    }
+
+    const float freq_factor = p.has_ff != 0 ? rope_data_ff[i0/2] : 1.0f;
+
+    float cos_theta, sin_theta;
+    rope_yarn(theta_base / freq_factor, i0, cos_theta, sin_theta, p);
+
+    const float x0 = float(rope_data_a[ix + 0]);
+    const float x1 = float(rope_data_a[ix + p.n_dims]);
+
+    rope_data_d[idst + 0]        = ROPE_D_TYPE(x0*cos_theta - x1*sin_theta);
+    rope_data_d[idst + p.n_dims] = ROPE_D_TYPE(x0*sin_theta + x1*cos_theta);
+}
+
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl
index fa2bb3339..d9b4d4c03 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl
@@ -3,56 +3,18 @@
 #extension GL_EXT_shader_16bit_storage : require
 
 #include "rte.glsl"
+#include "rope_params.glsl"
 
 layout(local_size_x = 1, local_size_y = 256, local_size_z = 1) in;
 
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) readonly buffer Y {int data_pos[];};
-layout (binding = 2) readonly buffer Z {float data_ff[];};
-layout (binding = 3) writeonly buffer D {D_TYPE data_d[];};
-layout (binding = 4) readonly buffer I {uvec2 data_i[];}; // indices for set_rows
+layout (binding = 0) readonly buffer X {A_TYPE rope_data_a[];};
+layout (binding = 1) readonly buffer Y {int rope_data_pos[];};
+layout (binding = 2) readonly buffer Z {float rope_data_ff[];};
+layout (binding = 3) writeonly buffer D {ROPE_D_TYPE rope_data_d[];};
+layout (binding = 4) readonly buffer I {uvec2 rope_data_i[];}; // indices for set_rows
+
 
 layout (push_constant) uniform parameter {
-    uint ncols;
-    uint n_dims;
-    float freq_scale;
-    uint p_delta_rows;
-    float freq_base;
-    float ext_factor;
-    float attn_factor;
-    float corr_dims[2];
-    float theta_scale;
-    uint has_ff;
-    uint ne02;
-    uint s1;
-    uint s2;
-    int sections[4];
-    uint is_imrope;
-    uint is_back;
-    uint set_rows_stride;
-} p;
+    rope_params pc;
+};
 
-float rope_yarn_ramp(const float low, const float high, const uint i0) {
-    const float y = (i0 / 2 - low) / max(0.001f, high - low);
-    return 1.0f - min(1.0f, max(0.0f, y));
-}
-
-void rope_yarn(const float theta_extrap, const uint i0, out float cos_theta, out float sin_theta) {
-    float mscale = p.attn_factor;
-    // Get n-d rotational scaling corrected for extrapolation
-    float theta_interp = p.freq_scale * theta_extrap;
-    float theta = theta_interp;
-    if (p.ext_factor != 0.0f) {
-        float ramp_mix = rope_yarn_ramp(p.corr_dims[0], p.corr_dims[1], i0) * p.ext_factor;
-        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
-
-        // Get n-d magnitude scaling corrected for interpolation
-        mscale *= 1.0f + 0.1f * log(1.0f / p.freq_scale);
-    }
-    // Backprogagation uses inverted rotation
-    if (p.is_back != 0) {
-        theta = -theta;
-    }
-    cos_theta = cos(theta) * mscale;
-    sin_theta = sin(theta) * mscale;
-}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp
index 54aabcf22..7c1fb1cd2 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp
@@ -1,70 +1,11 @@
 #version 450
 
 #include "rope_head.glsl"
+#include "rope_funcs.glsl"
 
 void main() {
     const uint i0 = 2*gl_GlobalInvocationID.y;
-    uint ne0 = p.ncols;
-    uint ne1 = p.p_delta_rows;
-    uint ne2 = p.ne02;
-
-    if (i0 >= ne0) {
-        return;
-    }
-
-    const uint row_dst = gl_GlobalInvocationID.x;
-
-    const uint row_x     = row_dst % ne1;
-    const uint channel_x = row_dst / ne1;
-
-    const uint idst = row_dst*ne0 + i0/2;
-    const uint ix   = channel_x*p.s2 + row_x*p.s1 + i0/2;
-
-    if (i0 >= p.n_dims) {
-        data_d[idst + i0/2 + 0] = data_a[ix + i0/2 + 0];
-        data_d[idst + i0/2 + 1] = data_a[ix + i0/2 + 1];
-
-        return;
-    }
-
-    const int sect_dims = p.sections[0] + p.sections[1] + p.sections[2] + p.sections[3];
-    const int sec_w = p.sections[1] + p.sections[0];
-    const uint sector = (i0 / 2) % sect_dims;
-
-    float theta_base = 0.0;
-    if (p.is_imrope != 0) {
-        if (sector % 3 == 1 && sector < 3 * p.sections[1]) {
-            theta_base = data_pos[channel_x + ne2 * 1]*pow(p.theta_scale, i0/2.0f);
-        } else if (sector % 3 == 2 && sector < 3 * p.sections[2]) {
-            theta_base = data_pos[channel_x + ne2 * 2]*pow(p.theta_scale, i0/2.0f);
-        } else if (sector % 3 == 0 && sector < 3 * p.sections[0]) {
-            theta_base = data_pos[channel_x]*pow(p.theta_scale, i0/2.0f);
-        } else {
-            theta_base = data_pos[channel_x + ne2 * 3]*pow(p.theta_scale, i0/2.0f);
-        }
-    } else {
-        if (sector < p.sections[0]) {
-            theta_base = data_pos[channel_x]*pow(p.theta_scale, i0/2.0f);
-        }
-        else if (sector >= p.sections[0] && sector < sec_w) {
-            theta_base = data_pos[channel_x + ne2 * 1]*pow(p.theta_scale, i0/2.0f);
-        }
-        else if (sector >= sec_w && sector < sec_w + p.sections[2]) {
-            theta_base = data_pos[channel_x + ne2 * 2]*pow(p.theta_scale, i0/2.0f);
-        }
-        else if (sector >= sec_w + p.sections[2]) {
-            theta_base = data_pos[channel_x + ne2 * 3]*pow(p.theta_scale, i0/2.0f);
-        }
-    }
-
-    const float freq_factor = p.has_ff != 0 ? data_ff[i0/2] : 1.0f;
-
-    float cos_theta, sin_theta;
-    rope_yarn(theta_base / freq_factor, i0, cos_theta, sin_theta);
-
-    const float x0 = float(data_a[ix + 0]);
-    const float x1 = float(data_a[ix + p.n_dims/2]);
-
-    data_d[idst + 0]          = D_TYPE(x0*cos_theta - x1*sin_theta);
-    data_d[idst + p.n_dims/2] = D_TYPE(x0*sin_theta + x1*cos_theta);
+    // i1 is actually i2*nb2+i1, but the rows are contiguous
+    const uint i1 = gl_GlobalInvocationID.x;
+    rope_multi(i0, i1, pc);
 }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp
index 9f4538155..68f00c180 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp
@@ -1,48 +1,11 @@
 #version 450
 
 #include "rope_head.glsl"
+#include "rope_funcs.glsl"
 
 void main() {
     const uint i0 = 2*gl_GlobalInvocationID.y;
-    uint ne0 = p.ncols;
-    uint ne1 = p.p_delta_rows;
-
-    if (i0 >= ne0) {
-        return;
-    }
-
-    const uint row_dst = gl_GlobalInvocationID.x;
-
-    const uint row_x     = row_dst % ne1;
-    const uint channel_x = row_dst / ne1;
-
-    uint idst = row_dst*ne0 + i0/2;
-    const uint ix   = channel_x*p.s2 + row_x*p.s1 + i0/2;
-
-    // Fusion optimization: ROPE + VIEW + SET_ROWS..
-    // The rope output is viewed as a 1D tensor and offset based on a row index in data_i.
-    if (p.set_rows_stride != 0) {
-        idst = row_x*ne0 + i0/2;
-        idst += data_i[channel_x].x * p.set_rows_stride;
-    }
-
-    if (i0 >= p.n_dims) {
-        data_d[idst + i0/2 + 0] = D_TYPE(data_a[ix + i0/2 + 0]);
-        data_d[idst + i0/2 + 1] = D_TYPE(data_a[ix + i0/2 + 1]);
-
-        return;
-    }
-
-    const float theta_base = data_pos[channel_x] * pow(p.theta_scale, i0/2.0f);
-
-    const float freq_factor = p.has_ff != 0 ? data_ff[i0/2] : 1.0f;
-
-    float cos_theta, sin_theta;
-    rope_yarn(theta_base / freq_factor, i0, cos_theta, sin_theta);
-
-    const float x0 = float(data_a[ix + 0]);
-    const float x1 = float(data_a[ix + p.n_dims/2]);
-
-    data_d[idst + 0]          = D_TYPE(x0*cos_theta - x1*sin_theta);
-    data_d[idst + p.n_dims/2] = D_TYPE(x0*sin_theta + x1*cos_theta);
+    // i1 is actually i2*nb2+i1, but the rows are contiguous
+    const uint i1 = gl_GlobalInvocationID.x;
+    rope_neox(i0, i1, pc);
 }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp
index f4209ed95..28a939ec6 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp
@@ -1,48 +1,11 @@
 #version 450
 
 #include "rope_head.glsl"
+#include "rope_funcs.glsl"
 
 void main() {
     const uint i0 = 2*gl_GlobalInvocationID.y;
-    uint ne0 = p.ncols;
-    uint ne1 = p.p_delta_rows;
-
-    if (i0 >= ne0) {
-        return;
-    }
-
-    const uint row_dst = gl_GlobalInvocationID.x;
-
-    const uint row_x     = row_dst % ne1;
-    const uint channel_x = row_dst / ne1;
-
-    uint idst = row_dst*ne0 + i0;
-    const uint ix   = channel_x*p.s2 + row_x*p.s1 + i0;
-
-    // Fusion optimization: ROPE + VIEW + SET_ROWS..
-    // The rope output is viewed as a 1D tensor and offset based on a row index in data_i.
-    if (p.set_rows_stride != 0) {
-        idst = row_x*ne0 + i0;
-        idst += data_i[channel_x].x * p.set_rows_stride;
-    }
-
-    if (i0 >= p.n_dims) {
-        data_d[idst + 0] = D_TYPE(data_a[ix + 0]);
-        data_d[idst + 1] = D_TYPE(data_a[ix + 1]);
-
-        return;
-    }
-
-    const float theta_base = data_pos[channel_x] * pow(p.theta_scale, i0/2.0f);
-
-    const float freq_factor = p.has_ff != 0 ? data_ff[i0/2] : 1.0f;
-
-    float cos_theta, sin_theta;
-    rope_yarn(theta_base / freq_factor, i0, cos_theta, sin_theta);
-
-    const float x0 = float(data_a[ix + 0]);
-    const float x1 = float(data_a[ix + 1]);
-
-    data_d[idst + 0] = D_TYPE(x0*cos_theta - x1*sin_theta);
-    data_d[idst + 1] = D_TYPE(x0*sin_theta + x1*cos_theta);
+    // i1 is actually i2*nb2+i1, but the rows are contiguous
+    const uint i1 = gl_GlobalInvocationID.x;
+    rope_norm(i0, i1, pc);
 }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl
new file mode 100644
index 000000000..82f39cee3
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl
@@ -0,0 +1,27 @@
+#if !defined(GGML_ROPE_PARAMS)
+#define GGML_ROPE_PARAMS
+
+#include "rte.glsl"
+
+struct rope_params {
+    uint rope_mode;
+    uint ncols;
+    uint n_dims;
+    float freq_scale;
+    uint p_delta_rows;
+    float freq_base;
+    float ext_factor;
+    float attn_factor;
+    float corr_dims[2];
+    float theta_scale;
+    uint has_ff;
+    uint ne02;
+    uint nb01;
+    uint nb02;
+    int sections[4];
+    uint is_imrope;
+    uint is_back;
+    uint set_rows_stride;
+};
+
+#endif // !defined(GGML_ROPE_PARAMS)
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp
index d37d1c104..ea1e0fdb4 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp
@@ -1,47 +1,11 @@
 #version 450
 
 #include "rope_head.glsl"
+#include "rope_funcs.glsl"
 
 void main() {
     const uint i0 = 2*gl_GlobalInvocationID.y;
-    uint ne0 = p.ncols;
-    uint ne1 = p.p_delta_rows;
-    uint ne2 = p.ne02;
-
-    if (i0 >= ne0) {
-        return;
-    }
-
-    const uint row_dst = gl_GlobalInvocationID.x;
-
-    const uint row_x     = row_dst % ne1;
-    const uint channel_x = row_dst / ne1;
-
-    const uint idst = row_dst*ne0 + i0/2;
-    const uint ix   = channel_x*p.s2 + row_x*p.s1 + i0/2;
-
-    const int sect_dims = p.sections[0] + p.sections[1];
-    const int sec_w = p.sections[1] + p.sections[0];
-    const uint sector = (i0 / 2) % sect_dims;
-
-    float theta_base = 0.0;
-    if (sector < p.sections[0]) {
-        const uint p0 = sector;
-        theta_base = data_pos[channel_x]*pow(p.theta_scale, p0);
-    }
-    else if (sector >= p.sections[0] && sector < sec_w) {
-        const uint p0 = sector - p.sections[0];
-        theta_base = data_pos[channel_x + ne2]*pow(p.theta_scale, p0);
-    }
-
-    const float freq_factor = p.has_ff != 0 ? data_ff[i0/2] : 1.0f;
-
-    float cos_theta, sin_theta;
-    rope_yarn(theta_base / freq_factor, i0, cos_theta, sin_theta);
-
-    const float x0 = float(data_a[ix + 0]);
-    const float x1 = float(data_a[ix + p.n_dims]);
-
-    data_d[idst + 0]        = D_TYPE(x0*cos_theta - x1*sin_theta);
-    data_d[idst + p.n_dims] = D_TYPE(x0*sin_theta + x1*cos_theta);
+    // i1 is actually i2*nb2+i1, but the rows are contiguous
+    const uint i1 = gl_GlobalInvocationID.x;
+    rope_vision(i0, i1, pc);
 }
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
index bd178875d..c2e42cf00 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -695,6 +695,8 @@ void process_shaders() {
     string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
     string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
     string_to_spv("rms_norm_partials_f32", "rms_norm_partials.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
+    string_to_spv("rms_norm_mul_rope_f32_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"ROPE_D_TYPE", "float"}, {"RMS_NORM_ROPE_FUSION", "1"}}));
+    string_to_spv("rms_norm_mul_rope_f32_f16_rte", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"ROPE_D_TYPE", "float16_t"}, {"RMS_NORM_ROPE_FUSION", "1"}, {"RTE16", "1"}}));
     string_to_spv("rms_norm_back_f32", "rms_norm_back.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
     string_to_spv("l2_norm_f32", "l2_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
 
@@ -840,25 +842,25 @@ void process_shaders() {
     string_to_spv("soft_max_f32_f16", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
     string_to_spv("soft_max_back_f32", "soft_max_back.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
 
-    string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
-    string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
-    string_to_spv("rope_norm_f16_rte", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}});
-    string_to_spv("rope_norm_f32_f16", "rope_norm.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
-    string_to_spv("rope_norm_f32_f16_rte", "rope_norm.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}});
+    string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float"}});
+    string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}});
+    string_to_spv("rope_norm_f16_rte", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}, {"RTE16", "1"}});
+    string_to_spv("rope_norm_f32_f16", "rope_norm.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float16_t"}});
+    string_to_spv("rope_norm_f32_f16_rte", "rope_norm.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float16_t"}, {"RTE16", "1"}});
 
-    string_to_spv("rope_neox_f32", "rope_neox.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
-    string_to_spv("rope_neox_f16", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
-    string_to_spv("rope_neox_f16_rte", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}});
-    string_to_spv("rope_neox_f32_f16", "rope_neox.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
-    string_to_spv("rope_neox_f32_f16_rte", "rope_neox.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}});
+    string_to_spv("rope_neox_f32", "rope_neox.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float"}});
+    string_to_spv("rope_neox_f16", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}});
+    string_to_spv("rope_neox_f16_rte", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}, {"RTE16", "1"}});
+    string_to_spv("rope_neox_f32_f16", "rope_neox.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float16_t"}});
+    string_to_spv("rope_neox_f32_f16_rte", "rope_neox.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float16_t"}, {"RTE16", "1"}});
 
-    string_to_spv("rope_multi_f32", "rope_multi.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
-    string_to_spv("rope_multi_f16", "rope_multi.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
-    string_to_spv("rope_multi_f16_rte", "rope_multi.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}});
+    string_to_spv("rope_multi_f32", "rope_multi.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float"}});
+    string_to_spv("rope_multi_f16", "rope_multi.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}});
+    string_to_spv("rope_multi_f16_rte", "rope_multi.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}, {"RTE16", "1"}});
 
-    string_to_spv("rope_vision_f32", "rope_vision.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
-    string_to_spv("rope_vision_f16", "rope_vision.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
-    string_to_spv("rope_vision_f16_rte", "rope_vision.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"RTE16", "1"}});
+    string_to_spv("rope_vision_f32", "rope_vision.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float"}});
+    string_to_spv("rope_vision_f16", "rope_vision.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}});
+    string_to_spv("rope_vision_f16_rte", "rope_vision.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}, {"RTE16", "1"}});
 
     string_to_spv("argsort_f32", "argsort.comp", {{"A_TYPE", "float"}});
 
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index b9ae82eed..80216f6f3 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -2294,6 +2294,79 @@ struct test_rope_set_rows : public test_case {
     }
 };
 
+// GGML_OP_RMS_NORM + GGML_OP_MUL + GGML_OP_ROPE (+ GGML_OP_VIEW + GGML_OP_SET_ROWS)
+struct test_rms_norm_mul_rope : public test_case {
+    const std::array<int64_t, 4> ne;
+    const float eps;
+    const bool multi_add; // test a sequence of adds feeding into rms_norm
+    const bool set_rows;
+    int mode;
+
+    std::string op_desc(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return "RMS_NORM_MUL_ROPE";
+    }
+
+    bool run_whole_graph() override { return true; }
+
+    std::string vars() override {
+        return VARS_TO_STR5(ne, eps, multi_add, set_rows, mode);
+    }
+
+    test_rms_norm_mul_rope(std::array<int64_t, 4> ne, float eps = 1e-6f, bool multi_add = false,
+                           bool set_rows = false, int mode = GGML_ROPE_TYPE_NORMAL)
+        : ne(ne), eps(eps), multi_add(multi_add), set_rows(set_rows), mode(mode) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, ne[0], ne[1], ne[2], 1);
+        ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, ne[0], ne[1], ne[2], 1);
+        ggml_tensor * c = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, ne[0], ne[1], ne[2], 1);
+
+        if (multi_add) {
+            a = ggml_add(ctx, ggml_add(ctx, a, b), c);
+        }
+
+        a = ggml_mul(ctx, ggml_rms_norm(ctx, a, eps), b);
+
+        ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne[2]);
+
+        ggml_tensor * rope = ggml_rope(ctx, a, pos, ne[0], mode);
+
+        ggml_tensor * out;
+
+        if (set_rows) {
+            ggml_tensor * view = ggml_view_2d(ctx, rope, ne[0] * ne[1], ne[2], rope->nb[2], 0);
+
+            ggml_tensor * dst = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, ne[0] * ne[1], ne[2] * ne[3], 1, 1);
+            ggml_set_name(dst, "dst");
+
+            ggml_tensor * row_idxs = ggml_new_tensor_3d(ctx, GGML_TYPE_I64, ne[2], 1, 1);
+            ggml_set_name(row_idxs, "row_idxs");
+
+            out = ggml_set_rows(ctx, dst, view, row_idxs);
+            ggml_set_name(out, "out");
+        } else {
+            out = rope;
+        }
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            if (t->type == GGML_TYPE_I64 || t->type == GGML_TYPE_I32) {
+                if (ggml_is_view_op(t->op)) {
+                    continue;
+                }
+
+                init_set_rows_row_ids(t, ne[2]);
+            } else {
+                init_tensor_uniform(t);
+            }
+        }
+    }
+};
+
 // GGML_OP_ARGMAX
 struct test_argmax : public test_case {
     const ggml_type type;
@@ -6751,6 +6824,22 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
         }
     }
 
+    for (auto multi_add : {false, true}) {
+        for (auto set_rows : {false, true}) {
+            for (auto rope : {GGML_ROPE_TYPE_NORMAL, GGML_ROPE_TYPE_NEOX}) {
+                test_cases.emplace_back(new test_rms_norm_mul_rope({768, 1, 1, 1}, 1e-6f, multi_add, set_rows, rope));
+                test_cases.emplace_back(new test_rms_norm_mul_rope({768, 3, 1, 1}, 1e-6f, multi_add, set_rows, rope));
+                test_cases.emplace_back(new test_rms_norm_mul_rope({768, 3, 5, 1}, 1e-6f, multi_add, set_rows, rope));
+                test_cases.emplace_back(new test_rms_norm_mul_rope({128, 32, 2, 1}, 1e-6f, multi_add, set_rows, rope));
+                test_cases.emplace_back(new test_rms_norm_mul_rope({128, 4, 2, 1}, 1e-6f, multi_add, set_rows, rope));
+                test_cases.emplace_back(new test_rms_norm_mul_rope({128, 32, 50, 1}, 1e-6f, multi_add, set_rows, rope));
+                test_cases.emplace_back(new test_rms_norm_mul_rope({128, 4, 50, 1}, 1e-6f, multi_add, set_rows, rope));
+                test_cases.emplace_back(new test_rms_norm_mul_rope({8192, 2, 2, 1}, 1e-6f, multi_add, set_rows, rope));
+                test_cases.emplace_back(new test_rms_norm_mul_rope({8192, 2, 2, 1}, 1e-6f, multi_add, set_rows, rope));
+            }
+        }
+    }
+
     test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, {64, 5, 4, 3}, 1e-12f));
 
     for (int64_t d_conv : {3, 4}) {

From 08416ebe7f7c0f344ada15489730d69eda096304 Mon Sep 17 00:00:00 2001
From: Aleksei Nikiforov
 <103434461+AlekseiNikiforovIBM@users.noreply.github.com>
Date: Sat, 8 Nov 2025 09:00:20 +0100
Subject: [PATCH 28/32] ggml: disable vxe for cross-compilation by default
 (#16966)

Otherwise compilation will fail due to enabling -mvx -mzvector
and not setting corresponding -march options.
---
 ggml/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 181f179ed..869796f0e 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -168,7 +168,7 @@ option(GGML_RV_ZFH           "ggml: enable riscv zfh"        ON)
 option(GGML_RV_ZVFH          "ggml: enable riscv zvfh"       ON)
 option(GGML_RV_ZICBOP        "ggml: enable riscv zicbop"     ON)
 option(GGML_XTHEADVECTOR     "ggml: enable xtheadvector"     OFF)
-option(GGML_VXE              "ggml: enable vxe"              ON)
+option(GGML_VXE              "ggml: enable vxe"              ${GGML_NATIVE})
 
 option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
 set(GGML_CPU_ARM_ARCH        "" CACHE STRING "ggml: CPU architecture for ARM")

From b8a5cfd11abc066f165e924136d0fa1db44d2b95 Mon Sep 17 00:00:00 2001
From: SavicStefan <50296686+SavicStefan@users.noreply.github.com>
Date: Sat, 8 Nov 2025 09:28:22 +0100
Subject: [PATCH 29/32] vulkan: Increase BK to 32; use BK/4 for non-CM
 mul_mm.comp (#16636)

Signed-off-by: Stefan Savic <stefan.savic@huawei.com>
Co-authored-by: Stefan Savic <stefan.savic@huawei.com>
---
 .../ggml-vulkan/vulkan-shaders/mul_mm.comp    | 33 +++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
index d260969f0..5c5251da3 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
@@ -100,7 +100,6 @@ layout (push_constant) uniform parameter
 layout (constant_id = 0) const uint BLOCK_SIZE = 64;
 layout (constant_id = 1) const uint BM = 64;
 layout (constant_id = 2) const uint BN = 64;
-layout (constant_id = 3) const uint BK = 16;  // Assumed to be 32 if working with a quant
 layout (constant_id = 4) const uint WM = 32;
 layout (constant_id = 5) const uint WN = 32;
 layout (constant_id = 6) const uint WMITER = 2;
@@ -109,6 +108,14 @@ layout (constant_id = 8) const uint TN = 2;
 layout (constant_id = 9) const uint TK = 1;  // Only needed for coopmat
 layout (constant_id = 10) const uint WARP = 32;
 
+#if defined(DATA_A_F32) || defined(DATA_A_F16)
+#define BK 32
+#define BK_STEP 4
+#else
+layout (constant_id = 3) const uint BK = 16;  // Assumed to be 32 if working with a quant
+#define BK_STEP 2
+#endif
+
 #ifdef COOPMAT
 #define SHMEM_STRIDE (BK / 2 + 4)
 #else
@@ -244,8 +251,13 @@ void main() {
     }
 #else
     ACC_TYPE_VEC2 sums[WMITER * TM * WNITER * TN/2];
+#if defined(DATA_A_F32) || defined(DATA_A_F16)
+    FLOAT_TYPE_VEC4 cache_a[WMITER * TM];
+    FLOAT_TYPE_VEC4 cache_b;
+#else
     FLOAT_TYPE_VEC2 cache_a[WMITER * TM];
     FLOAT_TYPE_VEC2 cache_b;
+#endif
 
     [[unroll]] for (uint i = 0; i < WMITER*TM*WNITER*TN/2; i++) {
         sums[i] = ACC_TYPE_VEC2(0.0f, 0.0f);
@@ -283,24 +295,41 @@ void main() {
             }
         }
 #else
-        [[unroll]] for (uint i = 0; i < BK / 2; i++) {
+        [[unroll]] for (uint i = 0; i < BK / BK_STEP; i++) {
             // Load from shared into cache
             [[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
                 [[unroll]] for (uint j = 0; j < TM; j++) {
+                #if defined(DATA_A_F32) || defined(DATA_A_F16)
+                    cache_a[wsir * TM + j].xy = buf_a[(warp_r * WM + wsir * WSUBM + tiwr * TM + j) * SHMEM_STRIDE + 2 * i    ];
+                    cache_a[wsir * TM + j].zw = buf_a[(warp_r * WM + wsir * WSUBM + tiwr * TM + j) * SHMEM_STRIDE + 2 * i + 1];
+                #else
                     cache_a[wsir * TM + j] = buf_a[(warp_r * WM + wsir * WSUBM + tiwr * TM + j) * SHMEM_STRIDE + i];
+                #endif
                 }
             }
 
             [[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) {
                 [[unroll]] for (uint cc = 0; cc < TN; cc++) {
+                #if defined(DATA_A_F32) || defined(DATA_A_F16)
+                    cache_b.xy = buf_b[(warp_c * WN + wsic * WSUBN + tiwc * TN + cc) * SHMEM_STRIDE + 2 * i    ];
+                    cache_b.zw = buf_b[(warp_c * WN + wsic * WSUBN + tiwc * TN + cc) * SHMEM_STRIDE + 2 * i + 1];
+                #else
                     cache_b = buf_b[(warp_c * WN + wsic * WSUBN + tiwc * TN + cc) * SHMEM_STRIDE + i];
+                #endif
 
                     [[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
                         [[unroll]] for (uint cr = 0; cr < TM / 2; cr++) {
                             // [WNITER][TN][WMITER][TM / 2] -> [wsic][cc][wsir][cr]
                             const uint sums_idx = (wsic * TN + cc) * WMITER * (TM / 2) + wsir * (TM / 2) + cr;
+                        #if defined(DATA_A_F32) || defined(DATA_A_F16)
+                            sums[sums_idx].x = fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr    ].x), ACC_TYPE(cache_b.x), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr    ].y), ACC_TYPE(cache_b.y),
+                                               fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr    ].z), ACC_TYPE(cache_b.z), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr    ].w), ACC_TYPE(cache_b.w), sums[sums_idx].x))));
+                            sums[sums_idx].y = fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].x), ACC_TYPE(cache_b.x), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].y), ACC_TYPE(cache_b.y),
+                                               fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].z), ACC_TYPE(cache_b.z), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].w), ACC_TYPE(cache_b.w), sums[sums_idx].y))));
+                        #else
                             sums[sums_idx].x = fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr    ].x), ACC_TYPE(cache_b.x), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr    ].y), ACC_TYPE(cache_b.y), sums[sums_idx].x));
                             sums[sums_idx].y = fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].x), ACC_TYPE(cache_b.x), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].y), ACC_TYPE(cache_b.y), sums[sums_idx].y));
+                        #endif
                         }
                     }
                 }

From c1b187688dac5e0f12cb38a63515aa20732d15a8 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Sat, 8 Nov 2025 16:58:05 +0800
Subject: [PATCH 30/32] CUDA: skip fusion for repeating adds in bias (#17080)

---
 ggml/src/ggml-cuda/CMakeLists.txt |  1 +
 ggml/src/ggml-cuda/ggml-cuda.cu   | 13 +++++++++++--
 tests/test-backend-ops.cpp        | 10 ++++++----
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt
index 302477513..67af1d8cc 100644
--- a/ggml/src/ggml-cuda/CMakeLists.txt
+++ b/ggml/src/ggml-cuda/CMakeLists.txt
@@ -124,6 +124,7 @@ if (CUDAToolkit_FOUND)
 
     if (GGML_CUDA_DEBUG)
         list(APPEND CUDA_FLAGS -lineinfo)
+        add_compile_definitions(GGML_CUDA_DEBUG)
     endif()
 
     if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 049aece1b..2d4314fba 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3152,8 +3152,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
 
             for (int i = 0; i < cgraph->n_nodes; i++) {
                 ggml_tensor * node = cgraph->nodes[i];
-
-
 #ifdef GGML_CUDA_DEBUG
                 const int nodes_fused = i - prev_i - 1;
                 prev_i = i;
@@ -3302,6 +3300,13 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                                 continue;
                             }
 
+                            // we don't support repeating adds
+                            if (bias_op == GGML_OP_ADD &&
+                                (!ggml_are_same_shape(gate_bias_n->src[0], gate_bias_n->src[1]) ||
+                                 !ggml_are_same_shape(up_bias_n->src[0], up_bias_n->src[1]))) {
+                                continue;
+                            }
+
                             const ggml_tensor * src0 = up_n->src[0];
                             const ggml_tensor * src1 = up_n->src[1];
                             const ggml_tensor * ids  = up_n->src[2];
@@ -3411,6 +3416,10 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                             continue;
                         }
 
+                        if (bias_op == GGML_OP_ADD && !ggml_are_same_shape(bias_node->src[0], bias_node->src[1])) {
+                            continue;
+                        }
+
                         ggml_cuda_mm_fusion_args_host fusion_data{};
                         fusion_data.x_bias = bias_tensor;
 
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 80216f6f3..31625bcc7 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -4984,8 +4984,10 @@ struct test_mul_mat_vec_fusion : public test_case {
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
         if (!use_id) {
-            std::array<int64_t, 4> ne = {k, m, 1, 1};
-            std::array<int64_t, 4> ne0 = {k, n, 1, 1};
+            const int              channels = 4;
+            const int              samples  = 2;
+            std::array<int64_t, 4> ne       = { k, m, channels, samples };
+            std::array<int64_t, 4> ne0      = { k, n, channels, samples };
 
             ggml_tensor * cur  = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne.data());
             ggml_tensor * gate = with_gate ? ggml_new_tensor(ctx, type, 4, ne0.data()) : nullptr;
@@ -4993,14 +4995,14 @@ struct test_mul_mat_vec_fusion : public test_case {
 
             ggml_tensor * ffn_up = ggml_mul_mat(ctx, up, cur);
             if (with_bias) {
-                std::array<int64_t, 4> bias_ne = {ffn_up->ne[0], 1, 1, 1};
+                std::array<int64_t, 4> bias_ne = { ffn_up->ne[0], 1, channels, samples };
                 ggml_tensor * up_bias = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, bias_ne.data());
                 ffn_up = ggml_add(ctx, ffn_up, up_bias);
             }
 
             ggml_tensor * ffn_gate = with_gate ? ggml_mul_mat(ctx, gate, cur) : nullptr;
             if (with_bias && with_gate) {
-                std::array<int64_t, 4> bias_ne = {ffn_gate->ne[0], 1, 1, 1};
+                std::array<int64_t, 4> bias_ne   = { ffn_gate->ne[0], 1, channels, samples };
                 ggml_tensor * gate_bias = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, bias_ne.data());
                 ffn_gate = ggml_add(ctx, ffn_gate, gate_bias);
             }

From 64fe17fbb84f493dbc33e4c13042953c4f5bfaeb Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Sat, 8 Nov 2025 21:05:19 +0800
Subject: [PATCH 31/32] Revert "CUDA: add expert reduce kernel (#16857)"
 (#17100)

---
 ggml/src/ggml-cuda/ggml-cuda.cu          |  26 ----
 ggml/src/ggml-cuda/moe-expert-reduce.cu  | 168 -----------------------
 ggml/src/ggml-cuda/moe-expert-reduce.cuh |  11 --
 tests/test-backend-ops.cpp               |  58 --------
 4 files changed, 263 deletions(-)
 delete mode 100644 ggml/src/ggml-cuda/moe-expert-reduce.cu
 delete mode 100644 ggml/src/ggml-cuda/moe-expert-reduce.cuh

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 2d4314fba..68dc57843 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -27,7 +27,6 @@
 #include "ggml-cuda/mmq.cuh"
 #include "ggml-cuda/mmvf.cuh"
 #include "ggml-cuda/mmvq.cuh"
-#include "ggml-cuda/moe-expert-reduce.cuh"
 #include "ggml-cuda/norm.cuh"
 #include "ggml-cuda/opt-step-adamw.cuh"
 #include "ggml-cuda/opt-step-sgd.cuh"
@@ -3197,31 +3196,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                         continue;
                     }
 
-                    if (node->op == GGML_OP_MUL) {
-                        int current_node = i + 1;
-                        int num_views    = 0;
-                        int num_adds     = 0;
-                        while (current_node < cgraph->n_nodes && cgraph->nodes[current_node]->op == GGML_OP_VIEW) {
-                            num_views++;
-                            current_node++;
-                        }
-
-                        while (current_node < cgraph->n_nodes && cgraph->nodes[current_node]->op == GGML_OP_ADD &&
-                                num_adds < num_views - 1) {
-                            num_adds++;
-                            current_node++;
-                        }
-
-                        if (num_adds == num_views - 1 && num_views > 0) {
-                            ggml_tensor * dst_node = cgraph->nodes[current_node - 1];
-                            if (ggml_cuda_should_use_moe_expert_reduce(cgraph, i, current_node)) {
-                                ggml_cuda_op_moe_expert_reduce(*cuda_ctx, node->src[0], node->src[1], dst_node);
-                                i += num_views + num_adds;
-                                continue;
-                            }
-                        }
-                    }
-
                     if (node->op == GGML_OP_ADD) {
                         int n_fuse = 0;
                         ggml_op ops[8];
diff --git a/ggml/src/ggml-cuda/moe-expert-reduce.cu b/ggml/src/ggml-cuda/moe-expert-reduce.cu
deleted file mode 100644
index a97c5d573..000000000
--- a/ggml/src/ggml-cuda/moe-expert-reduce.cu
+++ /dev/null
@@ -1,168 +0,0 @@
-#include "moe-expert-reduce.cuh"
-
-// This kernel is a fusion of the expert weight reduce, common in MoE models
-
-template <int n_expert_used_template>
-__global__ void moe_expert_reduce_cuda(const float * __restrict__ experts,
-                                       const float * __restrict__ weights,
-                                       float * __restrict__ dst,
-                                       const int n_expert_used,
-                                       const int n_cols) {
-    const int row = blockIdx.x;
-    const int col = blockIdx.y * blockDim.x + threadIdx.x;
-    if (col >= n_cols) {
-        return;
-    }
-
-    experts += row * n_cols * n_expert_used;
-    weights += row * n_expert_used;
-    dst += row * n_cols;
-
-    float acc = 0.f;
-    if constexpr (n_expert_used_template == 0) {
-        for (int expert = 0; expert < n_expert_used; ++expert) {
-            ggml_cuda_mad(acc, experts[col], weights[expert]);
-            experts += n_cols;
-        }
-        dst[col] = acc;
-    } else {
-#pragma unroll
-        for (int i = 0; i < n_expert_used_template; ++i) {
-            ggml_cuda_mad(acc, experts[col], weights[i]);
-            experts += n_cols;
-        }
-        dst[col] = acc;
-    }
-}
-
-static void launch_moe_expert_reduce(ggml_backend_cuda_context & ctx,
-                                     const float *               experts,
-                                     const float *               weights,
-                                     float *                     dst,
-                                     const int                   n_expert_used,
-                                     const int                   n_cols,
-                                     const int                   n_rows) {
-    const int block_size = 32;
-
-    const int n_blocks_x = n_rows;
-    const int n_blocks_y = (n_cols + block_size - 1) / block_size;
-
-    dim3 block_dims(block_size);
-    dim3 grid_dims(n_blocks_x, n_blocks_y);
-
-    cudaStream_t stream = ctx.stream();
-    switch (n_expert_used) {
-        case 1:
-            moe_expert_reduce_cuda<1>
-                <<<grid_dims, block_dims, 0, stream>>>(experts, weights, dst, n_expert_used, n_cols);
-            break;
-        case 2:
-            moe_expert_reduce_cuda<2>
-                <<<grid_dims, block_dims, 0, stream>>>(experts, weights, dst, n_expert_used, n_cols);
-            break;
-        case 4:
-            moe_expert_reduce_cuda<4>
-                <<<grid_dims, block_dims, 0, stream>>>(experts, weights, dst, n_expert_used, n_cols);
-            break;
-        case 6:
-            moe_expert_reduce_cuda<6>
-                <<<grid_dims, block_dims, 0, stream>>>(experts, weights, dst, n_expert_used, n_cols);
-            break;
-        case 8:
-            moe_expert_reduce_cuda<8>
-                <<<grid_dims, block_dims, 0, stream>>>(experts, weights, dst, n_expert_used, n_cols);
-            break;
-        case 16:
-            moe_expert_reduce_cuda<16>
-                <<<grid_dims, block_dims, 0, stream>>>(experts, weights, dst, n_expert_used, n_cols);
-            break;
-        case 32:
-            moe_expert_reduce_cuda<32>
-                <<<grid_dims, block_dims, 0, stream>>>(experts, weights, dst, n_expert_used, n_cols);
-            break;
-        case 64:
-            moe_expert_reduce_cuda<64>
-                <<<grid_dims, block_dims, 0, stream>>>(experts, weights, dst, n_expert_used, n_cols);
-            break;
-        case 128:
-            moe_expert_reduce_cuda<128>
-                <<<grid_dims, block_dims, 0, stream>>>(experts, weights, dst, n_expert_used, n_cols);
-            break;
-        default:
-            moe_expert_reduce_cuda<0>
-                <<<grid_dims, block_dims, 0, stream>>>(experts, weights, dst, n_expert_used, n_cols);
-            break;
-    }
-}
-
-bool ggml_cuda_should_use_moe_expert_reduce(const ggml_cgraph * cgraph, int start_index, int end_index) {
-    const ggml_tensor * mul = cgraph->nodes[start_index];
-
-    if (mul->op != GGML_OP_MUL || !ggml_is_contiguous(mul->src[0]) || !ggml_is_contiguous(mul->src[1])) {
-        return false;
-    }
-
-    int    current_node   = start_index + 1;
-    size_t current_offset = 0;
-
-    std::vector<const ggml_tensor *> view_nodes;
-    //check if all are views of the expert in increasing order
-    while (current_node < end_index && cgraph->nodes[current_node]->op == GGML_OP_VIEW) {
-        const ggml_tensor * node = cgraph->nodes[current_node];
-        if (node->view_src != mul) {
-            return false;
-        }
-        if (node->view_offs < current_offset) {
-            return false;
-        }
-        current_offset = node->view_offs;
-        current_node++;
-        view_nodes.push_back(node);
-    }
-
-    //check if all the adds are in increasing order
-    const ggml_tensor * prev_add_src = view_nodes.empty() ? nullptr : view_nodes[0];
-    int                 num_adds     = 0;
-    int                 num_views    = view_nodes.size();
-    while (current_node < end_index && cgraph->nodes[current_node]->op == GGML_OP_ADD) {
-        const ggml_tensor * add_node = cgraph->nodes[current_node];
-
-        bool is_first_op_ok  = num_views > num_adds ? add_node->src[0] == prev_add_src : false;
-        bool is_second_op_ok = num_views > num_adds ? add_node->src[1] == view_nodes[num_adds + 1] : false;
-
-        if (!is_first_op_ok || !is_second_op_ok) {
-            return false;
-        }
-        prev_add_src = add_node;
-
-        num_adds++;
-        current_node++;
-    }
-
-    if (num_views != num_adds + 1) {
-        return false;
-    }
-
-    return true;
-}
-
-void ggml_cuda_op_moe_expert_reduce(ggml_backend_cuda_context & ctx,
-                                    const ggml_tensor *         experts,
-                                    const ggml_tensor *         weights,
-                                    ggml_tensor *               dst) {
-    const int n_rows        = experts->ne[2];
-    const int n_expert_used = experts->ne[1];
-    const int n_cols        = experts->ne[0];
-
-    GGML_ASSERT(experts->type == GGML_TYPE_F32);
-    GGML_ASSERT(weights->type == GGML_TYPE_F32);
-    GGML_ASSERT(ggml_is_contiguous(experts));
-    GGML_ASSERT(ggml_is_contiguous(weights));
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    const float * experts_d = (const float *) experts->data;
-    const float * weights_d = (const float *) weights->data;
-    float *       dst_d     = (float *) dst->data;
-
-    launch_moe_expert_reduce(ctx, experts_d, weights_d, dst_d, n_expert_used, n_cols, n_rows);
-}
diff --git a/ggml/src/ggml-cuda/moe-expert-reduce.cuh b/ggml/src/ggml-cuda/moe-expert-reduce.cuh
deleted file mode 100644
index cafc50e10..000000000
--- a/ggml/src/ggml-cuda/moe-expert-reduce.cuh
+++ /dev/null
@@ -1,11 +0,0 @@
-#include "common.cuh"
-#include "ggml.h"
-
-#include <initializer_list>
-
-void ggml_cuda_op_moe_expert_reduce(ggml_backend_cuda_context & ctx,
-                                    const ggml_tensor *         experts,
-                                    const ggml_tensor *         weights,
-                                    ggml_tensor *               dst);
-
-bool ggml_cuda_should_use_moe_expert_reduce(const ggml_cgraph * cgraph, int start_index, int end_index);
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 31625bcc7..2470c148d 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -4882,60 +4882,6 @@ struct test_topk_moe: public test_case {
     }
 };
 
-struct test_moe_expert_reduce : public test_case {
-    const int64_t n_embd;
-    const int64_t n_tokens;
-    const int64_t n_expert_used;
-
-    test_moe_expert_reduce(int64_t n_embd = 64, int64_t n_tokens = 5, int64_t n_expert_used = 4)
-        : n_embd(n_embd), n_tokens(n_tokens), n_expert_used(n_expert_used) {
-        GGML_ASSERT(n_expert_used > 1);
-    }
-
-    std::string vars() override {
-        return VARS_TO_STR3(n_embd, n_tokens, n_expert_used);
-    }
-
-    std::string op_desc(ggml_tensor * t) override {
-        GGML_UNUSED(t);
-        return "MOE_EXPERT_REDUCE";
-    }
-
-    bool run_whole_graph() override { return true; }
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * experts = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, n_expert_used, n_tokens);
-        ggml_set_name(experts, "experts");
-
-        ggml_tensor * weights = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 1, n_expert_used, n_tokens);
-        ggml_set_name(weights, "weights");
-
-        ggml_tensor * weighted = ggml_mul(ctx, experts, weights);
-        ggml_set_name(weighted, "weighted_experts");
-
-        std::vector<ggml_tensor *> expert_views(n_expert_used);
-        for (int64_t i = 0; i < n_expert_used; ++i) {
-            expert_views[i] = ggml_view_2d(ctx, weighted, n_embd, n_tokens, weighted->nb[2], i * weighted->nb[1]);
-
-            std::string name = "expert_view_" + std::to_string(i);
-            ggml_set_name(expert_views[i], name.c_str());
-            ggml_build_forward_expand(gf, expert_views[i]);
-        }
-
-        ggml_tensor * moe_out = expert_views[0];
-        for (int64_t i = 1; i < n_expert_used; ++i) {
-            moe_out = ggml_add(ctx, moe_out, expert_views[i]);
-
-            std::string name = "expert_add_" + std::to_string(i - 1);
-            ggml_set_name(moe_out, name.c_str());
-        }
-
-        ggml_set_name(moe_out, "moe_out");
-
-        return moe_out;
-    }
-};
-
 struct test_mul_mat_vec_fusion : public test_case {
     const ggml_type type;
     const ggml_glu_op glu_op;
@@ -7415,10 +7361,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_topk_moe({ 8, 22, 1, 1 }, 4, /*with_norm*/ false, /*delayed_softmax*/ true));
     test_cases.emplace_back(new test_topk_moe({ 32, 22, 1, 1 }, 8, /*with_norm*/ false, /*delayed_softmax*/ true));
 
-    test_cases.emplace_back(new test_moe_expert_reduce(1024, 5, 4));
-    test_cases.emplace_back(new test_moe_expert_reduce(80, 3, 6));
-    test_cases.emplace_back(new test_moe_expert_reduce(80, 3, 7));
-
 #if 0
     // these tests are disabled to save execution time, sbut they can be handy for debugging
     test_cases.emplace_back(new test_llama(2, true));

From eeee367de51fb34d46c8103fc0ae827e84d94470 Mon Sep 17 00:00:00 2001
From: Aidan <99101158+gSUz92nc@users.noreply.github.com>
Date: Sat, 8 Nov 2025 13:12:11 +0000
Subject: [PATCH 32/32] server: fix correct time_ms calculation in
 prompt_progress (#17093)

* fix: correct time_ms calculation in send_partial_response

The time_ms field was incorrectly calculated. The division was happening
before the subtraction leading to incorrect values.

Before: (ggml_time_us() - slot.t_start_process_prompt / 1000) After:
(ggml_time_us() - slot.t_start_process_prompt) / 1000

* docs : document time_ms field in prompt_progress
---
 tools/server/README.md  | 2 +-
 tools/server/server.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/server/README.md b/tools/server/README.md
index 6828ef738..8fd478eb3 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -512,7 +512,7 @@ These words will not be included in the completion, so make sure to add them to
 
 `timings_per_token`: Include prompt processing and text generation speed information in each response.  Default: `false`
 
-`return_progress`: Include prompt processing progress in `stream` mode. The progress will be contained inside `prompt_progress` with 3 values: `total`, `cache` and `processed`. The overall progress is `processed/total`, while the actual timed progress is `(processed-cache)/(total-cache)`. Default: `false`
+`return_progress`: Include prompt processing progress in `stream` mode. The progress will be contained inside `prompt_progress` with 4 values: `total`, `cache`, `processed`, and `time_ms`. The overall progress is `processed/total`, while the actual timed progress is `(processed-cache)/(total-cache)`. The `time_ms` field contains the elapsed time in milliseconds since prompt processing started. Default: `false`
 
 `post_sampling_probs`: Returns the probabilities of top `n_probs` tokens after applying sampling chain.
 
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 164e8cf4e..9d91e32d1 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -3078,7 +3078,7 @@ struct server_context {
             res->progress.total     = slot.task->n_tokens();
             res->progress.cache     = slot.n_prompt_tokens_cache;
             res->progress.processed = slot.prompt.tokens.size();
-            res->progress.time_ms   = (ggml_time_us() - slot.t_start_process_prompt / 1000);
+            res->progress.time_ms   = (ggml_time_us() - slot.t_start_process_prompt) / 1000;
         } else {
             res->content = tkn.text_to_send;
             res->tokens  = { tkn.tok };