From 6da7168312a01da0f1ea50c8eb909340bb192c4e Mon Sep 17 00:00:00 2001
From: Masashi Yoshimura <yoshimura.masashi.frbs@gmail.com>
Date: Thu, 23 Apr 2026 02:51:40 +0900
Subject: [PATCH 01/35] ggml-webgpu: Add fused RMS_NORM + MUL (#21983)

* fused rms_norm_mul + mul

* Add GGML_WEBGPU_DISABLE_FUSION for being able to disable kernel fusion.

* Decouple num_fused_ops from webgpu_context; misc cleanup

* Fix eps handling and remove disable_fusion.

* Fix not to use c++20 initializers.
---
 .../ggml-webgpu/ggml-webgpu-shader-lib.hpp    |  71 +++++++-
 ggml/src/ggml-webgpu/ggml-webgpu.cpp          | 157 ++++++++++++++++--
 .../wgsl-shaders/rms_norm_mul.wgsl            | 139 ++++++++++++++++
 3 files changed, 349 insertions(+), 18 deletions(-)
 create mode 100644 ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
index f84dfee9d..6593a9fe1 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
@@ -194,6 +194,26 @@ struct ggml_webgpu_row_norm_pipeline_key_hash {
     }
 };
 
+/** RMS_NORM + MUL **/
+
+struct ggml_webgpu_rms_norm_mul_pipeline_key {
+    bool inplace;
+    bool src_overlap;
+
+    bool operator==(const ggml_webgpu_rms_norm_mul_pipeline_key & other) const {
+        return inplace == other.inplace && src_overlap == other.src_overlap;
+    }
+};
+
+struct ggml_webgpu_rms_norm_mul_pipeline_key_hash {
+    size_t operator()(const ggml_webgpu_rms_norm_mul_pipeline_key & key) const {
+        size_t seed = 0;
+        ggml_webgpu_hash_combine(seed, key.inplace);
+        ggml_webgpu_hash_combine(seed, key.src_overlap);
+        return seed;
+    }
+};
+
 /** Pad **/
 struct ggml_webgpu_pad_pipeline_key {
     bool circular;
@@ -517,7 +537,7 @@ inline uint32_t ggml_webgpu_flash_attn_max_kv_tile(const ggml_webgpu_shader_lib_
     const size_t q_tile       = context.sg_mat_m;
     const size_t base_q_bytes = (key.head_dim_qk + key.head_dim_v) * q_tile * GGML_WEBGPU_F16_SIZE_BYTES +
                                 2 * q_tile * GGML_WEBGPU_F32_SIZE_BYTES;
-    size_t bytes_per_kv = 0;
+    size_t       bytes_per_kv = 0;
     if (!key.kv_direct) {
         bytes_per_kv += std::max(key.head_dim_qk, key.head_dim_v);
     }
@@ -755,16 +775,17 @@ class ggml_webgpu_shader_lib {
     std::unordered_map<int, webgpu_pipeline> cumsum_pipelines;         // key is fixed, no variants yet
     std::unordered_map<ggml_webgpu_row_norm_pipeline_key, webgpu_pipeline, ggml_webgpu_row_norm_pipeline_key_hash>
         row_norm_pipelines;                                            // op/inplace
+
     std::unordered_map<ggml_webgpu_get_rows_pipeline_key, webgpu_pipeline, ggml_webgpu_get_rows_pipeline_key_hash>
-        get_rows_pipelines;                                            // src_type, vectorized
+        get_rows_pipelines;   // src_type, vectorized
     std::unordered_map<ggml_webgpu_unary_pipeline_key, webgpu_pipeline, ggml_webgpu_unary_pipeline_key_hash>
-        unary_pipelines;                                               // type/op/inplace
+        unary_pipelines;      // type/op/inplace
     std::unordered_map<ggml_webgpu_scale_pipeline_key, webgpu_pipeline, ggml_webgpu_scale_pipeline_key_hash>
-        scale_pipelines;                                               // inplace
+        scale_pipelines;      // inplace
     std::unordered_map<ggml_webgpu_solve_tri_pipeline_key, webgpu_pipeline, ggml_webgpu_solve_tri_pipeline_key_hash>
-        solve_tri_pipelines;                                           // type
+        solve_tri_pipelines;  // type
     std::unordered_map<ggml_webgpu_ssm_conv_pipeline_key, webgpu_pipeline, ggml_webgpu_ssm_conv_pipeline_key_hash>
-        ssm_conv_pipelines;                                            // type/vectorized
+        ssm_conv_pipelines;   // type/vectorized
     std::unordered_map<ggml_webgpu_gated_delta_net_pipeline_key,
                        webgpu_pipeline,
                        ggml_webgpu_gated_delta_net_pipeline_key_hash>
@@ -813,6 +834,11 @@ class ggml_webgpu_shader_lib {
     std::unordered_map<ggml_webgpu_conv2d_pipeline_key, webgpu_pipeline, ggml_webgpu_conv2d_pipeline_key_hash>
         conv2d_pipelines;
 
+    std::unordered_map<ggml_webgpu_rms_norm_mul_pipeline_key,
+                       webgpu_pipeline,
+                       ggml_webgpu_rms_norm_mul_pipeline_key_hash>
+        rms_norm_mul_pipelines;
+
   public:
     ggml_webgpu_shader_lib(wgpu::Device device) { this->device = device; }
 
@@ -1828,6 +1854,39 @@ class ggml_webgpu_shader_lib {
         return unary_pipelines[key];
     }
 
+    webgpu_pipeline get_rms_norm_mul_pipeline(const ggml_webgpu_shader_lib_context & context) {
+        ggml_webgpu_rms_norm_mul_pipeline_key key = {};
+        key.inplace                               = context.inplace;
+        key.src_overlap                           = context.src_overlap;
+
+        auto it = rms_norm_mul_pipelines.find(key);
+        if (it != rms_norm_mul_pipelines.end()) {
+            return it->second;
+        }
+
+        std::vector<std::string> defines;
+        std::string              op_name = "RMS_NORM_MUL";
+        std::string              variant = op_name;
+
+        if (key.inplace) {
+            defines.push_back("INPLACE");
+            variant += "_inplace";
+        } else if (key.src_overlap) {
+            defines.push_back("SRC_OVERLAP");
+            variant += "_src_overlap";
+        }
+
+        defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
+
+        auto processed              = preprocessor.preprocess(wgsl_rms_norm_mul, defines);
+        auto decisions              = std::make_shared<ggml_webgpu_generic_shader_decisions>();
+        decisions->wg_size          = context.max_wg_size;
+        webgpu_pipeline pipeline    = ggml_webgpu_create_pipeline(device, processed, variant);
+        pipeline.context            = decisions;
+        rms_norm_mul_pipelines[key] = pipeline;
+        return rms_norm_mul_pipelines[key];
+    }
+
     webgpu_pipeline get_binary_pipeline(const ggml_webgpu_shader_lib_context & context) {
         ggml_webgpu_binary_pipeline_key key = {};
         key.type                            = context.dst->type;
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 551586751..5d3169904 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -1972,6 +1972,94 @@ static webgpu_encoded_op ggml_webgpu_repeat(webgpu_context & ctx, ggml_tensor *
     return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
 }
 
+static std::optional<webgpu_encoded_op> ggml_webgpu_rms_norm_mul(webgpu_context & ctx,
+                                                                 ggml_tensor *    rn_src,
+                                                                 ggml_tensor *    rn_dst,
+                                                                 ggml_tensor *    mul_src0,
+                                                                 ggml_tensor *    mul_src1,
+                                                                 ggml_tensor *    dst) {
+    ggml_tensor * mul_src;
+
+    if (ggml_webgpu_tensor_equal(rn_dst, mul_src0)) {
+        mul_src = mul_src1;
+    } else if (ggml_webgpu_tensor_equal(rn_dst, mul_src1)) {
+        mul_src = mul_src0;
+    } else {
+        GGML_ABORT("rms_norm must be equal to the one of mul_src0 and mul_src1");
+    }
+
+    bool inplace     = (ggml_webgpu_tensor_equal(rn_dst, mul_src0) && ggml_webgpu_tensor_equal(mul_src1, dst)) ||
+                       (ggml_webgpu_tensor_equal(rn_dst, mul_src1) && ggml_webgpu_tensor_equal(mul_src0, dst));
+    bool src_overlap = ggml_webgpu_tensor_overlap(rn_src, mul_src);
+
+    uint32_t offset_merged_rn_src               = 0;
+    uint32_t offset_merged_mul_src              = 0;
+    size_t   rn_src_webgpu_tensor_align_offset  = ggml_webgpu_tensor_align_offset(ctx, rn_src);
+    size_t   mul_src_webgpu_tensor_align_offset = ggml_webgpu_tensor_align_offset(ctx, mul_src);
+
+    if (src_overlap) {
+        size_t min_offset = std::min(rn_src_webgpu_tensor_align_offset, mul_src_webgpu_tensor_align_offset);
+        offset_merged_rn_src =
+            (uint32_t) ((rn_src_webgpu_tensor_align_offset - min_offset) / ggml_type_size(rn_src->type));
+        offset_merged_mul_src =
+            (uint32_t) ((mul_src_webgpu_tensor_align_offset - min_offset) / ggml_type_size(mul_src->type));
+    }
+
+    std::vector<uint32_t> params = {
+        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, rn_src) / ggml_type_size(rn_src->type)),
+        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, mul_src) / ggml_type_size(mul_src->type)),
+        offset_merged_rn_src,
+        offset_merged_mul_src,
+        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
+        (uint32_t) (rn_src->nb[1] / ggml_type_size(rn_src->type)),
+        (uint32_t) (rn_src->nb[2] / ggml_type_size(rn_src->type)),
+        (uint32_t) (rn_src->nb[3] / ggml_type_size(rn_src->type)),
+        (uint32_t) (mul_src->nb[1] / ggml_type_size(mul_src->type)),
+        (uint32_t) (mul_src->nb[2] / ggml_type_size(mul_src->type)),
+        (uint32_t) (mul_src->nb[3] / ggml_type_size(mul_src->type)),
+        (uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
+        (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
+        (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
+        (uint32_t) mul_src->ne[0],
+        (uint32_t) mul_src->ne[1],
+        (uint32_t) mul_src->ne[2],
+        (uint32_t) mul_src->ne[3],
+        (uint32_t) dst->ne[0],
+        (uint32_t) dst->ne[1],
+        (uint32_t) dst->ne[2],
+        (uint32_t) dst->ne[3],
+        ggml_webgpu_u32_from_f32(ggml_get_op_params_f32(rn_dst, 0))  // epsilon, treated as f32 in the shader
+    };
+
+    std::vector<wgpu::BindGroupEntry> entries;
+
+    if (inplace) {
+        entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, rn_src));
+        entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, mul_src));
+    } else if (src_overlap) {
+        size_t merged_offset = std::min(rn_src_webgpu_tensor_align_offset, mul_src_webgpu_tensor_align_offset);
+        size_t merged_end =
+            std::max(rn_src_webgpu_tensor_align_offset + ggml_webgpu_tensor_binding_size(ctx, rn_src),
+                     mul_src_webgpu_tensor_align_offset + ggml_webgpu_tensor_binding_size(ctx, mul_src));
+        entries.push_back(ggml_webgpu_make_bind_group_entry(0, ggml_webgpu_tensor_buf(rn_src), merged_offset,
+                                                            merged_end - merged_offset));
+        entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, dst));
+    } else {
+        entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, rn_src));
+        entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, mul_src));
+        entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, dst));
+    }
+
+    ggml_webgpu_shader_lib_context shader_lib_ctx = {};
+    shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
+    shader_lib_ctx.inplace     = inplace;
+    shader_lib_ctx.src_overlap = src_overlap;
+
+    webgpu_pipeline pipeline = ctx->shader_lib->get_rms_norm_mul_pipeline(shader_lib_ctx);
+
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, ggml_nrows(dst));
+}
+
 static webgpu_encoded_op ggml_webgpu_row_norm(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
     bool inplace = ggml_webgpu_tensor_equal(src, dst);
 
@@ -2468,15 +2556,48 @@ static webgpu_encoded_op ggml_webgpu_sum_rows(webgpu_context & ctx, ggml_tensor
     return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
 }
 
+static bool ggml_webgpu_can_fuse_rms_norm_mul(const struct ggml_cgraph * cgraph, int node_idx) {
+    if (!ggml_can_fuse(cgraph, node_idx, { GGML_OP_RMS_NORM, GGML_OP_MUL })) {
+        return false;
+    }
+
+    // additional constraints specific to this fusion
+    const ggml_tensor * rms_norm = cgraph->nodes[node_idx];
+    const ggml_tensor * mul      = cgraph->nodes[node_idx + 1];
+
+    GGML_ASSERT(rms_norm->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(rms_norm->type == GGML_TYPE_F32);
+    // rms_norm only supports f32
+    if (mul->src[0]->type != GGML_TYPE_F32 || mul->src[1]->type != GGML_TYPE_F32 || mul->type != GGML_TYPE_F32) {
+        return false;
+    }
+    // if rms_norm is the B operand, then we don't handle broadcast
+    if (rms_norm == mul->src[1] && !ggml_are_same_shape(mul->src[0], rms_norm)) {
+        return false;
+    }
+    // rms_norm shader assumes contiguous rows
+    if (!ggml_is_contiguous_rows(mul->src[0]) || !ggml_is_contiguous_rows(mul->src[1])) {
+        return false;
+    }
+
+    return true;
+}
+
 // Returns the encoded command, or std::nullopt if the operation is a no-op
-static std::optional<webgpu_encoded_op> ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) {
+static std::optional<webgpu_encoded_op> ggml_webgpu_encode(webgpu_context ctx,
+                                                           ggml_cgraph *  cgraph,
+                                                           int            node_idx,
+                                                           int &          num_encoded_ops) {
+    ggml_tensor ** nodes = cgraph->nodes;
+    ggml_tensor *  node  = nodes[node_idx];
+
     if (ggml_is_empty(node)) {
         return std::nullopt;
     }
     if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
         return std::nullopt;
     }
-    WEBGPU_LOG_DEBUG("ggml_webgpu_encode_node(" << node << ", " << ggml_op_name(node->op) << ")");
+    WEBGPU_LOG_DEBUG("ggml_webgpu_encode(" << node << ", " << ggml_op_name(node->op) << ")");
 
     ggml_tensor * src0 = node->src[0];
     ggml_tensor * src1 = node->src[1];
@@ -2519,6 +2640,13 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_encode_node(webgpu_context c
         case GGML_OP_REPEAT:
             return ggml_webgpu_repeat(ctx, src0, node);
         case GGML_OP_RMS_NORM:
+            if (ggml_webgpu_can_fuse_rms_norm_mul(cgraph, node_idx)) {
+                num_encoded_ops        = 2;
+                ggml_tensor * mul_node = nodes[node_idx + 1];
+                return ggml_webgpu_rms_norm_mul(ctx, src0, node, mul_node->src[0], mul_node->src[1], mul_node);
+            } else {
+                return ggml_webgpu_row_norm(ctx, src0, node);
+            }
         case GGML_OP_L2_NORM:
             return ggml_webgpu_row_norm(ctx, src0, node);
         case GGML_OP_ROPE:
@@ -2629,6 +2757,8 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
     uint32_t num_inflight_batches = 0;
     bool     contains_set_rows    = false;
     bool     batch_compute_passes = true;
+    int      num_encoded_ops      = 1;
+    int      node_idx             = 0;
 
 #ifdef GGML_WEBGPU_GPU_PROFILE
     ctx->profile_timestamp_query_count = 0;
@@ -2641,11 +2771,11 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
         ctx->active_compute_pass = ctx->active_command_encoder.BeginComputePass();
     }
 
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        if (cgraph->nodes[i]->op == GGML_OP_SET_ROWS) {
+    while (node_idx < cgraph->n_nodes) {
+        if (cgraph->nodes[node_idx]->op == GGML_OP_SET_ROWS) {
             contains_set_rows = true;
         }
-        if (auto cmd = ggml_webgpu_encode_node(ctx, cgraph->nodes[i])) {
+        if (auto cmd = ggml_webgpu_encode(ctx, cgraph, node_idx, num_encoded_ops)) {
             commands.push_back(*cmd);
             num_batched_kernels += cmd.value().num_kernels;
 #ifdef GGML_WEBGPU_GPU_PROFILE
@@ -2670,6 +2800,9 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
             ctx->param_arena.reset();
             commands.clear();
         }
+
+        node_idx += num_encoded_ops;
+        num_encoded_ops = 1;
     }
 
     if (ctx->active_compute_pass) {
@@ -3237,7 +3370,7 @@ static webgpu_context initialize_webgpu_context(ggml_backend_dev_t dev) {
     ggml_backend_webgpu_device_context * dev_ctx    = (ggml_backend_webgpu_device_context *) dev->context;
     webgpu_context                       webgpu_ctx = std::make_shared<webgpu_context_struct>();
     webgpu_ctx->global_ctx                          = dev_ctx->webgpu_global_ctx;
-    webgpu_ctx->shader_lib = std::make_unique<ggml_webgpu_shader_lib>(dev_ctx->webgpu_global_ctx->device);
+    webgpu_ctx->shader_lib     = std::make_unique<ggml_webgpu_shader_lib>(dev_ctx->webgpu_global_ctx->device);
     webgpu_ctx->param_arena.init(
         webgpu_ctx->global_ctx->device, WEBGPU_PARAMS_BUF_SIZE_BYTES,
         webgpu_ctx->global_ctx->command_submit_batch_size + WEBGPU_NUM_PARAM_SLOT_SAFETY_MARGIN,
@@ -3487,12 +3620,12 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
                     break;
                 }
                 // Head dimensions must fit in workgroup memory with minimum tile sizes
-                size_t     limit_bytes = ctx->webgpu_global_ctx->capabilities.limits.maxComputeWorkgroupStorageSize;
-                const bool has_mask    = op->src[3] != nullptr;
-                const bool kv_direct   = src1->type == GGML_TYPE_F16 &&
-                                       (src0->ne[0] % ctx->webgpu_global_ctx->capabilities.sg_mat_k) == 0 &&
-                                       (src1->ne[1] % GGML_WEBGPU_KV_SEQ_PAD) == 0;
-                const size_t min_bytes = ggml_webgpu_flash_attn_wg_mem_bytes(
+                size_t       limit_bytes = ctx->webgpu_global_ctx->capabilities.limits.maxComputeWorkgroupStorageSize;
+                const bool   has_mask    = op->src[3] != nullptr;
+                const bool   kv_direct   = src1->type == GGML_TYPE_F16 &&
+                                           (src0->ne[0] % ctx->webgpu_global_ctx->capabilities.sg_mat_k) == 0 &&
+                                           (src1->ne[1] % GGML_WEBGPU_KV_SEQ_PAD) == 0;
+                const size_t min_bytes   = ggml_webgpu_flash_attn_wg_mem_bytes(
                     ctx->webgpu_global_ctx->capabilities.sg_mat_m, ctx->webgpu_global_ctx->capabilities.sg_mat_n,
                     (uint32_t) src0->ne[0], (uint32_t) src2->ne[0], has_mask, kv_direct);
                 if (min_bytes > limit_bytes) {
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl
new file mode 100644
index 000000000..71f063b51
--- /dev/null
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl
@@ -0,0 +1,139 @@
+#ifdef INPLACE
+
+@group(0) @binding(0)
+var<storage, read_write> rn_src: array<f32>;
+
+@group(0) @binding(1)
+var<storage, read_write> mul_src: array<f32>;
+
+@group(0) @binding(2)
+var<uniform> params: Params;
+
+fn update(rn_src_offset: u32, dst_offset: u32, scale: f32, mul_src_offset: u32) {
+    mul_src[dst_offset] = scale * rn_src[rn_src_offset] * mul_src[mul_src_offset];
+}
+
+#elif SRC_OVERLAP
+
+@group(0) @binding(0)
+var<storage, read_write> merged_src: array<f32>;
+
+@group(0) @binding(1)
+var<storage, read_write> dst: array<f32>;
+
+@group(0) @binding(2)
+var<uniform> params: Params;
+
+fn update(rn_src_offset: u32, dst_offset: u32, scale: f32, mul_src_offset: u32) {
+    dst[dst_offset] = scale * merged_src[rn_src_offset] * merged_src[mul_src_offset];
+}
+
+#else
+
+@group(0) @binding(0)
+var<storage, read_write> rn_src: array<f32>;
+
+@group(0) @binding(1)
+var<storage, read_write> mul_src: array<f32>;
+
+@group(0) @binding(2)
+var<storage, read_write> dst: array<f32>;
+
+@group(0) @binding(3)
+var<uniform> params: Params;
+
+fn update(rn_src_offset: u32, dst_offset: u32, scale: f32, mul_src_offset: u32) {
+    dst[dst_offset] = scale * rn_src[rn_src_offset] * mul_src[mul_src_offset];
+}
+
+#endif
+
+struct Params {
+    offset_rn_src: u32,
+    offset_mul_src: u32,
+    offset_merged_rn_src: u32,
+    offset_merged_mul_src: u32,
+    offset_dst: u32,
+
+    stride_rn_src1: u32,
+    stride_rn_src2: u32,
+    stride_rn_src3: u32,
+
+    stride_mul_src1: u32,
+    stride_mul_src2: u32,
+    stride_mul_src3: u32,
+
+    stride_dst1: u32,
+    stride_dst2: u32,
+    stride_dst3: u32,
+
+    mul_src_ne0: u32,
+    mul_src_ne1: u32,
+    mul_src_ne2: u32,
+    mul_src_ne3: u32,
+
+    ne0: u32,
+    ne1: u32,
+    ne2: u32,
+    ne3: u32,
+
+    eps: f32
+};
+
+var<workgroup> scratch: array<f32, WG_SIZE>;
+
+@compute @workgroup_size(WG_SIZE)
+fn main(@builtin(workgroup_id) wid: vec3<u32>,
+        @builtin(local_invocation_id) lid: vec3<u32>) {
+
+    // one thread per row
+    var i = wid.x;
+    let i3 = i / (params.ne2 * params.ne1);
+    i = i % (params.ne2 * params.ne1);
+    let i2 = i / params.ne1;
+    let i1 = i % params.ne1;
+    let i_rn_src_row = params.offset_rn_src + params.offset_merged_rn_src + i3 * params.stride_rn_src3 + i2 * params.stride_rn_src2 + i1 * params.stride_rn_src1;
+    let i_mul_src_row = params.offset_mul_src + params.offset_merged_mul_src + (i3 % params.mul_src_ne3) * params.stride_mul_src3 + (i2 % params.mul_src_ne2) * params.stride_mul_src2 + (i1 % params.mul_src_ne1) * params.stride_mul_src1;
+    let i_dst_row = params.offset_dst + i3 * params.stride_dst3 + i2 * params.stride_dst2 + i1 * params.stride_dst1;
+
+    let elems = (params.ne0 + WG_SIZE - 1) / WG_SIZE;
+
+    var sum = 0.0f;
+    var col = lid.x;
+    for (var j: u32 = 0; j < elems; j++) {
+        if (col >= params.ne0) {
+            break;
+        }
+#ifdef SRC_OVERLAP
+        sum += pow(merged_src[i_rn_src_row + col], 2.0);
+#else
+        sum += pow(rn_src[i_rn_src_row + col], 2.0);
+#endif
+        col += WG_SIZE;
+    }
+
+    scratch[lid.x] = sum;
+
+    workgroupBarrier();
+
+    var offset: u32 = WG_SIZE / 2;
+    while (offset > 0) {
+        if (lid.x < offset) {
+            scratch[lid.x] += scratch[lid.x + offset];
+        }
+        offset = offset / 2;
+        workgroupBarrier();
+    }
+    sum = scratch[0];
+
+    let scale = 1.0/sqrt(sum/f32(params.ne0) + params.eps);
+
+    col = lid.x;
+    for (var j: u32 = 0; j < elems; j++) {
+        if (col >= params.ne0) {
+            break;
+        }
+        update(i_rn_src_row + col, i_dst_row + col, scale, i_mul_src_row + col % params.mul_src_ne0);
+        col += WG_SIZE;
+    }
+}

From 0d0764dfd257c0ae862525c05778207f87b99b1c Mon Sep 17 00:00:00 2001
From: Nikhil Jain <nikhil.jain0987@gmail.com>
Date: Wed, 22 Apr 2026 10:52:01 -0700
Subject: [PATCH 02/35] [WebGPU] Implement async tensor api and event api 
 (#22099)

* Only run webgpu CI on my fork

* Implement set_tensor_async

* Implement synchronize api

* Implement event creation and deletion API

* Cleanup

* Cleanup

* Comment out jobs for local CI run

* Add webgpu only workflow

* Delete .github/workflows/build-webgpu.yml

* Cleanup

* Cleanup

* Update API with function handlers

* Run clang-format

* Replace one-shot buffer with a direct queue.WriteBuffer using the buffer context
---
 ggml/src/ggml-webgpu/ggml-webgpu.cpp | 99 ++++++++++++++++++++++++++--
 1 file changed, 92 insertions(+), 7 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 5d3169904..44e3bf822 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -2832,22 +2832,107 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
     return GGML_STATUS_SUCCESS;
 }
 
+struct ggml_backend_webgpu_event_context {
+    webgpu_global_context global_ctx;
+    wgpu::Future          future;
+    bool                  recorded = false;
+};
+
+static ggml_backend_event_t ggml_backend_webgpu_device_event_new(ggml_backend_dev_t device) {
+    ggml_backend_webgpu_device_context * dev_ctx = (ggml_backend_webgpu_device_context *) device->context;
+
+    auto * event_ctx      = new ggml_backend_webgpu_event_context();
+    event_ctx->global_ctx = dev_ctx->webgpu_global_ctx;
+
+    auto * event   = new ggml_backend_event;
+    event->device  = device;
+    event->context = event_ctx;
+    return event;
+}
+
+static void ggml_backend_webgpu_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
+    GGML_UNUSED(dev);
+    delete static_cast<ggml_backend_webgpu_event_context *>(event->context);
+    delete event;
+}
+
+static void ggml_backend_webgpu_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
+    GGML_UNUSED(dev);
+    ggml_backend_webgpu_event_context * event_ctx = (ggml_backend_webgpu_event_context *) event->context;
+    if (!event_ctx->recorded) {
+        return;
+    }
+    wgpu::WaitStatus status =
+        event_ctx->global_ctx->instance.WaitAny(event_ctx->future, WEBGPU_RUNTIME_WAIT_TIMEOUT_NS);
+    if (status == wgpu::WaitStatus::TimedOut) {
+        GGML_ABORT("ggml_webgpu: event_synchronize timed out after %u ms\n", WEBGPU_RUNTIME_WAIT_TIMEOUT_MS);
+    }
+    event_ctx->recorded = false;
+}
+
+static void ggml_backend_webgpu_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
+    ggml_backend_webgpu_context *       backend_ctx = (ggml_backend_webgpu_context *) backend->context;
+    ggml_backend_webgpu_event_context * event_ctx   = (ggml_backend_webgpu_event_context *) event->context;
+
+    event_ctx->future = backend_ctx->webgpu_ctx->global_ctx->queue.OnSubmittedWorkDone(
+        wgpu::CallbackMode::AllowSpontaneous, [](wgpu::QueueWorkDoneStatus, wgpu::StringView) {});
+    event_ctx->recorded = true;
+}
+
+static void ggml_backend_webgpu_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
+    GGML_UNUSED(backend);
+    ggml_backend_webgpu_device_event_synchronize(nullptr, event);
+}
+
+static void ggml_backend_webgpu_set_tensor_async(ggml_backend_t backend,
+                                                 ggml_tensor *  tensor,
+                                                 const void *   data,
+                                                 size_t         offset,
+                                                 size_t         size) {
+    GGML_UNUSED(backend);
+    auto * buf_ctx      = (ggml_backend_webgpu_buffer_context *) tensor->buffer->context;
+    size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset;
+
+    // Write aligned portion
+    buf_ctx->global_ctx->queue.WriteBuffer(buf_ctx->buffer, total_offset, data, (size / 4) * 4);
+
+    if (size % 4 != 0) {
+        // If size is not a multiple of 4, we need to memset the remaining bytes
+        size_t remaining_size = size % 4;
+
+        // pack the remaining bytes into a uint32_t
+        uint32_t val32 = 0;
+
+        for (size_t i = 0; i < remaining_size; i++) {
+            ((uint8_t *) &val32)[i] = ((const uint8_t *) data)[size - remaining_size + i];
+        }
+        // memset the remaining bytes
+        ggml_backend_webgpu_buffer_memset(buf_ctx->global_ctx, buf_ctx->buffer, val32,
+                                          total_offset + (size - remaining_size), remaining_size);
+    }
+}
+
+static void ggml_backend_webgpu_synchronize(ggml_backend_t backend) {
+    ggml_backend_webgpu_context * backend_ctx = (ggml_backend_webgpu_context *) backend->context;
+    ggml_backend_webgpu_wait_queue(backend_ctx->webgpu_ctx->global_ctx);
+}
+
 static ggml_backend_i ggml_backend_webgpu_i = {
     /* .get_name                = */ ggml_backend_webgpu_name,
     /* .free                    = */ ggml_backend_webgpu_free,
-    /* .set_tensor_async        = */ NULL,
+    /* .set_tensor_async        = */ ggml_backend_webgpu_set_tensor_async,
     /* .get_tensor_async        = */ NULL,
     /* .get_tensor_2d_async     = */ NULL,
     /* .set_tensor_2d_async     = */ NULL,
     /* .cpy_tensor_async        = */ NULL,
-    /* .synchronize             = */ NULL,
+    /* .synchronize             = */ ggml_backend_webgpu_synchronize,
     /* .graph_plan_create       = */ NULL,
     /* .graph_plan_free         = */ NULL,
     /* .graph_plan_update       = */ NULL,
     /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_webgpu_graph_compute,
-    /* .event_record            = */ NULL,
-    /* .event_wait              = */ NULL,
+    /* .event_record            = */ ggml_backend_webgpu_event_record,
+    /* .event_wait              = */ ggml_backend_webgpu_event_wait,
     /* .graph_optimize          = */ NULL,
 };
 
@@ -3810,9 +3895,9 @@ static struct ggml_backend_device_i ggml_backend_webgpu_device_i = {
     /* .supports_op          = */ ggml_backend_webgpu_device_supports_op,
     /* .supports_buft        = */ ggml_backend_webgpu_device_supports_buft,
     /* .offload_op           = */ NULL,
-    /* .event_new            = */ NULL,
-    /* .event_free           = */ NULL,
-    /* .event_synchronize    = */ NULL,
+    /* .event_new            = */ ggml_backend_webgpu_device_event_new,
+    /* .event_free           = */ ggml_backend_webgpu_device_event_free,
+    /* .event_synchronize    = */ ggml_backend_webgpu_device_event_synchronize,
 };
 
 /* End GGML Backend Device Interface */

From 6217b49583432f55014c2a0551f453d42b300530 Mon Sep 17 00:00:00 2001
From: uvos <carl@uvos.xyz>
Date: Thu, 23 Apr 2026 02:34:31 +0200
Subject: [PATCH 03/35] HIP: flip GGML_HIP_GRAPHS to default on (#22254)

In #11362 hip graph was disabled by default as, at the time, its performance impact was negative. Due to improvements in rocm and our usage and construction of graphs this is no longer true, so lets change the default.
---
 ggml/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 2effd587b..b9f7deb15 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -213,7 +213,7 @@ set   (GGML_CUDA_COMPRESSION_MODE "size" CACHE STRING
 set_property(CACHE GGML_CUDA_COMPRESSION_MODE PROPERTY STRINGS "none;speed;balance;size")
 
 option(GGML_HIP                             "ggml: use HIP"                                   OFF)
-option(GGML_HIP_GRAPHS                      "ggml: use HIP graph, experimental, slow"         OFF)
+option(GGML_HIP_GRAPHS                      "ggml: use HIP graph"                              ON)
 option(GGML_HIP_RCCL                        "ggml: use ROCm Collective Comm. Library"         OFF)
 option(GGML_HIP_NO_VMM                      "ggml: do not try to use HIP VMM"                 ON)
 option(GGML_HIP_ROCWMMA_FATTN               "ggml: enable rocWMMA for FlashAttention"         OFF)

From 86db42e97f6f20330b1a54653eeff6814162c39b Mon Sep 17 00:00:00 2001
From: Anav Prasad <anavp@nvidia.com>
Date: Thu, 23 Apr 2026 02:28:56 +0000
Subject: [PATCH 04/35] CUDA: fuse relu + sqr (#22249)

---
 ggml/src/ggml-cuda/ggml-cuda.cu | 30 +++++++++++++++++++++++++
 ggml/src/ggml-cuda/unary.cu     | 23 +++++++++++++++++++
 ggml/src/ggml-cuda/unary.cuh    |  2 ++
 tests/test-backend-ops.cpp      | 40 +++++++++++++++++++++++++++++++++
 4 files changed, 95 insertions(+)

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 185956317..1c2c3b4ac 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3592,6 +3592,30 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph *                cgraph,
         return true;
     }
 
+    if (ops.size() == 2 && ops.begin()[0] == GGML_OP_UNARY && ops.begin()[1] == GGML_OP_SQR
+     && unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_RELU) {
+        const ggml_tensor * unary = cgraph->nodes[node_idx];
+        const ggml_tensor * sqr   = cgraph->nodes[node_idx+1];
+
+        if (ggml_get_unary_op(unary) != GGML_UNARY_OP_RELU) {
+            return false;
+        }
+
+        if (unary->type != GGML_TYPE_F32 && unary->type != GGML_TYPE_F16) {
+            return false;
+        }
+
+        if (unary->type != sqr->type) {
+            return false;
+        }
+
+        if (!ggml_is_contiguous(unary->src[0])) {
+            return false;
+        }
+
+        return true;
+    }
+
     if (ops.size() == 3 && ops.begin()[0] == GGML_OP_SCALE && ops.begin()[1] == GGML_OP_UNARY && ops.begin()[2] == GGML_OP_SCALE
      && unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_TANH) {
         const ggml_tensor *scale  = cgraph->nodes[node_idx];
@@ -4100,6 +4124,12 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
                         continue;
                     }
 
+                    if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_SQR }, { GGML_UNARY_OP_RELU })) {
+                        ggml_cuda_op_relu_sqr(*cuda_ctx, node, cgraph->nodes[i+1]);
+                        i++;
+                        continue;
+                    }
+
                     if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SCALE, GGML_OP_UNARY, GGML_OP_SCALE }, { GGML_UNARY_OP_TANH })) {
                         i += 2;
                         ggml_cuda_op_softcap(*cuda_ctx, cgraph->nodes[i], node);
diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu
index 4ad30fa1f..2aeba26f4 100644
--- a/ggml/src/ggml-cuda/unary.cu
+++ b/ggml/src/ggml-cuda/unary.cu
@@ -65,6 +65,11 @@ static __device__ __forceinline__ float op_sqr(float x) {
     return x * x;
 }
 
+static __device__ __forceinline__ float op_relu_sqr(float x) {
+    const float r = fmaxf(x, 0.0f);
+    return r * r;
+}
+
 static __device__ __forceinline__ float op_sqrt(float x) {
     return sqrtf(x);
 }
@@ -615,3 +620,21 @@ void ggml_cuda_op_unary_mul(ggml_backend_cuda_context & ctx, ggml_tensor * unary
             GGML_ABORT("Unsupported unary op for fused unary+mul");
     }
 }
+
+/* fused relu + sqr */
+
+void ggml_cuda_op_relu_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * relu_node, ggml_tensor * sqr_node) {
+    const ggml_tensor * src = relu_node->src[0];
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(src));
+    GGML_ASSERT(src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16);
+    GGML_ASSERT(src->type == sqr_node->type);
+
+    const int k = ggml_nelements(src);
+    if (src->type == GGML_TYPE_F16) {
+        unary_cuda<op_relu_sqr>((const half *)src->data, (half *)sqr_node->data, k, stream);
+    } else {
+        unary_cuda<op_relu_sqr>((const float *)src->data, (float *)sqr_node->data, k, stream);
+    }
+}
diff --git a/ggml/src/ggml-cuda/unary.cuh b/ggml/src/ggml-cuda/unary.cuh
index f1dd2183a..81ed873ec 100644
--- a/ggml/src/ggml-cuda/unary.cuh
+++ b/ggml/src/ggml-cuda/unary.cuh
@@ -91,6 +91,8 @@ void ggml_cuda_op_xielu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 
 void ggml_cuda_op_unary_mul(ggml_backend_cuda_context & ctx, ggml_tensor * unary_node, ggml_tensor * mul_node);
 
+void ggml_cuda_op_relu_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * relu_node, ggml_tensor * sqr_node);
+
 __device__ __forceinline__ float ggml_cuda_op_silu_single(float x) {
     return x / (1.0f + expf(-x));
 }
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 828a9c14a..716011316 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -3522,6 +3522,40 @@ struct test_add_rms_norm : public test_case {
     }
 };
 
+// GGML_OP_UNARY(RELU) + GGML_OP_SQR (fused operation)
+struct test_relu_sqr : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string op_desc(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return "RELU_SQR";
+    }
+
+    bool run_whole_graph() override { return true; }
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_relu_sqr(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne = {128, 2, 2, 2})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_name(a, "a");
+
+        ggml_tensor * r = ggml_relu(ctx, a);
+        ggml_set_name(r, "relu");
+
+        ggml_tensor * out = ggml_sqr(ctx, r);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+};
+
 // GGML_OP_SSM_CONV
 struct test_ssm_conv : public test_case {
     const ggml_type type;
@@ -7311,6 +7345,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
         }
     }
 
+    // fused relu + sqr (squared ReLU)
+    for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) {
+        test_cases.emplace_back(new test_relu_sqr(type, { 128, 2, 2, 2 }));
+        test_cases.emplace_back(new test_relu_sqr(type, { 5, 7, 11, 13 }));
+    }
+
     // glu ops
     for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) {
         for (int v : {0, 1}) {

From b76429a69c1bfc12e56f65cd707b7b8a4260c86d Mon Sep 17 00:00:00 2001
From: Chen Yuan <constantchen525@gmail.com>
Date: Wed, 22 Apr 2026 23:17:41 -0400
Subject: [PATCH 05/35] ggml-webgpu: add support for im2col (#22259)

* shader(im2col): implement the im2col shader

* shader(im2col): clean the formatting issues

* shader(im2col): clean the editorconfig checker warning

* fix(shader): address the workgroup issues of im2col and conv2d
---
 .../ggml-webgpu/ggml-webgpu-shader-lib.hpp    |  59 ++++++++
 ggml/src/ggml-webgpu/ggml-webgpu.cpp          | 127 +++++++++++++++---
 ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl | 101 ++++++++++++++
 3 files changed, 268 insertions(+), 19 deletions(-)
 create mode 100644 ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
index 6593a9fe1..efc5b8c97 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
@@ -281,6 +281,25 @@ struct ggml_webgpu_conv2d_pipeline_key_hash {
     }
 };
 
+/** Im2Col **/
+struct ggml_webgpu_im2col_pipeline_key {
+    ggml_type input_type;
+    ggml_type output_type;
+
+    bool operator==(const ggml_webgpu_im2col_pipeline_key & other) const {
+        return input_type == other.input_type && output_type == other.output_type;
+    }
+};
+
+struct ggml_webgpu_im2col_pipeline_key_hash {
+    size_t operator()(const ggml_webgpu_im2col_pipeline_key & key) const {
+        size_t seed = 0;
+        ggml_webgpu_hash_combine(seed, key.input_type);
+        ggml_webgpu_hash_combine(seed, key.output_type);
+        return seed;
+    }
+};
+
 /** Gated Delta Net **/
 struct ggml_webgpu_gated_delta_net_pipeline_key {
     int type;
@@ -833,6 +852,8 @@ class ggml_webgpu_shader_lib {
         soft_max_pipelines;
     std::unordered_map<ggml_webgpu_conv2d_pipeline_key, webgpu_pipeline, ggml_webgpu_conv2d_pipeline_key_hash>
         conv2d_pipelines;
+    std::unordered_map<ggml_webgpu_im2col_pipeline_key, webgpu_pipeline, ggml_webgpu_im2col_pipeline_key_hash>
+        im2col_pipelines;
 
     std::unordered_map<ggml_webgpu_rms_norm_mul_pipeline_key,
                        webgpu_pipeline,
@@ -2504,6 +2525,44 @@ class ggml_webgpu_shader_lib {
         return conv2d_pipelines[key];
     }
 
+    webgpu_pipeline get_im2col_pipeline(const ggml_webgpu_shader_lib_context & context) {
+        ggml_webgpu_im2col_pipeline_key key = {};
+        key.input_type                      = context.src1->type;
+        key.output_type                     = context.dst->type;
+
+        auto it = im2col_pipelines.find(key);
+        if (it != im2col_pipelines.end()) {
+            return it->second;
+        }
+
+        std::vector<std::string> defines;
+        std::string              variant = "im2col";
+
+        auto push_type_defines = [&](const char * prefix, ggml_type type) {
+            std::string s_prefix = prefix;
+            if (type == GGML_TYPE_F32) {
+                defines.push_back(s_prefix + "_F32");
+            } else if (type == GGML_TYPE_F16) {
+                defines.push_back(s_prefix + "_F16");
+            } else {
+                GGML_ABORT("Unsupported type for IM2COL shader");
+            }
+        };
+
+        push_type_defines("INPUT", key.input_type);
+        push_type_defines("OUTPUT", key.output_type);
+
+        defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
+
+        auto processed           = preprocessor.preprocess(wgsl_im2col, defines);
+        auto decisions           = std::make_shared<ggml_webgpu_generic_shader_decisions>();
+        decisions->wg_size       = context.max_wg_size;
+        webgpu_pipeline pipeline = ggml_webgpu_create_pipeline(device, processed, variant);
+        pipeline.context         = decisions;
+        im2col_pipelines[key]    = pipeline;
+        return im2col_pipelines[key];
+    }
+
   private:
     static webgpu_pipeline ggml_webgpu_create_pipeline(wgpu::Device & device,
                                                        std::string    shader_code,
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 44e3bf822..bcca2bd46 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -979,25 +979,108 @@ static webgpu_encoded_op ggml_webgpu_conv_2d(webgpu_context & ctx,
         ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, dst),
     };
 
-    uint32_t max_wg_size =
-        std::min((uint32_t) WEBGPU_MAX_WG_SIZE, ctx->global_ctx->capabilities.limits.maxComputeWorkgroupSizeX);
-    uint32_t wg_size =
-        std::min((uint32_t) ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup, max_wg_size);
-
     ggml_webgpu_shader_lib_context shader_lib_ctx = {};
     shader_lib_ctx.src0                           = src0;
     shader_lib_ctx.src1                           = src1;
     shader_lib_ctx.dst                            = dst;
-    shader_lib_ctx.max_wg_size                    = wg_size;
+    shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
 
     webgpu_pipeline pipeline = ctx->shader_lib->get_conv2d_pipeline(shader_lib_ctx);
 
     auto * decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
 
-    uint32_t n_out    = ggml_nelements(dst);
-    uint32_t total_wg = CEIL_DIV(n_out, decisions->wg_size);
-    uint32_t max_wg   = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
-    uint32_t wg_x     = std::min(total_wg, max_wg);
+    uint32_t total_wg = CEIL_DIV((uint32_t) ggml_nelements(dst), decisions->wg_size);
+    uint32_t wg_x     = std::min(ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, total_wg);
+    uint32_t wg_y     = CEIL_DIV(total_wg, wg_x);
+
+    return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
+}
+
+static webgpu_encoded_op ggml_webgpu_im2col(webgpu_context & ctx,
+                                            ggml_tensor *    src0,
+                                            ggml_tensor *    src1,
+                                            ggml_tensor *    dst) {
+    const int32_t s0    = ggml_get_op_params_i32(dst, 0);
+    const int32_t s1    = ggml_get_op_params_i32(dst, 1);
+    const int32_t p0    = ggml_get_op_params_i32(dst, 2);
+    const int32_t p1    = ggml_get_op_params_i32(dst, 3);
+    const int32_t d0    = ggml_get_op_params_i32(dst, 4);
+    const int32_t d1    = ggml_get_op_params_i32(dst, 5);
+    const bool    is_2D = ggml_get_op_params_i32(dst, 6) == 1;
+
+    const uint32_t KW = src0->ne[0];
+    const uint32_t KH = is_2D ? src0->ne[1] : 1;
+    const uint32_t IC = is_2D ? src0->ne[2] : src0->ne[1];
+
+    const uint32_t IW = src1->ne[0];
+    const uint32_t IH = is_2D ? src1->ne[1] : 1;
+    const uint32_t N  = is_2D ? src1->ne[3] : src1->ne[2];
+
+    const uint32_t OW = dst->ne[1];
+    const uint32_t OH = is_2D ? dst->ne[2] : 1;
+
+    const uint32_t si0 = (uint32_t) (src1->nb[0] / ggml_type_size(src1->type));
+    const uint32_t si1 = is_2D ? (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)) : 0;
+    const uint32_t si2 = is_2D ? (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)) :
+                                 (uint32_t) (src1->nb[1] / ggml_type_size(src1->type));
+    const uint32_t si3 = is_2D ? (uint32_t) (src1->nb[3] / ggml_type_size(src1->type)) :
+                                 (uint32_t) (src1->nb[2] / ggml_type_size(src1->type));
+
+    const uint32_t so0 = (uint32_t) (dst->nb[0] / ggml_type_size(dst->type));
+    const uint32_t so1 = (uint32_t) (dst->nb[1] / ggml_type_size(dst->type));
+    const uint32_t so2 = is_2D ? (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)) : 0;
+    const uint32_t so3 = is_2D ? (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)) :
+                                 (uint32_t) (dst->nb[2] / ggml_type_size(dst->type));
+
+    std::vector<uint32_t> params = {
+        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
+        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
+
+        si0,
+        si1,
+        si2,
+        si3,
+        so0,
+        so1,
+        so2,
+        so3,
+
+        KW,
+        KH,
+        IC,
+
+        IW,
+        IH,
+        N,
+
+        OW,
+        OH,
+
+        (uint32_t) s0,
+        (uint32_t) s1,
+        (uint32_t) p0,
+        (uint32_t) p1,
+        (uint32_t) d0,
+        (uint32_t) d1,
+    };
+
+    std::vector<wgpu::BindGroupEntry> entries = {
+        ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, src1),
+        ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, dst),
+    };
+
+    ggml_webgpu_shader_lib_context shader_lib_ctx = {};
+    shader_lib_ctx.src0                           = src0;
+    shader_lib_ctx.src1                           = src1;
+    shader_lib_ctx.dst                            = dst;
+    shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
+
+    webgpu_pipeline pipeline = ctx->shader_lib->get_im2col_pipeline(shader_lib_ctx);
+
+    auto * decisions = static_cast<ggml_webgpu_generic_shader_decisions *>(pipeline.context.get());
+
+    uint32_t total_wg = CEIL_DIV((uint32_t) ggml_nelements(dst), decisions->wg_size);
+    uint32_t wg_x     = std::min(ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension, total_wg);
     uint32_t wg_y     = CEIL_DIV(total_wg, wg_x);
 
     return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
@@ -1988,8 +2071,8 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_rms_norm_mul(webgpu_context
         GGML_ABORT("rms_norm must be equal to the one of mul_src0 and mul_src1");
     }
 
-    bool inplace     = (ggml_webgpu_tensor_equal(rn_dst, mul_src0) && ggml_webgpu_tensor_equal(mul_src1, dst)) ||
-                       (ggml_webgpu_tensor_equal(rn_dst, mul_src1) && ggml_webgpu_tensor_equal(mul_src0, dst));
+    bool inplace = (ggml_webgpu_tensor_equal(rn_dst, mul_src0) && ggml_webgpu_tensor_equal(mul_src1, dst)) ||
+                   (ggml_webgpu_tensor_equal(rn_dst, mul_src1) && ggml_webgpu_tensor_equal(mul_src0, dst));
     bool src_overlap = ggml_webgpu_tensor_overlap(rn_src, mul_src);
 
     uint32_t offset_merged_rn_src               = 0;
@@ -2689,6 +2772,8 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_encode(webgpu_context ctx,
             return ggml_webgpu_sum_rows(ctx, src0, node);
         case GGML_OP_CONV_2D:
             return ggml_webgpu_conv_2d(ctx, src0, src1, node);
+        case GGML_OP_IM2COL:
+            return ggml_webgpu_im2col(ctx, src0, src1, node);
         default:
             return std::nullopt;
     }
@@ -3455,7 +3540,7 @@ static webgpu_context initialize_webgpu_context(ggml_backend_dev_t dev) {
     ggml_backend_webgpu_device_context * dev_ctx    = (ggml_backend_webgpu_device_context *) dev->context;
     webgpu_context                       webgpu_ctx = std::make_shared<webgpu_context_struct>();
     webgpu_ctx->global_ctx                          = dev_ctx->webgpu_global_ctx;
-    webgpu_ctx->shader_lib     = std::make_unique<ggml_webgpu_shader_lib>(dev_ctx->webgpu_global_ctx->device);
+    webgpu_ctx->shader_lib = std::make_unique<ggml_webgpu_shader_lib>(dev_ctx->webgpu_global_ctx->device);
     webgpu_ctx->param_arena.init(
         webgpu_ctx->global_ctx->device, WEBGPU_PARAMS_BUF_SIZE_BYTES,
         webgpu_ctx->global_ctx->command_submit_batch_size + WEBGPU_NUM_PARAM_SLOT_SAFETY_MARGIN,
@@ -3705,12 +3790,12 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
                     break;
                 }
                 // Head dimensions must fit in workgroup memory with minimum tile sizes
-                size_t       limit_bytes = ctx->webgpu_global_ctx->capabilities.limits.maxComputeWorkgroupStorageSize;
-                const bool   has_mask    = op->src[3] != nullptr;
-                const bool   kv_direct   = src1->type == GGML_TYPE_F16 &&
-                                           (src0->ne[0] % ctx->webgpu_global_ctx->capabilities.sg_mat_k) == 0 &&
-                                           (src1->ne[1] % GGML_WEBGPU_KV_SEQ_PAD) == 0;
-                const size_t min_bytes   = ggml_webgpu_flash_attn_wg_mem_bytes(
+                size_t     limit_bytes = ctx->webgpu_global_ctx->capabilities.limits.maxComputeWorkgroupStorageSize;
+                const bool has_mask    = op->src[3] != nullptr;
+                const bool kv_direct   = src1->type == GGML_TYPE_F16 &&
+                                       (src0->ne[0] % ctx->webgpu_global_ctx->capabilities.sg_mat_k) == 0 &&
+                                       (src1->ne[1] % GGML_WEBGPU_KV_SEQ_PAD) == 0;
+                const size_t min_bytes = ggml_webgpu_flash_attn_wg_mem_bytes(
                     ctx->webgpu_global_ctx->capabilities.sg_mat_m, ctx->webgpu_global_ctx->capabilities.sg_mat_n,
                     (uint32_t) src0->ne[0], (uint32_t) src2->ne[0], has_mask, kv_direct);
                 if (min_bytes > limit_bytes) {
@@ -3802,6 +3887,10 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
                           (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
                           (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
             break;
+        case GGML_OP_IM2COL:
+            supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) &&
+                          (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
+            break;
         case GGML_OP_SSM_CONV:
             supports_op = op->type == GGML_TYPE_F32;
             break;
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl
new file mode 100644
index 000000000..386ebab87
--- /dev/null
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/im2col.wgsl
@@ -0,0 +1,101 @@
+#include "common_decls.tmpl"
+enable f16;
+
+@group(0) @binding(0)
+#if defined(INPUT_F32)
+var<storage, read_write> input: array<f32>;
+#elif defined(INPUT_F16)
+var<storage, read_write> input: array<f16>;
+#endif
+
+@group(0) @binding(1)
+#if defined(OUTPUT_F32)
+var<storage, read_write> output: array<f32>;
+#elif defined(OUTPUT_F16)
+var<storage, read_write> output: array<f16>;
+#endif
+
+struct Params {
+    offset_i: u32,
+    offset_o: u32,
+
+    // element strides
+    si0: u32, si1: u32, si2: u32, si3: u32,
+    so0: u32, so1: u32, so2: u32, so3: u32,
+
+    KW: u32, KH: u32, IC: u32,
+    IW: u32, IH: u32, N: u32,
+    OW: u32, OH: u32,
+
+    // stride
+    s0: u32, s1: u32,
+    // padding
+    p0: u32, p1: u32,
+    // dilation
+    d0: u32, d1: u32,
+}
+
+@group(0) @binding(2)
+var<uniform> params: Params;
+
+fn load_input(idx: u32) -> f32 {
+    #if defined(INPUT_F32)
+        return input[idx];
+    #elif defined(INPUT_F16)
+        return f32(input[idx]);
+    #endif
+}
+
+fn store_output(idx: u32, val: f32) {
+    #if defined(OUTPUT_F32)
+        output[idx] = val;
+    #elif defined(OUTPUT_F16)
+        output[idx] = f16(val);
+    #endif
+}
+
+@compute @workgroup_size(WG_SIZE)
+fn main(
+    @builtin(global_invocation_id) gid: vec3<u32>,
+    @builtin(num_workgroups) num_wg: vec3<u32>
+) {
+
+    let threads_per_group = u32(WG_SIZE);
+    let i_out = gid.x + (num_wg.x * threads_per_group) * gid.y;
+    let K = params.KW * params.KH * params.IC;
+    let M = params.OW * params.OH;
+    let total = K * M * params.N;
+
+    if (i_out >= total) {
+        return;
+    }
+
+    // decode (k, m, n)
+    var i = i_out;
+    let n = i / (K * M);
+    i = i % (K * M);
+    let m = i / K;
+    let k = i % K;
+
+    // decode (oh, ow)
+    let oh = m / params.OW;
+    let ow = m % params.OW;
+
+    // decode (kw, kh, ic)
+    let kw = k % params.KW;
+    let tmp = k / params.KW;
+    let kh = tmp % params.KH;
+    let ic = tmp / params.KH;
+
+    let iw_i32 = i32(ow * params.s0 + kw * params.d0) - i32(params.p0);
+    let ih_i32 = i32(oh * params.s1 + kh * params.d1) - i32(params.p1);
+
+    if (iw_i32 >= 0 && iw_i32 < i32(params.IW) && ih_i32 >= 0 && ih_i32 < i32(params.IH)) {
+        let iw = u32(iw_i32);
+        let ih = u32(ih_i32);
+        let in_idx = params.offset_i + iw * params.si0 + ih * params.si1 + ic * params.si2 + n * params.si3;
+        store_output(params.offset_o + k * params.so0 + ow * params.so1 + oh * params.so2 + n * params.so3, load_input(in_idx));
+    } else {
+        store_output(params.offset_o + k * params.so0 + ow * params.so1 + oh * params.so2 + n * params.so3, 0.0);
+    }
+}

From 60b68a62792b952149631e1753ea12bb0a9de966 Mon Sep 17 00:00:00 2001
From: abotsis <github@bots.is>
Date: Wed, 22 Apr 2026 23:18:56 -0600
Subject: [PATCH 06/35] sycl : fused MoE mul_mat_vec_q for TG (#21920)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* sycl : fused MoE mul_mat_vec_q for TG

Create an MMVQ kernel so ggml_sycl_mul_mat_id can consolidate
n_experts_used matmuls in a single kernel launch. The kernel
also reads expert IDs directly, removing a per-call host sync.

This is similar to the CUDA backend's ggml_cuda_mul_mat_vec_q*
paths.

All types supported in the current MMVQ are supported here as well:
Q2_K, Q3_K, Q4_K, Q5_K, Q6_K, Q4_0, Q4_1, Q5_0, Q5_1, Q8_0

It will fall back to the existing per-expert path when src0 has been rewritten
by opt_for_reorder(), and for any shape the fused path doesn't handle.

test-backend-ops passes for supported type/shape combos.

Benchmark: Qwen3-Next-35B-A3B Q4_K_M on Intel Arc B70 (SYCL0),
baseline 707c0b7a6, 16k context, -fa 0.

  build/bin/llama-bench -hf unsloth/Qwen3.5-35B-A3B-GGUF:Q4_K_M \
    -p 1024 -n 128 -d 16384 -ngl 99 -fa 0 -ub 2048 -r 2 -dev SYCL0

Before (3 runs on 707c0b7a6):

  | test            |            run 1 |            run 2 |            run 3 |
  | --------------- | ----------------:| ----------------:| ----------------:|
  | pp1024 @ d16384 |   533.26 ±  4.87 |   535.20 ±  2.78 |   524.27 ±  3.10 |
  | tg128  @ d16384 |    33.47 ±  0.02 |    33.31 ±  0.02 |    33.17 ±  0.05 |

After (3 runs on 707c0b7a6 + this patch):

  | test            |            run 1 |            run 2 |            run 3 |
  | --------------- | ----------------:| ----------------:| ----------------:|
  | pp1024 @ d16384 |   534.06 ±  0.97 |   531.95 ±  0.02 |   520.94 ± 20.10 |
  | tg128  @ d16384 |    45.85 ±  0.21 |    45.95 ±  0.45 |    46.22 ±  0.12 |

disclosure: Claude wrote it, but I reviewed and understand the implementation
(albeit my C is a little rusty).

* sycl: also support nvfp4 and mxfp4 expert types

* sycl: terser comments/nested dispatch in response to review

* sycl: more comment cleanup in mmvq.cpp/hpp

---------

Co-authored-by: Debian <aaron@openllmi.net.bots.is>
---
 ggml/src/ggml-sycl/ggml-sycl.cpp |  51 +++++++++++
 ggml/src/ggml-sycl/mmvq.cpp      | 151 +++++++++++++++++++++++++++++++
 ggml/src/ggml-sycl/mmvq.hpp      |  16 ++++
 3 files changed, 218 insertions(+)

diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 3829da879..36923160d 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -3808,6 +3808,51 @@ __dpct_inline__ static void k_copy_dst_from_contiguous(
     }
 }
 
+// Fused MoE TG fast path. Returns false to fall back to the per-expert loop below.
+static bool ggml_sycl_mul_mat_id_mmvq_fused(
+    ggml_backend_sycl_context & ctx, const ggml_tensor * src0,
+    const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst)
+{
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    if (ne12 != 1) return false;
+    if (src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) return false;
+    if (ne10 != src0->ne[0] || ne10 % QK8_1 != 0) return false;
+    if (!ggml_is_contiguous(src1)) return false;
+
+    // Reorder layout not supported; fall back.
+    const ggml_tensor_extra_gpu * src0_extra =
+        static_cast<const ggml_tensor_extra_gpu *>(src0->extra);
+    if (src0_extra && src0_extra->optimized_feature.reorder) return false;
+
+    const int64_t n_ids_per_group = ids->ne[0];
+    if (ids->ne[1] != 1) return false;
+    if (ne11 != 1 && ne11 != n_ids_per_group) return false;
+
+    const queue_ptr stream           = ctx.stream();
+    const int       src1_padded_cols = GGML_PAD((int) ne10, MATRIX_ROW_PADDING);
+    const int       n_experts_used   = (int) n_ids_per_group;
+    const int       nrows            = (int) src0->ne[1];
+
+    ggml_sycl_pool_alloc<char> src1_q8_alloc(ctx.pool(),
+        (size_t) ne11 * src1_padded_cols * sizeof(block_q8_1) / QK8_1);
+    char * src1_ddq = src1_q8_alloc.get();
+    quantize_row_q8_1_sycl<quantize_q8_1>(
+        (const float *) src1->data, src1_ddq, (int) ne10, (int) ne11,
+        src1_padded_cols, stream);
+
+    const size_t bytes_per_qrow = (size_t) src1_padded_cols * sizeof(block_q8_1) / QK8_1;
+    const size_t src1_row_stride = (ne11 == 1) ? 0 : bytes_per_qrow;
+
+    return ggml_sycl_mul_mat_vec_q_id(
+        src0->type, src0->data, src1_ddq, (const int32_t *) ids->data,
+        (float *) dst->data, (int) ne10, nrows, n_experts_used,
+        /*expert_weight_stride=*/ src0->nb[2],
+        /*dst_row_stride=*/ dst->nb[1],
+        src1_row_stride, stream);
+}
+
 static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
                                  ggml_tensor *dst) try {
     scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/3);
@@ -3823,6 +3868,12 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
     const int64_t n_as = ne02;
     const int64_t n_ids = ids->ne[0];
 
+    if (ne12 == 1) {
+        if (ggml_sycl_mul_mat_id_mmvq_fused(ctx, src0, src1, ids, dst)) {
+            return;
+        }
+    }
+
     std::vector<char> ids_host(ggml_nbytes(ids));
     const char * ids_dev = (const char *) ids->data;
 
diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp
index 3a4577ecb..8fa2198f3 100644
--- a/ggml/src/ggml-sycl/mmvq.cpp
+++ b/ggml/src/ggml-sycl/mmvq.cpp
@@ -1199,3 +1199,154 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens
     GGML_UNUSED(src1_ddf_i);
     GGML_UNUSED(ctx);
 }
+
+// src1_row_stride: 0 for shared src1 (gate/up proj), else per-expert stride (down proj).
+template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
+static void mul_mat_vec_q_moe(
+    const void * __restrict__ vx_base, const void * __restrict__ vy_base,
+    float * __restrict__ dst_base, const int32_t * __restrict__ ids_dev,
+    const int ncols, const int nrows,
+    const size_t expert_weight_stride, const size_t dst_row_stride,
+    const size_t src1_row_stride,
+    const sycl::nd_item<3> & item_ct1) {
+
+    const int expert_idx = item_ct1.get_group(1);
+    const int i02        = ids_dev[expert_idx];
+
+    const char * vx = (const char *) vx_base + (size_t) i02 * expert_weight_stride;
+    const char * vy = (const char *) vy_base + (size_t) expert_idx * src1_row_stride;
+    float *      dst = (float *) ((char *) dst_base + (size_t) expert_idx * dst_row_stride);
+
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1);
+
+    if (row >= nrows) {
+        return;
+    }
+
+    const int     blocks_per_row  = ncols / qk;
+    constexpr int blocks_per_warp = (vdr * WARP_SIZE + qi - 1) / qi;
+
+    float tmp = 0.0f;
+
+    const block_q_t *  x = (const block_q_t *) vx;
+    const block_q8_1 * y = (const block_q8_1 *) vy;
+
+    for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; i += blocks_per_warp) {
+        const int ibx = row * blocks_per_row + i;
+        const int iby = i * (qk / QK8_1);
+
+        for (size_t elem = 0; elem < qi / vdr; elem += WARP_SIZE) {
+            const int iqs = elem + vdr * (item_ct1.get_local_id(2) % (qi / vdr));
+            tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
+        }
+    }
+
+#pragma unroll
+    for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
+        tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[row] = tmp;
+    }
+}
+
+template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
+static void launch_mul_mat_vec_q_moe(
+    const void * vx_base, const void * vy, const int32_t * ids_dev,
+    float * dst_base, const int ncols, const int nrows, const int n_experts_used,
+    const size_t expert_weight_stride, const size_t dst_row_stride,
+    const size_t src1_row_stride,
+    dpct::queue_ptr stream) {
+    const int            block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, (unsigned) n_experts_used, (unsigned) block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    stream->submit([&](sycl::handler & cgh) {
+        cgh.parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                mul_mat_vec_q_moe<qk, qi, block_q_t, vdr, vec_dot_q_sycl>(
+                    vx_base, vy, dst_base, ids_dev, ncols, nrows,
+                    expert_weight_stride, dst_row_stride, src1_row_stride, item);
+            });
+    });
+}
+
+bool ggml_sycl_mul_mat_vec_q_id(
+    enum ggml_type     src0_type,
+    const void *       vx_base,
+    const void *       vy,
+    const int32_t *    ids_dev,
+    float *            dst_base,
+    int                ncols,
+    int                nrows,
+    int                n_experts_used,
+    size_t             expert_weight_stride,
+    size_t             dst_row_stride,
+    size_t             src1_row_stride,
+    dpct::queue_ptr    stream) {
+    switch (src0_type) {
+        case GGML_TYPE_Q4_0:
+            launch_mul_mat_vec_q_moe<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
+                vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
+                expert_weight_stride, dst_row_stride, src1_row_stride, stream);
+            return true;
+        case GGML_TYPE_Q4_1:
+            launch_mul_mat_vec_q_moe<QK4_1, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(
+                vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
+                expert_weight_stride, dst_row_stride, src1_row_stride, stream);
+            return true;
+        case GGML_TYPE_Q5_0:
+            launch_mul_mat_vec_q_moe<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(
+                vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
+                expert_weight_stride, dst_row_stride, src1_row_stride, stream);
+            return true;
+        case GGML_TYPE_Q5_1:
+            launch_mul_mat_vec_q_moe<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(
+                vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
+                expert_weight_stride, dst_row_stride, src1_row_stride, stream);
+            return true;
+        case GGML_TYPE_Q8_0:
+            launch_mul_mat_vec_q_moe<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(
+                vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
+                expert_weight_stride, dst_row_stride, src1_row_stride, stream);
+            return true;
+        case GGML_TYPE_Q2_K:
+            launch_mul_mat_vec_q_moe<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(
+                vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
+                expert_weight_stride, dst_row_stride, src1_row_stride, stream);
+            return true;
+        case GGML_TYPE_Q3_K:
+            launch_mul_mat_vec_q_moe<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(
+                vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
+                expert_weight_stride, dst_row_stride, src1_row_stride, stream);
+            return true;
+        case GGML_TYPE_Q4_K:
+            launch_mul_mat_vec_q_moe<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(
+                vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
+                expert_weight_stride, dst_row_stride, src1_row_stride, stream);
+            return true;
+        case GGML_TYPE_Q5_K:
+            launch_mul_mat_vec_q_moe<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(
+                vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
+                expert_weight_stride, dst_row_stride, src1_row_stride, stream);
+            return true;
+        case GGML_TYPE_Q6_K:
+            launch_mul_mat_vec_q_moe<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(
+                vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
+                expert_weight_stride, dst_row_stride, src1_row_stride, stream);
+            return true;
+        case GGML_TYPE_MXFP4:
+            launch_mul_mat_vec_q_moe<QK_MXFP4, QI_MXFP4, block_mxfp4, VDR_MXFP4_Q8_1_MMVQ, vec_dot_mxfp4_q8_1>(
+                vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
+                expert_weight_stride, dst_row_stride, src1_row_stride, stream);
+            return true;
+        case GGML_TYPE_NVFP4:
+            launch_mul_mat_vec_q_moe<QK_NVFP4, QI_NVFP4, block_nvfp4, VDR_NVFP4_Q8_1_MMVQ, vec_dot_nvfp4_q8_1>(
+                vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used,
+                expert_weight_stride, dst_row_stride, src1_row_stride, stream);
+            return true;
+        default:
+            return false;
+    }
+}
diff --git a/ggml/src/ggml-sycl/mmvq.hpp b/ggml/src/ggml-sycl/mmvq.hpp
index 049b43d45..d674dc1d6 100644
--- a/ggml/src/ggml-sycl/mmvq.hpp
+++ b/ggml/src/ggml-sycl/mmvq.hpp
@@ -24,4 +24,20 @@ void ggml_sycl_op_mul_mat_vec_q(
     const int64_t src1_ncols, const int64_t src1_padded_row_size,
     const dpct::queue_ptr &stream);
 
+// Requires standard (non-reorder) block layout for src0.
+// Returns false if src0_type isn't handled; caller should fall back.
+bool ggml_sycl_mul_mat_vec_q_id(
+    enum ggml_type     src0_type,
+    const void *       vx_base,             // start of stacked expert weights
+    const void *       vy,                  // pre-quantized src1 (Q8_1)
+    const int32_t *    ids_dev,             // device-side int32, length n_experts_used
+    float *            dst_base,
+    int                ncols,
+    int                nrows,
+    int                n_experts_used,
+    size_t             expert_weight_stride, // bytes between experts in vx_base
+    size_t             dst_row_stride,       // bytes between dst rows
+    size_t             src1_row_stride,      // 0 = shared src1, else per-expert stride in bytes
+    dpct::queue_ptr    stream);
+
 #endif // GGML_SYCL_MMVQ_HPP

From 5eaee65384f8bcf3564a92c7f9c33b5d80b6267f Mon Sep 17 00:00:00 2001
From: ynankani <ynankani@nvidia.com>
Date: Thu, 23 Apr 2026 05:19:51 +0000
Subject: [PATCH 07/35] convert : Handle ModelOpt produced mixed precision
 model during convert to GGUF (#22247)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Handle ModelOpt produced mixed precision model during convert to GGUF

* Apply suggestions from code review

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Apply suggestions from code review

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
---
 convert_hf_to_gguf.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 090686b15..93d5509e6 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -746,7 +746,12 @@ class ModelBase:
 
         if (not quant_algo or not quant_layers) and quant_config_file.is_file():
             with open(quant_config_file, "r", encoding="utf-8") as f:
-                quant_config = json.load(f).get("quantization") or {}
+                hf_quant_config = json.load(f)
+                quant_config = hf_quant_config.get("quantization") or {}
+                producer = hf_quant_config.get("producer") or {}
+                producer_name = (producer.get("name") or "").lower()
+                if quant_method is None:
+                    self.hparams.setdefault("quantization_config", {})["quant_method"] = producer_name
                 quant_algo = quant_config.get("quant_algo", quant_algo)
                 quant_layers = quant_config.get("quantized_layers", quant_layers) or {}
 

From 4ead6fd9571df75c21599f3bafabb457e16b22c4 Mon Sep 17 00:00:00 2001
From: Neo Zhang Jianyu <jianyu.zhang@intel.com>
Date: Thu, 23 Apr 2026 13:21:36 +0800
Subject: [PATCH 08/35] [SYCL] Update oneapi 2025.3.3, Seperate SYCL build,
 release Ubuntu 24 package. (#22078)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* upgrade oneAPI to 2025.3.3

* update

* seperate SYCL CI and support release binary package for ubuntu 24

* add dependence

* remove wrong copy lines

* add missed line

* remove other task to test the release for SYCL

* rm more for test release

* fix file name

* correct the error in running

* support build for fp32/fp16

* rm ubuntu-24-sycl-fp16 for duplicated

* refactor build setting

* update guide for ubuntu 24 release package, restore the release.yml for other backend

* user docker replace to install oneAPI

* use download installation package to replace docker

* use wget to download and install oneapi, replace the apt cmd

* enable ccache for oneAPI installation

* fix format error

* enable cache for oneAPI installation

* update guide

* Update .github/workflows/release.yml

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update .github/workflows/release.yml

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update .github/workflows/build-sycl.yml

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update .github/workflows/release.yml

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
---
 .devops/intel.Dockerfile         |   2 +-
 .github/workflows/build-sycl.yml | 142 +++++++++++++++++++++++++++++++
 .github/workflows/build.yml      | 133 -----------------------------
 .github/workflows/release.yml    |  99 +++++++++++++++++++--
 docs/backend/SYCL.md             |  10 +++
 5 files changed, 247 insertions(+), 139 deletions(-)
 create mode 100644 .github/workflows/build-sycl.yml

diff --git a/.devops/intel.Dockerfile b/.devops/intel.Dockerfile
index 955a2962f..8e830d462 100644
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@@ -1,4 +1,4 @@
-ARG ONEAPI_VERSION=2025.3.2-0-devel-ubuntu24.04
+ARG ONEAPI_VERSION=2025.3.3-0-devel-ubuntu24.04
 
 ## Build Image
 
diff --git a/.github/workflows/build-sycl.yml b/.github/workflows/build-sycl.yml
new file mode 100644
index 000000000..2a6642292
--- /dev/null
+++ b/.github/workflows/build-sycl.yml
@@ -0,0 +1,142 @@
+name: CI (sycl)
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+    paths: [
+      '.github/workflows/build-sycl.yml',
+      '**/CMakeLists.txt',
+      '**/.cmake',
+      '**/*.h',
+      '**/*.hpp',
+      '**/*.c',
+      '**/*.cpp'
+    ]
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: [
+      '.github/workflows/build-sycl.yml',
+      'ggml/src/ggml-sycl/**'
+    ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  GGML_NLOOP: 3
+  GGML_N_THREADS: 1
+  LLAMA_LOG_COLORS: 1
+  LLAMA_LOG_PREFIX: 1
+  LLAMA_LOG_TIMESTAMPS: 1
+
+jobs:
+
+  ubuntu-24-sycl:
+    strategy:
+      matrix:
+        build: [fp32, fp16]
+        include:
+          - build: fp32
+            fp16: OFF
+          - build: fp16
+            fp16: ON
+
+    runs-on: ubuntu-24.04
+
+    env:
+      ONEAPI_ROOT: /opt/intel/oneapi/
+      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+
+    continue-on-error: true
+
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Use oneAPI Installation Cache
+        uses: actions/cache@v5
+        id: cache-sycl
+        with:
+          path: ${{ env.ONEAPI_ROOT }}
+          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
+
+      - name: Download & Install oneAPI
+        shell: bash
+        if: steps.cache-sycl.outputs.cache-hit != 'true'
+        run: |
+          cd /tmp
+          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
+          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: ubuntu-24-sycl-${{ matrix.build }}
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          cmake -B build \
+            -G "Ninja" \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_SYCL=ON \
+            -DCMAKE_C_COMPILER=icx \
+            -DCMAKE_CXX_COMPILER=icpx \
+            -DLLAMA_OPENSSL=OFF \
+            -DGGML_NATIVE=OFF \
+            -DGGML_SYCL_F16=${{ matrix.fp16 }}
+          time cmake --build build --config Release -j $(nproc)
+
+  windows-latest-sycl:
+    runs-on: windows-2022
+
+    defaults:
+      run:
+        shell: bash
+
+    env:
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
+      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
+      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
+      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: Use oneAPI Installation Cache
+        uses: actions/cache@v5
+        id: cache-sycl
+        with:
+          path: ${{ env.ONEAPI_ROOT }}
+          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
+
+      - name: Download & Install oneAPI
+        shell: bash
+        if: steps.cache-sycl.outputs.cache-hit != 'true'
+        run: |
+          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: windows-latest-sycl
+          variant: ccache
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
+
+      - name: Build
+        id: cmake_build
+        run:  examples/sycl/win-build-sycl.bat
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index c7f00e359..21eb4d97b 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -555,106 +555,6 @@ jobs:
             -DGGML_MUSA=ON
           time cmake --build build --config Release -j $(nproc)
 
-  ubuntu-22-sycl:
-    runs-on: ubuntu-22.04
-
-    continue-on-error: true
-
-    steps:
-      - uses: actions/checkout@v6
-
-      - name: add oneAPI to apt
-        shell: bash
-        run: |
-          cd /tmp
-          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
-
-      - name: install oneAPI dpcpp compiler
-        shell: bash
-        run: |
-          sudo apt update
-          sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev
-
-      - name: install oneAPI MKL library
-        shell: bash
-        run: |
-          sudo apt install intel-oneapi-mkl-devel
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-22-sycl
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          cmake -B build \
-            -DGGML_SYCL=ON \
-            -DCMAKE_C_COMPILER=icx \
-            -DCMAKE_CXX_COMPILER=icpx
-          time cmake --build build --config Release -j $(nproc)
-
-  ubuntu-22-sycl-fp16:
-    runs-on: ubuntu-22.04
-
-    continue-on-error: true
-
-    steps:
-      - uses: actions/checkout@v6
-
-      - name: add oneAPI to apt
-        shell: bash
-        run: |
-          cd /tmp
-          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
-
-      - name: install oneAPI dpcpp compiler
-        shell: bash
-        run: |
-          sudo apt update
-          sudo apt install intel-oneapi-compiler-dpcpp-cpp libssl-dev ninja-build
-
-      - name: install oneAPI MKL library
-        shell: bash
-        run: |
-          sudo apt install intel-oneapi-mkl-devel
-
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: ubuntu-22-sycl-fp16
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          source /opt/intel/oneapi/setvars.sh
-          cmake -B build \
-            -G "Ninja" \
-            -DCMAKE_BUILD_TYPE=Release \
-            -DGGML_SYCL=ON \
-            -DCMAKE_C_COMPILER=icx \
-            -DCMAKE_CXX_COMPILER=icpx \
-            -DGGML_SYCL_F16=ON
-          time cmake --build build --config Release -j $(nproc)
 
   windows-latest:
     runs-on: windows-2025
@@ -863,39 +763,6 @@ jobs:
           cmake --build build --config Release -j %NINJA_JOBS% -t ggml
           cmake --build build --config Release
 
-  windows-latest-sycl:
-    runs-on: windows-2022
-
-    defaults:
-      run:
-        shell: bash
-
-    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
-      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
-      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
-      - name: ccache
-        uses: ggml-org/ccache-action@v1.2.21
-        with:
-          key: windows-latest-sycl
-          variant: ccache
-          evict-old-files: 1d
-          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
-
-      - name: Install
-        run:  |
-          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
-      # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
-
-      - name: Build
-        id: cmake_build
-        run:  examples/sycl/win-build-sycl.bat
 
   windows-latest-hip:
     runs-on: windows-2022
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index f1cc12cd4..89563c51c 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -598,15 +598,29 @@ jobs:
         shell: bash
 
     env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe
       WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
       ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
+      ONEAPI_INSTALLER_VERSION: "2025.3.3"
 
     steps:
       - name: Clone
         id: checkout
         uses: actions/checkout@v6
 
+      - name: Use oneAPI Installation Cache
+        uses: actions/cache@v5
+        id: cache-sycl
+        with:
+          path: ${{ env.ONEAPI_ROOT }}
+          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
+
+      - name: Download & Install oneAPI
+        shell: bash
+        if: steps.cache-sycl.outputs.cache-hit != 'true'
+        run: |
+          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+
       - name: ccache
         uses: ggml-org/ccache-action@v1.2.21
         with:
@@ -614,10 +628,6 @@ jobs:
           variant: ccache
           evict-old-files: 1d
 
-      - name: Install
-        run:  |
-          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
-
       - name: Build
         id: cmake_build
         shell: cmd
@@ -670,6 +680,82 @@ jobs:
           path: llama-bin-win-sycl-x64.zip
           name: llama-bin-win-sycl-x64.zip
 
+  ubuntu-24-sycl:
+    strategy:
+      matrix:
+        build: [fp32, fp16]
+        include:
+          - build: fp32
+            fp16: OFF
+          - build: fp16
+            fp16: ON
+
+    runs-on: ubuntu-24.04
+
+    env:
+      ONEAPI_ROOT: /opt/intel/oneapi/
+      ONEAPI_INSTALLER_VERSION: "2025.3.3"
+
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Use oneAPI Installation Cache
+        uses: actions/cache@v5
+        id: cache-sycl
+        with:
+          path: ${{ env.ONEAPI_ROOT }}
+          key: oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }}
+
+      - name: Download & Install oneAPI
+        shell: bash
+        if: steps.cache-sycl.outputs.cache-hit != 'true'
+        run: |
+          cd /tmp
+          wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
+          sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.21
+        with:
+          key: ubuntu-24-sycl-${{ matrix.build }}
+          evict-old-files: 1d
+          save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
+
+      - name: Build
+        id: cmake_build
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          cmake -B build \
+            -G "Ninja" \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DGGML_SYCL=ON \
+            -DCMAKE_C_COMPILER=icx \
+            -DCMAKE_CXX_COMPILER=icpx \
+            -DLLAMA_OPENSSL=OFF \
+            -DGGML_NATIVE=OFF \
+            -DGGML_SYCL_F16=${{ matrix.fp16 }}
+          time cmake --build build --config Release -j $(nproc)
+
+      - name: Determine tag name
+        id: tag
+        uses: ./.github/actions/get-tag-name
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        run: |
+          cp LICENSE ./build/bin/
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v6
+        with:
+          path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
+          name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz
+
   ubuntu-22-rocm:
     runs-on: ubuntu-22.04
 
@@ -1045,6 +1131,7 @@ jobs:
       - ubuntu-cpu
       - ubuntu-vulkan
       - ubuntu-24-openvino
+      - ubuntu-24-sycl
       - android-arm64
       - macOS-cpu
       - ios-xcode-build
@@ -1133,6 +1220,8 @@ jobs:
             - [Ubuntu arm64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-arm64.tar.gz)
             - [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz)
             - [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz)
+            - [Ubuntu x64 (SYCL FP32)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp32-x64.tar.gz)
+            - [Ubuntu x64 (SYCL FP16)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp16-x64.tar.gz)
 
             **Android:**
             - [Android arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz)
diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index d52c61acb..1b86b3d4a 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -31,6 +31,8 @@ SYCL cross-platform capabilities enable support for other vendor GPUs as well.
 
 ## Recommended Release
 
+### Windows
+
 The following releases are verified and recommended:
 
 |Commit ID|Tag|Release|Verified  Platform| Update date|
@@ -39,6 +41,13 @@ The following releases are verified and recommended:
 |3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc A770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
 |fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc A770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||
 
+### Ubuntu 24.04
+
+The release packages for Ubuntu 24.04 x64 (FP32/FP16) only include the binary files of the llama.cpp SYCL backend. They require the target machine to have pre-installed Intel GPU drivers and oneAPI packages that are the same version as the build package. To get the version and installation info, refer to release.yml: ubuntu-24-sycl -> Download & Install oneAPI.
+
+It is recommended to use them with Intel Docker.
+
+The packages for FP32 and FP16 would have different accuracy and performance on LLMs. Please choose it acording to the test result.
 
 ## News
 
@@ -229,6 +238,7 @@ Upon a successful installation, SYCL is enabled for the available intel devices,
 
 |Verified release|
 |-|
+|2025.3.3 |
 |2025.2.1|
 |2025.1|
 |2024.1|

From 96c1db26c4441b10bb592aaac368842fc01e5617 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 23 Apr 2026 08:22:08 +0300
Subject: [PATCH 09/35] ggml-base: use MATH_LIBRARY variable instead of
 hardcoded 'm' (#22239)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes #22237 — the find_library(MATH_LIBRARY m) result was being
discarded and the target linked against the literal 'm' string.

This prevents users from overriding the math library (e.g. for AMD AOCL)
via CMake variables. Now the discovered MATH_LIBRARY is used directly.
---
 ggml/src/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 48fbe208d..52754e1b9 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -473,7 +473,7 @@ target_link_libraries(ggml-base PRIVATE Threads::Threads)
 find_library(MATH_LIBRARY m)
 if (MATH_LIBRARY)
     if (NOT WIN32 OR NOT DEFINED ENV{ONEAPI_ROOT})
-        target_link_libraries(ggml-base PRIVATE m)
+        target_link_libraries(ggml-base PRIVATE ${MATH_LIBRARY})
     endif()
 endif()
 

From 930e0210d1ba38ff8f6e49f6c4be19833014b1e7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 23 Apr 2026 08:22:24 +0300
Subject: [PATCH 10/35] gitignore: add AGENTS.local.md (#22246)

* gitignore: add AGENTS.local

Assisted-by: llama.cpp:local pi
Signed-off-by: Georgi Gerganov <ggerganov@gmail.com>

* gitignore: rename AGENTS.local to AGENTS.local.md

Assisted-by: llama.cpp:local pi
Signed-off-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Signed-off-by: Georgi Gerganov <ggerganov@gmail.com>
---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 15dc4014f..6136524d7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -145,3 +145,5 @@ poetry.toml
 /.windsurf/
 # emscripten
 a.out.*
+
+AGENTS.local.md

From 8635e221c8e074f8dbf5b7014ae0b6ccc7803812 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 23 Apr 2026 08:22:49 +0300
Subject: [PATCH 11/35] metal : fix event synchronization (#22260)

---
 ggml/src/ggml-metal/ggml-metal-device.m | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m
index 27cb16835..f17f7e2e0 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -931,13 +931,13 @@ void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev) {
 }
 
 struct ggml_metal_event {
-    void * obj; // id<MTLEvent>
+    void * obj; // id<MTLSharedEvent>
 
     atomic_int value;
 };
 
 void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
-    id<MTLEvent> event = (id<MTLEvent>)ev->obj;
+    id<MTLSharedEvent> event = (id<MTLSharedEvent>)ev->obj;
 
     id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;
 
@@ -945,7 +945,7 @@ void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t
 }
 
 void ggml_metal_event_encode_wait(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
-    id<MTLEvent> event = (id<MTLEvent>)ev->obj;
+    id<MTLSharedEvent> event = (id<MTLSharedEvent>)ev->obj;
 
     id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;
 
@@ -953,7 +953,7 @@ void ggml_metal_event_encode_wait(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cm
 }
 
 ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev) {
-    id<MTLEvent> event = [dev->mtl_device newEvent];
+    id<MTLSharedEvent> event = [dev->mtl_device newSharedEvent];
 
     ggml_metal_event_t ev = calloc(1, sizeof(struct ggml_metal_event));
 
@@ -964,7 +964,7 @@ ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev) {
 }
 
 void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev) {
-    id<MTLEvent> event = ev->obj;
+    id<MTLSharedEvent> event = ev->obj;
     [event release];
 
     free(ev);
@@ -973,14 +973,13 @@ void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev
 }
 
 void ggml_metal_device_event_synchronize(ggml_metal_device_t dev, ggml_metal_event_t ev) {
-    @autoreleasepool {
-        id<MTLEvent> event = ev->obj;
-
-        id<MTLCommandBuffer> cmd_buf = [dev->mtl_queue commandBuffer];
-        [cmd_buf encodeWaitForEvent:event value:atomic_load_explicit(&ev->value, memory_order_relaxed)];
-        [cmd_buf commit];
-        [cmd_buf waitUntilCompleted];
+    id<MTLSharedEvent> event = ev->obj;
+    const bool res = [event waitUntilSignaledValue:atomic_load_explicit(&ev->value, memory_order_relaxed) timeoutMS:60000];
+    if (!res) {
+        GGML_ABORT("%s: failed to wait for event\n", __func__);
     }
+
+    GGML_UNUSED(dev);
 }
 
 void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total) {

From 550d684bd13132d4af744cee64c1a3acb0ceaf09 Mon Sep 17 00:00:00 2001
From: Tarek Dakhran <tarek@liquid.ai>
Date: Thu, 23 Apr 2026 10:47:26 +0200
Subject: [PATCH 12/35] server: Enable transcriptions API for LFM2-Audio
 (#22000)

---
 common/chat.cpp                 | 26 +++++++++++++++++++++-----
 common/chat.h                   |  8 ++++++++
 tools/server/server-chat.cpp    | 21 ++++++++++++---------
 tools/server/server-chat.h      |  1 +
 tools/server/server-context.cpp |  1 +
 5 files changed, 43 insertions(+), 14 deletions(-)

diff --git a/common/chat.cpp b/common/chat.cpp
index 7c071560f..159d625de 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -544,6 +544,26 @@ bool common_chat_templates_was_explicit(const struct common_chat_templates * tmp
     return tmpls->has_explicit_template;
 }
 
+// LFM2 format detection: template uses <|tool_list_start|>[...]<|tool_list_end|> around the tool list
+// and <|tool_call_start|>[...]<|tool_call_end|> around each tool call
+static bool is_lfm2_template(const std::string & src) {
+    return src.find("<|tool_list_start|>") != std::string::npos &&
+           src.find("<|tool_list_end|>")   != std::string::npos;
+}
+
+common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates) {
+    common_chat_prompt_preset asr_preset;
+    asr_preset.system = "";
+    asr_preset.user   = "Transcribe audio to text";
+
+    if (chat_templates && chat_templates->template_default && is_lfm2_template(chat_templates->template_default->source())) {
+        asr_preset.system = "Perform ASR.";
+        asr_preset.user   = "";
+    }
+
+    return asr_preset;
+}
+
 std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant) {
     if (!variant.empty()) {
         if (variant == "tool_use") {
@@ -2053,10 +2073,7 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
         return common_chat_params_init_kimi_k2(tmpl, params);
     }
 
-    // LFM2 format detection: template uses <|tool_list_start|>[...]<|tool_list_end|> around the tool list
-    // and <|tool_call_start|>[...]<|tool_call_end|> around each tool call
-    if (src.find("<|tool_list_start|>") != std::string::npos &&
-        src.find("<|tool_list_end|>") != std::string::npos) {
+    if (is_lfm2_template(src)) {
         LOG_DBG("Using specialized template: LFM2\n");
         return common_chat_params_init_lfm2(tmpl, params);
     }
@@ -2365,4 +2382,3 @@ std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_tem
     GGML_ASSERT(chat_templates->template_default != nullptr);
     return chat_templates->template_default->caps.to_map();
 }
-
diff --git a/common/chat.h b/common/chat.h
index 9122f2967..01a47b383 100644
--- a/common/chat.h
+++ b/common/chat.h
@@ -274,3 +274,11 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
         const common_chat_template &          tmpl,
         const std::string &                   src,
         autoparser::generation_params & params);
+
+// specialized per-task preset
+struct common_chat_prompt_preset {
+    std::string system;
+    std::string user;
+};
+
+common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates);
diff --git a/tools/server/server-chat.cpp b/tools/server/server-chat.cpp
index 4fe81553c..ef586d1e1 100644
--- a/tools/server/server-chat.cpp
+++ b/tools/server/server-chat.cpp
@@ -535,6 +535,7 @@ json server_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
 
 json convert_transcriptions_to_chatcmpl(
         const json & inp_body,
+        const common_chat_templates * tmpls,
         const std::map<std::string, raw_buffer> & in_files,
         std::vector<raw_buffer> & out_files) {
     // TODO @ngxson : this function may need to be improved in the future
@@ -548,27 +549,29 @@ json convert_transcriptions_to_chatcmpl(
     }
 
     // handle input data
-    std::string prompt = json_value(inp_body, "prompt", std::string());
-    std::string language = json_value(inp_body, "language", std::string());
+    std::string prompt          = json_value(inp_body, "prompt", std::string());
+    std::string language        = json_value(inp_body, "language", std::string());
     std::string response_format = json_value(inp_body, "response_format", std::string("json"));
     if (response_format != "json") {
         throw std::invalid_argument("Only 'json' response_format is supported for transcription");
     }
+    const common_chat_prompt_preset preset = common_chat_get_asr_prompt(tmpls);
     if (prompt.empty()) {
-        prompt = "Transcribe audio to text";
+        prompt = preset.user;
     }
     if (!language.empty()) {
         prompt += string_format(" (language: %s)", language.c_str());
     }
     prompt += get_media_marker();
 
+    json messages = json::array();
+    if (!preset.system.empty()) {
+        messages.push_back({{"role", "system"}, {"content", preset.system}});
+    }
+    messages.push_back({{"role", "user"}, {"content", prompt}});
+
     json chatcmpl_body = inp_body; // copy all fields
-    chatcmpl_body["messages"] = json::array({
-        {
-            {"role", "user"},
-            {"content", prompt},
-        },
-    });
+    chatcmpl_body["messages"] = messages;
 
     // because input from form-data, everything is string, we need to correct the types here
     std::string stream = json_value(inp_body, "stream", std::string("false"));
diff --git a/tools/server/server-chat.h b/tools/server/server-chat.h
index ecb8907c4..5c5b792cf 100644
--- a/tools/server/server-chat.h
+++ b/tools/server/server-chat.h
@@ -18,6 +18,7 @@ json server_chat_convert_anthropic_to_oai(const json & body);
 // convert OpenAI transcriptions API format to OpenAI Chat Completions API format
 json convert_transcriptions_to_chatcmpl(
     const json & body,
+    const common_chat_templates * tmpls,
     const std::map<std::string, raw_buffer> & in_files,
     std::vector<raw_buffer> & out_files);
 
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index b8c05cd80..67a92755b 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -3807,6 +3807,7 @@ void server_routes::init_routes() {
         std::vector<raw_buffer> files;
         json body = convert_transcriptions_to_chatcmpl(
             json::parse(req.body),
+            meta->chat_params.tmpls.get(),
             req.files,
             files);
         SRV_DBG("%s\n", "Request converted: OpenAI Transcriptions -> OpenAI Chat Completions");

From 0dd7f915fd072fa3779a5d94436cdbec5c74cf22 Mon Sep 17 00:00:00 2001
From: Matthias Straka <59084281+matthiasstraka@users.noreply.github.com>
Date: Thu, 23 Apr 2026 15:03:28 +0200
Subject: [PATCH 13/35] cli : cleanup auto-completion code (#21745)

---
 common/common.h   |  5 +++++
 tools/cli/cli.cpp | 28 ++++++++++++++--------------
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/common/common.h b/common/common.h
index 9a8218433..d2d3c1061 100644
--- a/common/common.h
+++ b/common/common.h
@@ -746,6 +746,11 @@ inline bool string_starts_with(std::string_view str, std::string_view prefix) {
            str.compare(0, prefix.size(), prefix) == 0;
 }
 
+// remove when moving to c++20
+inline bool string_starts_with(std::string_view str, char prefix) {
+    return !str.empty() && str.front() == prefix;
+}
+
 // remove when moving to c++20
 inline bool string_ends_with(std::string_view str, std::string_view suffix) {
     return str.size() >= suffix.size() &&
diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp
index 5136e52a7..cd635a624 100644
--- a/tools/cli/cli.cpp
+++ b/tools/cli/cli.cpp
@@ -228,7 +228,7 @@ struct cli_context {
 };
 
 // TODO?: Make this reusable, enums, docs
-static const std::array<const std::string, 7> cmds = {
+static const std::array<std::string_view, 7> cmds = {
     "/audio ",
     "/clear",
     "/exit",
@@ -242,19 +242,19 @@ static std::vector<std::pair<std::string, size_t>> auto_completion_callback(std:
     std::vector<std::pair<std::string, size_t>> matches;
     std::string cmd;
 
-    if (line.length() > 1 && line[0] == '/' && !std::any_of(cmds.begin(), cmds.end(), [line](const std::string & prefix) {
+    if (line.length() > 1 && line.front() == '/' && !std::any_of(cmds.begin(), cmds.end(), [line](std::string_view prefix) {
         return string_starts_with(line, prefix);
     })) {
         auto it = cmds.begin();
 
-        while ((it = std::find_if(it, cmds.end(), [line](const std::string & cmd_line) {
+        while ((it = std::find_if(it, cmds.end(), [line](std::string_view cmd_line) {
             return string_starts_with(cmd_line, line);
         })) != cmds.end()) {
-            matches.emplace_back(*it, (*it).length());
+            matches.emplace_back(*it, it->length());
             ++it;
         }
     } else {
-        auto it = std::find_if(cmds.begin(), cmds.end(), [line](const std::string & prefix) {
+        auto it = std::find_if(cmds.begin(), cmds.end(), [line](std::string_view prefix) {
             return prefix.back() == ' ' && string_starts_with(line, prefix);
         });
 
@@ -271,18 +271,18 @@ static std::vector<std::pair<std::string, size_t>> auto_completion_callback(std:
         std::string expanded_prefix = path_prefix;
 
 #if !defined(_WIN32)
-        if (string_starts_with(path_prefix, "~")) {
+        if (string_starts_with(path_prefix, '~')) {
             const char * home = std::getenv("HOME");
             if (home && home[0]) {
-                expanded_prefix = std::string(home) + path_prefix.substr(1);
+                expanded_prefix = home + path_prefix.substr(1);
             }
         }
-        if (string_starts_with(expanded_prefix, "/")) {
+        if (string_starts_with(expanded_prefix, '/')) {
 #else
         if (std::isalpha(expanded_prefix[0]) && expanded_prefix.find(':') == 1) {
 #endif
             cur_dir = std::filesystem::path(expanded_prefix).parent_path();
-            cur_dir_str = "";
+            cur_dir_str.clear();
         } else if (!path_prefix.empty()) {
             cur_dir /= std::filesystem::path(path_prefix).parent_path();
         }
@@ -305,7 +305,7 @@ static std::vector<std::pair<std::string, size_t>> auto_completion_callback(std:
             }
 
             if (expanded_prefix.empty() || string_starts_with(path_entry, expanded_prefix)) {
-                std::string updated_line = cmd + path_entry;
+                const std::string updated_line = cmd + path_entry;
                 matches.emplace_back(updated_line + path_postfix, updated_line.length());
             }
 
@@ -315,7 +315,7 @@ static std::vector<std::pair<std::string, size_t>> auto_completion_callback(std:
         }
 
         if (matches.empty()) {
-            std::string updated_line = cmd + path_prefix;
+            const std::string updated_line = cmd + path_prefix;
             matches.emplace_back(updated_line + path_postfix, updated_line.length());
         }
 
@@ -332,7 +332,7 @@ static std::vector<std::pair<std::string, size_t>> auto_completion_callback(std:
                 len = std::min(len, static_cast<size_t>(cmp.first - match0.begin()));
             }
 
-            std::string updated_line = std::string(match0.substr(0, len));
+            const std::string updated_line = std::string(match0.substr(0, len));
             matches.emplace_back(updated_line + path_postfix, updated_line.length());
         }
 
@@ -569,10 +569,10 @@ int main(int argc, char ** argv) {
                 if (endpath != std::string::npos) {
                     std::string rel_pattern = pattern.substr(0, endpath);
 #if !defined(_WIN32)
-                    if (string_starts_with(rel_pattern, "~")) {
+                    if (string_starts_with(rel_pattern, '~')) {
                         const char * home = std::getenv("HOME");
                         if (home && home[0]) {
-                            rel_pattern = std::string(home) + rel_pattern.substr(1);
+                            rel_pattern = home + rel_pattern.substr(1);
                         }
                     }
 #endif

From 9012c50fc846bcbca12f97b7bcaeb8e68be8fc19 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Thu, 23 Apr 2026 15:07:38 +0200
Subject: [PATCH 14/35] model-conversion : fix mmproj output file name [no ci]
 (#22274)

* model-conversion : fix mmproj output file name [no ci]

This commit updates the convert-model.sh script to properly handle
mmproj output files.

The motivation for this that currently the same name as the original
model is used as the mmproj file, which causes the original model to
be overwritten and no mmproj-<model_name>.gguf to be created.

* model-conversion : use MODEL_NAME [no ci]
---
 .../model-conversion/scripts/causal/convert-model.sh  | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/examples/model-conversion/scripts/causal/convert-model.sh b/examples/model-conversion/scripts/causal/convert-model.sh
index a5865f6ac..4aa722062 100755
--- a/examples/model-conversion/scripts/causal/convert-model.sh
+++ b/examples/model-conversion/scripts/causal/convert-model.sh
@@ -25,7 +25,11 @@ MODEL_NAME="${MODEL_NAME:-$(basename "$MODEL_PATH")}"
 OUTPUT_DIR="${OUTPUT_DIR:-../../models}"
 TYPE="${OUTTYPE:-f16}"
 METADATA_OVERRIDE="${METADATA_OVERRIDE:-}"
-CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf"
+if [[ -n "$MMPROJ" ]]; then
+    CONVERTED_MODEL="${OUTPUT_DIR}/mmproj-${MODEL_NAME}.gguf"
+else
+    CONVERTED_MODEL="${OUTPUT_DIR}/${MODEL_NAME}.gguf"
+fi
 
 echo "Model path: ${MODEL_PATH}"
 echo "Model name: ${MODEL_NAME}"
@@ -38,6 +42,7 @@ if [[ -n "$DEBUG" ]]; then
 else
     CMD_ARGS=("python")
 fi
+
 CMD_ARGS+=("../../convert_hf_to_gguf.py" "--verbose")
 CMD_ARGS+=("${MODEL_PATH}")
 CMD_ARGS+=("--outfile" "${CONVERTED_MODEL}")
@@ -50,7 +55,3 @@ CMD_ARGS+=("--outtype" "${TYPE}")
 echo ""
 echo "The environment variable CONVERTED_MODEL can be set to this path using:"
 echo "export CONVERTED_MODEL=$(realpath ${CONVERTED_MODEL})"
-if [[ -n "$MMPROJ" ]]; then
-    mmproj_file="${OUTPUT_DIR}/mmproj-$(basename "${CONVERTED_MODEL}")"
-    echo "The mmproj model was created in $(realpath "$mmproj_file")"
-fi

From 0949beb5a3fc02adf0558d0f6736f1011c10891c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Thu, 23 Apr 2026 15:38:58 +0200
Subject: [PATCH 15/35] fix build number for sycl release (#22283)

---
 .github/workflows/release.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 89563c51c..924f6cd3f 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -697,7 +697,11 @@ jobs:
       ONEAPI_INSTALLER_VERSION: "2025.3.3"
 
     steps:
-      - uses: actions/checkout@v6
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
 
       - name: Use oneAPI Installation Cache
         uses: actions/cache@v5
@@ -714,10 +718,6 @@ jobs:
           wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh
           sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept
 
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v6
-
       - name: ccache
         uses: ggml-org/ccache-action@v1.2.21
         with:

From c807c6e3b0c74b77ad4c7a8213a1d5690d34e462 Mon Sep 17 00:00:00 2001
From: kvc0 <3454741+kvc0@users.noreply.github.com>
Date: Thu, 23 Apr 2026 08:45:02 -0700
Subject: [PATCH 16/35] server: (anthropic API) fix prefix caching (#21793)

When testing claude code against llama.cpp, I noticed that only
n_past 18577 was used even when context was 60k or more. The log
in llama-server says:
```
slot update_slots: id  3 | task 10342 | old: ... ; cch= | defa0;You are
slot update_slots: id  3 | task 10342 | new: ... ; cch= | 1c8b4;
```
I observed that the cch value changed every time. Reading about that,
the x-anthropic-billing-header system message seems to be specially
handled inside of the anthropic api. I could remove it, but there
is a meaningful string sometimes included at the end. So instead,
I just replace the changing cch checksum with fffff.

I'm treating this as an anthropic message body API detail - I think this
is the right way to do this, but by all means please correct me!

It's always 5 hexadecimal characters, but I've written the replacement
defensively in case they change the protocol.
---
 tools/server/server-chat.cpp | 41 +++++++++++++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/tools/server/server-chat.cpp b/tools/server/server-chat.cpp
index ef586d1e1..34ec7982b 100644
--- a/tools/server/server-chat.cpp
+++ b/tools/server/server-chat.cpp
@@ -281,6 +281,42 @@ json server_chat_convert_responses_to_chatcmpl(const json & response_body) {
     return chatcmpl_body;
 }
 
+// Edits the cch section of an "x-anthropic-billing-header" system prompt.
+// Does nothing to any other prompt.
+//
+// This is a claude message with a "cch=ef01a" attribute that breaks prefix caching.
+// The cch stamp is a whitebox end-to-end integrity hint. It's not meaningful as a
+// system prompt data, particularly to llama.cpp, but its presence means the prefix
+// cache will not get past it: It changes on each request.
+//
+// Reference: https://github.com/ggml-org/llama.cpp/pull/21793
+// Example header:
+// ```
+// x-anthropic-billing-header: cc_version=2.1.101.e51; cc_entrypoint=cli; cch=a5145;You are Claude Code, Anthropic's official CLI for Claude.
+//                                                                            ^^^^^
+// ```
+static void normalize_anthropic_billing_header(std::string & system_text) {
+    if (system_text.rfind("x-anthropic-billing-header:", 0) != 0) {
+        return;
+    }
+
+    const size_t header_prefix_length = strlen("x-anthropic-billing-header:");
+    const size_t cch_length = 5;
+    const size_t index_cch = system_text.find("cch=", header_prefix_length);
+    if (index_cch == std::string::npos) {
+        return;
+    }
+
+    const size_t index_replace = index_cch + 4;
+    if (index_replace + cch_length < system_text.length() && system_text[index_replace + cch_length] == ';') {
+        for (size_t i = 0; i < cch_length; ++i) {
+            system_text[index_replace + i] = 'f';
+        }
+    } else {
+        LOG_ERR("anthropic string not as expected: %s", system_text.c_str());
+    }
+}
+
 json server_chat_convert_anthropic_to_oai(const json & body) {
     json oai_body;
 
@@ -292,10 +328,13 @@ json server_chat_convert_anthropic_to_oai(const json & body) {
 
         if (system_param.is_string()) {
             system_content = system_param.get<std::string>();
+            normalize_anthropic_billing_header(system_content);
         } else if (system_param.is_array()) {
             for (const auto & block : system_param) {
                 if (json_value(block, "type", std::string()) == "text") {
-                    system_content += json_value(block, "text", std::string());
+                    auto system_text = json_value(block, "text", std::string());
+                    normalize_anthropic_billing_header(system_text);
+                    system_content += system_text;
                 }
             }
         }

From 12568ca8c8176785f5da005a5be17064c72c5536 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= <angt@huggingface.co>
Date: Thu, 23 Apr 2026 17:45:56 +0200
Subject: [PATCH 17/35] vendor : update LibreSSL to 4.3.1 (#22285)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
---
 vendor/cpp-httplib/CMakeLists.txt | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/vendor/cpp-httplib/CMakeLists.txt b/vendor/cpp-httplib/CMakeLists.txt
index 28485a0ce..df4b9ecce 100644
--- a/vendor/cpp-httplib/CMakeLists.txt
+++ b/vendor/cpp-httplib/CMakeLists.txt
@@ -81,7 +81,7 @@ if (LLAMA_BUILD_BORINGSSL)
     target_link_libraries(${TARGET} PUBLIC ssl crypto)
 
 elseif (LLAMA_BUILD_LIBRESSL)
-    set(LIBRESSL_VERSION "4.2.1" CACHE STRING "LibreSSL version")
+    set(LIBRESSL_VERSION "4.3.1" CACHE STRING "LibreSSL version")
 
     message(STATUS "Fetching LibreSSL version ${LIBRESSL_VERSION}")
 
@@ -161,12 +161,24 @@ if(LLAMA_BUILD_BORINGSSL OR LLAMA_BUILD_LIBRESSL)
         if(LLAMA_BUILD_BORINGSSL)
             target_compile_options(fipsmodule PRIVATE /w)
         endif()
+        if(LLAMA_BUILD_LIBRESSL)
+            target_compile_options(ssl_obj PRIVATE /w)
+            target_compile_options(bs_obj PRIVATE /w)
+            target_compile_options(compat_obj PRIVATE /w)
+            target_compile_options(crypto_obj PRIVATE /w)
+        endif()
     else()
         target_compile_options(ssl PRIVATE -w)
         target_compile_options(crypto PRIVATE -w)
         if(LLAMA_BUILD_BORINGSSL)
             target_compile_options(fipsmodule PRIVATE -w)
         endif()
+        if(LLAMA_BUILD_LIBRESSL)
+            target_compile_options(ssl_obj PRIVATE -w)
+            target_compile_options(bs_obj PRIVATE -w)
+            target_compile_options(compat_obj PRIVATE -w)
+            target_compile_options(crypto_obj PRIVATE -w)
+        endif()
     endif()
 endif()
 

From c78fb909b23758f5e418cf98a69bc8a0ef142fb8 Mon Sep 17 00:00:00 2001
From: Song Li <songtli@outlook.com>
Date: Thu, 23 Apr 2026 12:39:07 -0400
Subject: [PATCH 18/35] server: fix heap-buffer-overflow from negative
 n_discard (CVE-2026-21869) (#22267)

* server: clamp n_discard to non-negative at JSON parse boundary (CVE-2026-21869)

A negative n_discard from client JSON causes heap-buffer-overflow in
update_slots() context-shift loop (CWE-787, CVSS 8.8). Clamp to 0 at
ingress; n_discard=0 already triggers auto-discard (n_left/2).

Ref: GHSA-8947-pfff-2f3c

* cont : cleaner

* cont : cleanerer

* cont : cleanest

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 tools/server/server-task.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
index 9380792c0..4c341d7c5 100644
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@@ -270,6 +270,7 @@ task_params server_task::params_from_json_cmpl(
     params.n_indent         = json_value(data,       "n_indent",           defaults.n_indent);
     params.n_keep           = json_value(data,       "n_keep",             defaults.n_keep);
     params.n_discard        = json_value(data,       "n_discard",          defaults.n_discard);
+    params.n_discard        = std::max(0, params.n_discard);
     params.n_cmpl           = json_value(data,       "n_cmpl",             json_value(data, "n", 1));
     params.n_cache_reuse    = json_value(data,       "n_cache_reuse",      defaults.n_cache_reuse);
     //params.t_max_prompt_ms  = json_value(data,       "t_max_prompt_ms",    defaults.t_max_prompt_ms); // TODO: implement

From 185cbff6f1b460e2ac193fbe7c55dd6c17ff0b1b Mon Sep 17 00:00:00 2001
From: srkizer <github.soreepeong.prqovzj1d@srkizer.net>
Date: Fri, 24 Apr 2026 03:32:46 +0900
Subject: [PATCH 19/35] server : convert_anthropic_to_oai: also copy
 chat_template_kwargs (#22154)

---
 tools/server/server-chat.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/server/server-chat.cpp b/tools/server/server-chat.cpp
index 34ec7982b..a15583469 100644
--- a/tools/server/server-chat.cpp
+++ b/tools/server/server-chat.cpp
@@ -514,7 +514,7 @@ json server_chat_convert_anthropic_to_oai(const json & body) {
     }
 
     // Pass through common params
-    for (const auto & key : {"temperature", "top_p", "top_k", "stream"}) {
+    for (const auto & key : {"temperature", "top_p", "top_k", "stream", "chat_template_kwargs"}) {
         if (body.contains(key)) {
             oai_body[key] = body.at(key);
         }

From 187a45637054881ecacf17f8e2f6f8f2ba7df1c7 Mon Sep 17 00:00:00 2001
From: Shreya Jain <shreyajn@qti.qualcomm.com>
Date: Thu, 23 Apr 2026 13:08:10 -0700
Subject: [PATCH 20/35] Enable testing on Snapdragon devices (#21051)

* Add the tests that we want to run on external CI

* remove extra files

* Fixes python issues, reove the deadlock on CI

* remove unecessary changes

* use override to ty.toml

* fix pre-commit and try tests with secret in external repo not upstream

* skip if key is unavailable

* Fix feedback

* switch hexagon to snapdragon

* cleanup

* fix secrets

* remove the copyrights at the top of the files
---
 .../workflows/build-and-test-snapdragon.yml   | 113 +++++
 .github/workflows/build-android.yml           |  49 +--
 scripts/snapdragon/qdc/readme.md              |   1 -
 scripts/snapdragon/qdc/requirements.txt       |   3 -
 scripts/snapdragon/qdc/run_qdc_jobs.py        | 401 ++++++++++++++++++
 scripts/snapdragon/qdc/tests/conftest.py      |  20 +
 .../qdc/tests/run_backend_ops_posix.py        |  41 ++
 .../qdc/tests/run_bench_tests_posix.py        |  76 ++++
 scripts/snapdragon/qdc/tests/test_bench.py    |  63 ---
 scripts/snapdragon/qdc/tests/utils.py         |  93 ++++
 ty.toml                                       |   3 +-
 11 files changed, 764 insertions(+), 99 deletions(-)
 create mode 100644 .github/workflows/build-and-test-snapdragon.yml
 delete mode 100644 scripts/snapdragon/qdc/readme.md
 create mode 100644 scripts/snapdragon/qdc/run_qdc_jobs.py
 create mode 100644 scripts/snapdragon/qdc/tests/conftest.py
 create mode 100644 scripts/snapdragon/qdc/tests/run_backend_ops_posix.py
 create mode 100644 scripts/snapdragon/qdc/tests/run_bench_tests_posix.py
 delete mode 100644 scripts/snapdragon/qdc/tests/test_bench.py
 create mode 100644 scripts/snapdragon/qdc/tests/utils.py

diff --git a/.github/workflows/build-and-test-snapdragon.yml b/.github/workflows/build-and-test-snapdragon.yml
new file mode 100644
index 000000000..7eb204ea2
--- /dev/null
+++ b/.github/workflows/build-and-test-snapdragon.yml
@@ -0,0 +1,113 @@
+name: CI (snapdragon)
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - master
+    paths:
+      - '.github/workflows/build-and-test-snapdragon.yml'
+      - 'ggml/include/ggml-hexagon.h'
+      - 'ggml/src/ggml-hexagon/**'
+      - 'docs/backend/snapdragon/**'
+      - 'scripts/snapdragon/**'
+      - 'CMakePresets.json'
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths:
+      - '.github/workflows/build-and-test-snapdragon.yml'
+      - 'ggml/include/ggml-hexagon.h'
+      - 'ggml/src/ggml-hexagon/**'
+      - 'docs/backend/snapdragon/**'
+      - 'scripts/snapdragon/**'
+      - 'CMakePresets.json'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  android-ndk-snapdragon:
+    runs-on: ubuntu-latest
+    container:
+      image: 'ghcr.io/snapdragon-toolchain/arm64-android:v0.3'
+    defaults:
+      run:
+        shell: bash
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v6
+        with:
+          fetch-depth: 0
+          lfs: false
+
+      - name: Build Llama.CPP for Snapdragon Android
+        id: build_llama_cpp_snapdragon_android
+        run: |
+          cp docs/backend/snapdragon/CMakeUserPresets.json .
+          cmake --preset arm64-android-snapdragon-release -B build
+          cmake --build build
+          cmake --install build --prefix pkg-adb/llama.cpp
+
+      - name: Upload Llama.CPP Snapdragon Android Build Artifact
+        if: ${{ always() && steps.build_llama_cpp_snapdragon_android.outcome == 'success' }}
+        uses: actions/upload-artifact@v6
+        with:
+          name: llama-cpp-android-arm64-snapdragon
+          path: pkg-adb/llama.cpp
+
+  check-secret:
+    runs-on: ubuntu-latest
+    outputs:
+      has-key: ${{ steps.check.outputs.has-key }}
+    steps:
+      - id: check
+        run: echo "has-key=${{ secrets.QDC_API_KEY != '' }}" >> "$GITHUB_OUTPUT"
+
+  test-snapdragon-qdc:
+    name: Test on QDC Android Device (${{ matrix.device }})
+    needs: [android-ndk-snapdragon, check-secret]
+    if: needs.check-secret.outputs.has-key == 'true'
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        device: [SM8750, SM8650, SM8850]
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+
+      - name: Download build artifact
+        uses: actions/download-artifact@v4
+        with:
+          name: llama-cpp-android-arm64-snapdragon
+          path: pkg-snapdragon/
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.x'
+          cache: pip
+
+      - name: Install QDC SDK wheel
+        run: |
+          curl -fSL -o qdc_sdk.zip https://softwarecenter.qualcomm.com/api/download/software/tools/Qualcomm_Device_Cloud_SDK/All/0.2.3/qualcomm_device_cloud_sdk-0.2.3.zip
+          unzip qdc_sdk.zip -d qdc_sdk
+          pip install qdc_sdk/qualcomm_device_cloud_sdk-0.2.3-py3-none-any.whl
+
+      - name: Run QDC tests (${{ matrix.device }})
+        run: |
+          python scripts/snapdragon/qdc/run_qdc_jobs.py \
+              --test       all \
+              --pkg-dir    pkg-snapdragon/llama.cpp \
+              --model-url  "https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf" \
+              --device     ${{ matrix.device }}
+        env:
+          QDC_API_KEY: ${{ secrets.QDC_API_KEY }}
+
+      - name: Cleanup
+        if: always()
+        run: rm -rf pkg-snapdragon qdc_sdk qdc_sdk.zip
diff --git a/.github/workflows/build-android.yml b/.github/workflows/build-android.yml
index b38a793f1..5d88305a4 100644
--- a/.github/workflows/build-android.yml
+++ b/.github/workflows/build-android.yml
@@ -1,26 +1,24 @@
 name: CI (android)
 
 on:
-  workflow_dispatch: # allows manual triggering
+  workflow_dispatch:
   push:
     branches:
       - master
-    paths: [
-      '.github/workflows/build-android.yml',
-      '**/CMakeLists.txt',
-      '**/.cmake',
-      '**/*.h',
-      '**/*.hpp',
-      '**/*.c',
-      '**/*.cpp'
-    ]
+    paths:
+      - '.github/workflows/build-android.yml'
+      - '**/CMakeLists.txt'
+      - '**/.cmake'
+      - '**/*.h'
+      - '**/*.hpp'
+      - '**/*.c'
+      - '**/*.cpp'
 
   pull_request:
     types: [opened, synchronize, reopened]
-    paths: [
-      '.github/workflows/build-android.yml',
-      'examples/llama.android/**'
-    ]
+    paths:
+      - '.github/workflows/build-android.yml'
+      - 'examples/llama.android/**'
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@@ -67,35 +65,24 @@ jobs:
     defaults:
       run:
         shell: bash
-    strategy:
-      matrix:
-        include:
-          - build: 'arm64-cpu'
-            defines: '-D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF'
-          - build: 'arm64-snapdragon'
-            defines: '--preset arm64-android-snapdragon-release'
 
     steps:
       - name: Clone
-        id: checkout
         uses: actions/checkout@v6
         with:
           fetch-depth: 0
           lfs: false
 
-      - name: Build Llama.CPP for Hexagon Android
-        id: build_llama_cpp_hexagon_android
+      - name: Build
+        id: ndk_build
         run: |
-          if [[ "${{ matrix.build }}" == "arm64-snapdragon" ]]; then
-            cp docs/backend/snapdragon/CMakeUserPresets.json .
-          fi
-          cmake ${{ matrix.defines }} -B build
+          cmake -D ANDROID_ABI=arm64-v8a -D ANDROID_PLATFORM=android-31 -D CMAKE_TOOLCHAIN_FILE=${ANDROID_NDK_ROOT}/build/cmake/android.toolchain.cmake -D GGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm -G Ninja -D LLAMA_OPENSSL=OFF -D GGML_OPENMP=OFF -B build
           cmake --build build
           cmake --install build --prefix pkg-adb/llama.cpp
 
-      - name: Upload Llama.CPP Hexagon Android Build Artifact
-        if: ${{ always() && steps.build_llama_cpp_hexagon_android.outcome == 'success' }}
+      - name: Upload Android Build Artifact
+        if: ${{ always() && steps.ndk_build.outcome == 'success' }}
         uses: actions/upload-artifact@v6
         with:
-          name: llama-cpp-android-${{ matrix.build }}
+          name: llama-cpp-android-arm64-cpu
           path: pkg-adb/llama.cpp
diff --git a/scripts/snapdragon/qdc/readme.md b/scripts/snapdragon/qdc/readme.md
deleted file mode 100644
index b92cf243a..000000000
--- a/scripts/snapdragon/qdc/readme.md
+++ /dev/null
@@ -1 +0,0 @@
-This directory includes pytest based scripts for running CI jobs on Qualcomm Device Cloud (QDC).
diff --git a/scripts/snapdragon/qdc/requirements.txt b/scripts/snapdragon/qdc/requirements.txt
index f04bd682e..5e0f85917 100644
--- a/scripts/snapdragon/qdc/requirements.txt
+++ b/scripts/snapdragon/qdc/requirements.txt
@@ -8,12 +8,9 @@ iniconfig==2.1.0
 outcome==1.3.0.post0
 packaging==25.0
 pluggy==1.6.0
-Pygments==2.19.2
 PySocks==1.7.1
 pytest==8.4.2
-pytest-dependency==0.6.0
 selenium==4.36.0
-setuptools==80.9.0
 sniffio==1.3.1
 sortedcontainers==2.4.0
 tomli==2.3.0
diff --git a/scripts/snapdragon/qdc/run_qdc_jobs.py b/scripts/snapdragon/qdc/run_qdc_jobs.py
new file mode 100644
index 000000000..b4eede3d0
--- /dev/null
+++ b/scripts/snapdragon/qdc/run_qdc_jobs.py
@@ -0,0 +1,401 @@
+"""Run llama.cpp Hexagon Android tests in a single QDC Appium job.
+
+Bundles test scripts into one artifact and submits a single QDC job:
+
+  1. run_bench_tests_posix.py — llama-cli and llama-bench on CPU / GPU / NPU
+                                (from scripts/snapdragon/qdc/)
+
+Results are written to $GITHUB_STEP_SUMMARY when set (GitHub Actions).
+
+Prerequisites:
+  pip install /path/to/qualcomm_device_cloud_sdk*.whl
+
+Required environment variables:
+  QDC_API_KEY   API key from QDC UI -> Users -> Settings -> API Keys
+
+Usage:
+  python run_qdc_jobs.py \\
+      --pkg-dir    pkg-snapdragon/llama.cpp \\
+      --model-url  https://.../Llama-3.2-1B-Instruct-Q4_0.gguf \\
+      --device     SM8750
+"""
+
+from __future__ import annotations
+
+import argparse
+import logging
+import os
+import re
+import shutil
+import sys
+import tempfile
+import time
+import xml.etree.ElementTree as ET
+from dataclasses import dataclass, field
+from pathlib import Path
+
+from qualcomm_device_cloud_sdk.api import qdc_api  # ty: ignore[unresolved-import]
+from qualcomm_device_cloud_sdk.logging import configure_logging  # ty: ignore[unresolved-import]
+from qualcomm_device_cloud_sdk.models import ArtifactType, JobMode, JobState, JobSubmissionParameter, JobType, TestFramework  # ty: ignore[unresolved-import]
+
+configure_logging(level=logging.INFO, handlers=[logging.StreamHandler()])
+log = logging.getLogger(__name__)
+
+POLL_INTERVAL        = 30
+JOB_TIMEOUT          = 3600
+LOG_UPLOAD_TIMEOUT   = 600
+CAPACITY_TIMEOUT     = 1800
+CAPACITY_POLL        = 60
+MAX_CONCURRENT_JOBS  = 5
+TERMINAL_STATES     = {JobState.COMPLETED, JobState.CANCELED}
+NON_TERMINAL_STATES = {JobState.DISPATCHED, JobState.RUNNING, JobState.SETUP, JobState.SUBMITTED}
+
+_SCRIPTS_DIR      = Path(__file__).parent
+_TESTS_DIR        = _SCRIPTS_DIR / "tests"
+_RUN_BENCH        = _TESTS_DIR / "run_bench_tests_posix.py"
+_RUN_BACKEND_OPS  = _TESTS_DIR / "run_backend_ops_posix.py"
+_UTILS            = _TESTS_DIR / "utils.py"
+_CONFTEST         = _TESTS_DIR / "conftest.py"
+_REQUIREMENTS     = _SCRIPTS_DIR / "requirements.txt"
+
+_PYTEST_LINE_RE = re.compile(
+    r"(?:[\w/]+\.py::)?(?:\w+::)?([\w\[\].-]+)\s+(PASSED|FAILED|ERROR|SKIPPED)"
+)
+_EXCLUDED_LOGS = {"qdc_android_whole_host-000.log", "qdc_kernel_host-000.log"}
+_NON_TERMINAL_STATE_VALUES = {s.value for s in NON_TERMINAL_STATES}
+
+
+@dataclass
+class JobResult:
+    passed: bool
+    tests: dict[str, bool] = field(default_factory=dict)
+    raw_logs: dict[str, str] = field(default_factory=dict)
+    failure_details: dict[str, str] = field(default_factory=dict)
+
+
+def build_artifact_zip(
+    pkg_dir: Path,
+    stage_dir: Path,
+    *,
+    test_mode: str = "bench",
+    model_url: str | None = None,
+) -> Path:
+    """Bundle everything into a single QDC artifact zip.
+
+    Zip structure (extracted by QDC to /qdc/appium/ on the runner):
+      llama_cpp_bundle/            installed package (adb pushed to /data/local/tmp/)
+      tests/
+        utils.py                   shared helpers (paths, run_adb_command, …)
+        conftest.py                shared pytest fixtures (driver)
+        test_bench_posix.py        bench + cli tests (<<MODEL_URL>> substituted)
+          AND/OR
+        test_backend_ops_posix.py  test-backend-ops -b HTP0
+      requirements.txt
+    """
+    shutil.copytree(pkg_dir, stage_dir / "llama_cpp_bundle")
+
+    tests_dir = stage_dir / "tests"
+    tests_dir.mkdir()
+
+    shutil.copy(_UTILS,    tests_dir / "utils.py")
+    shutil.copy(_CONFTEST, tests_dir / "conftest.py")
+
+    if test_mode in ("bench", "all"):
+        assert model_url is not None, "--model-url is required for bench/all test modes"
+        (tests_dir / "test_bench_posix.py").write_text(
+            _RUN_BENCH.read_text().replace("<<MODEL_URL>>", model_url)
+        )
+    if test_mode in ("backend-ops", "all"):
+        shutil.copy(_RUN_BACKEND_OPS, tests_dir / "test_backend_ops_posix.py")
+
+    shutil.copy(_REQUIREMENTS, stage_dir / "requirements.txt")
+    (stage_dir / "pytest.ini").write_text("[pytest]\naddopts = --junitxml=results.xml\n")
+
+    zip_base = str(stage_dir / "artifact")
+    shutil.make_archive(zip_base, "zip", stage_dir)
+    return Path(f"{zip_base}.zip")
+
+
+def wait_for_job(client, job_id: str, timeout: int) -> str:
+    elapsed = 0
+    while elapsed < timeout:
+        raw = qdc_api.get_job_status(client, job_id)
+        try:
+            status = JobState(raw)
+        except ValueError:
+            status = raw
+        if status in TERMINAL_STATES:
+            return raw.lower()
+        log.info("Job %s: %s", job_id, raw)
+        time.sleep(POLL_INTERVAL)
+        elapsed += POLL_INTERVAL
+    raise TimeoutError(f"Job {job_id} did not finish within {timeout}s")
+
+
+def wait_for_log_upload(client, job_id: str) -> None:
+    elapsed = 0
+    while elapsed <= LOG_UPLOAD_TIMEOUT:
+        status = (qdc_api.get_job_log_upload_status(client, job_id) or "").lower()
+        if status in {"completed", "failed"}:
+            return
+        log.info("Waiting for log upload (status=%s) ...", status)
+        time.sleep(POLL_INTERVAL)
+        elapsed += POLL_INTERVAL
+    log.warning("Timed out waiting for log upload after %ds", LOG_UPLOAD_TIMEOUT)
+
+
+def wait_for_capacity(client, max_jobs: int = MAX_CONCURRENT_JOBS) -> None:
+    """Block until the user's active (non-terminal) QDC job count is below max_jobs."""
+    elapsed = 0
+    while elapsed < CAPACITY_TIMEOUT:
+        jobs_page = qdc_api.get_jobs_list(client, page_number=0, page_size=50)
+        if jobs_page is None:
+            log.warning("Could not retrieve job list; proceeding without capacity check")
+            return
+        items = getattr(jobs_page, "data", []) or []
+        active = sum(1 for j in items if getattr(j, "state", None) in _NON_TERMINAL_STATE_VALUES)
+        if active < max_jobs:
+            log.info("Active QDC jobs: %d / %d — proceeding", active, max_jobs)
+            return
+        log.info("Active QDC jobs: %d / %d — waiting %ds ...", active, max_jobs, CAPACITY_POLL)
+        time.sleep(CAPACITY_POLL)
+        elapsed += CAPACITY_POLL
+    log.warning("Capacity wait timed out after %ds; proceeding anyway", CAPACITY_TIMEOUT)
+
+
+def _parse_junit_xml(content: str) -> tuple[dict[str, bool], dict[str, str]]:
+    try:
+        root = ET.fromstring(content)
+    except ET.ParseError:
+        return {}, {}
+    results: dict[str, bool] = {}
+    failures: dict[str, str] = {}
+    for tc in root.iter("testcase"):
+        name = tc.get("name", "")
+        if classname := tc.get("classname", ""):
+            name = f"{classname}.{name}"
+        failure_el = tc.find("failure")
+        if failure_el is None:
+            failure_el = tc.find("error")
+        results[name] = failure_el is None
+        if failure_el is not None:
+            parts = [failure_el.get("message", ""), failure_el.text or ""]
+            failures[name] = "\n".join(p for p in parts if p).strip()
+    return results, failures
+
+
+def _parse_pytest_output(content: str) -> dict[str, bool]:
+    results: dict[str, bool] = {}
+    for m in _PYTEST_LINE_RE.finditer(content):
+        results[m.group(1)] = m.group(2) == "PASSED"
+    return results
+
+
+def fetch_logs_and_parse_tests(
+    client, job_id: str
+) -> tuple[dict[str, bool], dict[str, str], dict[str, str]]:
+    """Returns (test_results, raw_logs, failure_details)."""
+    log_files = qdc_api.get_job_log_files(client, job_id)
+    if not log_files:
+        log.warning("No log files returned for job %s", job_id)
+        return {}, {}, {}
+
+    test_results: dict[str, bool] = {}
+    pytest_fallback: dict[str, bool] = {}
+    raw_logs: dict[str, str] = {}
+    failure_details: dict[str, str] = {}
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        for lf in log_files:
+            log.info("Downloading log file: %s", lf.filename)
+            zip_path = os.path.join(tmpdir, "log.zip")
+            qdc_api.download_job_log_files(client, lf.filename, zip_path)
+            try:
+                shutil.unpack_archive(zip_path, tmpdir, "zip")
+            except Exception as e:
+                log.warning("Could not unpack %s as zip: %s", lf.filename, e)
+
+        for root_dir, _, files in os.walk(tmpdir):
+            for fname in sorted(files):
+                fpath = os.path.join(root_dir, fname)
+                content = Path(fpath).read_text(errors="replace")
+                if fname.endswith(".xml"):
+                    results, failures = _parse_junit_xml(content)
+                    test_results.update(results)
+                    failure_details.update(failures)
+                elif fname.endswith(".log"):
+                    if fname in _EXCLUDED_LOGS:
+                        continue
+                    log.info("--- %s ---", fname)
+                    log.info("%s", content)
+                    raw_logs[fname] = content
+                    pytest_fallback.update(_parse_pytest_output(content))
+
+    return (test_results if test_results else pytest_fallback), raw_logs, failure_details
+
+
+def write_summary(result: JobResult, title: str = "QDC Test Results") -> None:
+    summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
+    if not summary_path:
+        return
+
+    icon = "✅" if result.passed else "❌"
+
+    lines = [
+        f"## {title}\n",
+        f"Overall: {icon} {'PASSED' if result.passed else 'FAILED'}\n",
+    ]
+    reportable = {n: ok for n, ok in result.tests.items() if "test_install" not in n}
+    if reportable:
+        lines += ["| Test | Result |", "| ---- | ------ |"]
+        for name, ok in reportable.items():
+            lines.append(f"| `{name}` | {'✅' if ok else '❌'} |")
+        passed_n = sum(1 for v in reportable.values() if v)
+        failed_n = sum(1 for v in reportable.values() if not v)
+        lines += ["", f"**{passed_n} passed, {failed_n} failed**"]
+    else:
+        lines.append("_No per-test data available._")
+
+    failed_names = [n for n, ok in reportable.items() if not ok]
+    if failed_names:
+        lines += ["", "### Failures"]
+        for name in failed_names:
+            detail = result.failure_details.get(name)
+            if detail:
+                lines += [
+                    f"<details><summary><code>{name}</code></summary>",
+                    "",
+                    "```",
+                    detail,
+                    "```",
+                    "",
+                    "</details>",
+                ]
+
+    if result.raw_logs:
+        lines += ["", "### Raw Logs"]
+        for fname, content in sorted(result.raw_logs.items()):
+            lines += [
+                f"<details><summary>{fname}</summary>",
+                "",
+                "```",
+                content.rstrip(),
+                "```",
+                "",
+                "</details>",
+            ]
+
+    with open(summary_path, "a") as f:
+        f.write("\n".join(lines) + "\n")
+
+
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    p.add_argument("--pkg-dir",   required=True, type=Path,
+                   help="Installed llama.cpp package directory (contains bin/ and lib/)")
+    p.add_argument("--model-url",
+                   help="Direct URL to the GGUF model file (required for --test bench)")
+    p.add_argument("--device",    required=True,
+                   help="QDC chipset name, e.g. SM8750")
+    p.add_argument("--test", choices=["bench", "backend-ops", "all"], default="bench",
+                   help="Test suite to run (default: bench)")
+    p.add_argument("--job-timeout", type=int, default=JOB_TIMEOUT, metavar="SECONDS",
+                   help=f"Max seconds to wait for job completion (default: {JOB_TIMEOUT})")
+    args = p.parse_args()
+    if args.test in ("bench", "all") and not args.model_url:
+        p.error("--model-url is required when --test bench or --test all")
+    return args
+
+
+def main() -> int:
+    args = parse_args()
+
+    api_key = os.environ.get("QDC_API_KEY")
+    if not api_key:
+        log.error("QDC_API_KEY environment variable must be set")
+        return 1
+    if not args.pkg_dir.is_dir():
+        log.error("--pkg-dir %s does not exist", args.pkg_dir)
+        return 1
+
+    client = qdc_api.get_public_api_client_using_api_key(
+        api_key_header=api_key,
+        app_name_header="llama-cpp-ci",
+        on_behalf_of_header="llama-cpp-ci",
+        client_type_header="Python",
+    )
+
+    target_id = qdc_api.get_target_id(client, args.device)
+    if target_id is None:
+        log.error("Could not find QDC target for device %r", args.device)
+        return 1
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        log.info("Building artifact ...")
+        zip_path = build_artifact_zip(
+            args.pkg_dir, Path(tmpdir),
+            test_mode=args.test, model_url=args.model_url,
+        )
+        log.info("Uploading artifact (%d MB) ...", zip_path.stat().st_size // 1_000_000)
+        artifact_id = qdc_api.upload_file(client, str(zip_path), ArtifactType.TESTSCRIPT)
+
+    if artifact_id is None:
+        log.error("Artifact upload failed")
+        return 1
+
+    wait_for_capacity(client)
+
+    job_id = qdc_api.submit_job(
+        public_api_client=client,
+        target_id=target_id,
+        job_name="llama.cpp Hexagon tests",
+        external_job_id=None,
+        job_type=JobType.AUTOMATED,
+        job_mode=JobMode.APPLICATION,
+        timeout=max(1, args.job_timeout // 60),
+        test_framework=TestFramework.APPIUM,
+        entry_script=None,
+        job_artifacts=[artifact_id],
+        monkey_events=None,
+        monkey_session_timeout=None,
+        job_parameters=[JobSubmissionParameter.WIFIENABLED],
+    )
+    if job_id is None:
+        log.error("Job submission failed")
+        return 1
+    log.info("Job submitted: %s  (device=%s)", job_id, args.device)
+
+    try:
+        job_status = wait_for_job(client, job_id, timeout=args.job_timeout)
+    except TimeoutError as e:
+        log.error("%s", e)
+        write_summary(JobResult(passed=False, tests={}), title=f"QDC Job Timed Out ({args.device})")
+        return 1
+    log.info("Job %s finished: %s", job_id, job_status)
+
+    wait_for_log_upload(client, job_id)
+    tests, raw_logs, failure_details = fetch_logs_and_parse_tests(client, job_id)
+
+    passed = job_status == JobState.COMPLETED.value.lower()
+    if tests:
+        passed = passed and all(tests.values())
+    if not passed:
+        log.error("Job did not complete successfully or tests failed (status=%s)", job_status)
+
+    result = JobResult(passed=passed, tests=tests, raw_logs=raw_logs, failure_details=failure_details)
+    if args.test == "backend-ops":
+        title = f"Backend Ops — HTP0 ({args.device})"
+    elif args.test == "all":
+        title = f"QDC Tests ({args.device})"
+    else:
+        title = f"QDC Test Results ({args.device})"
+    write_summary(result, title=title)
+
+    return 0 if passed else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scripts/snapdragon/qdc/tests/conftest.py b/scripts/snapdragon/qdc/tests/conftest.py
new file mode 100644
index 000000000..0fc5b3e5f
--- /dev/null
+++ b/scripts/snapdragon/qdc/tests/conftest.py
@@ -0,0 +1,20 @@
+"""Shared pytest fixtures for QDC on-device test runners."""
+
+import os
+
+import pytest
+from appium import webdriver
+
+from utils import options, write_qdc_log
+
+
+@pytest.fixture(scope="session", autouse=True)
+def driver():
+    return webdriver.Remote(command_executor="http://127.0.0.1:4723/wd/hub", options=options)
+
+
+def pytest_sessionfinish(session, exitstatus):
+    xml_path = getattr(session.config.option, "xmlpath", None) or "results.xml"
+    if os.path.exists(xml_path):
+        with open(xml_path) as f:
+            write_qdc_log("results.xml", f.read())
diff --git a/scripts/snapdragon/qdc/tests/run_backend_ops_posix.py b/scripts/snapdragon/qdc/tests/run_backend_ops_posix.py
new file mode 100644
index 000000000..958fc0747
--- /dev/null
+++ b/scripts/snapdragon/qdc/tests/run_backend_ops_posix.py
@@ -0,0 +1,41 @@
+"""
+On-device test-backend-ops runner for llama.cpp (HTP0 backend).
+
+Executed by QDC's Appium test framework on the QDC runner.
+The runner has ADB access to the allocated device.
+"""
+
+import os
+import sys
+
+import pytest
+
+from utils import BIN_PATH, CMD_PREFIX, push_bundle_if_needed, run_adb_command, write_qdc_log
+
+
+@pytest.fixture(scope="session", autouse=True)
+def install(driver):
+    push_bundle_if_needed(f"{BIN_PATH}/test-backend-ops")
+
+
+@pytest.mark.parametrize("type_a", ["mxfp4", "fp16", "q4_0"])
+def test_backend_ops_htp0(type_a):
+    cmd = f"{CMD_PREFIX} GGML_HEXAGON_HOSTBUF=0 GGML_HEXAGON_EXPERIMENTAL=1 {BIN_PATH}/test-backend-ops -b HTP0 -o MUL_MAT"
+    if type_a == "q4_0":
+        cmd += r' -p "^(?=.*type_a=q4_0)(?!.*type_b=f32,m=576,n=512,k=576).*$"'
+    else:
+        cmd += f" -p type_a={type_a}"
+    result = run_adb_command(
+        cmd,
+        check=False,
+    )
+    write_qdc_log(f"backend_ops_{type_a}.log", result.stdout or "")
+    assert result.returncode == 0, f"test-backend-ops type_a={type_a} failed (exit {result.returncode})"
+
+
+if __name__ == "__main__":
+    ret = pytest.main(["-s", "--junitxml=results.xml", os.path.realpath(__file__)])
+    if os.path.exists("results.xml"):
+        with open("results.xml") as f:
+            write_qdc_log("results.xml", f.read())
+    sys.exit(ret)
diff --git a/scripts/snapdragon/qdc/tests/run_bench_tests_posix.py b/scripts/snapdragon/qdc/tests/run_bench_tests_posix.py
new file mode 100644
index 000000000..44802c313
--- /dev/null
+++ b/scripts/snapdragon/qdc/tests/run_bench_tests_posix.py
@@ -0,0 +1,76 @@
+"""
+On-device bench and completion test runner for llama.cpp (CPU, GPU, NPU backends).
+
+Executed by QDC's Appium test framework on the QDC runner.
+The runner has ADB access to the allocated device.
+
+Placeholders replaced at artifact creation time by run_qdc_jobs.py:
+  <<MODEL_URL>>  Direct URL to the GGUF model file (downloaded on-device via curl)
+"""
+
+import os
+import subprocess
+import sys
+
+import pytest
+
+from utils import BIN_PATH, CMD_PREFIX, push_bundle_if_needed, run_adb_command, write_qdc_log
+
+MODEL_PATH = "/data/local/tmp/model.gguf"
+PROMPT     = "What is the capital of France?"
+CLI_OPTS   = "--batch-size 128 -n 128 -no-cnv --seed 42"
+
+
+@pytest.fixture(scope="session", autouse=True)
+def install(driver):
+    push_bundle_if_needed(f"{BIN_PATH}/llama-cli")
+
+    # Skip model download if already present
+    check = subprocess.run(
+        ["adb", "shell", f"ls {MODEL_PATH}"],
+        text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+    )
+    if check.returncode != 0:
+        run_adb_command(f'curl -L -J --output {MODEL_PATH} "<<MODEL_URL>>"')
+
+
+@pytest.mark.parametrize("device,extra_flags", [
+    pytest.param("none",      "-ctk q8_0 -ctv q8_0", id="cpu"),
+    pytest.param("GPUOpenCL", "",                     id="gpu"),
+    pytest.param("HTP0",      "-ctk q8_0 -ctv q8_0", id="npu"),
+])
+def test_llama_completion(device, extra_flags):
+    result = run_adb_command(
+        f'{CMD_PREFIX} {BIN_PATH}/llama-completion'
+        f' -m {MODEL_PATH} --device {device} -ngl 99 -t 4 {CLI_OPTS} {extra_flags} -fa on'
+        f' -p "{PROMPT}"',
+        check=False,
+    )
+    write_qdc_log(f"llama_completion_{device}.log", result.stdout or "")
+    assert result.returncode == 0, f"llama-completion {device} failed (exit {result.returncode})"
+
+
+_DEVICE_LOG_NAME = {"none": "cpu", "GPUOpenCL": "gpu", "HTP0": "htp"}
+
+
+@pytest.mark.parametrize("device", [
+    pytest.param("none",      id="cpu"),
+    pytest.param("GPUOpenCL", id="gpu"),
+    pytest.param("HTP0",      id="npu"),
+])
+def test_llama_bench(device):
+    result = run_adb_command(
+        f"{CMD_PREFIX} {BIN_PATH}/llama-bench"
+        f" -m {MODEL_PATH} --device {device} -ngl 99 --batch-size 128 -t 4 -p 128 -n 32",
+        check=False,
+    )
+    write_qdc_log(f"llama_bench_{_DEVICE_LOG_NAME[device]}.log", result.stdout or "")
+    assert result.returncode == 0, f"llama-bench {device} failed (exit {result.returncode})"
+
+
+if __name__ == "__main__":
+    ret = pytest.main(["-s", "--junitxml=results.xml", os.path.realpath(__file__)])
+    if os.path.exists("results.xml"):
+        with open("results.xml") as f:
+            write_qdc_log("results.xml", f.read())
+    sys.exit(ret)
diff --git a/scripts/snapdragon/qdc/tests/test_bench.py b/scripts/snapdragon/qdc/tests/test_bench.py
deleted file mode 100644
index 651ab5b71..000000000
--- a/scripts/snapdragon/qdc/tests/test_bench.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import pytest
-import subprocess
-import sys
-
-tmp_path='/data/local/tmp'
-pkg_path=f'{tmp_path}/llama.cpp'
-lib_path=f'{pkg_path}/lib'
-bin_path=f'{pkg_path}/bin'
-
-model='../gguf/Llama-3.2-1B-Instruct-Q4_0.gguf'
-cli_pref=f'cd {pkg_path} && LD_LIBRARY_PATH={lib_path} ADSP_LIBRARY_PATH={lib_path} {bin_path}'
-
-
-def run_cmd(cmd):
-    p = subprocess.run(cmd, text = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)
-    sys.stdout.write(p.stdout)
-    assert(p.returncode == 0)
-
-
-@pytest.mark.dependency()
-def test_install():
-    run_cmd(['adb', 'push', 'llama.cpp', f'{tmp_path}'])
-    run_cmd(['adb', 'shell', f'chmod 755 {bin_path}/*'])
-
-
-## Basic cli tests
-def run_llama_cli(dev, opts):
-    prompt='what is the most popular cookie in the world?\nPlease provide a very brief bullet point summary.\nBegin your answer with **BEGIN**.'
-    opts = '--batch-size 128 -n 128 -no-cnv --seed 42 ' + opts
-    run_cmd(['adb', 'shell', f'{cli_pref}/llama-cli -m {model} --device {dev} -ngl 99 -t 4 {opts} -p "{prompt}"'])
-
-
-@pytest.mark.dependency(depends=['test_install'])
-def test_llama_cli_cpu():
-    run_llama_cli('none', '-ctk q8_0 -ctv q8_0 -fa on')
-
-
-@pytest.mark.dependency(depends=['test_install'])
-def test_llama_cli_gpu():
-    run_llama_cli('GPUOpenCL', '-fa on')
-
-
-@pytest.mark.dependency(depends=['test_install'])
-def test_llama_cli_npu():
-    run_llama_cli('HTP0', '-ctk q8_0 -ctv q8_0 -fa on')
-
-
-## Basic bench tests
-def run_llama_bench(dev):
-    run_cmd(['adb', 'shell', f'{cli_pref}/llama-bench -m {model} --device {dev} -ngl 99 --batch-size 128 -t 4 -p 128 -n 32'])
-
-
-@pytest.mark.dependency(depends=['test_install'])
-def test_llama_bench_cpu():
-    run_llama_bench('none')
-
-
-def test_llama_bench_gpu():
-    run_llama_bench('GPUOpenCL')
-
-
-def test_llama_bench_npu():
-    run_llama_bench('HTP0')
diff --git a/scripts/snapdragon/qdc/tests/utils.py b/scripts/snapdragon/qdc/tests/utils.py
new file mode 100644
index 000000000..00f0f1b2f
--- /dev/null
+++ b/scripts/snapdragon/qdc/tests/utils.py
@@ -0,0 +1,93 @@
+"""Shared helpers for QDC on-device test runners."""
+
+import logging
+import os
+import subprocess
+import tempfile
+
+from appium.options.common import AppiumOptions
+
+log = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# On-device paths
+# ---------------------------------------------------------------------------
+
+BUNDLE_PATH  = "/data/local/tmp/llama_cpp_bundle"
+QDC_LOGS_PATH = "/data/local/tmp/QDC_logs"
+LIB_PATH    = f"{BUNDLE_PATH}/lib"
+BIN_PATH    = f"{BUNDLE_PATH}/bin"
+ENV_PREFIX  = (
+    f"export LD_LIBRARY_PATH={LIB_PATH} && "
+    f"export ADSP_LIBRARY_PATH={LIB_PATH} && "
+    f"chmod +x {BIN_PATH}/* &&"
+)
+CMD_PREFIX  = f"cd {BUNDLE_PATH} && {ENV_PREFIX}"
+
+# ---------------------------------------------------------------------------
+# Appium session options
+# ---------------------------------------------------------------------------
+
+options = AppiumOptions()
+options.set_capability("automationName", "UiAutomator2")
+options.set_capability("platformName", "Android")
+options.set_capability("deviceName", os.getenv("ANDROID_DEVICE_VERSION"))
+
+# ---------------------------------------------------------------------------
+# ADB helpers
+# ---------------------------------------------------------------------------
+
+
+def run_adb_command(cmd: str, *, check: bool = True) -> subprocess.CompletedProcess:
+    # Append exit-code sentinel because `adb shell` doesn't reliably propagate
+    # the on-device exit code (older ADB versions always return 0).
+    raw = subprocess.run(
+        ["adb", "shell", f"{cmd}; echo __RC__:$?"],
+        text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+    )
+    stdout = raw.stdout
+    returncode = raw.returncode
+    if stdout:
+        lines = stdout.rstrip("\n").split("\n")
+        if lines and lines[-1].startswith("__RC__:"):
+            try:
+                returncode = int(lines[-1][7:])
+                stdout = "\n".join(lines[:-1]) + "\n"
+            except ValueError:
+                pass
+    log.info("%s", stdout)
+    result = subprocess.CompletedProcess(raw.args, returncode, stdout=stdout)
+    if check:
+        assert returncode == 0, f"Command failed (exit {returncode})"
+    return result
+
+
+def write_qdc_log(filename: str, content: str) -> None:
+    """Push content as a log file to QDC_LOGS_PATH on the device for QDC log collection."""
+    subprocess.run(
+        ["adb", "shell", f"mkdir -p {QDC_LOGS_PATH}"],
+        stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+    )
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".log", delete=False) as f:
+        f.write(content)
+        tmp_path = f.name
+    try:
+        subprocess.run(
+            ["adb", "push", tmp_path, f"{QDC_LOGS_PATH}/{filename}"],
+            stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+        )
+    finally:
+        os.unlink(tmp_path)
+
+
+def push_bundle_if_needed(check_binary: str) -> None:
+    """Push llama_cpp_bundle to the device if check_binary is not already present."""
+    result = subprocess.run(
+        ["adb", "shell", f"ls {check_binary}"],
+        text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+    )
+    if result.returncode != 0:
+        subprocess.run(
+            ["adb", "push", "/qdc/appium/llama_cpp_bundle/", "/data/local/tmp"],
+            text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+        )
diff --git a/ty.toml b/ty.toml
index bcd23db9b..a07d7485d 100644
--- a/ty.toml
+++ b/ty.toml
@@ -1,5 +1,5 @@
 [environment]
-extra-paths = ["./gguf-py", "./examples/model-conversion/scripts", "./tools/server/tests"]
+extra-paths = ["./gguf-py", "./examples/model-conversion/scripts", "./tools/server/tests", "./scripts/snapdragon/qdc/tests"]
 python-version = "3.10"
 
 [rules]
@@ -13,6 +13,7 @@ exclude = [
 [[overrides]]
 include = [
     "./tools/server/tests/**",
+    "./scripts/snapdragon/qdc/tests/**",
 ]
 
 [overrides.rules]

From 5d2b52d80d9f375a6e81d07e212d047d8ee4f76e Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Thu, 23 Apr 2026 14:17:21 -0700
Subject: [PATCH 21/35] hexagon: add support for basic and extended Op
 profiling  (#22269)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* hexagon: restore HTP_OPMASK_QUEUE

* hexagon: honor OPMASK_SKIP_COMPUTE in hmx-matmul

* hex-prof: restore op profiling

* hex-prof: enable PMU

* hexagon: simplify and improve op-queuing with full profiling support

Add separate profile descriptors.

* hexagon: remove opsync and rename opmask into opstage

opsync is no longer needed since the profiler is fully async now.
opmask name was confusing and opstage is more accurate.

* hexagon: refactor opbatch queue handling

* hexagon: add iface hooks for enabling profiler from the host

Also move all the PMU setup stuff out of the hex-utils since it's not inteded for normal use.

* hexagon: make profiler mode configurable

On older devices getting PMU counters is expensive so it's now optional.

* hexagon: add support for setting profiler pmu events from env

* hexagon: simplify profiler output (no need to print buffs, etc)

* hexagon: simplify pmu counter formating

* hexagon: add a simple profile post-proc tool

* hex-prof: add support for reading logs from stdin

* hexagon: document GGML_HEXAGON_PROFILE

* hex-prof: update default width for dims field

* hex-prof: fix linter warnings and errors

* Update ggml/src/ggml-hexagon/htp/htp-ops.h

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update scripts/snapdragon/ggml-hexagon-profile.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

---------

Co-authored-by: Trivikram Reddy <tamarnat@qti.qualcomm.com>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
---
 docs/backend/snapdragon/README.md             |  19 +-
 ggml/src/ggml-hexagon/ggml-hexagon.cpp        | 402 +++++++++++-------
 ggml/src/ggml-hexagon/htp/hex-utils.h         |  28 ++
 ggml/src/ggml-hexagon/htp/htp-ctx.h           |   5 +-
 ggml/src/ggml-hexagon/htp/htp-ops.h           |  36 +-
 ggml/src/ggml-hexagon/htp/htp_iface.idl       |   8 +-
 ggml/src/ggml-hexagon/htp/main.c              | 176 +++++---
 ggml/src/ggml-hexagon/htp/matmul-ops.c        |   4 +
 scripts/snapdragon/adb/run-bench.sh           |   4 +-
 scripts/snapdragon/adb/run-cli.sh             |   4 +-
 scripts/snapdragon/adb/run-completion.sh      |   4 +-
 scripts/snapdragon/adb/run-mtmd.sh            |   4 +-
 scripts/snapdragon/adb/run-tool.sh            |   4 +-
 scripts/snapdragon/ggml-hexagon-profile.py    | 188 ++++++++
 scripts/snapdragon/windows/run-bench.ps1      |   6 +-
 scripts/snapdragon/windows/run-cli.ps1        |   6 +-
 scripts/snapdragon/windows/run-completion.ps1 |   6 +-
 scripts/snapdragon/windows/run-mtmd.ps1       |   6 +-
 scripts/snapdragon/windows/run-tool.ps1       |   6 +-
 19 files changed, 671 insertions(+), 245 deletions(-)
 create mode 100755 scripts/snapdragon/ggml-hexagon-profile.py

diff --git a/docs/backend/snapdragon/README.md b/docs/backend/snapdragon/README.md
index e13fdfd05..2414eeaf6 100644
--- a/docs/backend/snapdragon/README.md
+++ b/docs/backend/snapdragon/README.md
@@ -249,18 +249,27 @@ build: 6a8cf8914 (6733)
   ```
 
 - `GGML_HEXAGON_PROFILE=1`
-  Generates a host-side profile for the ggml-hexagon Ops.
+  Enables Op profiling:
 
-- `GGML_HEXAGON_OPMASK=0x0`
-  Allows enabling specific stages of the processing pipeline:
+  - `1` Basic profile with per-op `usecs` and `cycles` counters
+  - `2` Extended profile with per-op `usecs`, `cycles` and default PMU counter data
+  - `0x1,...,0x8` Extended profile with per-op `usecs`, `cycles` and custom PMU counter data
+
+  The logging output can be either saved into a file for post-processing or it can be piped directly into the post-processing tool to generate the report.
+  Examples:
+
+      `GGML_HEXAGON_PROFILE=1 llama-completion ... |& ./scripts/snapdragon/ggml-hexagon-profile.py -`
+
+- `GGML_HEXAGON_OPSTAGE=0x0`
+  Allows enabling specific stages of the Op processing pipeline:
 
   - `0x1` Enable Op Queue (i.e., queuing Ops into NPU)
   - `0x2` Enable Op Compute (MUL_MAT, etc.)
 
   Examples:
 
-      `GGML_HEXAGON_OPMASK=0x1 llama-completion ...` - Ops are enqueued but NPU-side processing is stubbed out
-      `GGML_HEXAGON_OPMASK=0x3 llama-completion ...` - Full queuing and processing of Ops (default)
+      `GGML_HEXAGON_OPSTAGE=0x1 llama-completion ...` - Ops are enqueued to the NPU but dma & compute are disabled
+      `GGML_HEXAGON_OPSTAGE=0x3 llama-completion ...` - Full queuing and processing of Ops (default)
 
 - `GGML_HEXAGON_OPFILTER=regex`
   Allows filtering (disabling) Ops that match the regex pattern:
diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index cdd9fcf59..955903418 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -12,9 +12,12 @@
 #include <cstddef>
 #include <stdexcept>
 #include <string>
+#include <sstream>
+#include <iomanip>
 #include <unordered_set>
 #include <unordered_map>
 #include <regex>
+#include <queue>
 
 #ifdef _WIN32
 #    include <sal.h>
@@ -41,18 +44,26 @@
 #include "htp_iface.h"
 #include "htp-drv.h"
 
+using intvec  = std::vector<int>;
+using uintvec = std::vector<unsigned int>;
+using u32vec  = std::vector<uint32_t>;
+
 static size_t opt_ndev         = 1;
 static size_t opt_nhvx         = 0; // use all
 static int    opt_arch         = 0; // autodetect
 static int    opt_etm          = 0;
 static int    opt_verbose      = 0;
-static int    opt_profile      = 0;
+static int    opt_profile      = 0; // profiling mode (0-disabled, 1-basic, 2-pmu)
 static int    opt_hostbuf      = 1; // hostbuf ON by default
 static int    opt_use_hmx      = 1; // when set, enable HMX; when 0, use HVX only
 
+// Default PMU events, if profiling with PMU (mode=2) is enabled
+// See https://docs.qualcomm.com/doc/80-N2040-60/topic/pmu-events.html
+//     https://docs.qualcomm.com/doc/80-N2040-61/topic/hvx-pmu-events.html
+static u32vec opt_pmu_evt { 0x3, 0x111, 0x100, 0x105, 0x240, 0x256, 0x7D, 0x8C };
+
 // Enable all stages by default
-static int opt_opmask   = HTP_OPMASK_QUEUE | HTP_OPMASK_COMPUTE;
-static int opt_opsync   = 0;  // synchronous ops
+static int opt_opstage  = HTP_OPSTAGE_QUEUE | HTP_OPSTAGE_COMPUTE;
 static int opt_opbatch  = 1024; // max number of ops in a batch
 static int opt_opqueue  = 16;   // max number of pending batches
 static std::regex* opt_opfilter = NULL; // regex of ops to not claim
@@ -104,19 +115,26 @@ static void ggml_hexagon_dump_op_supp(const std::string &sess_name, const struct
 }
 
 static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const ggml_tensor * op,
-                                      uint32_t op_usec, uint32_t op_cycles, uint32_t op_pkts, uint64_t call_usec) {
+                                      uint32_t op_usec, uint32_t op_cycles, const uint32_t pmu[]) {
     if (!opt_profile) return;
 
     op_desc desc(op);
-    GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : %s : op-usec %u op-cycles %u op-pkts %u (%f) call-usec %llu\n", sess_name.c_str(),
-                ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, desc.buffs,
-                op_usec, op_cycles, op_pkts, (float) op_cycles / op_pkts, (unsigned long long) call_usec);
+
+    char pmu_str[256] = "";
+    if (opt_profile > 1) {
+        static_assert(HTP_PROF_PMU_NCNT == 8, "current implementation assumes 8 PMU counters");
+        sprintf(pmu_str, " pmu [%u,%u,%u,%u,%u,%u,%u,%u]",
+                pmu[0], pmu[1], pmu[2], pmu[3], pmu[4], pmu[5], pmu[6], pmu[7]);
+    }
+
+    GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : usec %u cycles %u%s\n", sess_name.c_str(),
+            ggml_op_desc(op), desc.names, desc.dims, desc.types, desc.strides, op_usec, op_cycles, pmu_str);
 }
 
 // ** backend sessions
 
 struct ggml_hexagon_opbatch;
-struct ggml_hexagon_opshm;
+struct ggml_hexagon_opqueue;
 
 struct ggml_hexagon_session {
     std::string      name;
@@ -132,8 +150,8 @@ struct ggml_hexagon_session {
     bool             valid_iface;
 
     std::atomic<int>      op_pending;
-    ggml_hexagon_opbatch *op_batch;
-    ggml_hexagon_opshm   *op_shm;
+    ggml_hexagon_opbatch* op_batch;
+    ggml_hexagon_opqueue* op_queue;
 
     ggml_backend_buffer_type buffer_type        = {};
     ggml_backend_buffer_type repack_buffer_type = {};
@@ -1521,65 +1539,14 @@ static ggml_backend_buffer_type_i ggml_backend_hexagon_repack_buffer_type_interf
 
 // Backend session implementation
 
-struct ggml_hexagon_opshm {
-    ggml_hexagon_shared_buffer *sbuf;
-
-    std::vector<bool> block_mask;
-    size_t            block_size;
-
-    uint8_t * base()     const { return this->sbuf->base; }
-    int       fd()       const { return this->sbuf->fd;   }
-    size_t    n_blocks() const { return this->block_mask.size(); }
-
-    ggml_hexagon_opshm(ggml_hexagon_session *sess, size_t max_batch, size_t max_pending) {
-        size_t n_bufs    = HTP_OP_MAX_BUFS;
-        size_t n_ops     = max_batch;
-        size_t n_tensors = n_ops + n_ops * HTP_OP_MAX_INPUTS;
-
-        block_mask.resize(max_pending, true);
-
-        block_size = sizeof(htp_buf_desc) * n_bufs    +
-                     sizeof(htp_tensor)   * n_tensors +
-                     sizeof(htp_op_desc)  * n_ops;
-
-        sbuf = new ggml_hexagon_shared_buffer(sess, block_size * block_mask.size(), true /* pinned */);
-
-        if (opt_verbose) {
-            GGML_LOG_INFO("ggml-hex: %s allocated shared buf %zu : block-size %zu max-batch %zu max-pending %zu\n",
-                    sess->c_name(), (size_t) sbuf->size, block_size, max_batch, max_pending);
-        }
-    }
-
-    ~ggml_hexagon_opshm() {
-        delete sbuf;
-    }
-
-    uint8_t * allocate() {
-        auto it = std::find(block_mask.begin(), block_mask.end(), true);
-        if (it == block_mask.end())
-            return nullptr;
-
-        unsigned int i = std::distance(block_mask.begin(), it);
-        uint8_t*  addr = sbuf->base + (i * block_size);
-        block_mask[i]  = false;
-
-        HEX_VERBOSE("ggml-hex: %s allocated op shm #%u %p\n", sbuf->sess->c_name(), i, (void*) addr);
-        return addr;
-    }
-
-    void release(uint8_t * addr) {
-        int i = (addr - sbuf->base) / block_size;
-        block_mask[i] = true;
-        HEX_VERBOSE("ggml-hex: %s released op shm #%u %p\n", sbuf->sess->c_name(), i, (void*) addr);
-    }
-};
-
 struct ggml_hexagon_opbatch {
-    const char* name;
+    ggml_hexagon_session*            sess;
 
-    std::vector<htp_buf_desc> buffers;
-    std::vector<htp_tensor>   tensors;
-    std::vector<htp_op_desc>  ops;
+    std::vector<const ggml_tensor*>  ops;       // pointers to original ops
+
+    std::vector<htp_buf_desc>        h_bufs;    // htp buffer descriptors
+    std::vector<htp_tensor>          h_tens;    // htp tensor descriptors
+    std::vector<htp_op_desc>         h_ops;     // htp op descriptors
 
     std::unordered_map<int, int>                b_map; // buffer fd   to index
     std::unordered_map<const ggml_tensor*, int> t_map; // tensor ptr  to index
@@ -1606,19 +1573,21 @@ struct ggml_hexagon_opbatch {
         d_map.clear();
     }
 
-    ggml_hexagon_opbatch(ggml_hexagon_session *sess, size_t max_batch) {
-        name = sess->c_name();
+    ggml_hexagon_opbatch(ggml_hexagon_session *sess, size_t batch_size) {
+        this->sess = sess;
 
         n_bufs_max = HTP_OP_MAX_BUFS;
-        n_ops_max  = max_batch;
+        n_ops_max  = batch_size;
         n_tens_max = n_ops_max + n_ops_max * HTP_OP_MAX_INPUTS;
 
         b_vmem_max = HTP_OP_MAX_VMEM;
 
-        buffers.resize(n_bufs_max);
-        tensors.resize(n_tens_max);
         ops.resize(n_ops_max);
 
+        h_bufs.resize(n_bufs_max);
+        h_tens.resize(n_tens_max);
+        h_ops.resize(n_ops_max);
+
         b_map.reserve(n_bufs_max);
         t_map.reserve(n_tens_max);
         d_map.reserve(n_tens_max);
@@ -1640,7 +1609,7 @@ struct ggml_hexagon_opbatch {
 
         b_map.insert({sbuf->fd, bi});
 
-        htp_buf_desc &b = buffers[bi];
+        htp_buf_desc &b = h_bufs[bi];
         b.base = (uint64_t) sbuf->base;
         b.fd   = sbuf->fd;
         b.size = sbuf->size;
@@ -1664,7 +1633,7 @@ struct ggml_hexagon_opbatch {
         // First lookup by tensor data
         auto range = d_map.equal_range(t->data);
         for (auto it = range.first; it != range.second; ++it) {
-            htp_tensor * h = &tensors[it->second];
+            htp_tensor * h = &h_tens[it->second];
             if (same_shape(h, t)) { return it->second; }
         }
 
@@ -1682,7 +1651,7 @@ struct ggml_hexagon_opbatch {
         uint64_t t_offset = (uint8_t *) t->data - sbuf->base;
         size_t   t_size   = ggml_nbytes(t);
 
-        htp_tensor &h = tensors[ti];
+        htp_tensor &h = h_tens[ti];
         h.bi    = add_buffer(sbuf);
         h.data  = t_offset;
         h.size  = t_size;
@@ -1737,65 +1706,170 @@ struct ggml_hexagon_opbatch {
     // assumes that fit_op() was called first and returned true
     void add_op(htp_op_code opcode, const struct ggml_tensor * t) {
         // Add new op
-        htp_op_desc &o = ops[n_ops++];
+
+        unsigned int n = n_ops++;
         GGML_ASSERT(n_ops <= n_ops_max);
 
+        ops[n] = t;
+
+        htp_op_desc &o = h_ops[n];
         memcpy(&o.params, &t->op_params, sizeof(t->op_params));
         o.opcode = opcode;
         o.flags  = 0;
 
-        if (!(opt_opmask & HTP_OPMASK_COMPUTE)) {
+        if (!(opt_opstage & HTP_OPSTAGE_COMPUTE)) {
             o.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
         }
 
-        ggml_hexagon_dump_op_exec(name, t, o.flags);
+        ggml_hexagon_dump_op_exec(sess->c_name(), t, o.flags);
 
         for (unsigned int i=0; i < HTP_OP_MAX_INPUTS; i++) {
             o.src[i] = t->src[i] ? add_tensor(t->src[i]) : 0xffff;
         }
         o.dst = add_tensor(t);
     }
+};
 
-    size_t flush(uint8_t * mem_addr, size_t mem_size) {
-        static_assert(sizeof(htp_buf_desc) % 8 == 0, "sizeof(htp_buf_desc) must be multiple of 8");
-        static_assert(sizeof(htp_tensor)   % 8 == 0, "sizeof(htp_tensor) must be multiple of 8");
-        static_assert(sizeof(htp_op_desc)  % 8 == 0, "sizeof(htp_op_desc) must be multiple of 8");
+struct ggml_hexagon_opqueue {
+    // Shared buffer for storing batches
+    ggml_hexagon_shared_buffer *shm_buf;
+    size_t                      shm_blk_size;
 
-        const size_t b_size = sizeof(htp_buf_desc) * n_bufs;
-        const size_t t_size = sizeof(htp_tensor)   * n_tens;
-        const size_t o_size = sizeof(htp_op_desc)  * n_ops;
+    using opvec = std::vector<const ggml_tensor*>;
 
-        const size_t m_size = b_size + t_size + o_size;
-        GGML_ASSERT(m_size <= mem_size);
+    std::queue<unsigned int>    done;       // completed batch ids
+    std::vector<opvec>          op_cache;   // per batch op cache
+    std::vector<uint64_t>       start_usec; // per batch start time
 
-        uint8_t * b_ptr = (uint8_t *) mem_addr;
-        uint8_t * t_ptr = (uint8_t *) b_ptr + b_size;
-        uint8_t * o_ptr = (uint8_t *) t_ptr + t_size;
+    ggml_hexagon_opqueue(ggml_hexagon_session *sess, size_t batch_size, size_t depth) {
+        size_t n_bufs    = HTP_OP_MAX_BUFS;
+        size_t n_ops     = batch_size;
+        size_t n_tensors = n_ops + n_ops * HTP_OP_MAX_INPUTS;
 
-        memcpy(b_ptr, (void *) buffers.data(), b_size);
-        memcpy(t_ptr, (void *) tensors.data(), t_size);
-        memcpy(o_ptr, (void *) ops.data(),     o_size);
+        shm_blk_size = sizeof(htp_buf_desc)  * n_bufs    +
+                       sizeof(htp_tensor)    * n_tensors +
+                       sizeof(htp_op_desc)   * n_ops     +
+                       sizeof(htp_prof_desc) * n_ops;
 
-        HEX_VERBOSE("ggml-hex: %s flush-opbatch : n-bufs %u n-tensors %u n-ops %u vmem %zu : b-size %zu t-size %zu o-size %zu\n",
-                name, n_bufs, n_tens, n_ops, b_vmem, b_size, t_size, o_size);
+        shm_buf = new ggml_hexagon_shared_buffer(sess, shm_blk_size * depth, true /* pinned */);
+
+        op_cache.resize(depth);
+        start_usec.resize(depth, 0);
+
+        // init done queue
+        for (unsigned int i = 0; i < depth; i++) { done.push(i); }
+
+        if (opt_verbose) {
+            GGML_LOG_INFO("ggml-hex: %s allocated op-queue : batch-size %zu depth %zu shm-size %zu shm-block-size %zu\n",
+                    sess->c_name(), batch_size, depth, shm_buf->size, shm_blk_size);
+        }
+    }
+
+    ~ggml_hexagon_opqueue() {
+        delete shm_buf;
+    }
+
+    // push new batch
+    bool push(htp_opbatch_req& req, dspqueue_buffer& dbuf, ggml_hexagon_opbatch* op_batch) {
+        static_assert(sizeof(htp_opbatch_req) % 8 == 0, "sizeof(htp_opbatch_req) must be multiple of 8");
+        static_assert(sizeof(htp_opbatch_rsp) % 8 == 0, "sizeof(htp_opbatch_rsp) must be multiple of 8");
+        static_assert(sizeof(htp_buf_desc)    % 8 == 0, "sizeof(htp_buf_desc) must be multiple of 8");
+        static_assert(sizeof(htp_tensor)      % 8 == 0, "sizeof(htp_tensor) must be multiple of 8");
+        static_assert(sizeof(htp_op_desc)     % 8 == 0, "sizeof(htp_op_desc) must be multiple of 8");
+        static_assert(sizeof(htp_prof_desc)   % 8 == 0, "sizeof(htp_prof_desc) must be multiple of 8");
+
+        if (done.empty()) { return false; }
+
+        req.id        = done.front(); done.pop(); // batch id
+        req.n_bufs    = op_batch->n_bufs;
+        req.n_tensors = op_batch->n_tens;
+        req.n_ops     = op_batch->n_ops;
+
+        op_cache[req.id]   = op_batch->ops;
+        start_usec[req.id] = ggml_time_us();
+
+        const size_t b_size = sizeof(htp_buf_desc)  * req.n_bufs;
+        const size_t t_size = sizeof(htp_tensor)    * req.n_tensors;
+        const size_t o_size = sizeof(htp_op_desc)   * req.n_ops;
+        const size_t p_size = sizeof(htp_prof_desc) * req.n_ops;
+
+        dbuf.ptr      = shm_buf->base + (req.id * shm_blk_size);
+        dbuf.fd       = shm_buf->fd;
+        dbuf.flags    = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
+        dbuf.offset   = (uint8_t*) dbuf.ptr - (uint8_t*) shm_buf->base;
+        dbuf.size     = b_size + t_size + o_size + p_size;
+
+        GGML_ASSERT(dbuf.size <= shm_blk_size);
+
+        uint8_t * m_ptr = (uint8_t*) dbuf.ptr;
+        uint8_t * b_ptr = m_ptr; m_ptr += b_size;
+        uint8_t * t_ptr = m_ptr; m_ptr += t_size;
+        uint8_t * o_ptr = m_ptr;
+
+        memcpy(b_ptr, (void *) op_batch->h_bufs.data(), b_size);
+        memcpy(t_ptr, (void *) op_batch->h_tens.data(), t_size);
+        memcpy(o_ptr, (void *) op_batch->h_ops.data(),  o_size);
+
+        HEX_VERBOSE("ggml-hex: %s op-queue push batch #%u : n-bufs %u n-tensors %u n-ops %u vmem %zu : b-size %zu t-size %zu o-size %zu m-size %zu\n",
+                shm_buf->sess->c_name(), req.id, req.n_bufs, req.n_tensors, req.n_ops, op_batch->b_vmem,
+                b_size, t_size, o_size, (size_t) dbuf.size);
+
+        op_batch->reset();
 
         if (opt_verbose > 1) {
             htp_buf_desc *b = (htp_buf_desc*) b_ptr;
-            for (unsigned int i=0; i < n_bufs; i++) {
-                GGML_LOG_DEBUG("ggml-hex: %s htp-buf #%u : fd %d base %p size %zu\n", name, i,
+            for (unsigned int i=0; i < req.n_bufs; i++) {
+                GGML_LOG_DEBUG("ggml-hex: %s htp-buf #%u : fd %d base %p size %zu\n", shm_buf->sess->c_name(), i,
                             b[i].fd, (void *) b[i].base, (size_t) b[i].size);
             }
             htp_tensor *t = (htp_tensor*) t_ptr;
-            for (unsigned int i=0; i < n_tens; i++) {
+            for (unsigned int i=0; i < req.n_tensors; i++) {
                 GGML_LOG_DEBUG("ggml-hex: %s htp-tensor #%u : bi %u offset %u size %u : %zu:%zu:%zu:%zu\n",
-                            name, i, t[i].bi, t[i].data, t[i].size,
+                            shm_buf->sess->c_name(), i, t[i].bi, t[i].data, t[i].size,
                             (size_t) t[i].ne[0], (size_t) t[i].ne[1], (size_t) t[i].ne[2], (size_t) t[i].ne[3]);
             }
         }
 
-        reset();
+        return true;
+    }
 
-        return m_size;
+    void pop(htp_opbatch_rsp rsp, dspqueue_buffer dbuf) {
+        GGML_ASSERT(rsp.id < op_cache.size());
+
+        done.push(rsp.id);
+
+        const size_t b_size = sizeof(htp_buf_desc)  * rsp.n_bufs;
+        const size_t t_size = sizeof(htp_tensor)    * rsp.n_tensors;
+        const size_t o_size = sizeof(htp_op_desc)   * rsp.n_ops;
+        const size_t p_size = sizeof(htp_prof_desc) * rsp.n_ops;
+
+        const size_t m_size = b_size + t_size + o_size + p_size;
+        GGML_ASSERT(m_size <= shm_blk_size);
+
+        HEX_VERBOSE("ggml-hex: %s op-queue pop batch #%u : n-bufs %u n-tensors %u n-ops %u : m-size %zu b-size %zu t-size %zu o-size %zu\n",
+                shm_buf->sess->c_name(), rsp.id, rsp.n_bufs, rsp.n_tensors, rsp.n_ops,
+                (size_t) dbuf.size, b_size, t_size, o_size);
+
+        uint8_t * m_ptr = (uint8_t*) dbuf.ptr;
+        uint8_t * p_ptr = m_ptr + (b_size + t_size + o_size);
+
+        if (opt_profile && rsp.n_ops > 0) {
+            auto & ops = op_cache[rsp.id];
+
+            uint64_t batch_usec = ggml_time_us() - start_usec[rsp.id];
+            uint32_t htp_usec   = 0;
+
+            GGML_ASSERT(rsp.n_ops <= ops.size());
+
+            const htp_prof_desc * pd = (const htp_prof_desc *) p_ptr;
+            for (uint32_t i = 0; i < rsp.n_ops; i++) {
+                htp_usec += pd[i].usecs;
+                ggml_hexagon_dump_op_prof(shm_buf->sess->name, ops[i], pd[i].usecs, pd[i].cycles, pd[i].pmu);
+            }
+
+            GGML_LOG_DEBUG("ggml-hex: %s profile-batch n-ops %u batch-dur-usec %lld htp-ops-usec %u\n",
+                           shm_buf->sess->c_name(), rsp.n_ops, (long long) batch_usec, htp_usec);
+        }
     }
 };
 
@@ -1824,17 +1898,12 @@ void ggml_hexagon_session::flush_pending(bool all) {
             GGML_ABORT("ggml-hex: %s dspcall : bad response : size %u dspbufs %u\n", this->c_name(), rsp_size, n_dbufs);
         }
 
-        op_shm->release((uint8_t*) dbuf.ptr);
-
         if (rsp.status != HTP_STATUS_OK) {
             GGML_LOG_ERROR("ggml-hex: %s dspcall : dsp-rsp: %s\n", this->c_name(), status_to_str(rsp.status));
             // TODO: handle errors
         }
 
-        // FIXME: profile will be per opreq
-        // this->prof_usecs  = rsp.prof_usecs;
-        // this->prof_cycles = rsp.prof_cycles;
-        // this->prof_pkts   = rsp.prof_pkts;
+        op_queue->pop(rsp, dbuf);
 
         this->op_pending--;  // atomic dec
 
@@ -1845,28 +1914,17 @@ void ggml_hexagon_session::flush_pending(bool all) {
 void ggml_hexagon_session::flush_batch() {
     if (op_batch->empty()) { return; }
 
-    htp_opbatch_req req;
-    req.n_bufs    = op_batch->n_bufs;
-    req.n_tensors = op_batch->n_tens;
-    req.n_ops     = op_batch->n_ops;
+    htp_opbatch_req req {};
+    dspqueue_buffer dbuf{};
 
-    dspqueue_buffer dbuf;
-    dbuf.fd     = op_shm->fd();
-    dbuf.flags  = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
-    dbuf.ptr    = op_shm->allocate();
-    if (!dbuf.ptr) {
+    if (!op_queue->push(req, dbuf, op_batch)) {
         flush_pending(false);
-        dbuf.ptr = op_shm->allocate();
+        op_queue->push(req, dbuf, op_batch);
     }
 
-    dbuf.offset = (uint8_t*) dbuf.ptr - (uint8_t*) op_shm->base();
-    dbuf.size   = op_batch->flush((uint8_t*) dbuf.ptr, op_shm->block_size);
-
     // Bump pending flag (cleared in the session::flush once we get the response)
     this->op_pending++;  // atomic inc
 
-    HEX_VERBOSE("ggml-hex: %s: queue-opbatch : %p size %u\n", this->c_name(), dbuf.ptr, dbuf.size);
-
     int err = dspqueue_write(this->queue, 0, 1, &dbuf, sizeof(req), (const uint8_t*) &req, DSPQUEUE_TIMEOUT);
     if (err != 0) {
         GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", this->c_name(), (unsigned) err);
@@ -2016,25 +2074,33 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
     }
 
     if (opt_etm) {
-        err = htp_iface_enable_etm(this->handle);
+        err = htp_iface_etm(this->handle, 1);
         if (err != 0) {
             GGML_LOG_ERROR("ggml-hex: failed to enable ETM tracing: 0x%08x\n", (unsigned) err);
         }
     }
 
-    // Start the DSP-side service. We need to pass the queue ID to the
-    // DSP in a FastRPC call; the DSP side will import the queue and start
-    // listening for packets in a callback.
+    if (opt_profile) {
+        htp_iface_pmu_conf pmu_conf{};
+        std::copy(opt_pmu_evt.begin(), opt_pmu_evt.end(), pmu_conf.events);
+
+        err = htp_iface_profiler(this->handle, opt_profile, &pmu_conf);
+        if (err != 0) {
+            GGML_LOG_ERROR("ggml-hex: failed to enable profiling: 0x%08x\n", (unsigned) err);
+        }
+    }
+
+    // Allocate buffers and state for op batching
+    this->op_batch = new ggml_hexagon_opbatch(this, opt_opbatch);
+    this->op_queue = new ggml_hexagon_opqueue(this, opt_opbatch, opt_opqueue);
+
+    // Start processing op batch requests
     err = htp_iface_start(this->handle, dev_id, this->queue_id, opt_nhvx, opt_use_hmx);
     if (err != 0) {
         GGML_LOG_ERROR("ggml-hex: failed to start session: 0x%08x\n", (unsigned) err);
         throw std::runtime_error("ggml-hex: iface start failed (see log for details)");
     }
     this->valid_iface = true;
-
-    // Allocate buffers and state for op batching
-    this->op_batch = new ggml_hexagon_opbatch(this, opt_opbatch);
-    this->op_shm   = new ggml_hexagon_opshm(this, opt_opbatch, opt_opqueue);
 }
 
 void ggml_hexagon_session::release() noexcept(true) {
@@ -2043,7 +2109,7 @@ void ggml_hexagon_session::release() noexcept(true) {
     int err;
 
     delete this->op_batch;
-    delete this->op_shm;
+    delete this->op_queue;
 
     // Stop the DSP-side service and close the queue
     if (this->valid_iface) {
@@ -2054,12 +2120,20 @@ void ggml_hexagon_session::release() noexcept(true) {
     }
 
     if (opt_etm) {
-        err = htp_iface_disable_etm(this->handle);
+        err = htp_iface_etm(this->handle, 0);
         if (err != 0) {
             GGML_LOG_ERROR("ggml-hex: warn : failed to disable ETM tracing: 0x%08x\n", (unsigned) err);
         }
     }
 
+    if (opt_profile) {
+        htp_iface_pmu_conf pmu_conf{};
+        err = htp_iface_profiler(this->handle, 0, &pmu_conf);
+        if (err != 0) {
+            GGML_LOG_ERROR("ggml-hex: warn : failed to disable profiling: 0x%08x\n", (unsigned) err);
+        }
+    }
+
     if (this->valid_queue) {
         err = dspqueue_close(queue);
         if (err != 0) {
@@ -2077,7 +2151,7 @@ ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) n
     repack_buffer_type.device = dev;
 
     op_batch = nullptr;
-    op_shm   = nullptr;
+    op_queue = nullptr;
 
     try {
         allocate(dev_id);
@@ -2698,7 +2772,7 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
 
     for (int i = 0; i < graph->n_nodes; ++i) {
         ggml_tensor * n = graph->nodes[i];
-        if (op_is_compute(n)) {
+        if (op_is_compute(n) && (opt_opstage & HTP_OPSTAGE_QUEUE)) {
             sess->enqueue_op(op_remap_to_htp(n), n);
         }
     }
@@ -3338,6 +3412,26 @@ static void * ggml_backend_hexagon_get_proc_address(ggml_backend_reg_t reg, cons
     return NULL;
 }
 
+template<typename T> std::vector<T> str_to_vec(const char* str) {
+    std::stringstream ss(str);
+    std::vector<T> v;
+    std::string    t;
+
+    while (std::getline(ss, t, ',')) {
+        v.push_back(std::stoul(t, nullptr, 0));
+    }
+
+    return v;
+}
+
+template<typename T, int BASE=10> std::string vec_to_str(std::vector<T> v) {
+    std::stringstream ss;
+    ss << std::setbase(BASE) << std::showbase;
+    for (auto i : v) { ss << i << ','; }
+    auto str = ss.str(); str.pop_back(); // drop last comma
+    return str;
+}
+
 static void ggml_hexagon_init(ggml_backend_reg * reg) {
     // Basic sanity checks to make sure definitions match
     static_assert((unsigned int) HTP_TYPE_Q4_0 == (unsigned int) GGML_TYPE_Q4_0,
@@ -3351,8 +3445,7 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
 
     const char * str_verbose = getenv("GGML_HEXAGON_VERBOSE");
     const char * str_hostbuf = getenv("GGML_HEXAGON_HOSTBUF");
-    const char * str_opmask  = getenv("GGML_HEXAGON_OPMASK");
-    const char * str_opsync  = getenv("GGML_HEXAGON_OPSYNC");
+    const char * str_opstage = getenv("GGML_HEXAGON_OPSTAGE");
     const char * str_opbatch = getenv("GGML_HEXAGON_OPBATCH");
     const char * str_opqueue = getenv("GGML_HEXAGON_OPQUEUE");
     const char * str_opfilter= getenv("GGML_HEXAGON_OPFILTER");
@@ -3365,19 +3458,30 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) {
 
     auto RE_ICASE = std::regex_constants::icase;
 
-    opt_opfilter     = str_opfilter     ? new std::regex(str_opfilter, RE_ICASE) : NULL;
-    opt_verbose      = str_verbose ? atoi(str_verbose) : 0;
-    opt_hostbuf      = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
-    opt_opmask       = str_opmask  ? strtoul(str_opmask, NULL, 0)  : opt_opmask;
-    opt_opsync       = str_opsync  ? atoi(str_opsync)              : opt_opsync;
-    opt_opbatch      = str_opbatch ? strtoul(str_opbatch, NULL, 0) : opt_opbatch;
-    opt_opqueue      = str_opqueue ? strtoul(str_opqueue, NULL, 0) : opt_opqueue;
-    opt_profile      = str_profile ? atoi(str_profile) : 0;
-    opt_etm          = str_etm     ? atoi(str_etm)     : 0;
-    opt_nhvx         = str_nhvx    ? strtoul(str_nhvx, NULL, 0) : opt_nhvx;
-    opt_use_hmx      = str_use_hmx ? atoi(str_use_hmx) : opt_use_hmx;
-    opt_ndev         = str_ndev    ? strtoul(str_ndev, NULL, 0) : opt_ndev;
-    opt_hostbuf      = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf;
+    opt_opfilter     = str_opfilter ? new std::regex(str_opfilter, RE_ICASE) : NULL;
+    opt_verbose      = str_verbose  ? atoi(str_verbose)             : 0;
+    opt_hostbuf      = str_hostbuf  ? atoi(str_hostbuf)             : opt_hostbuf;
+    opt_opstage      = str_opstage  ? strtoul(str_opstage, NULL, 0) : opt_opstage;
+    opt_opbatch      = str_opbatch  ? strtoul(str_opbatch, NULL, 0) : opt_opbatch;
+    opt_opqueue      = str_opqueue  ? strtoul(str_opqueue, NULL, 0) : opt_opqueue;
+    opt_etm          = str_etm      ? atoi(str_etm)                 : 0;
+    opt_nhvx         = str_nhvx     ? strtoul(str_nhvx, NULL, 0)    : opt_nhvx;
+    opt_use_hmx      = str_use_hmx  ? atoi(str_use_hmx)             : opt_use_hmx;
+    opt_ndev         = str_ndev     ? strtoul(str_ndev, NULL, 0)    : opt_ndev;
+    opt_hostbuf      = str_hostbuf  ? atoi(str_hostbuf)             : opt_hostbuf;
+
+    if (str_profile) {
+        opt_pmu_evt = [&]() -> std::vector<uint32_t> {
+            auto v  = str_to_vec<uint32_t>(str_profile);
+            switch (v.size()) {
+                case 1:  opt_profile = v[0]; return opt_pmu_evt; // mode with default pmu events
+                case 8:  opt_profile = 2;    return v;           // mode with custom  pmu events
+                default: opt_profile = 0;    return {};          // garbage input
+            }}();
+        if (opt_profile == 1) opt_pmu_evt = {};
+        GGML_LOG_INFO("ggml-hex: Profiling mode %u : pmu-evt [ %s ]\n", opt_profile,
+                vec_to_str<uint32_t, 16>(opt_pmu_evt).c_str());
+    }
 
     if (opt_ndev > GGML_HEXAGON_MAX_SESSIONS) {
         opt_ndev = GGML_HEXAGON_MAX_SESSIONS;
diff --git a/ggml/src/ggml-hexagon/htp/hex-utils.h b/ggml/src/ggml-hexagon/htp/hex-utils.h
index f6713c5cf..329249e11 100644
--- a/ggml/src/ggml-hexagon/htp/hex-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hex-utils.h
@@ -4,6 +4,7 @@
 #include <stdbool.h>
 #include <stdint.h>
 #include <qurt_memory.h>
+#include <qurt.h>
 
 #include "hexagon_types.h"
 #include "hexagon_protos.h"
@@ -100,4 +101,31 @@ static inline void hex_pause() {
     asm volatile(" pause(#255)\n");
 }
 
+#ifndef HEX_NUM_PMU_COUNTERS
+#define HEX_NUM_PMU_COUNTERS 8
+#endif
+
+static inline void hex_get_pmu(uint32_t counters[]) {
+#if __HVX_ARCH__ >= 79
+    asm volatile("%0 = upmucnt0" : "=r"(counters[0]));
+    asm volatile("%0 = upmucnt1" : "=r"(counters[1]));
+    asm volatile("%0 = upmucnt2" : "=r"(counters[2]));
+    asm volatile("%0 = upmucnt3" : "=r"(counters[3]));
+    asm volatile("%0 = upmucnt4" : "=r"(counters[4]));
+    asm volatile("%0 = upmucnt5" : "=r"(counters[5]));
+    asm volatile("%0 = upmucnt6" : "=r"(counters[6]));
+    asm volatile("%0 = upmucnt7" : "=r"(counters[7]));
+#else
+    counters[0] = qurt_pmu_get(QURT_PMUCNT0);
+    counters[1] = qurt_pmu_get(QURT_PMUCNT1);
+    counters[2] = qurt_pmu_get(QURT_PMUCNT2);
+    counters[3] = qurt_pmu_get(QURT_PMUCNT3);
+    counters[4] = qurt_pmu_get(QURT_PMUCNT4);
+    counters[5] = qurt_pmu_get(QURT_PMUCNT5);
+    counters[6] = qurt_pmu_get(QURT_PMUCNT6);
+    counters[7] = qurt_pmu_get(QURT_PMUCNT7);
+    // qurt_pmu_get_pmucnt(counters);
+#endif
+}
+
 #endif /* HEX_UTILS_H */
diff --git a/ggml/src/ggml-hexagon/htp/htp-ctx.h b/ggml/src/ggml-hexagon/htp/htp-ctx.h
index 78455e6b0..f8c89211a 100644
--- a/ggml/src/ggml-hexagon/htp/htp-ctx.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ctx.h
@@ -10,6 +10,7 @@
 #include <dspqueue.h>
 #include <stdatomic.h>
 #include <stdint.h>
+#include <stdbool.h>
 
 #define HTP_MAX_NTHREADS 10
 #define HTP_MAX_MMAPS    16
@@ -66,7 +67,9 @@ struct htp_context {
     int                    thread_id;
     int                    thread_prio;
 
-    int                    hmx_enabled;
+    bool                   hmx_enabled;
+    bool                   etm;
+    uint32_t               profiler;
 
     uint8_t *              vtcm_base;
     size_t                 vtcm_size;
diff --git a/ggml/src/ggml-hexagon/htp/htp-ops.h b/ggml/src/ggml-hexagon/htp/htp-ops.h
index 62d6ec022..56d7b398d 100644
--- a/ggml/src/ggml-hexagon/htp/htp-ops.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ops.h
@@ -42,9 +42,9 @@ enum htp_data_type {
 
 // Mask to enable various stages of the Ops.
 // Used for debugging and profiling.
-enum htp_op_mask {
-    HTP_OPMASK_QUEUE    = (1 << 0),  // Enable Queueing (ie calls into the DSP)
-    HTP_OPMASK_COMPUTE  = (1 << 1),  // Enable Compute
+enum htp_op_stage {
+    HTP_OPSTAGE_QUEUE    = (1 << 0),  // Enable Queueing (ie calls into NPU)
+    HTP_OPSTAGE_COMPUTE  = (1 << 1),  // Enable Compute
 };
 
 // Do not reorder first 4 (used as an index)
@@ -137,27 +137,45 @@ struct htp_op_desc {
     int32_t  params[HTP_OP_MAX_PARAMS]; // Params for the op, e.g. epsilon of RMS norm
     uint16_t src[HTP_OP_MAX_INPUTS];    // Input tensors indices
     uint16_t dst;                       // Output tensor index
+};
 
-    // the rest is filled in-place by the NPU
-    uint32_t prof_usecs;                // Number of usec per request
-    uint32_t prof_cycles;               // Number of cycles per request
-    uint32_t prof_pkts;                 // Number of instruction packets per request
-    uint32_t unused;
+enum htp_profiler_mode {
+    HTP_PROF_DISABLED = 0,
+    HTP_PROF_BASIC    = 1,
+    HTP_PROF_PMU      = 2,
+};
+
+#define HTP_PROF_PMU_NCNT 8
+
+// Profile descriptor
+struct htp_prof_desc {
+    uint32_t opcode;                 // GGML/HTP Op
+    uint32_t usecs;                  // Number of usec
+    uint32_t cycles;                 // Number of cycles
+    uint32_t pad;                    // Unused
+    uint32_t pmu[HTP_PROF_PMU_NCNT]; // PMU counters
 };
 
 struct htp_opbatch_req {
+    uint32_t id;          // Batch id
     uint32_t n_bufs;      // Number of buffers
     uint32_t n_tensors;   // Number of tensors
     uint32_t n_ops;       // Number of ops
     uint32_t flags;       // unused
+    uint32_t pad;         // unused
     // struct htp_buf_desc  bufs[];    -- dspqueue buf 0
     // struct htp_tensor    tensors[]; -- dspqueue buf 0
     // struct htp_op_desc   ops[];     -- dspqueue buf 0
 };
 
 struct htp_opbatch_rsp {
+    uint32_t id;         // Batch id
     uint32_t status;     // HTP_STATUS_...
-    // struct htp_op_req ops[];     -- dspqueue buf 0
+    uint32_t n_bufs;     // Number of buffers
+    uint32_t n_tensors;  // Number of tensors
+    uint32_t n_ops;      // Number of op profile descriptors
+    uint32_t pad;        // unused
+    // struct htp_prof_desc profs[];  -- dspqueue buf 0
 };
 
 #endif /* HTP_OPS_H */
diff --git a/ggml/src/ggml-hexagon/htp/htp_iface.idl b/ggml/src/ggml-hexagon/htp/htp_iface.idl
index 3eb5d5a69..dbcafd1d8 100644
--- a/ggml/src/ggml-hexagon/htp/htp_iface.idl
+++ b/ggml/src/ggml-hexagon/htp/htp_iface.idl
@@ -6,13 +6,17 @@
 #include "AEEStdDef.idl"
 #include "remote.idl"
 
+struct htp_iface_pmu_conf {
+    uint32 events[8];
+};
+
 interface htp_iface : remote_handle64 {
     AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx, in uint32 use_hmx);
     AEEResult stop();
     AEEResult mmap(in uint32 fd, in uint32 size, in uint32 pinned);
     AEEResult munmap(in uint32 fd);
-    AEEResult enable_etm();
-    AEEResult disable_etm();
+    AEEResult profiler(in uint32 mode, in htp_iface_pmu_conf pmu);
+    AEEResult etm(in uint32 enable);
 };
 
 #endif /* HTP_IDL */
diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c
index 9185c9ffe..088434a63 100644
--- a/ggml/src/ggml-hexagon/htp/main.c
+++ b/ggml/src/ggml-hexagon/htp/main.c
@@ -27,6 +27,7 @@
 #include "htp-ctx.h"
 #include "htp-ops.h"
 #include "htp-ops.h"
+#include "htp_iface.h"
 #include "worker-pool.h"
 
 AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
@@ -103,6 +104,54 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
     return AEE_SUCCESS;
 }
 
+AEEResult htp_iface_etm(remote_handle64 handle, uint32_t enable) {
+    int err = enable ? HAP_user_etm_enable() : HAP_user_etm_disable();
+    if (err) {
+        if (err == AEE_EVERSIONNOTSUPPORT) {
+            FARF(ERROR, "API HAP_user_etm_enable/disable is not supported\n");
+        } else {
+            FARF(ERROR, "Error executing HAP_user_etm_enable/disable with error code : 0x%x\n", err);
+        }
+    }
+    return err;
+}
+
+AEEResult htp_iface_profiler(remote_handle64 handle, uint32_t mode, const htp_iface_pmu_conf* pmu_conf) {
+    struct htp_context * ctx = (struct htp_context *) handle;
+    if (!ctx) {
+        return AEE_EBADPARM;
+    }
+
+    if (mode == HTP_PROF_PMU) {
+        const uint32_t* events = pmu_conf->events;
+
+        // Pack 4 event IDs (low 8 bits) into each 32-bit config register
+        uint32_t evtcfg = 0, evtcfg1 = 0, cfg = 0, i = 0;
+        for (; i < HEX_NUM_PMU_COUNTERS/2; i++) {
+            evtcfg  |= ((events[i + 0] & 0xFF) << (i * 8));
+            evtcfg1 |= ((events[i + 4] & 0xFF) << (i * 8));
+        }
+
+        // For events >255 pack high 2 bits of all 8 event IDs into cfg register
+        // 2 bits per counter: bits [1:0] for counter 0, [3:2] for counter 1, etc.
+        for (i = 0; i < HEX_NUM_PMU_COUNTERS; i++) {
+            cfg |= (((events[i] >> 8) & 3) << (i * 2));
+        }
+
+        FARF(ALWAYS, "Configuring PMU registers: evtcfg = 0x%x, evtcfg1 = 0x%x, pmucfg = 0x%x", evtcfg, evtcfg1, cfg);
+
+        // Configure PMU registers
+        qurt_pmu_set(QURT_PMUCFG,     cfg);
+        qurt_pmu_set(QURT_PMUEVTCFG,  evtcfg);
+        qurt_pmu_set(QURT_PMUEVTCFG1, evtcfg1);
+        qurt_pmu_enable(1);
+    }
+
+    ctx->profiler = mode;
+
+    return AEE_SUCCESS;
+}
+
 AEEResult htp_iface_close(remote_handle64 handle) {
     struct htp_context * ctx = (struct htp_context *) handle;
 
@@ -129,35 +178,19 @@ AEEResult htp_iface_close(remote_handle64 handle) {
         }
     }
 
+    if (ctx->profiler) {
+        qurt_pmu_enable(1);
+    }
+
+    if (ctx->etm) {
+        HAP_user_etm_disable();
+    }
+
     free(ctx);
     return AEE_SUCCESS;
 }
 
-AEEResult htp_iface_enable_etm(remote_handle64 handle) {
-    int err = HAP_user_etm_enable();
-    if (err) {
-        if (err == AEE_EVERSIONNOTSUPPORT) {
-            FARF(ERROR, "API HAP_user_etm_enable is not supported\n");
-        } else {
-            FARF(ERROR, "Error executing HAP_user_etm_enable with error code : 0x%x\n", err);
-        }
-    }
-    return err;
-}
-
-AEEResult htp_iface_disable_etm(remote_handle64 handle) {
-    int err = HAP_user_etm_disable();
-    if (err) {
-        if (err == AEE_EVERSIONNOTSUPPORT) {
-            FARF(ERROR, "API HAP_user_etm_disable is not supported\n");
-        } else {
-            FARF(ERROR, "Error executing HAP_user_etm_disable with error code : 0x%x\n", err);
-        }
-    }
-    return err;
-}
-
-AEEResult htp_iface_mmap(remote_handle64 handle, int fd, uint32_t size, uint32_t pinned) {
+AEEResult htp_iface_mmap(remote_handle64 handle, uint32 fd, uint32 size, uint32 pinned) {
     struct htp_context * ctx = (struct htp_context *) handle;
     if (!ctx) {
         return AEE_EBADPARM;
@@ -204,7 +237,7 @@ AEEResult htp_iface_mmap(remote_handle64 handle, int fd, uint32_t size, uint32_t
     return AEE_ENOMEMORY;
 }
 
-AEEResult htp_iface_munmap(remote_handle64 handle, int fd) {
+AEEResult htp_iface_munmap(remote_handle64 handle, uint32 fd) {
     struct htp_context * ctx = (struct htp_context *) handle;
     if (!ctx) {
         return AEE_EBADPARM;
@@ -434,19 +467,39 @@ static void htp_error_callback(dspqueue_t queue, int error, void * context) {
 struct profile_data {
     uint64_t usecs;
     uint64_t cycles;
-    uint64_t pkts;
+    uint32_t pmu_counters[HEX_NUM_PMU_COUNTERS];
 };
 
-static inline void profile_start(struct profile_data * d) {
-    d->usecs  = HAP_perf_get_qtimer_count();
-    d->cycles = hex_get_cycles();
-    d->pkts   = hex_get_pktcnt();
+static inline void profile_start(uint32_t mode, struct profile_data * d) {
+    switch (mode) {
+        case HTP_PROF_PMU:
+            hex_get_pmu(d->pmu_counters);
+            // fallthrough
+        case HTP_PROF_BASIC:
+            d->usecs  = HAP_perf_get_qtimer_count();
+            d->cycles = hex_get_cycles();
+            break;
+        default:
+            break;
+    }
 }
 
-static inline void profile_stop(struct profile_data * d) {
-    d->usecs  = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs);
-    d->cycles = hex_get_cycles() - d->cycles;
-    d->pkts   = hex_get_pktcnt() - d->pkts;
+static inline void profile_stop(uint32_t mode, struct profile_data * d) {
+    uint32_t pmu_counters[HEX_NUM_PMU_COUNTERS];
+    switch (mode) {
+        case HTP_PROF_PMU:
+            hex_get_pmu(pmu_counters);
+            for (int i = 0; i < HEX_NUM_PMU_COUNTERS; i++) {
+                d->pmu_counters[i] = pmu_counters[i] - d->pmu_counters[i];
+            }
+            // fallthrough
+        case HTP_PROF_BASIC:
+            d->usecs  = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs);
+            d->cycles = hex_get_cycles() - d->cycles;
+            break;
+        default:
+            break;
+    }
 }
 
 static int execute_op(struct htp_ops_context * octx) {
@@ -726,30 +779,33 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
             continue;
         }
 
+        // Reset poll count for valid requests
+        poll_count = DSPQUEUE_POLL_COUNT;
+
         const uint32_t n_bufs = req.n_bufs;
         const uint32_t n_tens = req.n_tensors;
         const uint32_t n_ops  = req.n_ops;
 
-        const uint32_t b_size = sizeof(struct htp_buf_desc) * n_bufs;
-        const uint32_t t_size = sizeof(struct htp_tensor)   * n_tens;
-        const uint32_t o_size = sizeof(struct htp_op_desc)  * n_ops;
+        const uint32_t b_size = sizeof(struct htp_buf_desc)  * n_bufs;
+        const uint32_t t_size = sizeof(struct htp_tensor)    * n_tens;
+        const uint32_t o_size = sizeof(struct htp_op_desc)   * n_ops;
+        const uint32_t p_size = sizeof(struct htp_prof_desc) * n_ops;
 
-        if (dbuf.size < b_size + t_size + o_size) {
+        if (dbuf.size < b_size + t_size + o_size + p_size) {
             FARF(ERROR, "invalid opbatch memory block size %u", dbuf.size);
             break;
         }
 
-        // Reset poll count for valid requests
-        poll_count = DSPQUEUE_POLL_COUNT;
-
-        uint8_t * m_ptr = dbuf.ptr;
-        struct htp_buf_desc* bufs = (struct htp_buf_desc*) m_ptr; m_ptr += b_size;
-        struct htp_tensor*   tens = (struct htp_tensor*)   m_ptr; m_ptr += t_size;
-        struct htp_op_desc*   ops = (struct htp_op_desc*)  m_ptr;
-
-        FARF(HIGH, "processing opbatch: n-bufs %u n-tensors %u n-ops %u : m-size %u b-size %u t-size %u o-size %u",
+        FARF(HIGH, "processing opbatch #%u: n-bufs %u n-tensors %u n-ops %u : m-size %u b-size %u t-size %u o-size %u", req.id,
                 n_bufs, n_tens, n_ops, dbuf.size, b_size, t_size, o_size);
 
+        // Setup descriptor pointers
+        uint8_t * m_ptr = dbuf.ptr;
+        struct htp_buf_desc* bufs = (struct htp_buf_desc*)  m_ptr; m_ptr += b_size;
+        struct htp_tensor*   tens = (struct htp_tensor*)    m_ptr; m_ptr += t_size;
+        struct htp_op_desc*   ops = (struct htp_op_desc*)   m_ptr; m_ptr += o_size;
+        struct htp_prof_desc* pds = (struct htp_prof_desc*) m_ptr;
+
         prep_op_bufs(ctx, bufs, n_bufs);
         prep_tensors(ctx, bufs, tens, n_tens);
 
@@ -760,22 +816,34 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
 
         for (uint32_t i=0; i < n_ops; i++) {
             struct profile_data prof;
-            profile_start(&prof);
+
+            profile_start(ctx->profiler, &prof);
 
             proc_op_req(octx, tens, i, &ops[i]);
 
-            profile_stop(&prof);
-            ops[i].prof_usecs  = prof.usecs;
-            ops[i].prof_cycles = prof.cycles;
-            ops[i].prof_pkts   = prof.pkts;
+            profile_stop(ctx->profiler, &prof);
+
+            if (ctx->profiler) {
+                pds[i].opcode = ops[i].opcode;
+                pds[i].usecs  = prof.usecs;
+                pds[i].cycles = prof.cycles;
+                for (int j = 0; j < HEX_NUM_PMU_COUNTERS; j++) {
+                    pds[i].pmu[j] = prof.pmu_counters[j];
+                }
+            }
         }
 
         // dspqueue_write_early_wakeup_noblock(ctx->queue, 10, 0);
 
         struct htp_opbatch_rsp rsp;
-        rsp.status = HTP_STATUS_OK; // FIXME
+        rsp.id        = req.id;
+        rsp.status    = HTP_STATUS_OK;
+        rsp.n_bufs    = n_bufs;
+        rsp.n_tensors = n_tens;
+        rsp.n_ops     = n_ops;
 
         dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT;
+
         err = dspqueue_write(queue, 0, 1, &dbuf, sizeof(rsp), (const uint8_t *) &rsp, DSPQUEUE_TIMEOUT_NONE);
         if (err != 0) {
             FARF(ERROR, "dspqueue_write failed: 0x%08x", (unsigned) err);
diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index bac06693d..a0c265132 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -3017,6 +3017,10 @@ int op_matmul(struct htp_ops_context * octx) {
     const int act_stride = (int)(src1->nb[1] / sizeof(float));
     const int wgt_stride = (int)(src0->nb[1] / sizeof(__fp16));
 
+    if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
+        return HTP_STATUS_OK;
+    }
+
     if (src0->type == HTP_TYPE_F16) {
         if (is_batched) {
             hmx_matmul_w16a32_batched_params_t batch_params = {
diff --git a/scripts/snapdragon/adb/run-bench.sh b/scripts/snapdragon/adb/run-bench.sh
index 36c908da7..27459df24 100755
--- a/scripts/snapdragon/adb/run-bench.sh
+++ b/scripts/snapdragon/adb/run-bench.sh
@@ -23,10 +23,10 @@ verbose=
 [ "$V" != "" ] && verbose="GGML_HEXAGON_VERBOSE=$V" cli_opts="$cli_opts -v"
 
 profile=
-[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" cli_opts="$cli_opts -v"
+[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF" cli_opts="$cli_opts -v"
 
 opmask=
-[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
+[ "$OPSTAGE" != "" ] && opmask="GGML_HEXAGON_OPSTAGE=$OPSTAGE"
 
 nhvx=
 [ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
diff --git a/scripts/snapdragon/adb/run-cli.sh b/scripts/snapdragon/adb/run-cli.sh
index 901d7eff1..e1f0ac0eb 100755
--- a/scripts/snapdragon/adb/run-cli.sh
+++ b/scripts/snapdragon/adb/run-cli.sh
@@ -28,10 +28,10 @@ sched=
 [ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
 
 profile=
-[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" cli_opts="$cli_opts -v"
+[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF" cli_opts="$cli_opts -v"
 
 opmask=
-[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
+[ "$OPSTAGE" != "" ] && opmask="GGML_HEXAGON_OPSTAGE=$OPSTAGE"
 
 nhvx=
 [ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
diff --git a/scripts/snapdragon/adb/run-completion.sh b/scripts/snapdragon/adb/run-completion.sh
index f7290825a..7b84106dc 100755
--- a/scripts/snapdragon/adb/run-completion.sh
+++ b/scripts/snapdragon/adb/run-completion.sh
@@ -28,10 +28,10 @@ sched=
 [ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
 
 profile=
-[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1" cli_opts="$cli_opts -v"
+[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF" cli_opts="$cli_opts -v"
 
 opmask=
-[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
+[ "$OPSTAGE" != "" ] && opmask="GGML_HEXAGON_OPSTAGE=$OPSTAGE"
 
 nhvx=
 [ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
diff --git a/scripts/snapdragon/adb/run-mtmd.sh b/scripts/snapdragon/adb/run-mtmd.sh
index 0c1cf8928..38467beba 100755
--- a/scripts/snapdragon/adb/run-mtmd.sh
+++ b/scripts/snapdragon/adb/run-mtmd.sh
@@ -37,10 +37,10 @@ sched=
 [ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
 
 profile=
-[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1"
+[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF"
 
 opmask=
-[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
+[ "$OPSTAGE" != "" ] && opmask="GGML_HEXAGON_OPSTAGE=$OPSTAGE"
 
 nhvx=
 [ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
diff --git a/scripts/snapdragon/adb/run-tool.sh b/scripts/snapdragon/adb/run-tool.sh
index 70ed407e8..27cbb2b6d 100755
--- a/scripts/snapdragon/adb/run-tool.sh
+++ b/scripts/snapdragon/adb/run-tool.sh
@@ -25,10 +25,10 @@ sched=
 [ "$SCHED" != "" ] && sched="GGML_SCHED_DEBUG=2" cli_opts="$cli_opts -v"
 
 profile=
-[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF GGML_HEXAGON_OPSYNC=1"
+[ "$PROF" != "" ] && profile="GGML_HEXAGON_PROFILE=$PROF"
 
 opmask=
-[ "$OPMASK" != "" ] && opmask="GGML_HEXAGON_OPMASK=$OPMASK"
+[ "$OPSTAGE" != "" ] && opmask="GGML_HEXAGON_OPSTAGE=$OPSTAGE"
 
 nhvx=
 [ "$NHVX" != "" ] && nhvx="GGML_HEXAGON_NHVX=$NHVX"
diff --git a/scripts/snapdragon/ggml-hexagon-profile.py b/scripts/snapdragon/ggml-hexagon-profile.py
new file mode 100755
index 000000000..3edaacd27
--- /dev/null
+++ b/scripts/snapdragon/ggml-hexagon-profile.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+
+import sys
+import os
+import re
+import argparse
+import statistics
+import logging
+
+from collections import defaultdict
+
+# Mapping of cli-friendly names to (internal_data_key, Display Header, numeric_sort_key)
+COL_MAP = {
+    "op":         ("op",         "Op",         "op"),
+    "dims":       ("dims",       "Dims",       "dims"),
+    "dtypes":     ("dtypes",     "DTypes",     "dtypes"),
+    "count":      ("count",      "Count",      "_sort_count"),
+    "max-usec":   ("max_usec",   "Max usec",   "_sort_max_usec"),
+    "avg-usec":   ("avg_usec",   "Avg usec",   "_sort_avg_usec"),
+    "max-cycles": ("max_cycles", "Max Cycles", "_sort_max_cycles"),
+    "avg-cycles": ("avg_cycles", "Avg Cycles", "_sort_avg_cycles"),
+    "max-pmu":    ("max_pmu",    "Max PMU",    "_sort_max_pmu"),
+    "avg-pmu":    ("avg_pmu",    "Avg PMU",    "_sort_avg_pmu"),
+}
+
+op_pattern = re.compile(
+    r"profile-op\s+(?P<op_name>[A-Z_0-9]+):\s+.*?\s+:\s+(?P<dims>[\d:x\s\->!]+)\s+:\s+(?P<types>[a-z\d_\s\->x]+)\s+:\s+.*?\s+usec\s+(?P<usec>\d+)\s+cycles\s+(?P<cycles>\d+)(?:\s+pmu\s+\[(?P<pmu>[\d,\s]+)\])?"
+)
+
+logger = logging.getLogger("ggml-hexagon-profile")
+
+
+def parse_log(file_path, pmu_index=None):
+    try:
+        if file_path != "-":
+            f = open(file_path, 'r', encoding='utf-8', errors='ignore')
+        else:
+            f = os.fdopen(0, 'r', encoding='utf-8', errors='ignore')
+    except FileNotFoundError:
+        logger.error(f"file '{file_path}' not found.")
+        sys.exit(1)
+
+    all_ops = []
+    for line in f:
+        match = op_pattern.search(line)
+        if not match: continue
+
+        pmu_raw = match.group('pmu')
+        pmu_val = None
+        if pmu_raw and pmu_index is not None:
+            try:
+                pmu_list = [int(x.strip()) for x in pmu_raw.split(',')]
+                if len(pmu_list) > pmu_index:
+                    pmu_val = pmu_list[pmu_index]
+            except (ValueError, IndexError):
+                pmu_val = None
+
+        all_ops.append({
+            'name':    match.group('op_name'),
+            'dims':    match.group('dims').strip(),
+            'types':   match.group('types').strip(),
+            'usec':    int(match.group('usec')),
+            'cycles':  int(match.group('cycles')),
+            'pmu_val': pmu_val
+        })
+
+    f.close()
+
+    return all_ops
+
+
+def generate_report(ops, top_n, width_overrides, sort_col, pmu_name=None):
+    if not ops:
+        logger.info("No valid records found.")
+        return
+
+    grouped = defaultdict(list)
+    for op in ops:
+        key = (op['name'], op['dims'], op['types'])
+        grouped[key].append(op)
+
+    group_stats = []
+    for (name, dims, types), group_ops in grouped.items():
+        usecs = [o['usec'] for o in group_ops]
+        cycles = [o['cycles'] for o in group_ops]
+        pmu_vals = [o['pmu_val'] for o in group_ops if o['pmu_val'] is not None]
+
+        group_stats.append({
+            'op':               name,
+            'dims':             dims,
+            'dtypes':           types,
+            'count':            str(len(group_ops)),
+            'max_usec':         str(max(usecs)),
+            'avg_usec':         f"{statistics.mean(usecs):.2f}",
+            'max_cycles':       str(max(cycles)),
+            'avg_cycles':       f"{statistics.mean(cycles):.2f}",
+            'max_pmu':          str(max(pmu_vals)) if pmu_vals else "0",
+            'avg_pmu':          f"{statistics.mean(pmu_vals):.2f}" if pmu_vals else "0.00",
+            # Numeric values for accurate sorting
+            '_sort_count':      len(group_ops),
+            '_sort_max_usec':   max(usecs),
+            '_sort_avg_usec':   statistics.mean(usecs),
+            '_sort_max_cycles': max(cycles),
+            '_sort_avg_cycles': statistics.mean(cycles),
+            '_sort_max_pmu':    max(pmu_vals) if pmu_vals else 0,
+            '_sort_avg_pmu':    statistics.mean(pmu_vals) if pmu_vals else 0
+        })
+
+    # Sorting logic
+    actual_sort_key = COL_MAP[sort_col][2]
+    # We sort numeric fields descending, strings (op/dims) ascending
+    is_numeric    = actual_sort_key.startswith("_") or actual_sort_key == "count"
+    sorted_groups = sorted(group_stats, key=lambda x: x[actual_sort_key], reverse=is_numeric)[:top_n]
+
+    # Define initial column order
+    active_cols = ["op", "dims", "dtypes"]
+    if pmu_name:
+        active_cols += ["max-pmu", "avg-pmu"]
+    active_cols += ["max-usec", "avg-usec", "max-cycles", "avg-cycles", "count"]
+
+    final_headers, final_keys, final_widths = [], [], []
+
+    for col_name in active_cols:
+        data_key, header_text, _ = COL_MAP[col_name]
+        if "pmu" in col_name and pmu_name:
+            header_text = header_text.replace("PMU", pmu_name)
+
+        natural_width = max([len(row[data_key]) for row in sorted_groups] + [len(header_text)])
+        target_width  = width_overrides.get(col_name, natural_width)
+
+        if target_width == 0:
+            continue
+
+        final_headers.append(header_text)
+        final_keys.append(data_key)
+        final_widths.append(target_width)
+
+    # Print Report
+    logger.info(f"\n# Profile Report (Top {top_n} Ops sorted by {sort_col})\n")
+    header_line = "| " + " | ".join(f"{h:<{final_widths[i]}}" for i, h in enumerate(final_headers)) + " |"
+    sep_line    = "| " + " | ".join("-" * final_widths[i] for i in range(len(final_headers))) + " |"
+    logger.info(header_line)
+    logger.info(sep_line)
+
+    for group in sorted_groups:
+        row_vals = []
+        for i, key in enumerate(final_keys):
+            val = group[key]
+            if len(val) > final_widths[i]:
+                val = val[:final_widths[i] - 3] + "..."
+            row_vals.append(f"{val:<{final_widths[i]}}")
+        logger.info("| " + " | ".join(row_vals) + " |")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Post-process Op profile info.")
+    parser.add_argument("logfile")
+    parser.add_argument("-n", "--top", type=int, default=100)
+    parser.add_argument("--sort", type=str, default="max-usec", choices=list(COL_MAP.keys()))
+    parser.add_argument("--pmu-index", type=int)
+    parser.add_argument("--pmu-name", type=str)
+    parser.add_argument("--width", action='append', default=['dims:40'], help="Override column width, e.g. --width dims:50")
+
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.INFO, format='%(message)s')
+
+    # Sort validation: can't sort by PMU if index isn't provided
+    if "pmu" in args.sort and args.pmu_index is None:
+        logger.error(f"Cannot sort by '{args.sort}' without --pmu-index.")
+        sys.exit(1)
+
+    overrides = {}
+    if args.width:
+        for w in args.width:
+            try:
+                name, val = w.split(':')
+                overrides[name.lower()] = int(val)
+            except ValueError:
+                logger.warning(f"Invalid width format '{w}'")
+
+    final_pmu_name = (args.pmu_name or f"#{args.pmu_index}") if args.pmu_index is not None else None
+    ops = parse_log(args.logfile, pmu_index=args.pmu_index)
+    generate_report(ops, args.top, overrides, args.sort, pmu_name=final_pmu_name)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/snapdragon/windows/run-bench.ps1 b/scripts/snapdragon/windows/run-bench.ps1
index 5a3a9074d..8bf6939d2 100644
--- a/scripts/snapdragon/windows/run-bench.ps1
+++ b/scripts/snapdragon/windows/run-bench.ps1
@@ -21,11 +21,11 @@ if ($null -ne $env:V) {
 }
 
 if ($null -ne $env:PROF) {
-    $env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
+    $env:GGML_HEXAGON_PROFILE=$env:PROF
 }
 
-if ($null -ne $env:OPMASK) {
-    $env:GGML_HEXAGON_OPMASK=$env:OPMASK
+if ($null -ne $env:OPSTAGE) {
+    $env:GGML_HEXAGON_OPSTAGE=$env:OPSTAGE
 }
 
 if ($null -ne $env:NHVX) {
diff --git a/scripts/snapdragon/windows/run-cli.ps1 b/scripts/snapdragon/windows/run-cli.ps1
index c64aaf725..104452f9b 100644
--- a/scripts/snapdragon/windows/run-cli.ps1
+++ b/scripts/snapdragon/windows/run-cli.ps1
@@ -25,11 +25,11 @@ if ($null -ne $env:SCHED) {
 }
 
 if ($null -ne $env:PROF) {
-    $env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
+    $env:GGML_HEXAGON_PROFILE=$env:PROF
 }
 
-if ($null -ne $env:OPMASK) {
-    $env:GGML_HEXAGON_OPMASK=$env:OPMASK
+if ($null -ne $env:OPSTAGE) {
+    $env:GGML_HEXAGON_OPSTAGE=$env:OPSTAGE
 }
 
 if ($null -ne $env:NHVX) {
diff --git a/scripts/snapdragon/windows/run-completion.ps1 b/scripts/snapdragon/windows/run-completion.ps1
index a896cd352..5841a82fa 100644
--- a/scripts/snapdragon/windows/run-completion.ps1
+++ b/scripts/snapdragon/windows/run-completion.ps1
@@ -25,11 +25,11 @@ if ($null -ne $env:SCHED) {
 }
 
 if ($null -ne $env:PROF) {
-    $env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
+    $env:GGML_HEXAGON_PROFILE=$env:PROF
 }
 
-if ($null -ne $env:OPMASK) {
-    $env:GGML_HEXAGON_OPMASK=$env:OPMASK
+if ($null -ne $env:OPSTAGE) {
+    $env:GGML_HEXAGON_OPSTAGE=$env:OPSTAGE
 }
 
 if ($null -ne $env:NHVX) {
diff --git a/scripts/snapdragon/windows/run-mtmd.ps1 b/scripts/snapdragon/windows/run-mtmd.ps1
index f230ac5a6..be8178751 100644
--- a/scripts/snapdragon/windows/run-mtmd.ps1
+++ b/scripts/snapdragon/windows/run-mtmd.ps1
@@ -34,11 +34,11 @@ if ($null -ne $env:SCHED) {
 }
 
 if ($null -ne $env:PROF) {
-    $env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
+    $env:GGML_HEXAGON_PROFILE=$env:PROF
 }
 
-if ($null -ne $env:OPMASK) {
-    $env:GGML_HEXAGON_OPMASK=$env:OPMASK
+if ($null -ne $env:OPSTAGE) {
+    $env:GGML_HEXAGON_OPSTAGE=$env:OPSTAGE
 }
 
 if ($null -ne $env:NHVX) {
diff --git a/scripts/snapdragon/windows/run-tool.ps1 b/scripts/snapdragon/windows/run-tool.ps1
index 39edbfcf7..15c880f2d 100644
--- a/scripts/snapdragon/windows/run-tool.ps1
+++ b/scripts/snapdragon/windows/run-tool.ps1
@@ -31,11 +31,11 @@ if ($null -ne $env:SCHED) {
 }
 
 if ($null -ne $env:PROF) {
-    $env:GGML_HEXAGON_PROFILE=$env:PROF; $env:GGML_HEXAGON_OPSYNC=1
+    $env:GGML_HEXAGON_PROFILE=$env:PROF
 }
 
-if ($null -ne $env:OPMASK) {
-    $env:GGML_HEXAGON_OPMASK=$env:OPMASK
+if ($null -ne $env:OPSTAGE) {
+    $env:GGML_HEXAGON_OPSTAGE=$env:OPSTAGE
 }
 
 if ($null -ne $env:NHVX) {

From fa0b8a70a8d686cd11bad1579080d546d681a53e Mon Sep 17 00:00:00 2001
From: Ethan Turner <eturner64@gmail.com>
Date: Thu, 23 Apr 2026 15:53:23 -0700
Subject: [PATCH 22/35] cli: Remove redundant local sampling variables (#20429)
 (#22264)

This change implements the third requested change in issue 20429.
Because defaults.sampling contains the reasoning budget token count and
the reasoning budget message, it's not necessary to assign them to
struct variables.
---
 tools/cli/cli.cpp | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp
index cd635a624..369c24216 100644
--- a/tools/cli/cli.cpp
+++ b/tools/cli/cli.cpp
@@ -59,8 +59,6 @@ struct cli_context {
     std::vector<raw_buffer> input_files;
     task_params defaults;
     bool verbose_prompt;
-    int reasoning_budget = -1;
-    std::string reasoning_budget_message;
 
     // thread for showing "loading" animation
     std::atomic<bool> loading_show;
@@ -77,8 +75,6 @@ struct cli_context {
         // defaults.return_progress = true; // TODO: show progress
 
         verbose_prompt = params.verbose_prompt;
-        reasoning_budget = params.sampling.reasoning_budget_tokens;
-        reasoning_budget_message = params.sampling.reasoning_budget_message;
     }
 
     std::string generate_completion(result_timings & out_timings) {
@@ -106,7 +102,7 @@ struct cli_context {
                 const llama_vocab * vocab = llama_model_get_vocab(
                     llama_get_model(ctx_server.get_llama_context()));
 
-                task.params.sampling.reasoning_budget_tokens = reasoning_budget;
+                task.params.sampling.reasoning_budget_tokens = defaults.sampling.reasoning_budget_tokens;
                 task.params.sampling.generation_prompt = chat_params.generation_prompt;
 
                 if (!chat_params.thinking_start_tag.empty()) {
@@ -116,7 +112,7 @@ struct cli_context {
                 task.params.sampling.reasoning_budget_end =
                     common_tokenize(vocab, chat_params.thinking_end_tag, false, true);
                 task.params.sampling.reasoning_budget_forced =
-                    common_tokenize(vocab, reasoning_budget_message + chat_params.thinking_end_tag, false, true);
+                    common_tokenize(vocab, defaults.sampling.reasoning_budget_message + chat_params.thinking_end_tag, false, true);
             }
 
             rd.post_task({std::move(task)});

From e5f070a1dca19baf3ae983273846b9a8c7c4231f Mon Sep 17 00:00:00 2001
From: Chen Yuan <constantchen525@gmail.com>
Date: Thu, 23 Apr 2026 19:32:59 -0400
Subject: [PATCH 23/35] fix(shader): handle the buffer aliasing for rms fuse
 (#22266)

---
 ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp | 14 ++++++++++----
 ggml/src/ggml-webgpu/ggml-webgpu.cpp            |  6 ++++--
 .../ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl  | 17 ++++++++++++++++-
 3 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
index efc5b8c97..449eae808 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
@@ -197,11 +197,12 @@ struct ggml_webgpu_row_norm_pipeline_key_hash {
 /** RMS_NORM + MUL **/
 
 struct ggml_webgpu_rms_norm_mul_pipeline_key {
-    bool inplace;
-    bool src_overlap;
+    bool inplace;      // rn_src == dst
+    bool overlap;      // mul_src == dst
+    bool src_overlap;  // rn_src == mul_src
 
     bool operator==(const ggml_webgpu_rms_norm_mul_pipeline_key & other) const {
-        return inplace == other.inplace && src_overlap == other.src_overlap;
+        return inplace == other.inplace && overlap == other.overlap && src_overlap == other.src_overlap;
     }
 };
 
@@ -209,6 +210,7 @@ struct ggml_webgpu_rms_norm_mul_pipeline_key_hash {
     size_t operator()(const ggml_webgpu_rms_norm_mul_pipeline_key & key) const {
         size_t seed = 0;
         ggml_webgpu_hash_combine(seed, key.inplace);
+        ggml_webgpu_hash_combine(seed, key.overlap);
         ggml_webgpu_hash_combine(seed, key.src_overlap);
         return seed;
     }
@@ -556,7 +558,7 @@ inline uint32_t ggml_webgpu_flash_attn_max_kv_tile(const ggml_webgpu_shader_lib_
     const size_t q_tile       = context.sg_mat_m;
     const size_t base_q_bytes = (key.head_dim_qk + key.head_dim_v) * q_tile * GGML_WEBGPU_F16_SIZE_BYTES +
                                 2 * q_tile * GGML_WEBGPU_F32_SIZE_BYTES;
-    size_t       bytes_per_kv = 0;
+    size_t bytes_per_kv = 0;
     if (!key.kv_direct) {
         bytes_per_kv += std::max(key.head_dim_qk, key.head_dim_v);
     }
@@ -1878,6 +1880,7 @@ class ggml_webgpu_shader_lib {
     webgpu_pipeline get_rms_norm_mul_pipeline(const ggml_webgpu_shader_lib_context & context) {
         ggml_webgpu_rms_norm_mul_pipeline_key key = {};
         key.inplace                               = context.inplace;
+        key.overlap                               = context.overlap;
         key.src_overlap                           = context.src_overlap;
 
         auto it = rms_norm_mul_pipelines.find(key);
@@ -1892,6 +1895,9 @@ class ggml_webgpu_shader_lib {
         if (key.inplace) {
             defines.push_back("INPLACE");
             variant += "_inplace";
+        } else if (key.overlap) {
+            defines.push_back("OVERLAP");
+            variant += "_overlap";
         } else if (key.src_overlap) {
             defines.push_back("SRC_OVERLAP");
             variant += "_src_overlap";
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index bcca2bd46..acc486cfd 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -2071,8 +2071,9 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_rms_norm_mul(webgpu_context
         GGML_ABORT("rms_norm must be equal to the one of mul_src0 and mul_src1");
     }
 
-    bool inplace = (ggml_webgpu_tensor_equal(rn_dst, mul_src0) && ggml_webgpu_tensor_equal(mul_src1, dst)) ||
+    bool overlap = (ggml_webgpu_tensor_equal(rn_dst, mul_src0) && ggml_webgpu_tensor_equal(mul_src1, dst)) ||
                    (ggml_webgpu_tensor_equal(rn_dst, mul_src1) && ggml_webgpu_tensor_equal(mul_src0, dst));
+    bool inplace     = ggml_webgpu_tensor_equal(rn_src, dst);
     bool src_overlap = ggml_webgpu_tensor_overlap(rn_src, mul_src);
 
     uint32_t offset_merged_rn_src               = 0;
@@ -2116,7 +2117,7 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_rms_norm_mul(webgpu_context
 
     std::vector<wgpu::BindGroupEntry> entries;
 
-    if (inplace) {
+    if (inplace || overlap) {
         entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, rn_src));
         entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, mul_src));
     } else if (src_overlap) {
@@ -2136,6 +2137,7 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_rms_norm_mul(webgpu_context
     ggml_webgpu_shader_lib_context shader_lib_ctx = {};
     shader_lib_ctx.max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
     shader_lib_ctx.inplace     = inplace;
+    shader_lib_ctx.overlap     = overlap;
     shader_lib_ctx.src_overlap = src_overlap;
 
     webgpu_pipeline pipeline = ctx->shader_lib->get_rms_norm_mul_pipeline(shader_lib_ctx);
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl
index 71f063b51..74aaa2753 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm_mul.wgsl
@@ -1,4 +1,4 @@
-#ifdef INPLACE
+#ifdef OVERLAP
 
 @group(0) @binding(0)
 var<storage, read_write> rn_src: array<f32>;
@@ -13,6 +13,21 @@ fn update(rn_src_offset: u32, dst_offset: u32, scale: f32, mul_src_offset: u32)
     mul_src[dst_offset] = scale * rn_src[rn_src_offset] * mul_src[mul_src_offset];
 }
 
+#elif INPLACE
+
+@group(0) @binding(0)
+var<storage, read_write> rn_src: array<f32>;
+
+@group(0) @binding(1)
+var<storage, read_write> mul_src: array<f32>;
+
+@group(0) @binding(2)
+var<uniform> params: Params;
+
+fn update(rn_src_offset: u32, dst_offset: u32, scale: f32, mul_src_offset: u32) {
+    rn_src[dst_offset] = scale * rn_src[rn_src_offset] * mul_src[mul_src_offset];
+}
+
 #elif SRC_OVERLAP
 
 @group(0) @binding(0)

From 8bc492ebb407dfc8a61b8d92eced1b094ac1c823 Mon Sep 17 00:00:00 2001
From: Mengsheng Wu <mengshen@qti.qualcomm.com>
Date: Fri, 24 Apr 2026 09:39:13 +0800
Subject: [PATCH 24/35] hexagon: add SOLVE_TRI op (#21974)

* hexagon: add SOLVE_TRI op

* ggml: fix TODO description for solve_tri

* hexagon: rm unused variable/function warnings

* hexagon: chunk vs batch processingfor better thread utilization

* hexagon: vectorize partial f32 loads

* hexagon: move HVX f32 add/sub/mul wrappers to hvx-base.h

---------

Co-authored-by: Todor Boinovski <todorb@qti.qualcomm.com>
---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp    |  39 +++-
 ggml/src/ggml-hexagon/htp/CMakeLists.txt  |   1 +
 ggml/src/ggml-hexagon/htp/htp-ctx.h       |   1 +
 ggml/src/ggml-hexagon/htp/htp-ops.h       |   2 +-
 ggml/src/ggml-hexagon/htp/hvx-base.h      |  24 ++
 ggml/src/ggml-hexagon/htp/main.c          |   3 +
 ggml/src/ggml-hexagon/htp/solve-tri-ops.c | 267 ++++++++++++++++++++++
 7 files changed, 335 insertions(+), 2 deletions(-)
 create mode 100644 ggml/src/ggml-hexagon/htp/solve-tri-ops.c

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 955903418..0d9b5e289 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -2693,6 +2693,39 @@ static bool ggml_hexagon_supported_diag(const struct ggml_hexagon_session * sess
     return true;
 }
 
+static bool ggml_hexagon_supported_solve_tri(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) {
+    const struct ggml_tensor * src0 = op->src[0]; // A
+    const struct ggml_tensor * src1 = op->src[1]; // B
+    const struct ggml_tensor * dst  = op;         // X
+
+    if (!src0 || !src1) {
+        return false;
+    }
+
+    if (src0->type != GGML_TYPE_F32 || src1->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) {
+        return false;
+    }
+
+    if (src0->ne[0] != src0->ne[1]) {
+        return false;
+    }
+
+    if (src0->ne[1] != src1->ne[1]) {
+        return false;
+    }
+
+    if (src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3]) {
+        return false;
+    }
+
+    if (dst->ne[0] != src1->ne[0] || dst->ne[1] != src1->ne[1] || dst->ne[2] != src1->ne[2] || dst->ne[3] != src1->ne[3]) {
+        return false;
+    }
+
+    GGML_UNUSED(sess);
+    return true;
+}
+
 static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
     auto sess = static_cast<ggml_hexagon_session *>(backend->context);
     return sess->c_name();
@@ -2731,7 +2764,7 @@ static htp_op_code op_remap_to_htp(const ggml_tensor * t) {
         case GGML_OP_CUMSUM:         return HTP_OP_CUMSUM;
         case GGML_OP_FILL:           return HTP_OP_FILL;
         case GGML_OP_DIAG:           return HTP_OP_DIAG;
-
+        case GGML_OP_SOLVE_TRI:      return HTP_OP_SOLVE_TRI;
         case GGML_OP_UNARY:
             switch (ggml_get_unary_op(t)) {
                 case GGML_UNARY_OP_SILU:     return HTP_OP_UNARY_SILU;
@@ -3277,6 +3310,10 @@ static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, cons
             supp = ggml_hexagon_supported_diag(sess, op);
             break;
 
+        case GGML_OP_SOLVE_TRI:
+            supp = ggml_hexagon_supported_solve_tri(sess, op);
+            break;
+
         default:
             break;
     }
diff --git a/ggml/src/ggml-hexagon/htp/CMakeLists.txt b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
index b1ae60a9c..8bd528478 100644
--- a/ggml/src/ggml-hexagon/htp/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
@@ -36,6 +36,7 @@ add_library(${HTP_LIB} SHARED
     cumsum-ops.c
     fill-ops.c
     diag-ops.c
+    solve-tri-ops.c
 )
 
 target_compile_definitions(${HTP_LIB} PRIVATE
diff --git a/ggml/src/ggml-hexagon/htp/htp-ctx.h b/ggml/src/ggml-hexagon/htp/htp-ctx.h
index f8c89211a..d704fedee 100644
--- a/ggml/src/ggml-hexagon/htp/htp-ctx.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ctx.h
@@ -103,5 +103,6 @@ int op_ssm_conv(struct htp_ops_context * octx);
 int op_cumsum(struct htp_ops_context * octx);
 int op_fill(struct htp_ops_context * octx);
 int op_diag(struct htp_ops_context * octx);
+int op_solve_tri(struct htp_ops_context * octx);
 
 #endif /* HTP_CTX_H */
diff --git a/ggml/src/ggml-hexagon/htp/htp-ops.h b/ggml/src/ggml-hexagon/htp/htp-ops.h
index 56d7b398d..4397245c5 100644
--- a/ggml/src/ggml-hexagon/htp/htp-ops.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ops.h
@@ -82,7 +82,7 @@ enum htp_op_code {
     HTP_OP_CUMSUM,
     HTP_OP_FILL,
     HTP_OP_DIAG,
-
+    HTP_OP_SOLVE_TRI,
     HTP_OP_INVALID
 };
 
diff --git a/ggml/src/ggml-hexagon/htp/hvx-base.h b/ggml/src/ggml-hexagon/htp/hvx-base.h
index ed6026e76..d0926dedd 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-base.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-base.h
@@ -256,6 +256,18 @@ static inline HVX_Vector hvx_vec_mul_f16_f16(HVX_Vector a, HVX_Vector b)
     return Q6_Vhf_equals_Wqf32(Q6_Wqf32_vmpy_VhfVhf(a, b));
 }
 
+static inline HVX_Vector hvx_vec_add_f32_f32(HVX_Vector a, HVX_Vector b) {
+    return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a, b));
+}
+
+static inline HVX_Vector hvx_vec_sub_f32_f32(HVX_Vector a, HVX_Vector b) {
+    return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a, b));
+}
+
+static inline HVX_Vector hvx_vec_mul_f32_f32(HVX_Vector a, HVX_Vector b) {
+    return Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a, b));
+}
+
 #else
 
 static inline HVX_Vector hvx_vec_add_f16_f16(HVX_Vector a, HVX_Vector b)
@@ -273,6 +285,18 @@ static inline HVX_Vector hvx_vec_mul_f16_f16(HVX_Vector a, HVX_Vector b)
     return Q6_Vhf_vmpy_VhfVhf(a, b);
 }
 
+static inline HVX_Vector hvx_vec_add_f32_f32(HVX_Vector a, HVX_Vector b) {
+    return Q6_Vsf_vadd_VsfVsf(a, b);
+}
+
+static inline HVX_Vector hvx_vec_sub_f32_f32(HVX_Vector a, HVX_Vector b) {
+    return Q6_Vsf_vsub_VsfVsf(a, b);
+}
+
+static inline HVX_Vector hvx_vec_mul_f32_f32(HVX_Vector a, HVX_Vector b) {
+    return Q6_Vsf_vmpy_VsfVsf(a, b);
+}
+
 #endif // __HVX_ARCH__ < 79
 
 #endif /* HVX_BASE_H */
diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c
index 088434a63..db277a25e 100644
--- a/ggml/src/ggml-hexagon/htp/main.c
+++ b/ggml/src/ggml-hexagon/htp/main.c
@@ -573,6 +573,9 @@ static int execute_op(struct htp_ops_context * octx) {
         case HTP_OP_DIAG:
             return op_diag(octx);
 
+        case HTP_OP_SOLVE_TRI:
+            return op_solve_tri(octx);
+
         case HTP_OP_INVALID:
             break;
 
diff --git a/ggml/src/ggml-hexagon/htp/solve-tri-ops.c b/ggml/src/ggml-hexagon/htp/solve-tri-ops.c
new file mode 100644
index 000000000..ae8e1a504
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp/solve-tri-ops.c
@@ -0,0 +1,267 @@
+#pragma clang diagnostic ignored "-Wunused-but-set-variable"
+
+#include <HAP_farf.h>
+#include <HAP_perf.h>
+#include <string.h>
+
+#define GGML_COMMON_DECL_C
+#include "ggml-common.h"
+#include "htp-ctx.h"
+#include "htp-ops.h"
+#include "hvx-types.h"
+#include "hvx-utils.h"
+
+struct htp_solve_tri_context {
+    struct htp_ops_context * octx;
+    uint32_t                 jobs_per_thread;
+    uint32_t                 total_jobs;
+    uint32_t                 k_chunks;
+    uint32_t                 col_block;
+};
+
+static inline void solve_tri_row_scalar(const float * A_row,
+                                        const float * B_row,
+                                        float *       X,
+                                        uint32_t      row,
+                                        uint32_t      k,
+                                        uint32_t      col0,
+                                        uint32_t      coln,
+                                        float         inv_diag) {
+    for (uint32_t col = col0; col < col0 + coln; ++col) {
+        float sum = 0.0f;
+        for (uint32_t t = 0; t < row; ++t) {
+            sum += A_row[t] * X[t * k + col];
+        }
+        X[row * k + col] = (B_row[col] - sum) * inv_diag;
+    }
+}
+
+static inline HVX_Vector hvx_load_partial_f32(const float * src, uint32_t n) {
+    HVX_Vector v = *((const HVX_UVector *) src);
+    HVX_VectorPred mask = Q6_Q_vsetq2_R(n * sizeof(float));
+    return Q6_V_vmux_QVV(mask, v, Q6_V_vzero());
+}
+
+static inline void solve_tri_row_hvx(const float * A_row,
+                                     const float * B_row,
+                                     float *       X,
+                                     uint32_t      row,
+                                     uint32_t      k,
+                                     uint32_t      col0,
+                                     uint32_t      coln,
+                                     float         inv_diag) {
+    const bool full = (coln == VLEN_FP32);
+
+    HVX_Vector sum_v = Q6_V_vzero();
+    for (uint32_t t = 0; t < row; ++t) {
+        const float   a         = A_row[t];
+        const float * x_row_col = X + t * k + col0;
+
+        HVX_Vector x_v = full ? *((const HVX_UVector *) x_row_col) : hvx_load_partial_f32(x_row_col, coln);
+        HVX_Vector a_v = hvx_vec_splat_f32(a);
+        sum_v          = hvx_vec_add_f32_f32(sum_v, hvx_vec_mul_f32_f32(x_v, a_v));
+    }
+
+    const float * b_row_col = B_row + col0;
+    float *       x_out_col = X + row * k + col0;
+
+    HVX_Vector b_v        = full ? *((const HVX_UVector *) b_row_col) : hvx_load_partial_f32(b_row_col, coln);
+    HVX_Vector inv_diag_v = hvx_vec_splat_f32(inv_diag);
+
+    HVX_Vector out_v = hvx_vec_mul_f32_f32(hvx_vec_sub_f32_f32(b_v, sum_v), inv_diag_v);
+    hvx_vec_store_u((void *) x_out_col, coln * sizeof(float), out_v);
+}
+
+// Batch-level thread: each job is one full batch.
+static void solve_tri_batch_thread_f32(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_solve_tri_context * sctx = (struct htp_solve_tri_context *) data;
+    struct htp_ops_context *       octx = sctx->octx;
+
+    const struct htp_tensor * src0 = octx->src[0];  // A
+    const struct htp_tensor * src1 = octx->src[1];  // B
+    const struct htp_tensor * dst  = octx->dst;     // X
+
+    const uint32_t n = src0->ne[0];
+    const uint32_t k = src1->ne[0];
+
+    const uint32_t ne02 = src0->ne[2];
+
+    const uint32_t col_block = VLEN_FP32;
+    const uint32_t k_full    = (k / col_block) * col_block;
+
+    const uint32_t start_batch = sctx->jobs_per_thread * ith;
+    const uint32_t end_batch   = MIN(start_batch + sctx->jobs_per_thread, sctx->total_jobs);
+
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
+
+    for (uint32_t batch = start_batch; batch < end_batch; ++batch) {
+        const uint32_t i03 = batch / ne02;
+        const uint32_t i02 = batch - i03 * ne02;
+
+        const float * A_batch =
+            (const float *) ((const uint8_t *) (uintptr_t) src0->data + i02 * src0->nb[2] + i03 * src0->nb[3]);
+        const float * B_batch =
+            (const float *) ((const uint8_t *) (uintptr_t) src1->data + i02 * src1->nb[2] + i03 * src1->nb[3]);
+        float * X_batch = (float *) ((uint8_t *) (uintptr_t) dst->data + i02 * dst->nb[2] + i03 * dst->nb[3]);
+
+        for (uint32_t row = 0; row < n; ++row) {
+            const float   diag     = A_batch[row * n + row];
+            const float   inv_diag = 1.0f / diag;
+            const float * A_row    = A_batch + row * n;
+            const float * B_row    = B_batch + row * k;
+
+            uint32_t col0 = 0;
+            for (; col0 < k_full; col0 += col_block) {
+                solve_tri_row_hvx(A_row, B_row, X_batch, row, k, col0, col_block, inv_diag);
+            }
+
+            if (col0 < k) {
+                const uint32_t coln = k - col0;
+                if (coln >= 8) {
+                    solve_tri_row_hvx(A_row, B_row, X_batch, row, k, col0, coln, inv_diag);
+                } else {
+                    solve_tri_row_scalar(A_row, B_row, X_batch, row, k, col0, coln, inv_diag);
+                }
+            }
+        }
+    }
+
+    t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "solve-tri-batch %d/%d: A=(%ux%u) B=(%ux%u) batch %u:%u usec %u\n",
+         ith, nth, n, n, k, n, start_batch, end_batch,
+         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+}
+
+// Chunk-level thread: each job is one (batch, col_chunk) pair.
+static void solve_tri_chunk_thread_f32(unsigned int nth, unsigned int ith, void * data) {
+    struct htp_solve_tri_context * sctx = (struct htp_solve_tri_context *) data;
+    struct htp_ops_context *       octx = sctx->octx;
+
+    const struct htp_tensor * src0 = octx->src[0];  // A
+    const struct htp_tensor * src1 = octx->src[1];  // B
+    const struct htp_tensor * dst  = octx->dst;     // X
+
+    const uint32_t n = src0->ne[0];
+    const uint32_t k = src1->ne[0];
+
+    const uint32_t ne02 = src0->ne[2];
+
+    const uint32_t start_job = sctx->jobs_per_thread * ith;
+    const uint32_t end_job   = MIN(start_job + sctx->jobs_per_thread, sctx->total_jobs);
+
+    uint64_t t1, t2;
+    t1 = HAP_perf_get_qtimer_count();
+
+    for (uint32_t job = start_job; job < end_job; ++job) {
+        const uint32_t batch = job / sctx->k_chunks;
+        const uint32_t chunk = job - batch * sctx->k_chunks;
+
+        const uint32_t i03 = batch / ne02;
+        const uint32_t i02 = batch - i03 * ne02;
+
+        const uint32_t col0 = chunk * sctx->col_block;
+        const uint32_t coln = MIN(sctx->col_block, k - col0);
+
+        const float * A_batch =
+            (const float *) ((const uint8_t *) (uintptr_t) src0->data + i02 * src0->nb[2] + i03 * src0->nb[3]);
+        const float * B_batch =
+            (const float *) ((const uint8_t *) (uintptr_t) src1->data + i02 * src1->nb[2] + i03 * src1->nb[3]);
+        float * X_batch = (float *) ((uint8_t *) (uintptr_t) dst->data + i02 * dst->nb[2] + i03 * dst->nb[3]);
+
+        const bool use_hvx = (coln >= 8);
+
+        for (uint32_t row = 0; row < n; ++row) {
+            const float diag     = A_batch[row * n + row];
+            const float inv_diag = 1.0f / diag;
+
+            const float * A_row = A_batch + row * n;
+            const float * B_row = B_batch + row * k;
+
+            if (use_hvx) {
+                solve_tri_row_hvx(A_row, B_row, X_batch, row, k, col0, coln, inv_diag);
+            } else {
+                solve_tri_row_scalar(A_row, B_row, X_batch, row, k, col0, coln, inv_diag);
+            }
+        }
+    }
+
+    t2 = HAP_perf_get_qtimer_count();
+
+    FARF(HIGH, "solve-tri-chunk %d/%d: A=(%ux%u) B=(%ux%u) job %u:%u usec %u\n",
+         ith, nth, n, n, k, n, start_job, end_job,
+         (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
+}
+
+int op_solve_tri(struct htp_ops_context * octx) {
+    const struct htp_tensor * src0 = octx->src[0];  // A
+    const struct htp_tensor * src1 = octx->src[1];  // B
+    const struct htp_tensor * dst  = octx->dst;     // X
+
+    if (src0->type != HTP_TYPE_F32 || src1->type != HTP_TYPE_F32 || dst->type != HTP_TYPE_F32) {
+        return HTP_STATUS_NO_SUPPORT;
+    }
+
+    // left=true, lower=true, uni=false only
+    if (src0->ne[0] != src0->ne[1]) {
+        return HTP_STATUS_INVAL_PARAMS;
+    }
+    if (src0->ne[1] != src1->ne[1]) {
+        return HTP_STATUS_INVAL_PARAMS;
+    }
+    if (src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3]) {
+        return HTP_STATUS_INVAL_PARAMS;
+    }
+    if (dst->ne[0] != src1->ne[0] || dst->ne[1] != src1->ne[1] || dst->ne[2] != src1->ne[2] ||
+        dst->ne[3] != src1->ne[3]) {
+        return HTP_STATUS_INVAL_PARAMS;
+    }
+
+    if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) {
+        return HTP_STATUS_OK;
+    }
+
+    const uint32_t k = src1->ne[0];
+
+    const uint32_t col_block     = VLEN_FP32;
+    const uint32_t k_chunks      = (k + col_block - 1) / col_block;
+    const uint32_t total_batches = src0->ne[2] * src0->ne[3];
+    const bool     batched       = total_batches >= (uint32_t) octx->n_threads;
+
+    FARF(HIGH, "solve-tri: (%ux%ux%ux%u) x (%ux%ux%ux%u) -> (%ux%ux%ux%u) : batched %d\n",
+         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+         src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
+         dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], batched);
+
+    if (batched) {
+        // Batch-level parallelism
+        const uint32_t n_threads = MIN((uint32_t) octx->n_threads, total_batches);
+
+        struct htp_solve_tri_context sctx = {
+            .octx            = octx,
+            .jobs_per_thread = (total_batches + n_threads - 1) / n_threads,
+            .total_jobs      = total_batches,
+            .k_chunks        = k_chunks,
+            .col_block       = col_block,
+        };
+
+        worker_pool_run_func(octx->ctx->worker_pool, solve_tri_batch_thread_f32, &sctx, n_threads);
+    } else {
+        // Chunk-level parallelism
+        const uint32_t total_jobs = total_batches * k_chunks;
+        const uint32_t n_threads  = MIN((uint32_t) octx->n_threads, MAX(total_jobs, 1));
+
+        struct htp_solve_tri_context sctx = {
+            .octx            = octx,
+            .jobs_per_thread = (total_jobs + n_threads - 1) / n_threads,
+            .total_jobs      = total_jobs,
+            .k_chunks        = k_chunks,
+            .col_block       = col_block,
+        };
+
+        worker_pool_run_func(octx->ctx->worker_pool, solve_tri_chunk_thread_f32, &sctx, n_threads);
+    }
+
+    return HTP_STATUS_OK;
+}

From 793d0a7931f737f592fe080164c4f00006d92ea0 Mon Sep 17 00:00:00 2001
From: Yes You Can Have Your Own <188969017+yychyo@users.noreply.github.com>
Date: Fri, 24 Apr 2026 09:28:44 +0300
Subject: [PATCH 25/35] server: rename debug tags to match --cache-idle-slots
 naming (#22292)

---
 tools/server/server-context.cpp                     |  4 ++--
 tools/server/tests/unit/test_kv_keep_only_active.py | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 67a92755b..ef2a4fddc 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -719,7 +719,7 @@ private:
             return;
         }
         SLT_INF(slot, "%s", "saving idle slot to prompt cache\n");
-        SLT_DBG(slot, "%s", "__TEST_TAG_CLEAR_IDLE_SLOT__\n");
+        SLT_DBG(slot, "%s", "__TEST_TAG_CACHE_IDLE_SLOT__\n");
         slot.prompt_save(*prompt_cache);
         slot.prompt_clear(false);
         prompt_cache->update();
@@ -996,7 +996,7 @@ private:
                 params_base.cache_idle_slots = false;
             } else {
                 SRV_INF("%s: idle slots will be saved to prompt cache and cleared upon starting a new task\n", __func__);
-                SRV_DBG("%s", "__TEST_TAG_CLEAR_IDLE_ENABLED__\n");
+                SRV_DBG("%s", "__TEST_TAG_CACHE_IDLE_SLOTS_ENABLED__\n");
             }
         }
 
diff --git a/tools/server/tests/unit/test_kv_keep_only_active.py b/tools/server/tests/unit/test_kv_keep_only_active.py
index f4b08b5dd..44c05fab0 100644
--- a/tools/server/tests/unit/test_kv_keep_only_active.py
+++ b/tools/server/tests/unit/test_kv_keep_only_active.py
@@ -48,7 +48,7 @@ def test_clear_and_restore():
     log = LogReader(server.log_path)
 
     # verify feature is enabled
-    assert "__TEST_TAG_CLEAR_IDLE_ENABLED__" in log.drain()
+    assert "__TEST_TAG_CACHE_IDLE_SLOTS_ENABLED__" in log.drain()
 
     res = server.make_request("POST", "/completion", data={
         "prompt": LONG_PROMPT,
@@ -59,7 +59,7 @@ def test_clear_and_restore():
     original_prompt_n = res.body["timings"]["prompt_n"]
 
     # Slot 0 is the only slot with KV — should NOT be cleared
-    assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
+    assert "__TEST_TAG_CACHE_IDLE_SLOT__" not in log.drain()
 
     # Launching slot 1 clears idle slot 0
     res = server.make_request("POST", "/completion", data={
@@ -68,7 +68,7 @@ def test_clear_and_restore():
         "cache_prompt": True,
     })
     assert res.status_code == 200
-    assert "__TEST_TAG_CLEAR_IDLE_SLOT__" in log.drain()
+    assert "__TEST_TAG_CACHE_IDLE_SLOT__" in log.drain()
 
     # Re-send same prompt — should restore from cache-ram
     res = server.make_request("POST", "/completion", data={
@@ -86,7 +86,7 @@ def test_clear_and_restore():
         "cache_prompt": True,
     })
     assert res.status_code == 200
-    assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
+    assert "__TEST_TAG_CACHE_IDLE_SLOT__" not in log.drain()
 
 
 def test_disabled_with_flag():
@@ -96,7 +96,7 @@ def test_disabled_with_flag():
     log = LogReader(server.log_path)
 
     # Feature should not be enabled
-    assert "__TEST_TAG_CLEAR_IDLE_ENABLED__" not in log.drain()
+    assert "__TEST_TAG_CACHE_IDLE_SLOTS_ENABLED__" not in log.drain()
 
     res = server.make_request("POST", "/completion", data={
         "prompt": LONG_PROMPT,
@@ -112,4 +112,4 @@ def test_disabled_with_flag():
         "cache_prompt": True,
     })
     assert res.status_code == 200
-    assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
+    assert "__TEST_TAG_CACHE_IDLE_SLOT__" not in log.drain()

From ffdd983fb83ff3ca5e972188b30bcf8d039d3283 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 24 Apr 2026 10:17:37 +0300
Subject: [PATCH 26/35] server : fix swa-full logic (#22288)

---
 tools/server/server-context.cpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index ef2a4fddc..08ff1e362 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -675,6 +675,10 @@ private:
 
     int32_t n_ctx; // total context for all clients / slots
 
+    // set to llama_model_n_swa(model)
+    // if swa_full is enabled, this is set to 0 to simulate a non-SWA model
+    int32_t n_swa;
+
     // slots / clients
     std::vector<server_slot> slots;
 
@@ -854,6 +858,8 @@ private:
             }
         }
 
+        n_swa = params_base.swa_full ? 0 : llama_model_n_swa(model);
+
         // Necessary similarity of prompt for slot selection
         slot_prompt_similarity = params_base.slot_prompt_similarity;
 
@@ -2415,9 +2421,6 @@ private:
 
                             llama_pos pos_next = slot.prompt.tokens.pos_next(n_past);
 
-                            // note: when n_swa == 0, the model does not use SWA
-                            const auto n_swa = std::max(0, llama_model_n_swa(model));
-
                             // the largest pos_min required for a checkpoint to be useful
                             const auto pos_min_thold = std::max(0, pos_next - n_swa);
 
@@ -2589,10 +2592,10 @@ private:
                     // make a checkpoint of the parts of the memory that cannot be rolled back.
                     // checkpoints are created only if:
                     // - the model does not support partial sequence removal
-                    // - the model uses SWA and we are not using `swa_full`
+                    // - the model uses SWA (and we are not using `swa_full`)
                     do_checkpoint = do_checkpoint && (
                             (slot.ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) ||
-                            (llama_model_n_swa(model) > 0 && !params_base.swa_full));
+                            (n_swa > 0));
 
                     bool has_mtmd = false;
 

From 017f090442555c7c2c72394e25cc72cba705b819 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 24 Apr 2026 11:01:46 +0300
Subject: [PATCH 27/35] jinja : remove unused header (#22310)

---
 common/jinja/caps.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/common/jinja/caps.cpp b/common/jinja/caps.cpp
index ec207a53e..ead864763 100644
--- a/common/jinja/caps.cpp
+++ b/common/jinja/caps.cpp
@@ -1,4 +1,3 @@
-#include "log.h"
 #include "value.h"
 #include "runtime.h"
 #include "caps.h"

From e583f3b4f5dabf9a6e987772d4e4daf121858707 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 24 Apr 2026 11:02:00 +0300
Subject: [PATCH 28/35] ggml : minor coding style (#22308)

---
 ggml/src/ggml.c | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index eda041f45..54d3eae3e 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -7656,7 +7656,7 @@ size_t ggml_quantize_chunk(
                int64_t   nrows,
                int64_t   n_per_row,
            const float * imatrix) {
-    const int64_t n = (int64_t) nrows * n_per_row;
+    const int64_t n = nrows * n_per_row;
 
     if (ggml_quantize_requires_imatrix(type)) {
         GGML_ASSERT(imatrix != NULL);
@@ -7673,21 +7673,21 @@ size_t ggml_quantize_chunk(
     size_t result = 0;
 
     switch (type) {
-        case GGML_TYPE_Q1_0:    result = quantize_q1_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q4_0:    result = quantize_q4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q4_1:    result = quantize_q4_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q5_0:    result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q5_1:    result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q8_0:    result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_MXFP4:   result = quantize_mxfp4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_NVFP4:   result = quantize_nvfp4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q2_K:    result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q3_K:    result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q4_K:    result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q5_K:    result = quantize_q5_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q6_K:    result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_TQ1_0:   result = quantize_tq1_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_TQ2_0:   result = quantize_tq2_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q1_0:    result = quantize_q1_0   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q4_0:    result = quantize_q4_0   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q4_1:    result = quantize_q4_1   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q5_0:    result = quantize_q5_0   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q5_1:    result = quantize_q5_1   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q8_0:    result = quantize_q8_0   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_MXFP4:   result = quantize_mxfp4  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_NVFP4:   result = quantize_nvfp4  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q2_K:    result = quantize_q2_K   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q3_K:    result = quantize_q3_K   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q4_K:    result = quantize_q4_K   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q5_K:    result = quantize_q5_K   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q6_K:    result = quantize_q6_K   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_TQ1_0:   result = quantize_tq1_0  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_TQ2_0:   result = quantize_tq2_0  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_IQ2_XS:  result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
@@ -7752,9 +7752,9 @@ struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
 }
 
 bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
-    if (p0->n_threads      != p1->n_threads  )    return false;
-    if (p0->prio           != p1->prio       )    return false;
-    if (p0->poll           != p1->poll       )    return false;
-    if (p0->strict_cpu     != p1->strict_cpu )    return false;
+    if (p0->n_threads  != p1->n_threads  ) return false;
+    if (p0->prio       != p1->prio       ) return false;
+    if (p0->poll       != p1->poll       ) return false;
+    if (p0->strict_cpu != p1->strict_cpu ) return false;
     return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
 }

From dc80c5252a6d49301726e5987cc1fa006b69d93e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= <angt@huggingface.co>
Date: Fri, 24 Apr 2026 12:36:02 +0200
Subject: [PATCH 29/35] common : fix jinja warnings with clang 21 (#22313)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
---
 common/jinja/runtime.h | 16 +++++++++++-----
 common/jinja/value.cpp | 22 +++++++++++++---------
 common/jinja/value.h   | 39 +++++++++++++++++++++------------------
 3 files changed, 45 insertions(+), 32 deletions(-)

diff --git a/common/jinja/runtime.h b/common/jinja/runtime.h
index 3ca5f1754..b6f4a6ab4 100644
--- a/common/jinja/runtime.h
+++ b/common/jinja/runtime.h
@@ -106,10 +106,16 @@ struct statement {
     size_t pos; // position in source, for debugging
     virtual ~statement() = default;
     virtual std::string type() const { return "Statement"; }
+
     // execute_impl must be overridden by derived classes
-    virtual value execute_impl(context &) { throw std::runtime_error("cannot exec " + type()); }
+    virtual value execute_impl(context &) { throw_exec_error(); }
     // execute is the public method to execute a statement with error handling
     value execute(context &);
+
+private:
+    [[noreturn]] void throw_exec_error() const {
+        throw std::runtime_error("cannot exec " + type());
+    }
 };
 
 // Type Checking Utilities
@@ -143,7 +149,7 @@ struct program : public statement {
     program() = default;
     explicit program(statements && body) : body(std::move(body)) {}
     std::string type() const override { return "Program"; }
-    value execute_impl(context &) override {
+    [[noreturn]] value execute_impl(context &) override {
         throw std::runtime_error("Cannot execute program directly, use jinja::runtime instead");
     }
 };
@@ -195,7 +201,7 @@ struct break_statement : public statement {
         }
     };
 
-    value execute_impl(context &) override {
+    [[noreturn]] value execute_impl(context &) override {
         throw break_statement::signal();
     }
 };
@@ -209,7 +215,7 @@ struct continue_statement : public statement {
         }
     };
 
-    value execute_impl(context &) override {
+    [[noreturn]] value execute_impl(context &) override {
         throw continue_statement::signal();
     }
 };
@@ -509,7 +515,7 @@ struct slice_expression : public expression {
         chk_type<expression>(this->step_expr);
     }
     std::string type() const override { return "SliceExpression"; }
-    value execute_impl(context &) override {
+    [[noreturn]] value execute_impl(context &) override {
         throw std::runtime_error("must be handled by MemberExpression");
     }
 };
diff --git a/common/jinja/value.cpp b/common/jinja/value.cpp
index 8e86a715f..0b79098cd 100644
--- a/common/jinja/value.cpp
+++ b/common/jinja/value.cpp
@@ -590,6 +590,10 @@ static bool string_endswith(const std::string & str, const std::string & suffix)
     return str.compare(str.length() - suffix.length(), suffix.length(), suffix) == 0;
 }
 
+[[noreturn]] static value string_join_not_implemented(const func_args &) {
+    throw not_implemented_exception("String join builtin not implemented");
+}
+
 const func_builtins & value_string_t::get_builtins() const {
     static const func_builtins builtins = {
         {"default", default_value},
@@ -851,9 +855,7 @@ const func_builtins & value_string_t::get_builtins() const {
             res->val_str.mark_input_based_on(val_input->as_string());
             return res;
         }},
-        {"join", [](const func_args &) -> value {
-            throw not_implemented_exception("String join builtin not implemented");
-        }},
+        {"join", string_join_not_implemented},
     };
     return builtins;
 }
@@ -884,6 +886,9 @@ const func_builtins & value_bool_t::get_builtins() const {
     return builtins;
 }
 
+[[noreturn]] static value array_unique_not_implemented(const func_args &) {
+    throw not_implemented_exception("Array unique builtin not implemented");
+}
 
 const func_builtins & value_array_t::get_builtins() const {
     static const func_builtins builtins = {
@@ -1084,13 +1089,14 @@ const func_builtins & value_array_t::get_builtins() const {
             std::reverse(arr.begin(), arr.end());
             return is_val<value_tuple>(val) ? mk_val<value_tuple>(std::move(arr)) : mk_val<value_array>(std::move(arr));
         }},
-        {"unique", [](const func_args &) -> value {
-            throw not_implemented_exception("Array unique builtin not implemented");
-        }},
+        {"unique", array_unique_not_implemented},
     };
     return builtins;
 }
 
+[[noreturn]] static value object_join_not_implemented(const func_args &) {
+    throw not_implemented_exception("object join not implemented");
+}
 
 const func_builtins & value_object_t::get_builtins() const {
     if (!has_builtins) {
@@ -1183,9 +1189,7 @@ const func_builtins & value_object_t::get_builtins() const {
             });
             return result;
         }},
-        {"join", [](const func_args &) -> value {
-            throw not_implemented_exception("object join not implemented");
-        }},
+        {"join", object_join_not_implemented},
     };
     return builtins;
 }
diff --git a/common/jinja/value.h b/common/jinja/value.h
index 7d164588a..5cf85e4f5 100644
--- a/common/jinja/value.h
+++ b/common/jinja/value.h
@@ -129,27 +129,25 @@ struct value_t {
     // Note: only for debugging and error reporting purposes
     virtual std::string type() const { return ""; }
 
-    virtual int64_t as_int() const { throw std::runtime_error(type() + " is not an int value"); }
-    virtual double as_float() const { throw std::runtime_error(type() + " is not a float value"); }
-    virtual string as_string() const { throw std::runtime_error(type() + " is not a string value"); }
-    virtual bool as_bool() const { throw std::runtime_error(type() + " is not a bool value"); }
-    virtual const std::vector<value> & as_array() const { throw std::runtime_error(type() + " is not an array value"); }
-    virtual const std::vector<std::pair<value, value>> & as_ordered_object() const { throw std::runtime_error(type() + " is not an object value"); }
-    virtual value invoke(const func_args &) const { throw std::runtime_error(type() + " is not a function value"); }
+    virtual int64_t as_int() const { throw_type_error("is not an int value"); }
+    virtual double as_float() const { throw_type_error("is not a float value"); }
+    virtual string as_string() const { throw_type_error("is not a string value"); }
+    virtual bool as_bool() const { throw_type_error("is not a bool value"); }
+    virtual const std::vector<value> & as_array() const { throw_type_error("is not an array value"); }
+    virtual const std::vector<std::pair<value, value>> & as_ordered_object() const { throw_type_error("is not an object value"); }
+    virtual value invoke(const func_args &) const { throw_type_error("is not a function value"); }
     virtual bool is_none() const { return false; }
     virtual bool is_undefined() const { return false; }
-    virtual const func_builtins & get_builtins() const {
-        throw std::runtime_error("No builtins available for type " + type());
-    }
+    virtual const func_builtins & get_builtins() const { throw_type_error("has no builtins"); }
 
-    virtual bool has_key(const value &) { throw std::runtime_error(type() + " is not an object value"); }
-    virtual void insert(const value & /* key */, const value & /* val */) { throw std::runtime_error(type() + " is not an object value"); }
-    virtual value & at(const value & /* key */, value & /* default_val */) { throw std::runtime_error(type() + " is not an object value"); }
-    virtual value & at(const value & /* key */) { throw std::runtime_error(type() + " is not an object value"); }
-    virtual value & at(const std::string & /* key */, value & /* default_val */) { throw std::runtime_error(type() + " is not an object value"); }
-    virtual value & at(const std::string & /* key */) { throw std::runtime_error(type() + " is not an object value"); }
-    virtual value & at(int64_t /* idx */, value & /* default_val */) { throw std::runtime_error(type() + " is not an array value"); }
-    virtual value & at(int64_t /* idx */) { throw std::runtime_error(type() + " is not an array value"); }
+    virtual bool has_key(const value &) { throw_type_error("is not an object value"); }
+    virtual void insert(const value & /* key */, const value & /* val */) { throw_type_error("is not an object value"); }
+    virtual value & at(const value & /* key */, value & /* default_val */) { throw_type_error("is not an object value"); }
+    virtual value & at(const value & /* key */) { throw_type_error("is not an object value"); }
+    virtual value & at(const std::string & /* key */, value & /* default_val */) { throw_type_error("is not an object value"); }
+    virtual value & at(const std::string & /* key */) { throw_type_error("is not an object value"); }
+    virtual value & at(int64_t /* idx */, value & /* default_val */) { throw_type_error("is not an array value"); }
+    virtual value & at(int64_t /* idx */) { throw_type_error("is not an array value"); }
 
     virtual bool is_numeric() const { return false; }
     virtual bool is_hashable() const { return false; }
@@ -163,6 +161,11 @@ struct value_t {
     // Note: only for debugging purposes
     virtual std::string as_repr() const { return as_string().str(); }
 
+private:
+    [[noreturn]] void throw_type_error(const char* expected) const {
+        throw std::runtime_error(type() + " " + expected);
+    }
+
 protected:
     virtual bool equivalent(const value_t &) const = 0;
     virtual bool nonequal(const value_t & other) const { return !equivalent(other); }

From 15fa3c493bfcd040b5f4dcb29e1c998a0846de16 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 24 Apr 2026 13:56:03 +0300
Subject: [PATCH 30/35] metal : print GPU description (#22318)

---
 ggml/src/ggml-metal/ggml-metal-device.m | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m
index f17f7e2e0..27b78c5e6 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -814,7 +814,7 @@ ggml_metal_device_t ggml_metal_device_init(int device) {
             }
 
             // print MTL GPU family:
-            GGML_LOG_INFO("%s: GPU name:   %s\n", __func__, dev->props.name);
+            GGML_LOG_INFO("%s: GPU name:   %s (%s)\n", __func__, dev->props.name, dev->props.desc);
 
             // determine max supported GPU family
             // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf

From f65bc34c688f9ab68c312b5ce0c0885cca94cf1d Mon Sep 17 00:00:00 2001
From: Mengsheng Wu <mengshen@qti.qualcomm.com>
Date: Sat, 25 Apr 2026 00:21:33 +0800
Subject: [PATCH 31/35] hexagon: use DIRID 13 in libggml-htp.inf for modern
 InfVerif (#22306)

---
 ggml/src/ggml-hexagon/libggml-htp.inf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-hexagon/libggml-htp.inf b/ggml/src/ggml-hexagon/libggml-htp.inf
index 360d8b122..39cefcdda 100644
--- a/ggml/src/ggml-hexagon/libggml-htp.inf
+++ b/ggml/src/ggml-hexagon/libggml-htp.inf
@@ -8,7 +8,7 @@ CatalogFile = libggml-htp.cat
 PnpLockDown = 1
 
 [DestinationDirs]
-Drivers_Dir = 6
+Drivers_Dir = 13
 
 [SourceDisksNames]
 1 = %DiskId%

From 13d36cf89178354d9aa6732e5930d89d64caf718 Mon Sep 17 00:00:00 2001
From: Zheyuan Chen <sephirotheca17@gmail.com>
Date: Fri, 24 Apr 2026 10:39:09 -0700
Subject: [PATCH 32/35] ggml-webgpu: enable FLASH_ATTN_EXT on browser without
 subgroup matrix  (#22199)

* ggml-webgpu: add tile flash attention fallback

* ggml-webgpu: add new fields and discard usage of mnk for tile version

* ggml-webgpu: modify the vec path to discard the mnk parameter

* ggml-webgpu: enable flash attention vec and tile version for broswer

* ggml-webgpu: stagging KV for flash attention tile version

* formatting

* turn on subgroup uniformity check

* remove Q_TILE as it is always 1 for vec path

* make row_max and exp_sum to local register

* make different bindings with same underlying buffer to have the same usage flags

* move path selection into the shader library and have the host consume a single flash-attn decision object.

* turn off skip_validation and address buffer overlapping when nwg==1

* formatting

* merge binding when kv overlap
---
 .../ggml-webgpu/ggml-webgpu-shader-lib.hpp    | 326 +++++++++--------
 ggml/src/ggml-webgpu/ggml-webgpu.cpp          | 199 +++++++----
 .../ggml-webgpu/wgsl-shaders/flash_attn.wgsl  |  47 ++-
 .../wgsl-shaders/flash_attn_tile.wgsl         | 330 ++++++++++++++++++
 .../wgsl-shaders/flash_attn_vec_blk.wgsl      |   2 +-
 .../wgsl-shaders/flash_attn_vec_split.wgsl    | 313 ++++++++---------
 6 files changed, 817 insertions(+), 400 deletions(-)
 create mode 100644 ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl

diff --git a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
index 449eae808..e492c2123 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp
@@ -436,19 +436,27 @@ struct ggml_webgpu_unary_pipeline_key_hash {
 
 /** FlashAttention */
 
+enum ggml_webgpu_flash_attn_path : uint32_t {
+    GGML_WEBGPU_FLASH_ATTN_PATH_SUBGROUP_MATRIX = 0u,
+    GGML_WEBGPU_FLASH_ATTN_PATH_TILE            = 1u,
+    GGML_WEBGPU_FLASH_ATTN_PATH_VEC             = 2u,
+};
+
 struct ggml_webgpu_flash_attn_pipeline_key {
     ggml_type kv_type;
     uint32_t  head_dim_qk;
     uint32_t  head_dim_v;
     bool      kv_direct;
+    bool      kv_overlap;
     bool      has_mask;
     bool      has_sinks;
     bool      uses_logit_softcap;
+    uint32_t  path;
 
     bool operator==(const ggml_webgpu_flash_attn_pipeline_key & other) const {
         return kv_type == other.kv_type && head_dim_qk == other.head_dim_qk && head_dim_v == other.head_dim_v &&
-               kv_direct == other.kv_direct && has_mask == other.has_mask && has_sinks == other.has_sinks &&
-               uses_logit_softcap == other.uses_logit_softcap;
+               kv_direct == other.kv_direct && kv_overlap == other.kv_overlap && has_mask == other.has_mask &&
+               has_sinks == other.has_sinks && uses_logit_softcap == other.uses_logit_softcap && path == other.path;
     }
 };
 
@@ -459,39 +467,70 @@ struct ggml_webgpu_flash_attn_pipeline_key_hash {
         ggml_webgpu_hash_combine(seed, key.head_dim_qk);
         ggml_webgpu_hash_combine(seed, key.head_dim_v);
         ggml_webgpu_hash_combine(seed, key.kv_direct);
+        ggml_webgpu_hash_combine(seed, key.kv_overlap);
         ggml_webgpu_hash_combine(seed, key.has_mask);
         ggml_webgpu_hash_combine(seed, key.has_sinks);
         ggml_webgpu_hash_combine(seed, key.uses_logit_softcap);
+        ggml_webgpu_hash_combine(seed, key.path);
         return seed;
     }
 };
 
 struct ggml_webgpu_flash_attn_decisions {
-    uint32_t q_tile  = 0;
-    uint32_t kv_tile = 0;
-    uint32_t wg_size = 0;
+    uint32_t path      = GGML_WEBGPU_FLASH_ATTN_PATH_SUBGROUP_MATRIX;
+    uint32_t q_tile    = 0;
+    uint32_t kv_tile   = 0;
+    uint32_t wg_size   = 0;
+    bool     kv_direct = false;
 };
 
-struct ggml_webgpu_flash_attn_vec_decisions {
-    uint32_t kv_tile = 0;
-    uint32_t wg_size = 0;
-};
+inline constexpr uint32_t GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH = 4u;
+inline constexpr uint32_t GGML_WEBGPU_FLASH_ATTN_TILE_Q_TILE       = 4u;
+
+inline uint32_t ggml_webgpu_flash_attn_pick_vec_ne(const ggml_webgpu_flash_attn_pipeline_key & key) {
+    if (key.path != GGML_WEBGPU_FLASH_ATTN_PATH_VEC || key.kv_type != GGML_TYPE_F16 ||
+        key.head_dim_qk != key.head_dim_v) {
+        return 1u;
+    }
+
+    switch (key.head_dim_qk) {
+        case 64:
+        case 192:
+        case 576:
+            return 2u;
+        case 96:
+            return 4u;
+        default:
+            return 1u;
+    }
+}
 
 inline ggml_webgpu_flash_attn_pipeline_key ggml_webgpu_flash_attn_make_pipeline_key(
-    const ggml_webgpu_shader_lib_context & context) {
+    const ggml_webgpu_shader_lib_context & context,
+    uint32_t                               path) {
     const bool has_mask  = context.src3 != nullptr;
     const bool has_sinks = context.src4 != nullptr;
-    const bool kv_direct = (context.src1->type == GGML_TYPE_F16) && (context.src0->ne[0] % context.sg_mat_k == 0) &&
-                           (context.src1->ne[1] % GGML_WEBGPU_KV_SEQ_PAD == 0);
+    bool       kv_direct = false;
+    if (path != GGML_WEBGPU_FLASH_ATTN_PATH_TILE) {
+        uint32_t kv_direct_align = GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH;
+        if (path == GGML_WEBGPU_FLASH_ATTN_PATH_SUBGROUP_MATRIX) {
+            kv_direct_align = context.sg_mat_k;
+        }
+        kv_direct = (context.src1->type == GGML_TYPE_F16) &&
+                    (context.src0->ne[0] % std::max(1u, kv_direct_align) == 0) &&
+                    (context.src1->ne[1] % GGML_WEBGPU_KV_SEQ_PAD == 0);
+    }
 
     ggml_webgpu_flash_attn_pipeline_key key = {};
     key.kv_type                             = context.src1->type;
     key.head_dim_qk                         = (uint32_t) context.src0->ne[0];
     key.head_dim_v                          = (uint32_t) context.src2->ne[0];
     key.kv_direct                           = kv_direct;
+    key.kv_overlap                          = context.src_overlap;
     key.has_mask                            = has_mask;
     key.has_sinks                           = has_sinks;
     key.uses_logit_softcap                  = ggml_get_op_params_f32(context.dst, 2) != 0.0f;
+    key.path                                = path;
     return key;
 }
 
@@ -554,8 +593,16 @@ inline size_t ggml_webgpu_flash_attn_wg_mem_bytes(uint32_t q_tile,
 
 inline uint32_t ggml_webgpu_flash_attn_max_kv_tile(const ggml_webgpu_shader_lib_context &      context,
                                                    const ggml_webgpu_flash_attn_pipeline_key & key) {
-    const size_t limit_bytes  = context.wg_mem_limit_bytes;
-    const size_t q_tile       = context.sg_mat_m;
+    const size_t limit_bytes    = context.wg_mem_limit_bytes;
+    uint32_t     q_tile         = context.sg_mat_m;
+    uint32_t     kv_granularity = context.sg_mat_n;
+    if (key.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE) {
+        q_tile         = GGML_WEBGPU_FLASH_ATTN_TILE_Q_TILE;
+        kv_granularity = std::max(1u, context.max_subgroup_size);
+    } else if (key.path == GGML_WEBGPU_FLASH_ATTN_PATH_VEC) {
+        q_tile         = 1u;
+        kv_granularity = 8u;
+    }
     const size_t base_q_bytes = (key.head_dim_qk + key.head_dim_v) * q_tile * GGML_WEBGPU_F16_SIZE_BYTES +
                                 2 * q_tile * GGML_WEBGPU_F32_SIZE_BYTES;
     size_t bytes_per_kv = 0;
@@ -568,23 +615,90 @@ inline uint32_t ggml_webgpu_flash_attn_max_kv_tile(const ggml_webgpu_shader_lib_
     bytes_per_kv += q_tile;
     bytes_per_kv *= GGML_WEBGPU_F16_SIZE_BYTES;
     const uint32_t max_kv_tile = (limit_bytes - base_q_bytes) / bytes_per_kv;
-    return (max_kv_tile / context.sg_mat_n) * context.sg_mat_n;
+    return (max_kv_tile / kv_granularity) * kv_granularity;
 }
 
-inline uint32_t ggml_webgpu_flash_attn_vec_get_kv_tile(const ggml_webgpu_shader_lib_context & context) {
-    const ggml_webgpu_flash_attn_pipeline_key key         = ggml_webgpu_flash_attn_make_pipeline_key(context);
-    const uint32_t                            min_kv_tile = ggml_webgpu_flash_attn_max_kv_tile(context, key);
-    uint32_t                                  kv_tile     = std::max(context.sg_mat_n, std::min(32u, min_kv_tile));
-    kv_tile                                               = (kv_tile / context.sg_mat_n) * context.sg_mat_n;
+inline ggml_webgpu_flash_attn_decisions ggml_webgpu_flash_attn_get_decisions(
+    const ggml_webgpu_shader_lib_context & context,
+    size_t                                 storage_offset_alignment) {
+    ggml_webgpu_flash_attn_decisions decisions = {};
+    const size_t                     alignment = std::max<size_t>(1u, storage_offset_alignment);
+    const auto *                     K         = context.src1;
+    const auto *                     V         = context.src2;
+    GGML_ASSERT(K != nullptr);
+    GGML_ASSERT(V != nullptr);
 
-    if (key.kv_direct) {
-        kv_tile = std::min(kv_tile, GGML_WEBGPU_KV_SEQ_PAD);
-        while (GGML_WEBGPU_KV_SEQ_PAD % kv_tile != 0) {
-            kv_tile -= context.sg_mat_n;
+    const auto flash_attn_tensor_offset = [](const ggml_tensor * tensor) -> size_t {
+        constexpr uintptr_t ptr_base_addr = 0x1000u;
+        const ggml_tensor * base          = tensor->view_src != nullptr ? tensor->view_src : tensor;
+        return reinterpret_cast<uintptr_t>(base->data) - ptr_base_addr + tensor->view_offs;
+    };
+
+    const uint32_t k_offset_elems =
+        (uint32_t) ((flash_attn_tensor_offset(K) & (alignment - 1)) / ggml_type_size(K->type));
+    const uint32_t v_offset_elems =
+        (uint32_t) ((flash_attn_tensor_offset(V) & (alignment - 1)) / ggml_type_size(V->type));
+    const bool f16_vec4_aligned = (k_offset_elems % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0u) &&
+                                  (v_offset_elems % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0u);
+    const bool kv_vec_type_supported =
+        K->type == GGML_TYPE_F16 || K->type == GGML_TYPE_Q4_0 || K->type == GGML_TYPE_Q8_0;
+    const bool use_vec = context.supports_subgroups && (context.src0->ne[1] < 20) && (context.src0->ne[0] % 32 == 0) &&
+                         (context.src2->ne[0] % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0) &&
+                         kv_vec_type_supported && (K->type != GGML_TYPE_F16 || f16_vec4_aligned) &&
+                         (context.src2->type == K->type);
+    const bool use_tile = context.supports_subgroups && !context.supports_subgroup_matrix && K->type == GGML_TYPE_F16 &&
+                          V->type == GGML_TYPE_F16 && f16_vec4_aligned &&
+                          (context.src0->ne[0] % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0) &&
+                          (context.src2->ne[0] % GGML_WEBGPU_FLASH_ATTN_TILE_KV_VEC_WIDTH == 0) && !use_vec;
+
+    decisions.path = use_vec  ? GGML_WEBGPU_FLASH_ATTN_PATH_VEC :
+                     use_tile ? GGML_WEBGPU_FLASH_ATTN_PATH_TILE :
+                                GGML_WEBGPU_FLASH_ATTN_PATH_SUBGROUP_MATRIX;
+
+    const ggml_webgpu_flash_attn_pipeline_key key = ggml_webgpu_flash_attn_make_pipeline_key(context, decisions.path);
+    decisions.kv_direct                           = key.kv_direct;
+
+    if (decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_VEC) {
+        const uint32_t min_kv_tile = ggml_webgpu_flash_attn_max_kv_tile(context, key);
+        decisions.q_tile           = 1u;
+        decisions.kv_tile          = std::max(8u, std::min(32u, min_kv_tile));
+        decisions.kv_tile          = (decisions.kv_tile / 8u) * 8u;
+        decisions.wg_size          = std::max(1u, std::min<uint32_t>(32u, context.max_subgroup_size));
+        if (decisions.kv_direct) {
+            decisions.kv_tile = std::min(decisions.kv_tile, GGML_WEBGPU_KV_SEQ_PAD);
+            while (GGML_WEBGPU_KV_SEQ_PAD % decisions.kv_tile != 0) {
+                decisions.kv_tile -= 8u;
+            }
+        }
+        return decisions;
+    }
+
+    decisions.q_tile =
+        decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE ? GGML_WEBGPU_FLASH_ATTN_TILE_Q_TILE : context.sg_mat_m;
+    decisions.kv_tile = decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE ?
+                            std::min(64u, ggml_webgpu_flash_attn_max_kv_tile(context, key)) :
+                            std::min(ggml_webgpu_flash_attn_max_kv_tile(context, key),
+                                     context.sg_mat_n * GGML_WEBGPU_FLASH_ATTN_PREFERRED_KV_SG_TILES);
+    decisions.wg_size = decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE ?
+                            GGML_WEBGPU_FLASH_ATTN_PREFERRED_WG_SIZE :
+                            std::max(context.max_subgroup_size, GGML_WEBGPU_FLASH_ATTN_PREFERRED_WG_SIZE);
+
+    if (decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE) {
+        const uint32_t tile_kv_granularity = std::max(1u, context.max_subgroup_size);
+        decisions.kv_tile =
+            std::max(tile_kv_granularity, (decisions.kv_tile / tile_kv_granularity) * tile_kv_granularity);
+    }
+
+    if (decisions.kv_direct) {
+        GGML_ASSERT(decisions.kv_tile <= GGML_WEBGPU_KV_SEQ_PAD);
+        while (GGML_WEBGPU_KV_SEQ_PAD % decisions.kv_tile != 0) {
+            decisions.kv_tile -= decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE ?
+                                     std::max(1u, context.max_subgroup_size) :
+                                     context.sg_mat_n;
         }
     }
 
-    return kv_tile;
+    return decisions;
 }
 
 /** Matrix Multiplication **/
@@ -821,8 +935,6 @@ class ggml_webgpu_shader_lib {
         repeat_pipelines;           // type
     std::unordered_map<ggml_webgpu_flash_attn_pipeline_key, webgpu_pipeline, ggml_webgpu_flash_attn_pipeline_key_hash>
         flash_attn_pipelines;
-    std::unordered_map<ggml_webgpu_flash_attn_pipeline_key, webgpu_pipeline, ggml_webgpu_flash_attn_pipeline_key_hash>
-        flash_attn_vec_pipelines;
     std::unordered_map<ggml_webgpu_flash_attn_vec_reduce_pipeline_key,
                        webgpu_pipeline,
                        ggml_webgpu_flash_attn_vec_reduce_pipeline_key_hash>
@@ -2044,14 +2156,19 @@ class ggml_webgpu_shader_lib {
         return repeat_pipelines[key];
     }
 
-    webgpu_pipeline get_flash_attn_pipeline(const ggml_webgpu_shader_lib_context & context) {
-        const ggml_webgpu_flash_attn_pipeline_key key = ggml_webgpu_flash_attn_make_pipeline_key(context);
-        auto                                      it  = flash_attn_pipelines.find(key);
+    webgpu_pipeline get_flash_attn_pipeline(const ggml_webgpu_shader_lib_context & context,
+                                            size_t                                 storage_offset_alignment) {
+        const ggml_webgpu_flash_attn_decisions decisions =
+            ggml_webgpu_flash_attn_get_decisions(context, storage_offset_alignment);
+        ggml_webgpu_flash_attn_pipeline_key key = ggml_webgpu_flash_attn_make_pipeline_key(context, decisions.path);
+        auto                                it  = flash_attn_pipelines.find(key);
         if (it != flash_attn_pipelines.end()) {
             return it->second;
         }
         std::vector<std::string> defines;
-        std::string              variant = "flash_attn";
+        std::string              variant = decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_VEC  ? "flash_attn_vec" :
+                                           decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE ? "flash_attn_tile" :
+                                                                                                "flash_attn";
 
         switch (key.kv_type) {
             case GGML_TYPE_F32:
@@ -2073,7 +2190,12 @@ class ggml_webgpu_shader_lib {
 
         if (key.has_mask) {
             defines.push_back("MASK");
-            variant += "_mask";
+            if (key.path == GGML_WEBGPU_FLASH_ATTN_PATH_VEC) {
+                defines.push_back("BLK");
+                variant += "_mask_blk";
+            } else {
+                variant += "_mask";
+            }
         }
         if (key.has_sinks) {
             defines.push_back("SINKS");
@@ -2087,6 +2209,10 @@ class ggml_webgpu_shader_lib {
             defines.push_back("KV_DIRECT");
             variant += "_kvdirect";
         }
+        if (key.kv_overlap) {
+            defines.push_back("KV_OVERLAP");
+            variant += "_kv_overlap";
+        }
 
         defines.push_back(std::string("HEAD_DIM_QK=") + std::to_string(key.head_dim_qk));
         variant += std::string("_hsqk") + std::to_string(key.head_dim_qk);
@@ -2094,129 +2220,37 @@ class ggml_webgpu_shader_lib {
         defines.push_back(std::string("HEAD_DIM_V=") + std::to_string(key.head_dim_v));
         variant += std::string("_hsv") + std::to_string(key.head_dim_v);
 
-        defines.push_back(std::string("SG_MAT_M=") + std::to_string(context.sg_mat_m));
-        defines.push_back(std::string("SG_MAT_N=") + std::to_string(context.sg_mat_n));
-        defines.push_back(std::string("SG_MAT_K=") + std::to_string(context.sg_mat_k));
-
-        auto decisions    = std::make_shared<ggml_webgpu_flash_attn_decisions>();
-        decisions->q_tile = context.sg_mat_m;
-
-        const uint32_t min_kv_tile = ggml_webgpu_flash_attn_max_kv_tile(context, key);
-        uint32_t       kv_tile = std::min(min_kv_tile, context.sg_mat_n * GGML_WEBGPU_FLASH_ATTN_PREFERRED_KV_SG_TILES);
-
-        if (key.kv_direct) {
-            kv_tile = std::min(kv_tile, GGML_WEBGPU_KV_SEQ_PAD);
-            while (GGML_WEBGPU_KV_SEQ_PAD % kv_tile != 0) {
-                kv_tile -= context.sg_mat_n;
-            }
+        const char * shader_src = wgsl_flash_attn;
+        if (key.path == GGML_WEBGPU_FLASH_ATTN_PATH_VEC) {
+            defines.push_back("KV_GRANULARITY=8");
+            defines.push_back(std::string("VEC_NE=") + std::to_string(ggml_webgpu_flash_attn_pick_vec_ne(key)) + "u");
+            shader_src = wgsl_flash_attn_vec_split;
+        } else if (key.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE) {
+            shader_src = wgsl_flash_attn_tile;
+            defines.push_back("MAX_SUBGROUP_SIZE=" + std::to_string(context.max_subgroup_size));
+            defines.push_back("KV_STAGE_STRIDE=" + std::to_string(std::max(key.head_dim_qk, key.head_dim_v)));
+            variant += "_tile";
+        } else {
+            defines.push_back(std::string("SG_MAT_M=") + std::to_string(context.sg_mat_m));
+            defines.push_back(std::string("SG_MAT_N=") + std::to_string(context.sg_mat_n));
+            defines.push_back(std::string("SG_MAT_K=") + std::to_string(context.sg_mat_k));
         }
 
-        decisions->kv_tile = kv_tile;
-        decisions->wg_size = std::max(context.max_subgroup_size, GGML_WEBGPU_FLASH_ATTN_PREFERRED_WG_SIZE);
-
-        defines.push_back(std::string("Q_TILE=") + std::to_string(decisions->q_tile));
-        defines.push_back(std::string("KV_TILE=") + std::to_string(decisions->kv_tile));
-        defines.push_back(std::string("WG_SIZE=") + std::to_string(decisions->wg_size));
+        auto pipeline_decisions = std::make_shared<ggml_webgpu_flash_attn_decisions>(decisions);
+        defines.push_back(std::string("Q_TILE=") + std::to_string(decisions.q_tile));
+        defines.push_back(std::string("KV_TILE=") + std::to_string(decisions.kv_tile));
+        defines.push_back(std::string("WG_SIZE=") + std::to_string(decisions.wg_size));
 
         webgpu_pipeline pipeline =
-            ggml_webgpu_create_pipeline(device, preprocessor.preprocess(wgsl_flash_attn, defines), variant);
-        pipeline.context          = decisions;
+            ggml_webgpu_create_pipeline(device, preprocessor.preprocess(shader_src, defines), variant);
+        pipeline.context          = pipeline_decisions;
         flash_attn_pipelines[key] = pipeline;
         return flash_attn_pipelines[key];
     }
 
-    webgpu_pipeline get_flash_attn_vec_pipeline(const ggml_webgpu_shader_lib_context & context) {
-        const ggml_webgpu_flash_attn_pipeline_key key = ggml_webgpu_flash_attn_make_pipeline_key(context);
-        auto                                      it  = flash_attn_vec_pipelines.find(key);
-        if (it != flash_attn_vec_pipelines.end()) {
-            return it->second;
-        }
-
-        std::vector<std::string> defines;
-        std::string              variant = "flash_attn_vec";
-
-        switch (key.kv_type) {
-            case GGML_TYPE_F32:
-                defines.push_back("KV_F32");
-                break;
-            case GGML_TYPE_F16:
-                defines.push_back("KV_F16");
-                break;
-            case GGML_TYPE_Q4_0:
-                defines.push_back("KV_Q4_0");
-                break;
-            case GGML_TYPE_Q8_0:
-                defines.push_back("KV_Q8_0");
-                break;
-            default:
-                GGML_ABORT("Unsupported KV type for flash attention shader");
-        }
-        variant += std::string("_") + ggml_type_name(key.kv_type);
-
-        if (key.has_mask) {
-            defines.push_back("MASK");
-            defines.push_back("BLK");
-            variant += "_mask_blk";
-        }
-        if (key.has_sinks) {
-            defines.push_back("SINKS");
-            variant += "_sinks";
-        }
-        if (key.uses_logit_softcap) {
-            defines.push_back("LOGIT_SOFTCAP");
-            variant += "_lgsc";
-        }
-        if (key.kv_direct) {
-            defines.push_back("KV_DIRECT");
-            variant += "_kvdirect";
-        }
-
-        defines.push_back(std::string("HEAD_DIM_QK=") + std::to_string(key.head_dim_qk));
-        variant += std::string("_hsqk") + std::to_string(key.head_dim_qk);
-
-        defines.push_back(std::string("HEAD_DIM_V=") + std::to_string(key.head_dim_v));
-        variant += std::string("_hsv") + std::to_string(key.head_dim_v);
-
-        defines.push_back(std::string("SG_MAT_M=") + std::to_string(context.sg_mat_m));
-        defines.push_back(std::string("SG_MAT_N=") + std::to_string(context.sg_mat_n));
-        defines.push_back(std::string("SG_MAT_K=") + std::to_string(context.sg_mat_k));
-        defines.push_back("Q_TILE=1");
-
-        auto decisions     = std::make_shared<ggml_webgpu_flash_attn_vec_decisions>();
-        decisions->kv_tile = ggml_webgpu_flash_attn_vec_get_kv_tile(context);
-        decisions->wg_size = std::max(1u, std::min<uint32_t>(32u, context.max_subgroup_size));
-        uint32_t vec_ne    = 1u;
-
-        // Keep conservative defaults unless this is the f16 vec-split shape family.
-        if (key.kv_type == GGML_TYPE_F16 && key.head_dim_qk == key.head_dim_v) {
-            switch (key.head_dim_qk) {
-                case 64:
-                case 192:
-                case 576:
-                    vec_ne = 2u;
-                    break;
-                case 96:
-                    vec_ne = 4u;
-                    break;
-                default:
-                    break;
-            }
-        }
-
-        defines.push_back(std::string("KV_TILE=") + std::to_string(decisions->kv_tile));
-        defines.push_back(std::string("WG_SIZE=") + std::to_string(decisions->wg_size));
-        defines.push_back(std::string("VEC_NE=") + std::to_string(vec_ne) + "u");
-
-        webgpu_pipeline pipeline =
-            ggml_webgpu_create_pipeline(device, preprocessor.preprocess(wgsl_flash_attn_vec_split, defines), variant);
-        pipeline.context              = decisions;
-        flash_attn_vec_pipelines[key] = pipeline;
-        return flash_attn_vec_pipelines[key];
-    }
-
-    webgpu_pipeline get_flash_attn_blk_pipeline(const ggml_webgpu_shader_lib_context & context) {
+    webgpu_pipeline get_flash_attn_blk_pipeline(const ggml_webgpu_shader_lib_context & context, uint32_t kv_tile) {
         ggml_webgpu_flash_attn_blk_pipeline_key key = {};
-        key.kv_tile                                 = ggml_webgpu_flash_attn_vec_get_kv_tile(context);
+        key.kv_tile                                 = kv_tile;
         auto it                                     = flash_attn_blk_pipelines.find(key);
         if (it != flash_attn_blk_pipelines.end()) {
             return it->second;
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index acc486cfd..7ed6fdd16 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -389,23 +389,6 @@ static size_t ggml_webgpu_tensor_misalignment(webgpu_context & ctx, const ggml_t
     return offset & (ctx->global_ctx->capabilities.limits.minStorageBufferOffsetAlignment - 1);
 }
 
-static bool ggml_webgpu_flash_attn_use_vec(webgpu_global_context & global_ctx,
-                                           const ggml_tensor *     Q,
-                                           const ggml_tensor *     K,
-                                           const ggml_tensor *     V) {
-    const size_t   alignment = global_ctx->capabilities.limits.minStorageBufferOffsetAlignment;
-    const uint32_t k_offset_elems =
-        (uint32_t) ((ggml_webgpu_tensor_offset(K) & (alignment - 1)) / ggml_type_size(K->type));
-    const uint32_t v_offset_elems =
-        (uint32_t) ((ggml_webgpu_tensor_offset(V) & (alignment - 1)) / ggml_type_size(V->type));
-    const bool f16_vec4_aligned = (k_offset_elems % 4u == 0u) && (v_offset_elems % 4u == 0u);
-    const bool kv_vec_type_supported =
-        K->type == GGML_TYPE_F16 || K->type == GGML_TYPE_Q4_0 || K->type == GGML_TYPE_Q8_0;
-
-    return (Q->ne[1] < 20) && (Q->ne[0] % 32 == 0) && (V->ne[0] % 4 == 0) && kv_vec_type_supported &&
-           (K->type != GGML_TYPE_F16 || f16_vec4_aligned) && (V->type == K->type);
-}
-
 static size_t ggml_webgpu_tensor_align_offset(webgpu_context & ctx, const ggml_tensor * t) {
     size_t offset = ggml_webgpu_tensor_offset(t);
     return offset & ~(ctx->global_ctx->capabilities.limits.minStorageBufferOffsetAlignment - 1);
@@ -1567,7 +1550,6 @@ static webgpu_encoded_op ggml_webgpu_mul_mat_id(webgpu_context & ctx,
     return ggml_backend_webgpu_build_multi(ctx, dispatches);
 }
 
-#ifndef __EMSCRIPTEN__
 static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
                                                 ggml_tensor *    Q,
                                                 ggml_tensor *    K,
@@ -1585,13 +1567,29 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
     float m0          = powf(2.0f, -(max_bias) / n_head_log2);
     float m1          = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
 
-    const int has_mask  = (mask != nullptr);
-    const int has_sinks = (sinks != nullptr);
+    const int  has_mask   = (mask != nullptr);
+    const int  has_sinks  = (sinks != nullptr);
+    const bool kv_overlap = ggml_webgpu_tensor_overlap(K, V) && K->type == V->type;
+
+    uint32_t offset_k       = (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, K) / ggml_type_size(K->type));
+    uint32_t offset_v       = (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, V) / ggml_type_size(V->type));
+    size_t   kv_bind_offset = 0;
+    size_t   kv_bind_size   = 0;
+    if (kv_overlap) {
+        const size_t k_bind_offset = ggml_webgpu_tensor_align_offset(ctx, K);
+        const size_t v_bind_offset = ggml_webgpu_tensor_align_offset(ctx, V);
+        const size_t k_bind_end    = k_bind_offset + ggml_webgpu_tensor_binding_size(ctx, K);
+        const size_t v_bind_end    = v_bind_offset + ggml_webgpu_tensor_binding_size(ctx, V);
+        kv_bind_offset             = std::min(k_bind_offset, v_bind_offset);
+        kv_bind_size               = std::max(k_bind_end, v_bind_end) - kv_bind_offset;
+        offset_k = (uint32_t) ((ggml_webgpu_tensor_offset(K) - kv_bind_offset) / ggml_type_size(K->type));
+        offset_v = (uint32_t) ((ggml_webgpu_tensor_offset(V) - kv_bind_offset) / ggml_type_size(V->type));
+    }
 
     std::vector<uint32_t> params = {
         (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, Q) / ggml_type_size(Q->type)),
-        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, K) / ggml_type_size(K->type)),
-        (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, V) / ggml_type_size(V->type)),
+        offset_k,
+        offset_v,
         has_mask ? (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, mask) / ggml_type_size(mask->type)) : 0,
         has_sinks ? (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, sinks) / ggml_type_size(sinks->type)) : 0,
         (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
@@ -1619,10 +1617,15 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
     };
     std::vector<wgpu::BindGroupEntry> entries = {
         ggml_webgpu_make_tensor_bind_group_entry(ctx, 0, Q),
-        ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, K),
-        ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, V),
     };
-    uint32_t binding_index = 3;
+    if (kv_overlap) {
+        entries.push_back(
+            ggml_webgpu_make_bind_group_entry(1, ggml_webgpu_tensor_buf(K), kv_bind_offset, kv_bind_size));
+    } else {
+        entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 1, K));
+        entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, 2, V));
+    }
+    uint32_t binding_index = kv_overlap ? 2u : 3u;
     if (has_mask) {
         entries.push_back(ggml_webgpu_make_tensor_bind_group_entry(ctx, binding_index++, mask));
     }
@@ -1638,25 +1641,25 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
     shader_lib_ctx.src3                           = mask;
     shader_lib_ctx.src4                           = sinks;
     shader_lib_ctx.dst                            = dst;
+    shader_lib_ctx.src_overlap                    = kv_overlap;
+    shader_lib_ctx.supports_subgroups             = ctx->global_ctx->capabilities.supports_subgroups;
+    shader_lib_ctx.supports_subgroup_matrix       = ctx->global_ctx->capabilities.supports_subgroup_matrix;
     shader_lib_ctx.max_wg_size        = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
     shader_lib_ctx.wg_mem_limit_bytes = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupStorageSize;
     shader_lib_ctx.sg_mat_m           = ctx->global_ctx->capabilities.sg_mat_m;
     shader_lib_ctx.sg_mat_n           = ctx->global_ctx->capabilities.sg_mat_n;
     shader_lib_ctx.sg_mat_k           = ctx->global_ctx->capabilities.sg_mat_k;
     shader_lib_ctx.max_subgroup_size  = ctx->global_ctx->capabilities.max_subgroup_size;
-    const bool      use_vec           = ggml_webgpu_flash_attn_use_vec(ctx->global_ctx, Q, K, V);
-    webgpu_pipeline pipeline          = use_vec ? ctx->shader_lib->get_flash_attn_vec_pipeline(shader_lib_ctx) :
-                                                  ctx->shader_lib->get_flash_attn_pipeline(shader_lib_ctx);
+    webgpu_pipeline pipeline          = ctx->shader_lib->get_flash_attn_pipeline(
+        shader_lib_ctx, ctx->global_ctx->capabilities.limits.minStorageBufferOffsetAlignment);
+    auto * decisions = static_cast<ggml_webgpu_flash_attn_decisions *>(pipeline.context.get());
 
-    if (!use_vec) {
-        auto *   decisions   = static_cast<ggml_webgpu_flash_attn_decisions *>(pipeline.context.get());
+    if (decisions->path != GGML_WEBGPU_FLASH_ATTN_PATH_VEC) {
         uint32_t wg_per_head = CEIL_DIV(Q->ne[1], decisions->q_tile);
         uint32_t wg_x        = wg_per_head * Q->ne[2] * Q->ne[3];  // wg per head * number of heads * number of batches
         return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
     }
 
-    auto * decisions = static_cast<ggml_webgpu_flash_attn_vec_decisions *>(pipeline.context.get());
-
     wgpu::Buffer blk_buf         = {};
     uint64_t     blk_size_bytes  = 0;
     uint32_t     blk_nblk0       = 0;
@@ -1695,10 +1698,12 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
         tmp_bind_size   = tmp_size_bytes;
         scratch_offset  = ROUNDUP_POW2(scratch_offset + tmp_size_bytes, align_bytes);
     } else {
-        // nwg==1 writes final dst directly in vec-split; keep tmp binding valid without extra allocation.
+        // nwg==1 writes final dst directly in vec-split; bind tmp to a tiny non-overlapping scratch region.
+        tmp_size_bytes  = WEBGPU_STORAGE_BUF_BINDING_MULT;
         tmp_buf         = ggml_webgpu_tensor_buf(dst);
-        tmp_bind_offset = ggml_webgpu_tensor_align_offset(ctx, dst);
-        tmp_bind_size   = ggml_webgpu_tensor_binding_size(ctx, dst);
+        tmp_bind_offset = scratch_offset;
+        tmp_bind_size   = tmp_size_bytes;
+        scratch_offset  = ROUNDUP_POW2(scratch_offset + tmp_size_bytes, align_bytes);
     }
 
     webgpu_pipeline                   blk_pipeline;
@@ -1713,7 +1718,7 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
         const uint64_t blk_elems    = (uint64_t) blk_nblk0 * blk_nblk1 * blk_batch_count;
         blk_size_bytes              = ROUNDUP_POW2(blk_elems * sizeof(uint32_t), WEBGPU_STORAGE_BUF_BINDING_MULT);
         const ggml_webgpu_shader_lib_context blk_shader_ctx = shader_lib_ctx;
-        blk_pipeline = ctx->shader_lib->get_flash_attn_blk_pipeline(blk_shader_ctx);
+        blk_pipeline = ctx->shader_lib->get_flash_attn_blk_pipeline(blk_shader_ctx, decisions->kv_tile);
 
         blk_params = {
             (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, mask) / ggml_type_size(mask->type)),  // offset_mask
@@ -1745,12 +1750,19 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
     std::vector<wgpu::BindGroupEntry> split_entries = {
         ggml_webgpu_make_bind_group_entry(0, ggml_webgpu_tensor_buf(Q), ggml_webgpu_tensor_align_offset(ctx, Q),
                                           ggml_webgpu_tensor_binding_size(ctx, Q)),
-        ggml_webgpu_make_bind_group_entry(1, ggml_webgpu_tensor_buf(K), ggml_webgpu_tensor_align_offset(ctx, K),
-                                          ggml_webgpu_tensor_binding_size(ctx, K)),
-        ggml_webgpu_make_bind_group_entry(2, ggml_webgpu_tensor_buf(V), ggml_webgpu_tensor_align_offset(ctx, V),
-                                          ggml_webgpu_tensor_binding_size(ctx, V)),
     };
-    uint32_t split_binding_index = 3;
+    if (kv_overlap) {
+        split_entries.push_back(
+            ggml_webgpu_make_bind_group_entry(1, ggml_webgpu_tensor_buf(K), kv_bind_offset, kv_bind_size));
+    } else {
+        split_entries.push_back(ggml_webgpu_make_bind_group_entry(1, ggml_webgpu_tensor_buf(K),
+                                                                  ggml_webgpu_tensor_align_offset(ctx, K),
+                                                                  ggml_webgpu_tensor_binding_size(ctx, K)));
+        split_entries.push_back(ggml_webgpu_make_bind_group_entry(2, ggml_webgpu_tensor_buf(V),
+                                                                  ggml_webgpu_tensor_align_offset(ctx, V),
+                                                                  ggml_webgpu_tensor_binding_size(ctx, V)));
+    }
+    uint32_t split_binding_index = kv_overlap ? 2u : 3u;
     if (has_mask) {
         split_entries.push_back(ggml_webgpu_make_bind_group_entry(split_binding_index++, ggml_webgpu_tensor_buf(mask),
                                                                   ggml_webgpu_tensor_align_offset(ctx, mask),
@@ -1820,7 +1832,6 @@ static webgpu_encoded_op ggml_webgpu_flash_attn(webgpu_context & ctx,
 
     return ggml_backend_webgpu_build_multi(ctx, dispatches);
 }
-#endif  // __EMSCRIPTEN__
 
 static webgpu_encoded_op ggml_webgpu_unary_op(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
     bool is_unary = dst->op == GGML_OP_UNARY;
@@ -2710,11 +2721,7 @@ static std::optional<webgpu_encoded_op> ggml_webgpu_encode(webgpu_context ctx,
         case GGML_OP_MUL_MAT_ID:
             return ggml_webgpu_mul_mat_id(ctx, src0, src1, src2, node);
         case GGML_OP_FLASH_ATTN_EXT:
-#ifndef __EMSCRIPTEN__
             return ggml_webgpu_flash_attn(ctx, src0, src1, src2, node->src[3], node->src[4], node);
-#else
-            return std::nullopt;
-#endif
         case GGML_OP_ADD:
         case GGML_OP_SUB:
         case GGML_OP_MUL:
@@ -3257,13 +3264,19 @@ static size_t ggml_backend_webgpu_buffer_type_get_alloc_size(ggml_backend_buffer
                         ctx->webgpu_global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup;
                     shader_lib_ctx.wg_mem_limit_bytes =
                         ctx->webgpu_global_ctx->capabilities.limits.maxComputeWorkgroupStorageSize;
+                    shader_lib_ctx.supports_subgroups = ctx->webgpu_global_ctx->capabilities.supports_subgroups;
+                    shader_lib_ctx.supports_subgroup_matrix =
+                        ctx->webgpu_global_ctx->capabilities.supports_subgroup_matrix;
                     shader_lib_ctx.sg_mat_m          = ctx->webgpu_global_ctx->capabilities.sg_mat_m;
                     shader_lib_ctx.sg_mat_n          = ctx->webgpu_global_ctx->capabilities.sg_mat_n;
                     shader_lib_ctx.sg_mat_k          = ctx->webgpu_global_ctx->capabilities.sg_mat_k;
                     shader_lib_ctx.max_subgroup_size = ctx->webgpu_global_ctx->capabilities.max_subgroup_size;
 
-                    if (ggml_webgpu_flash_attn_use_vec(ctx->webgpu_global_ctx, Q, K, V)) {
-                        const uint32_t kv_tile = ggml_webgpu_flash_attn_vec_get_kv_tile(shader_lib_ctx);
+                    const ggml_webgpu_flash_attn_decisions decisions = ggml_webgpu_flash_attn_get_decisions(
+                        shader_lib_ctx, ctx->webgpu_global_ctx->capabilities.limits.minStorageBufferOffsetAlignment);
+
+                    if (decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_VEC) {
+                        const uint32_t kv_tile = decisions.kv_tile;
 
                         const uint32_t vec_nwg_cap = std::max(
                             1u, std::min<uint32_t>(32u, ctx->webgpu_global_ctx->capabilities.max_subgroup_size));
@@ -3283,6 +3296,8 @@ static size_t ggml_backend_webgpu_buffer_type_get_alloc_size(ggml_backend_buffer
                             const size_t   tmp_size_bytes  = ROUNDUP_POW2(
                                 (tmp_data_elems + tmp_stats_elems) * sizeof(float), WEBGPU_STORAGE_BUF_BINDING_MULT);
                             res += tmp_size_bytes + align;
+                        } else {
+                            res += WEBGPU_STORAGE_BUF_BINDING_MULT + align;
                         }
                         if (mask != nullptr) {
                             const uint32_t blk_nblk0       = CEIL_DIV((uint32_t) K->ne[1], kv_tile);
@@ -3431,12 +3446,12 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
     ctx->webgpu_global_ctx->capabilities.supports_subgroups =
         ctx->webgpu_global_ctx->adapter.HasFeature(wgpu::FeatureName::Subgroups);
 
+    bool valid_subgroup_matrix_config = false;
 #ifndef __EMSCRIPTEN__
     // Accept f16 subgroup matrix configurations (square or non-square).
     // NVIDIA GPUs typically report square configs (e.g. 16x16x16),
     // while Intel Xe2 GPUs report non-square configs (e.g. 8x16x16).
     // The shaders are already parameterized to handle any M/N/K dimensions.
-    bool valid_subgroup_matrix_config = false;
     if (ctx->webgpu_global_ctx->adapter.HasFeature(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix)) {
         for (size_t i = 0; i < subgroup_matrix_configs.configCount; i++) {
             const wgpu::SubgroupMatrixConfig config = subgroup_matrix_configs.configs[i];
@@ -3450,8 +3465,8 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
             }
         }
     }
-    ctx->webgpu_global_ctx->capabilities.supports_subgroup_matrix = valid_subgroup_matrix_config;
 #endif
+    ctx->webgpu_global_ctx->capabilities.supports_subgroup_matrix = valid_subgroup_matrix_config;
 
     // For subgroup matrix code to be the most efficient, we would like the subgroup size to be consistent and accurate.
     // Unfortunately, that is not possible, so we use the maximum subgroup size reported by the adapter.
@@ -3499,12 +3514,12 @@ static bool create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) {
     // Enable Dawn-specific toggles to increase native performance
     // TODO: Maybe WebGPU needs a "fast" mode where you can request compilers skip adding checks like these,
     //       only for native performance?
-    const char * const deviceEnabledToggles[]  = { "skip_validation", "disable_robustness", "disable_workgroup_init",
-                                                   "disable_polyfills_on_integer_div_and_mod" };
-    const char * const deviceDisabledToggles[] = { "timestamp_quantization" };
+    const char * const          deviceEnabledToggles[]  = { "disable_robustness", "disable_workgroup_init",
+                                                            "disable_polyfills_on_integer_div_and_mod" };
+    const char * const          deviceDisabledToggles[] = { "timestamp_quantization" };
     wgpu::DawnTogglesDescriptor deviceTogglesDesc;
     deviceTogglesDesc.enabledToggles      = deviceEnabledToggles;
-    deviceTogglesDesc.enabledToggleCount  = 4;
+    deviceTogglesDesc.enabledToggleCount  = 3;
     deviceTogglesDesc.disabledToggles     = deviceDisabledToggles;
     deviceTogglesDesc.disabledToggleCount = 1;
 
@@ -3782,33 +3797,63 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
             break;
         case GGML_OP_FLASH_ATTN_EXT:
             {
-#ifndef __EMSCRIPTEN__
-                if (!ctx->webgpu_global_ctx->capabilities.supports_subgroup_matrix) {
-                    break;
-                }
-                // Head dimensions must be divisible by subgroup matrix dimensions
-                if (src0->ne[0] % ctx->webgpu_global_ctx->capabilities.sg_mat_k != 0 ||
-                    src2->ne[0] % ctx->webgpu_global_ctx->capabilities.sg_mat_n != 0) {
-                    break;
-                }
-                // Head dimensions must fit in workgroup memory with minimum tile sizes
-                size_t     limit_bytes = ctx->webgpu_global_ctx->capabilities.limits.maxComputeWorkgroupStorageSize;
-                const bool has_mask    = op->src[3] != nullptr;
-                const bool kv_direct   = src1->type == GGML_TYPE_F16 &&
-                                       (src0->ne[0] % ctx->webgpu_global_ctx->capabilities.sg_mat_k) == 0 &&
-                                       (src1->ne[1] % GGML_WEBGPU_KV_SEQ_PAD) == 0;
-                const size_t min_bytes = ggml_webgpu_flash_attn_wg_mem_bytes(
-                    ctx->webgpu_global_ctx->capabilities.sg_mat_m, ctx->webgpu_global_ctx->capabilities.sg_mat_n,
-                    (uint32_t) src0->ne[0], (uint32_t) src2->ne[0], has_mask, kv_direct);
-                if (min_bytes > limit_bytes) {
-                    break;
-                }
-
                 supports_op = src0->type == GGML_TYPE_F32 &&
                               (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 ||
                                src1->type == GGML_TYPE_Q4_0 || src1->type == GGML_TYPE_Q8_0) &&
                               src2->type == src1->type && op->type == GGML_TYPE_F32;
-#endif
+                if (!supports_op) {
+                    break;
+                }
+                ggml_webgpu_shader_lib_context shader_lib_ctx = {};
+                shader_lib_ctx.src0                           = src0;
+                shader_lib_ctx.src1                           = src1;
+                shader_lib_ctx.src2                           = src2;
+                shader_lib_ctx.src3                           = op->src[3];
+                shader_lib_ctx.src4                           = op->src[4];
+                shader_lib_ctx.dst                            = const_cast<ggml_tensor *>(op);
+                shader_lib_ctx.supports_subgroups             = ctx->webgpu_global_ctx->capabilities.supports_subgroups;
+                shader_lib_ctx.supports_subgroup_matrix = ctx->webgpu_global_ctx->capabilities.supports_subgroup_matrix;
+                shader_lib_ctx.wg_mem_limit_bytes =
+                    ctx->webgpu_global_ctx->capabilities.limits.maxComputeWorkgroupStorageSize;
+                shader_lib_ctx.sg_mat_m          = ctx->webgpu_global_ctx->capabilities.sg_mat_m;
+                shader_lib_ctx.sg_mat_n          = ctx->webgpu_global_ctx->capabilities.sg_mat_n;
+                shader_lib_ctx.sg_mat_k          = ctx->webgpu_global_ctx->capabilities.sg_mat_k;
+                shader_lib_ctx.max_subgroup_size = ctx->webgpu_global_ctx->capabilities.max_subgroup_size;
+
+                const ggml_webgpu_flash_attn_decisions decisions = ggml_webgpu_flash_attn_get_decisions(
+                    shader_lib_ctx, ctx->webgpu_global_ctx->capabilities.limits.minStorageBufferOffsetAlignment);
+                const size_t limit_bytes = ctx->webgpu_global_ctx->capabilities.limits.maxComputeWorkgroupStorageSize;
+                const bool   has_mask    = op->src[3] != nullptr;
+                if (decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_VEC) {
+                    const size_t min_bytes =
+                        ggml_webgpu_flash_attn_wg_mem_bytes(decisions.q_tile, decisions.kv_tile, (uint32_t) src0->ne[0],
+                                                            (uint32_t) src2->ne[0], has_mask, decisions.kv_direct);
+                    if (min_bytes > limit_bytes) {
+                        supports_op = false;
+                    }
+                    break;
+                }
+
+                if (decisions.path == GGML_WEBGPU_FLASH_ATTN_PATH_TILE) {
+                    const size_t min_bytes =
+                        ggml_webgpu_flash_attn_wg_mem_bytes(decisions.q_tile, decisions.kv_tile, (uint32_t) src0->ne[0],
+                                                            (uint32_t) src2->ne[0], has_mask, decisions.kv_direct);
+                    if (min_bytes > limit_bytes) {
+                        supports_op = false;
+                    }
+                    break;
+                }
+
+                if (!ctx->webgpu_global_ctx->capabilities.supports_subgroup_matrix) {
+                    supports_op = false;
+                    break;
+                }
+                const size_t min_bytes =
+                    ggml_webgpu_flash_attn_wg_mem_bytes(decisions.q_tile, decisions.kv_tile, (uint32_t) src0->ne[0],
+                                                        (uint32_t) src2->ne[0], has_mask, decisions.kv_direct);
+                if (min_bytes > limit_bytes) {
+                    supports_op = false;
+                }
                 break;
             }
         case GGML_OP_RMS_NORM:
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl
index aa2d2e54d..6d5d69fb8 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl
@@ -138,25 +138,54 @@ struct Params {
 };
 
 @group(0) @binding(0) var<storage, read_write> Q: array<f32>;
+#ifdef KV_OVERLAP
+@group(0) @binding(1) var<storage, read_write> K: array<KV_TYPE>;
+#define V K
+#else
 @group(0) @binding(1) var<storage, read_write> K: array<KV_TYPE>;
 @group(0) @binding(2) var<storage, read_write> V: array<KV_TYPE>;
+#endif
 
 #if defined(MASK) && defined(SINKS)
-@group(0) @binding(3) var<storage, read_write> mask: array<f16>;
-@group(0) @binding(4) var<storage, read_write> sinks: array<f32>;
-#define DST_BINDING 5
-#define PARAMS_BINDING 6
-#elif defined(MASK)
-@group(0) @binding(3) var<storage, read_write> mask: array<f16>;
-#define DST_BINDING 4
-#define PARAMS_BINDING 5
-#elif defined(SINKS)
+#ifdef KV_OVERLAP
+@group(0) @binding(2) var<storage, read_write> mask: array<f16>;
 @group(0) @binding(3) var<storage, read_write> sinks: array<f32>;
 #define DST_BINDING 4
 #define PARAMS_BINDING 5
 #else
+@group(0) @binding(3) var<storage, read_write> mask: array<f16>;
+@group(0) @binding(4) var<storage, read_write> sinks: array<f32>;
+#define DST_BINDING 5
+#define PARAMS_BINDING 6
+#endif
+#elif defined(MASK)
+#ifdef KV_OVERLAP
+@group(0) @binding(2) var<storage, read_write> mask: array<f16>;
 #define DST_BINDING 3
 #define PARAMS_BINDING 4
+#else
+@group(0) @binding(3) var<storage, read_write> mask: array<f16>;
+#define DST_BINDING 4
+#define PARAMS_BINDING 5
+#endif
+#elif defined(SINKS)
+#ifdef KV_OVERLAP
+@group(0) @binding(2) var<storage, read_write> sinks: array<f32>;
+#define DST_BINDING 3
+#define PARAMS_BINDING 4
+#else
+@group(0) @binding(3) var<storage, read_write> sinks: array<f32>;
+#define DST_BINDING 4
+#define PARAMS_BINDING 5
+#endif
+#else
+#ifdef KV_OVERLAP
+#define DST_BINDING 2
+#define PARAMS_BINDING 3
+#else
+#define DST_BINDING 3
+#define PARAMS_BINDING 4
+#endif
 #endif
 
 @group(0) @binding(DST_BINDING) var<storage, read_write> dst: array<vec4<f32>>;
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl
new file mode 100644
index 000000000..37ea23b80
--- /dev/null
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl
@@ -0,0 +1,330 @@
+enable f16;
+enable subgroups;
+
+#define HEAD_DIM_QK 64
+#define HEAD_DIM_V 64
+#define KV_STAGE_STRIDE 64
+#define Q_TILE 4
+#define KV_TILE 64
+#define WG_SIZE 128
+
+struct Params {
+    offset_q: u32,
+    offset_k: u32,
+    offset_v: u32,
+    offset_mask: u32,
+    offset_sinks: u32,
+    offset_dst: u32,
+
+    n_heads: u32,
+    seq_len_q: u32,
+    seq_len_kv: u32,
+
+    stride_q1: u32,
+    stride_q2: u32,
+    stride_q3: u32,
+    stride_k1: u32,
+    stride_k2: u32,
+    stride_k3: u32,
+    stride_v1: u32,
+    stride_v2: u32,
+    stride_v3: u32,
+    stride_mask3: u32,
+
+    q_per_kv: u32,
+
+    scale: f32,
+    max_bias: f32,
+    logit_softcap: f32,
+    n_head_log2: f32,
+    m0: f32,
+    m1: f32,
+};
+
+@group(0) @binding(0) var<storage, read_write> Q: array<f32>;
+#ifdef KV_OVERLAP
+@group(0) @binding(1) var<storage, read_write> K: array<vec4<f16>>;
+#define V K
+#else
+@group(0) @binding(1) var<storage, read_write> K: array<vec4<f16>>;
+@group(0) @binding(2) var<storage, read_write> V: array<vec4<f16>>;
+#endif
+
+#if defined(MASK) && defined(SINKS)
+#ifdef KV_OVERLAP
+@group(0) @binding(2) var<storage, read_write> mask: array<f16>;
+@group(0) @binding(3) var<storage, read_write> sinks: array<f32>;
+#define DST_BINDING 4
+#define PARAMS_BINDING 5
+#else
+@group(0) @binding(3) var<storage, read_write> mask: array<f16>;
+@group(0) @binding(4) var<storage, read_write> sinks: array<f32>;
+#define DST_BINDING 5
+#define PARAMS_BINDING 6
+#endif
+#elif defined(MASK)
+#ifdef KV_OVERLAP
+@group(0) @binding(2) var<storage, read_write> mask: array<f16>;
+#define DST_BINDING 3
+#define PARAMS_BINDING 4
+#else
+@group(0) @binding(3) var<storage, read_write> mask: array<f16>;
+#define DST_BINDING 4
+#define PARAMS_BINDING 5
+#endif
+#elif defined(SINKS)
+#ifdef KV_OVERLAP
+@group(0) @binding(2) var<storage, read_write> sinks: array<f32>;
+#define DST_BINDING 3
+#define PARAMS_BINDING 4
+#else
+@group(0) @binding(3) var<storage, read_write> sinks: array<f32>;
+#define DST_BINDING 4
+#define PARAMS_BINDING 5
+#endif
+#else
+#ifdef KV_OVERLAP
+#define DST_BINDING 2
+#define PARAMS_BINDING 3
+#else
+#define DST_BINDING 3
+#define PARAMS_BINDING 4
+#endif
+#endif
+
+@group(0) @binding(DST_BINDING) var<storage, read_write> dst: array<vec4<f32>>;
+@group(0) @binding(PARAMS_BINDING) var<uniform> params: Params;
+
+const FLOAT_MIN: f32 = -1.0e9;
+const Q_CHUNKS: u32 = HEAD_DIM_QK / 4u;
+const V_CHUNKS: u32 = HEAD_DIM_V / 4u;
+const SCORE_REGS_PER_LANE: u32 = (KV_TILE + MAX_SUBGROUP_SIZE - 1u) / MAX_SUBGROUP_SIZE;
+const OUT_REGS_PER_LANE: u32 = (V_CHUNKS + MAX_SUBGROUP_SIZE - 1u) / MAX_SUBGROUP_SIZE;
+
+var<workgroup> q_shmem: array<f16, Q_TILE * HEAD_DIM_QK>;
+var<workgroup> kv_shmem: array<f16, KV_TILE * KV_STAGE_STRIDE>;
+var<workgroup> p_shmem: array<f32, Q_TILE * KV_TILE>;
+
+@compute @workgroup_size(WG_SIZE)
+fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
+        @builtin(local_invocation_id) local_id: vec3<u32>,
+        @builtin(subgroup_id) subgroup_id: u32,
+        @builtin(subgroup_size) subgroup_size: u32,
+        @builtin(num_subgroups) num_subgroups: u32,
+        @builtin(subgroup_invocation_id) sg_inv_id: u32) {
+    if (subgroup_size == 0u || num_subgroups < Q_TILE) {
+        return;
+    }
+
+    let wg_per_head = (params.seq_len_q + Q_TILE - 1u) / Q_TILE;
+    let wg_per_batch = wg_per_head * params.n_heads;
+
+    let dst2_stride = HEAD_DIM_V * params.n_heads;
+    let dst3_stride = dst2_stride * params.seq_len_q;
+
+    let batch_idx = wg_id.x / wg_per_batch;
+    let q_batch_offset = params.offset_q + batch_idx * params.stride_q3;
+    let k_batch_offset = params.offset_k + batch_idx * params.stride_k3;
+    let v_batch_offset = params.offset_v + batch_idx * params.stride_v3;
+    let dst_batch_offset = params.offset_dst + batch_idx * dst3_stride;
+    let wg_in_batch = wg_id.x % wg_per_batch;
+
+    let head_idx = wg_in_batch / wg_per_head;
+    let q_head_offset = q_batch_offset + head_idx * params.stride_q2;
+    let k_head_idx = head_idx / params.q_per_kv;
+    let v_head_offset = v_batch_offset + k_head_idx * params.stride_v2;
+    let k_head_offset = k_batch_offset + k_head_idx * params.stride_k2;
+
+    let wg_in_head = wg_in_batch % wg_per_head;
+    let q_row_start = wg_in_head * Q_TILE;
+    let global_q_row = q_row_start + subgroup_id;
+    let row_active = subgroup_id < Q_TILE && global_q_row < params.seq_len_q;
+
+#ifdef MASK
+    let mask_global_offset = params.offset_mask + batch_idx * params.stride_mask3 + q_row_start * params.seq_len_kv;
+#endif
+
+    let dst_global_offset = dst_batch_offset + q_row_start * dst2_stride + head_idx * HEAD_DIM_V;
+
+    let head = f32(head_idx);
+    let slope = select(1.0,
+                       select(pow(params.m1, 2.0 * (head - params.n_head_log2) + 1.0),
+                              pow(params.m0, head + 1.0),
+                              head < params.n_head_log2),
+                       params.max_bias > 0.0);
+
+    for (var elem_idx = local_id.x; elem_idx < Q_TILE * HEAD_DIM_QK; elem_idx += WG_SIZE) {
+        let q_tile_row = elem_idx / HEAD_DIM_QK;
+        let q_col = elem_idx % HEAD_DIM_QK;
+        let head_q_row = q_row_start + q_tile_row;
+        let global_q_row_offset = q_head_offset + head_q_row * params.stride_q1;
+        q_shmem[elem_idx] = f16(select(
+            0.0,
+            Q[global_q_row_offset + q_col] * params.scale,
+            head_q_row < params.seq_len_q));
+    }
+
+    workgroupBarrier();
+
+    var row_max = FLOAT_MIN;
+    var exp_sum = 0.0;
+    var out_regs: array<vec4<f32>, OUT_REGS_PER_LANE>;
+    for (var reg_idx = 0u; reg_idx < OUT_REGS_PER_LANE; reg_idx += 1u) {
+        out_regs[reg_idx] = vec4<f32>(0.0);
+    }
+
+    let q_base = subgroup_id * HEAD_DIM_QK;
+    let subgroup_p_offset = subgroup_id * KV_TILE;
+
+    for (var kv_tile = 0u; kv_tile < params.seq_len_kv; kv_tile += KV_TILE) {
+        let kv_count = min(KV_TILE, params.seq_len_kv - kv_tile);
+        let score_slots = min(SCORE_REGS_PER_LANE, (kv_count + subgroup_size - 1u) / subgroup_size);
+        let out_slots = min(OUT_REGS_PER_LANE, (V_CHUNKS + subgroup_size - 1u) / subgroup_size);
+        var local_scores: array<f32, SCORE_REGS_PER_LANE>;
+        for (var slot = 0u; slot < SCORE_REGS_PER_LANE; slot += 1u) {
+            local_scores[slot] = FLOAT_MIN;
+        }
+
+        for (var vec_idx_local = local_id.x; vec_idx_local < kv_count * Q_CHUNKS; vec_idx_local += WG_SIZE) {
+            let kv_local = vec_idx_local / Q_CHUNKS;
+            let chunk = vec_idx_local % Q_CHUNKS;
+            let global_k_row = kv_tile + kv_local;
+            let k_vec_index = (k_head_offset + global_k_row * params.stride_k1 + chunk * 4u) >> 2u;
+            let k4 = K[k_vec_index];
+            let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
+            kv_shmem[kv_off + 0u] = k4.x;
+            kv_shmem[kv_off + 1u] = k4.y;
+            kv_shmem[kv_off + 2u] = k4.z;
+            kv_shmem[kv_off + 3u] = k4.w;
+        }
+
+        workgroupBarrier();
+
+        var local_max = FLOAT_MIN;
+        if (row_active) {
+            for (var slot = 0u; slot < score_slots; slot += 1u) {
+                let kv_local = sg_inv_id + slot * subgroup_size;
+                if (kv_local >= kv_count) {
+                    continue;
+                }
+
+                let global_k_row = kv_tile + kv_local;
+                var dot_val = 0.0;
+                for (var chunk = 0u; chunk < Q_CHUNKS; chunk += 1u) {
+                    let q_off = q_base + chunk * 4u;
+                    let qv = vec4<f32>(
+                        f32(q_shmem[q_off + 0u]),
+                        f32(q_shmem[q_off + 1u]),
+                        f32(q_shmem[q_off + 2u]),
+                        f32(q_shmem[q_off + 3u]));
+                    let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
+                    let kv = vec4<f32>(
+                        f32(kv_shmem[kv_off + 0u]),
+                        f32(kv_shmem[kv_off + 1u]),
+                        f32(kv_shmem[kv_off + 2u]),
+                        f32(kv_shmem[kv_off + 3u]));
+                    dot_val += dot(qv, kv);
+                }
+#ifdef LOGIT_SOFTCAP
+                dot_val = params.logit_softcap * tanh(dot_val);
+#endif
+#ifdef MASK
+                let mask_idx = mask_global_offset + subgroup_id * params.seq_len_kv + global_k_row;
+                dot_val += slope * f32(mask[mask_idx]);
+#endif
+                local_scores[slot] = dot_val;
+                local_max = max(local_max, dot_val);
+            }
+        }
+
+        let tile_max = subgroupMax(local_max);
+        let new_max = max(row_max, tile_max);
+        let cur_exp = exp(row_max - new_max);
+        exp_sum *= cur_exp;
+        for (var reg_idx = 0u; reg_idx < OUT_REGS_PER_LANE; reg_idx += 1u) {
+            out_regs[reg_idx] *= cur_exp;
+        }
+
+        var local_sum = 0.0;
+        for (var slot = 0u; slot < score_slots; slot += 1u) {
+            let kv_local = sg_inv_id + slot * subgroup_size;
+            if (row_active && kv_local < kv_count) {
+                let p = exp(local_scores[slot] - new_max);
+                p_shmem[subgroup_p_offset + kv_local] = p;
+                local_sum += p;
+            }
+        }
+
+        workgroupBarrier();
+
+        for (var vec_idx_local = local_id.x; vec_idx_local < kv_count * V_CHUNKS; vec_idx_local += WG_SIZE) {
+            let kv_local = vec_idx_local / V_CHUNKS;
+            let chunk = vec_idx_local % V_CHUNKS;
+            let global_v_row = kv_tile + kv_local;
+            let v_vec_index = (v_head_offset + global_v_row * params.stride_v1 + chunk * 4u) >> 2u;
+            let v4 = V[v_vec_index];
+            let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
+            kv_shmem[kv_off + 0u] = v4.x;
+            kv_shmem[kv_off + 1u] = v4.y;
+            kv_shmem[kv_off + 2u] = v4.z;
+            kv_shmem[kv_off + 3u] = v4.w;
+        }
+
+        workgroupBarrier();
+
+        let tile_sum = subgroupAdd(local_sum);
+        exp_sum += tile_sum;
+        row_max = new_max;
+
+        if (row_active) {
+            for (var reg_idx = 0u; reg_idx < out_slots; reg_idx += 1u) {
+                let chunk = sg_inv_id + reg_idx * subgroup_size;
+                if (chunk >= V_CHUNKS) {
+                    continue;
+                }
+
+                var acc = out_regs[reg_idx];
+                for (var kv_local = 0u; kv_local < kv_count; kv_local += 1u) {
+                    let p = p_shmem[subgroup_p_offset + kv_local];
+                    let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
+                    let v4 = vec4<f32>(
+                        f32(kv_shmem[kv_off + 0u]),
+                        f32(kv_shmem[kv_off + 1u]),
+                        f32(kv_shmem[kv_off + 2u]),
+                        f32(kv_shmem[kv_off + 3u]));
+                    acc += p * v4;
+                }
+                out_regs[reg_idx] = acc;
+            }
+        }
+
+        workgroupBarrier();
+    }
+
+#ifdef SINKS
+    if (row_active) {
+        let sink_score = sinks[params.offset_sinks + head_idx];
+        let sink_max = max(row_max, sink_score);
+        let sink_scale = exp(row_max - sink_max);
+        for (var reg_idx = 0u; reg_idx < OUT_REGS_PER_LANE; reg_idx += 1u) {
+            out_regs[reg_idx] *= sink_scale;
+        }
+        exp_sum = exp_sum * sink_scale + exp(sink_score - sink_max);
+        row_max = sink_max;
+    }
+#endif
+
+    if (row_active) {
+        let inv_exp_sum = select(0.0, 1.0 / exp_sum, exp_sum != 0.0);
+        let row_base = dst_global_offset + subgroup_id * dst2_stride;
+        let out_slots = min(OUT_REGS_PER_LANE, (V_CHUNKS + subgroup_size - 1u) / subgroup_size);
+        for (var reg_idx = 0u; reg_idx < out_slots; reg_idx += 1u) {
+            let chunk = sg_inv_id + reg_idx * subgroup_size;
+            if (chunk >= V_CHUNKS) {
+                continue;
+            }
+            let dst_vec_index = (row_base + chunk * 4u) >> 2u;
+            dst[dst_vec_index] = out_regs[reg_idx] * inv_exp_sum;
+        }
+    }
+}
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl
index 61107c6a9..b4f7c16c3 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl
@@ -15,7 +15,7 @@ struct Params {
     nblk1: u32,
 };
 
-@group(0) @binding(0) var<storage, read> mask: array<f16>;
+@group(0) @binding(0) var<storage, read_write> mask: array<f16>;
 @group(0) @binding(1) var<storage, read_write> blk: array<u32>;
 @group(0) @binding(2) var<uniform> params: Params;
 
diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl
index a52575871..b1e234784 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl
@@ -1,8 +1,6 @@
-diagnostic(off, chromium.subgroup_matrix_uniformity);
 diagnostic(off, subgroup_uniformity);
 enable f16;
 enable subgroups;
-enable chromium_experimental_subgroup_matrix;
 
 #ifdef KV_F32
 #define KV_TYPE f32
@@ -13,19 +11,14 @@ enable chromium_experimental_subgroup_matrix;
 #define HEAD_DIM_QK 64
 #define HEAD_DIM_V 64
 
-
-#define SG_MAT_M 8
-#define SG_MAT_N 8
-#define SG_MAT_K 8
-
-#define Q_TILE SG_MAT_M
+#define KV_GRANULARITY 8
 #define KV_TILE 16
 #define WG_SIZE 64
 #ifndef VEC_NE
 #define VEC_NE 4u
 #endif
 
-#define KV_BLOCKS (KV_TILE / SG_MAT_N)
+#define KV_BLOCKS (KV_TILE / KV_GRANULARITY)
 
 #define BLOCK_SIZE 32
 #define BLOCKS_K ((HEAD_DIM_QK + BLOCK_SIZE - 1) / BLOCK_SIZE)
@@ -97,6 +90,14 @@ struct Params {
 };
 
 @group(0) @binding(0) var<storage, read_write> Q: array<f32>;
+#ifdef KV_OVERLAP
+#if defined(KV_Q4_0) || defined(KV_Q8_0)
+@group(0) @binding(1) var<storage, read_write> K: array<KV_TYPE>;
+#else
+@group(0) @binding(1) var<storage, read_write> K: array<vec4<KV_TYPE>>;
+#endif
+#define V K
+#else
 #if defined(KV_Q4_0) || defined(KV_Q8_0)
 @group(0) @binding(1) var<storage, read_write> K: array<KV_TYPE>;
 #else
@@ -107,7 +108,22 @@ struct Params {
 #else
 @group(0) @binding(2) var<storage, read_write> V: array<vec4<KV_TYPE>>;
 #endif
+#endif
 #if defined(MASK) && defined(SINKS)
+#ifdef KV_OVERLAP
+@group(0) @binding(2) var<storage, read_write> mask: array<f16>;
+@group(0) @binding(3) var<storage, read_write> sinks: array<f32>;
+#ifdef BLK
+#define BLK_BINDING 4
+#define TMP_BINDING 5
+#define DST_BINDING 6
+#define PARAMS_BINDING 7
+#else
+#define TMP_BINDING 4
+#define DST_BINDING 5
+#define PARAMS_BINDING 6
+#endif
+#else
 @group(0) @binding(3) var<storage, read_write> mask: array<f16>;
 @group(0) @binding(4) var<storage, read_write> sinks: array<f32>;
 #ifdef BLK
@@ -120,7 +136,21 @@ struct Params {
 #define DST_BINDING 6
 #define PARAMS_BINDING 7
 #endif
+#endif
 #elif defined(MASK)
+#ifdef KV_OVERLAP
+@group(0) @binding(2) var<storage, read_write> mask: array<f16>;
+#ifdef BLK
+#define BLK_BINDING 3
+#define TMP_BINDING 4
+#define DST_BINDING 5
+#define PARAMS_BINDING 6
+#else
+#define TMP_BINDING 3
+#define DST_BINDING 4
+#define PARAMS_BINDING 5
+#endif
+#else
 @group(0) @binding(3) var<storage, read_write> mask: array<f16>;
 #ifdef BLK
 #define BLK_BINDING 4
@@ -132,16 +162,30 @@ struct Params {
 #define DST_BINDING 5
 #define PARAMS_BINDING 6
 #endif
+#endif
 #elif defined(SINKS)
+#ifdef KV_OVERLAP
+@group(0) @binding(2) var<storage, read_write> sinks: array<f32>;
+#define TMP_BINDING 3
+#define DST_BINDING 4
+#define PARAMS_BINDING 5
+#else
 @group(0) @binding(3) var<storage, read_write> sinks: array<f32>;
 #define TMP_BINDING 4
 #define DST_BINDING 5
 #define PARAMS_BINDING 6
+#endif
+#else
+#ifdef KV_OVERLAP
+#define TMP_BINDING 2
+#define DST_BINDING 3
+#define PARAMS_BINDING 4
 #else
 #define TMP_BINDING 3
 #define DST_BINDING 4
 #define PARAMS_BINDING 5
 #endif
+#endif
 
 #ifdef BLK
 @group(0) @binding(BLK_BINDING) var<storage, read_write> blk: array<u32>;
@@ -153,7 +197,7 @@ struct Params {
 // Just a very small float value.
 const FLOAT_MIN: f32 = -1.0e9;
 
-var<workgroup> q_shmem: array<f16, Q_TILE * HEAD_DIM_QK>;
+var<workgroup> q_shmem: array<f16, HEAD_DIM_QK>;
 
 #ifndef KV_DIRECT
 const kv_shmem_size = KV_TILE * max(HEAD_DIM_QK, HEAD_DIM_V);
@@ -161,31 +205,27 @@ const kv_shmem_size = KV_TILE * max(HEAD_DIM_QK, HEAD_DIM_V);
 var<workgroup> kv_shmem: array<f16, kv_shmem_size>;
 #endif
 
-var<workgroup> o_shmem: array<f16, Q_TILE * HEAD_DIM_V>;
+var<workgroup> o_shmem: array<f16, HEAD_DIM_V>;
 
 #ifdef MASK
 // storage for mask values
-var<workgroup> mask_shmem: array<f16, Q_TILE * KV_TILE>;
+var<workgroup> mask_shmem: array<f16, KV_TILE>;
 #endif
 
 // note that we reuse the same storage for both since we only need one at a time
-var<workgroup> inter_shmem: array<f16, Q_TILE * KV_TILE>;
+var<workgroup> inter_shmem: array<f16, KV_TILE>;
 
 // Storage for row max and exp sum during online softmax
-var<workgroup> row_max_shmem: array<f32, Q_TILE>;
-var<workgroup> exp_sum_shmem: array<f32, Q_TILE>;
-var<workgroup> blk_state_wg: u32;
-
-fn calc_softmax_term(kv_idx: u32, q_tile_row: u32, slope: f32, has_bias: bool, apply_mask: bool) -> f32 {
+fn calc_softmax_term(kv_idx: u32, slope: f32, has_bias: bool, apply_mask: bool) -> f32 {
     var v = select(FLOAT_MIN,
-                   f32(inter_shmem[kv_idx + q_tile_row * KV_TILE]) * params.scale,
+                   f32(inter_shmem[kv_idx]) * params.scale,
                    kv_idx < KV_TILE);
 #ifdef LOGIT_SOFTCAP
     v = params.logit_softcap * tanh(v);
 #endif
 #ifdef MASK
     if (apply_mask) {
-        var mask_val = select(0.0,f32(mask_shmem[q_tile_row * KV_TILE + kv_idx]), kv_idx < KV_TILE);
+        var mask_val = select(0.0, f32(mask_shmem[kv_idx]), kv_idx < KV_TILE);
         v += select(mask_val, slope * mask_val, has_bias);
     }
 #endif
@@ -199,19 +239,17 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
     @builtin(subgroup_size) subgroup_size: u32,
     @builtin(num_subgroups) num_subgroups: u32,
     @builtin(subgroup_invocation_id) sg_inv_id: u32) {
+    // Vec path processes exactly one query row per workgroup, so subgroup 0 can
+    // keep the running softmax state in private storage.
+    var row_max = FLOAT_MIN;
+    var exp_sum = 0.0;
 
-    // initialize row max for online softmax
-    for (var i = local_id.x; i < Q_TILE; i += WG_SIZE) {
-        row_max_shmem[i] = FLOAT_MIN;
-        exp_sum_shmem[i] = 0.0;
-    }
-
-    for (var i = local_id.x; i < Q_TILE * HEAD_DIM_V; i += WG_SIZE) {
+    for (var i = local_id.x; i < HEAD_DIM_V; i += WG_SIZE) {
         o_shmem[i] = 0.0;
     }
 
     // workgroups per head/batch
-    let wg_per_head = (params.seq_len_q + Q_TILE - 1u) / Q_TILE;
+    let wg_per_head = params.seq_len_q;
     let wg_per_batch = wg_per_head * params.n_heads;
 
     let dst2_stride = HEAD_DIM_V * params.n_heads;
@@ -235,9 +273,9 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
     let k_head_offset = k_batch_offset + k_head_idx * params.stride_k2;
     let v_head_offset = v_batch_offset + v_head_idx * params.stride_v2;
 
-    // starting Q row for this workgroup
+    // Vec path handles one Q row per workgroup.
     let wg_in_head = wg_in_batch % wg_per_head;
-    let q_row_start = wg_in_head * Q_TILE;
+    let q_row_start = wg_in_head;
 
 #ifdef MASK
     // mask offset
@@ -248,21 +286,18 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
     let has_bias = params.max_bias > 0.0;
     let slope = select(1.0, select(pow(params.m1, 2.0 * (head - params.n_head_log2) + 1.0), pow(params.m0, head + 1.0), head < params.n_head_log2), has_bias);
 
-    // load q tile into shared memory
-    for (var elem_idx = local_id.x; elem_idx < Q_TILE * HEAD_DIM_QK; elem_idx += WG_SIZE) {
-        let q_row = elem_idx / HEAD_DIM_QK;
-        let q_col = elem_idx % HEAD_DIM_QK;
-        let head_q_row = q_row_start + q_row;
-        let global_q_row_offset = q_head_offset + head_q_row * params.stride_q1;
+    // load the single Q row into shared memory
+    for (var elem_idx = local_id.x; elem_idx < HEAD_DIM_QK; elem_idx += WG_SIZE) {
+        let global_q_row_offset = q_head_offset + q_row_start * params.stride_q1;
         q_shmem[elem_idx] = f16(select(
             0.0,
-            Q[global_q_row_offset + q_col],
-            head_q_row < params.seq_len_q && q_col < HEAD_DIM_QK));
+            Q[global_q_row_offset + elem_idx],
+            q_row_start < params.seq_len_q));
     }
 
     for (var kv_tile = iwg * KV_TILE; kv_tile < params.seq_len_kv; kv_tile += KV_TILE * params.nwg) {
 #ifdef BLK
-        let q_blk = q_row_start / Q_TILE;
+        let q_blk = q_row_start;
         let kv_blk = kv_tile / KV_TILE;
         let blk_batch = select(0u, batch_idx, params.stride_mask3 > 0u);
         let blk_idx = params.blk_base + (blk_batch * params.blk_nblk1 + q_blk) * params.blk_nblk0 + kv_blk;
@@ -270,13 +305,9 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
 #else
         let blk_state_local = 1u;
 #endif
-        if (local_id.x == 0u) {
-            blk_state_wg = blk_state_local;
-        }
-        workgroupBarrier();
-        let blk_state = blk_state_wg;
+        let blk_state = blk_state_local;
         let skip_tile = blk_state == 0u;
-        for (var elem_idx = local_id.x; elem_idx < Q_TILE * KV_TILE; elem_idx += WG_SIZE) {
+        for (var elem_idx = local_id.x; elem_idx < KV_TILE; elem_idx += WG_SIZE) {
             inter_shmem[elem_idx] = f16(0.0);
         }
 
@@ -360,20 +391,14 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
         let num_of_threads = subgroup_size / VEC_NE;
         let tx = sg_inv_id % num_of_threads;
         let ty = sg_inv_id / num_of_threads;
-          for (var q_tile_row = subgroup_id; q_tile_row < Q_TILE; q_tile_row += num_subgroups) {
-              let global_q_row = q_row_start + q_tile_row;
-              if (global_q_row >= params.seq_len_q) {
-                  continue;
-              }
-              let local_q_row_offset = q_tile_row * HEAD_DIM_QK;
-
+          if (subgroup_id == 0u && q_row_start < params.seq_len_q) {
               for (var kv_base : u32 = 0u; kv_base < KV_TILE; kv_base += VEC_NE) {
                   let kv_idx = kv_base + ty;
                   var partial_sum: f32 = 0.0;
                   let kv_valid = kv_idx < KV_TILE && (kv_tile + kv_idx) < params.seq_len_kv;
                   if (kv_valid) {
                     for (var i = tx; i < (HEAD_DIM_QK / 4u); i += num_of_threads) {
-                        let q_off = local_q_row_offset + i * 4u;
+                        let q_off = i * 4u;
 
                         let qv = vec4<f32>(
                             f32(q_shmem[q_off + 0u]),
@@ -410,8 +435,7 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
 
                   let sum_bcast = subgroupShuffle(sum, num_of_threads * ty);
                   if (tx == 0u && kv_valid) {
-                      let dst_idx = q_tile_row * KV_TILE + kv_idx;
-                      inter_shmem[dst_idx] = f16(sum_bcast);
+                      inter_shmem[kv_idx] = f16(sum_bcast);
                   }
               }
           }
@@ -422,13 +446,10 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
       let apply_mask = !skip_tile && (blk_state != 2u);
       if (apply_mask) {
           // load mask tile into shared memory for this KV block
-          for (var elem_idx = local_id.x; elem_idx < Q_TILE * KV_TILE; elem_idx += WG_SIZE) {
-              let mask_row = elem_idx / KV_TILE;
-              let mask_col = elem_idx % KV_TILE;
-              let global_q_row = q_row_start + mask_row;
-              let global_k_col = kv_tile + mask_col;
-              let mask_in_bounds = global_q_row < params.seq_len_q && global_k_col < params.seq_len_kv;
-              let mask_idx = mask_global_offset + mask_row * params.seq_len_kv + global_k_col;
+          for (var elem_idx = local_id.x; elem_idx < KV_TILE; elem_idx += WG_SIZE) {
+              let global_k_col = kv_tile + elem_idx;
+              let mask_in_bounds = q_row_start < params.seq_len_q && global_k_col < params.seq_len_kv;
+              let mask_idx = mask_global_offset + global_k_col;
               mask_shmem[elem_idx] = select(0.0, mask[mask_idx], mask_in_bounds);
           }
       }
@@ -439,50 +460,40 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
       workgroupBarrier();
 
       // online softmax
-      if (!skip_tile) {
-          for (var q_tile_row = subgroup_id; q_tile_row < Q_TILE; q_tile_row += num_subgroups) {
-              let global_q_row = q_row_start + q_tile_row;
-              if (global_q_row >= params.seq_len_q) {
-                  break;
-              }
+      if (!skip_tile && subgroup_id == 0u && q_row_start < params.seq_len_q) {
+          var prev_max = row_max;
+          var final_max = prev_max;
+          // pass 1: compute final max across the full KV tile in chunks
+          for (var kv_offset = 0u; kv_offset < KV_TILE; kv_offset += subgroup_size) {
+              let kv_idx = kv_offset + sg_inv_id;
+              let kv_valid = kv_tile + kv_idx < params.seq_len_kv && kv_idx < KV_TILE;
+              let softmax_term = select(FLOAT_MIN,
+                                        calc_softmax_term(kv_idx, slope, has_bias, apply_mask),
+                                        kv_valid);
+              final_max = subgroupMax(max(final_max, softmax_term));
+          }
 
-              var prev_max = row_max_shmem[q_tile_row];
-              var final_max = prev_max;
-              // pass 1: compute final max across the full KV tile in chunks
-              for (var kv_offset = 0u; kv_offset < KV_TILE; kv_offset += subgroup_size) {
-                  let kv_idx = kv_offset + sg_inv_id;
-                  let kv_valid = kv_tile + kv_idx < params.seq_len_kv && kv_idx < KV_TILE;
-                  let softmax_term = select(FLOAT_MIN,
-                                            calc_softmax_term(kv_idx, q_tile_row, slope, has_bias, apply_mask),
-                                            kv_valid);
-                  final_max = subgroupMax(max(final_max, softmax_term));
+          var total_exp_term: f32 = 0.0;
+          // pass 2: compute exp sum and write P using final_max
+          for (var kv_offset = 0u; kv_offset < KV_TILE; kv_offset += subgroup_size) {
+              let kv_idx = kv_offset + sg_inv_id;
+              let softmax_term = calc_softmax_term(kv_idx, slope, has_bias, apply_mask);
+              let cur_p = select(0.0,
+                                 exp(softmax_term - final_max),
+                                 kv_tile + kv_idx < params.seq_len_kv && kv_idx < KV_TILE);
+              total_exp_term += subgroupAdd(cur_p);
+              if (kv_idx < KV_TILE) {
+                  inter_shmem[kv_idx] = f16(cur_p);
               }
+          }
 
-              var total_exp_term: f32 = 0.0;
-              // pass 2: compute exp sum and write P using final_max
-              for (var kv_offset = 0u; kv_offset < KV_TILE; kv_offset += subgroup_size) {
-                  let kv_idx = kv_offset + sg_inv_id;
-                  let softmax_term = calc_softmax_term(kv_idx, q_tile_row, slope, has_bias, apply_mask);
-                  let cur_p = select(0.0,
-                                     exp(softmax_term - final_max),
-                                     kv_tile + kv_idx < params.seq_len_kv && kv_idx < KV_TILE);
-                  total_exp_term += subgroupAdd(cur_p);
-                  if (kv_idx < KV_TILE) {
-                      inter_shmem[kv_idx + q_tile_row * KV_TILE] = f16(cur_p);
-                  }
-              }
+          let cur_exp = exp(prev_max - final_max);
 
-              let cur_exp = exp(prev_max - final_max);
+          row_max = final_max;
+          exp_sum = exp_sum * cur_exp + total_exp_term;
 
-              if (sg_inv_id == 0) {
-                  row_max_shmem[q_tile_row] = final_max;
-                  exp_sum_shmem[q_tile_row] = exp_sum_shmem[q_tile_row] * cur_exp + total_exp_term;
-              }
-
-              for (var elem_idx = sg_inv_id; elem_idx < HEAD_DIM_V; elem_idx += subgroup_size) {
-                  let idx = q_tile_row * HEAD_DIM_V + elem_idx;
-                  o_shmem[idx] = f16(f32(o_shmem[idx]) * cur_exp);
-              }
+          for (var elem_idx = sg_inv_id; elem_idx < HEAD_DIM_V; elem_idx += subgroup_size) {
+              o_shmem[elem_idx] = f16(f32(o_shmem[elem_idx]) * cur_exp);
           }
       }
 
@@ -562,15 +573,13 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
       workgroupBarrier();
 
       if (!skip_tile) {
-          // we have P (Q_TILE x KV_TILE) in inter_shmem and V (KV_TILE x head_dim_v) in kv_shmem
+          // we have P (KV_TILE) in inter_shmem and V (KV_TILE x head_dim_v) in kv_shmem
           // we want to compute O += P * V across the full KV tile
           let ne_threads : u32 = VEC_NE;
           let nl_threads = max(1u, subgroup_size / ne_threads);
           let tx_pv = sg_inv_id % nl_threads;
           let ty_pv = sg_inv_id / nl_threads;
-          for (var q_tile_row = subgroup_id;
-               q_tile_row < Q_TILE;
-               q_tile_row += num_subgroups) {
+          if (subgroup_id == 0u && q_row_start < params.seq_len_q) {
               for (var vec_col = tx_pv; vec_col < (HEAD_DIM_V / 4u); vec_col += nl_threads) {
                   var lo = vec4<f32>(0.0, 0.0, 0.0, 0.0);
                   for (var cc = 0u; cc < KV_TILE / ne_threads; cc += 1u) {
@@ -580,7 +589,7 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
                           continue;
                       }
 
-                      let p = f32(inter_shmem[kv_idx + q_tile_row * KV_TILE]);
+                      let p = f32(inter_shmem[kv_idx]);
 #ifdef KV_DIRECT
                       let v_idx = v_head_offset + v_row * params.stride_v1 + vec_col * 4u;
                       let v4 = vec4<f32>(V[v_idx >> 2u]);
@@ -621,11 +630,10 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
 
                   if (ty_pv == 0u) {
                       let elem_base = vec_col * 4u;
-                      let o_base_idx = q_tile_row * HEAD_DIM_V + elem_base;
-                      o_shmem[o_base_idx + 0u] = f16(f32(o_shmem[o_base_idx + 0u]) + lo_x);
-                      o_shmem[o_base_idx + 1u] = f16(f32(o_shmem[o_base_idx + 1u]) + lo_y);
-                      o_shmem[o_base_idx + 2u] = f16(f32(o_shmem[o_base_idx + 2u]) + lo_z);
-                      o_shmem[o_base_idx + 3u] = f16(f32(o_shmem[o_base_idx + 3u]) + lo_w);
+                      o_shmem[elem_base + 0u] = f16(f32(o_shmem[elem_base + 0u]) + lo_x);
+                      o_shmem[elem_base + 1u] = f16(f32(o_shmem[elem_base + 1u]) + lo_y);
+                      o_shmem[elem_base + 2u] = f16(f32(o_shmem[elem_base + 2u]) + lo_z);
+                      o_shmem[elem_base + 3u] = f16(f32(o_shmem[elem_base + 3u]) + lo_w);
                   }
               }
           }
@@ -637,70 +645,46 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
 
 #ifdef SINKS
     // Sinks are global terms and must be applied exactly once across split workgroups.
-    if (iwg == 0u) {
-        for (var q_tile_row = subgroup_id;
-             q_tile_row < Q_TILE;
-             q_tile_row += num_subgroups) {
-                let global_q_row = q_row_start + q_tile_row;
-                if (global_q_row >= params.seq_len_q) {
-                    break;
-                }
+    if (iwg == 0u && subgroup_id == 0u && q_row_start < params.seq_len_q) {
+        var prev_max = row_max;
 
-                var prev_max = row_max_shmem[q_tile_row];
+        // for non-sink threads, exp(FLOAT_MIN) effectively zeroes out their contribution to the sum
+        let sink_val = select(FLOAT_MIN, sinks[params.offset_sinks + head_idx], sg_inv_id == 0u);
+        let new_max = subgroupMax(max(prev_max, sink_val));
+        let max_exp = exp(prev_max - new_max);
+        let sink_exp = exp(sink_val - new_max);
 
-                // for non-sink threads, exp(FLOAT_MIN) effectively zeroes out their contribution to the sum
-                let sink_val = select(FLOAT_MIN, sinks[params.offset_sinks + head_idx], sg_inv_id == 0);
-                let new_max = subgroupMax(max(prev_max, sink_val));
-                let max_exp = exp(prev_max - new_max);
-                let sink_exp = exp(sink_val - new_max);
+        let sink_exp_sum = subgroupAdd(sink_exp);
 
-                let sink_exp_sum = subgroupAdd(sink_exp);
+        row_max = new_max;
+        exp_sum = exp_sum * max_exp + sink_exp_sum;
 
-                if (sg_inv_id == 0) {
-                    row_max_shmem[q_tile_row] = new_max;
-                    exp_sum_shmem[q_tile_row] = exp_sum_shmem[q_tile_row] * max_exp + sink_exp_sum;
-                }
-
-            for (var elem_idx = sg_inv_id; elem_idx < HEAD_DIM_V; elem_idx += subgroup_size) {
-                let idx = q_tile_row * HEAD_DIM_V + elem_idx;
-                o_shmem[idx] = f16(f32(o_shmem[idx]) * max_exp);
-            }
+        for (var elem_idx = sg_inv_id; elem_idx < HEAD_DIM_V; elem_idx += subgroup_size) {
+            o_shmem[elem_idx] = f16(f32(o_shmem[elem_idx]) * max_exp);
         }
-        workgroupBarrier();
     }
+    workgroupBarrier();
 #endif
     let rows_per_batch = params.n_heads * params.seq_len_q;
-    for (var q_tile_row = subgroup_id;
-         q_tile_row < Q_TILE;
-         q_tile_row += num_subgroups) {
-
-        let global_q_row = q_row_start + q_tile_row;
-        if (global_q_row >= params.seq_len_q) { break; }
-
+    if (subgroup_id == 0u && q_row_start < params.seq_len_q) {
         if (params.nwg == 1u) {
-            let exp_sum = exp_sum_shmem[q_tile_row];
             let scale = select(0.0, 1.0 / exp_sum, exp_sum != 0.0);
-            let row_base: u32 =
-                params.offset_dst + batch_idx * dst3_stride + global_q_row * dst2_stride + head_idx * HEAD_DIM_V;
+            let row_base: u32 = params.offset_dst + batch_idx * dst3_stride + q_row_start * dst2_stride +
+                                head_idx * HEAD_DIM_V;
 
             for (var elem_base = sg_inv_id * 4u; elem_base < HEAD_DIM_V; elem_base += subgroup_size * 4u) {
-                let i0 = q_tile_row * HEAD_DIM_V + (elem_base + 0u);
-                let i1 = q_tile_row * HEAD_DIM_V + (elem_base + 1u);
-                let i2 = q_tile_row * HEAD_DIM_V + (elem_base + 2u);
-                let i3 = q_tile_row * HEAD_DIM_V + (elem_base + 3u);
-
                 let v = vec4<f32>(
-                    f32(o_shmem[i0]) * scale,
-                    f32(o_shmem[i1]) * scale,
-                    f32(o_shmem[i2]) * scale,
-                    f32(o_shmem[i3]) * scale
+                    f32(o_shmem[elem_base + 0u]) * scale,
+                    f32(o_shmem[elem_base + 1u]) * scale,
+                    f32(o_shmem[elem_base + 2u]) * scale,
+                    f32(o_shmem[elem_base + 3u]) * scale
                 );
 
                 let dst_vec_index: u32 = (row_base + elem_base) >> 2u;
                 dst[dst_vec_index] = v;
             }
         } else {
-            let rid = batch_idx * rows_per_batch + head_idx * params.seq_len_q + global_q_row;
+            let rid = batch_idx * rows_per_batch + head_idx * params.seq_len_q + q_row_start;
             let tmp_row_data_base = params.tmp_data_base + rid * (HEAD_DIM_V * params.nwg) + iwg * HEAD_DIM_V;
             let tmp_row_stats_base = params.tmp_stats_base + rid * (2u * params.nwg) + 2u * iwg;
 
@@ -708,21 +692,16 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
                 elem_base < HEAD_DIM_V;
                 elem_base += subgroup_size * 4u) {
 
-                let i0 = q_tile_row * HEAD_DIM_V + (elem_base + 0u);
-                let i1 = q_tile_row * HEAD_DIM_V + (elem_base + 1u);
-                let i2 = q_tile_row * HEAD_DIM_V + (elem_base + 2u);
-                let i3 = q_tile_row * HEAD_DIM_V + (elem_base + 3u);
-
                 let tbase = tmp_row_data_base + elem_base;
-                tmp[tbase + 0u] = f32(o_shmem[i0]);
-                tmp[tbase + 1u] = f32(o_shmem[i1]);
-                tmp[tbase + 2u] = f32(o_shmem[i2]);
-                tmp[tbase + 3u] = f32(o_shmem[i3]);
+                tmp[tbase + 0u] = f32(o_shmem[elem_base + 0u]);
+                tmp[tbase + 1u] = f32(o_shmem[elem_base + 1u]);
+                tmp[tbase + 2u] = f32(o_shmem[elem_base + 2u]);
+                tmp[tbase + 3u] = f32(o_shmem[elem_base + 3u]);
             }
 
             if (sg_inv_id == 0u) {
-                tmp[tmp_row_stats_base + 0u] = exp_sum_shmem[q_tile_row];
-                tmp[tmp_row_stats_base + 1u] = row_max_shmem[q_tile_row];
+                tmp[tmp_row_stats_base + 0u] = exp_sum;
+                tmp[tmp_row_stats_base + 1u] = row_max;
             }
         }
     }

From a702f39597c9fa6d8e4178c16fdb3e4367b7cf38 Mon Sep 17 00:00:00 2001
From: Shreya Jain <shreyajn@qti.qualcomm.com>
Date: Fri, 24 Apr 2026 12:21:36 -0700
Subject: [PATCH 33/35] CI Snapdragon: Switch ubuntu-latest to ubuntu-slim
 runner (#22303)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* switch ubuntu-latest to ubuntu-slim

* Fix the path for upload so CI doesn't fail

* Update .github/workflows/build-and-test-snapdragon.yml

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Use -slim image for key check and consistent naming for artifact dir

Signed-off-by: Max Krasnyansky <maxk@qti.qualcomm.com>

* Remove check-secret extra job

* move QDC key check for Run QDC jobs step specifically

* add a step before to check the secret for qdc jobs

---------

Signed-off-by: Max Krasnyansky <maxk@qti.qualcomm.com>
Co-authored-by: Max Krasnyansky <maxk@qti.qualcomm.com>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
---
 .../workflows/build-and-test-snapdragon.yml   | 33 ++++++++++---------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/build-and-test-snapdragon.yml b/.github/workflows/build-and-test-snapdragon.yml
index 7eb204ea2..deed8e808 100644
--- a/.github/workflows/build-and-test-snapdragon.yml
+++ b/.github/workflows/build-and-test-snapdragon.yml
@@ -49,28 +49,19 @@ jobs:
           cp docs/backend/snapdragon/CMakeUserPresets.json .
           cmake --preset arm64-android-snapdragon-release -B build
           cmake --build build
-          cmake --install build --prefix pkg-adb/llama.cpp
+          cmake --install build --prefix pkg-snapdragon/llama.cpp
 
       - name: Upload Llama.CPP Snapdragon Android Build Artifact
         if: ${{ always() && steps.build_llama_cpp_snapdragon_android.outcome == 'success' }}
         uses: actions/upload-artifact@v6
         with:
           name: llama-cpp-android-arm64-snapdragon
-          path: pkg-adb/llama.cpp
-
-  check-secret:
-    runs-on: ubuntu-latest
-    outputs:
-      has-key: ${{ steps.check.outputs.has-key }}
-    steps:
-      - id: check
-        run: echo "has-key=${{ secrets.QDC_API_KEY != '' }}" >> "$GITHUB_OUTPUT"
+          path: pkg-snapdragon/llama.cpp
 
   test-snapdragon-qdc:
     name: Test on QDC Android Device (${{ matrix.device }})
-    needs: [android-ndk-snapdragon, check-secret]
-    if: needs.check-secret.outputs.has-key == 'true'
-    runs-on: ubuntu-latest
+    needs: [android-ndk-snapdragon]
+    runs-on: ubuntu-slim
     strategy:
       fail-fast: false
       matrix:
@@ -81,10 +72,10 @@ jobs:
         uses: actions/checkout@v6
 
       - name: Download build artifact
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v7
         with:
           name: llama-cpp-android-arm64-snapdragon
-          path: pkg-snapdragon/
+          path: pkg-snapdragon/llama.cpp
 
       - name: Set up Python
         uses: actions/setup-python@v5
@@ -92,13 +83,25 @@ jobs:
           python-version: '3.x'
           cache: pip
 
+      - name: Install system dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y curl unzip
+
       - name: Install QDC SDK wheel
         run: |
           curl -fSL -o qdc_sdk.zip https://softwarecenter.qualcomm.com/api/download/software/tools/Qualcomm_Device_Cloud_SDK/All/0.2.3/qualcomm_device_cloud_sdk-0.2.3.zip
           unzip qdc_sdk.zip -d qdc_sdk
           pip install qdc_sdk/qualcomm_device_cloud_sdk-0.2.3-py3-none-any.whl
 
+      - name: Check QDC API key
+        id: check_secret
+        env:
+          QDC_API_KEY: ${{ secrets.QDC_API_KEY }}
+        run: echo "has-qdc-key=${{ env.QDC_API_KEY != '' }}" >> "$GITHUB_OUTPUT"
+
       - name: Run QDC tests (${{ matrix.device }})
+        if: steps.check_secret.outputs.has-qdc-key == 'true'
         run: |
           python scripts/snapdragon/qdc/run_qdc_jobs.py \
               --test       all \

From 361fe72acb7b9bd79059cc177cbeda99b35b5db9 Mon Sep 17 00:00:00 2001
From: Trivikram Reddy <127072883+trivikram-reddy1@users.noreply.github.com>
Date: Fri, 24 Apr 2026 15:55:17 -0500
Subject: [PATCH 34/35] Hexagon: Bump HMX Frequency to Max Corner (#22334)

* hexagon: bump HMX freq to max corner

* hex-mm: fix error in log msg
---
 ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c |  2 +-
 ggml/src/ggml-hexagon/htp/main.c           | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
index dbca8220f..05e3c6c2b 100644
--- a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
@@ -1683,7 +1683,7 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict
     __fp16  *vtcm_scales     = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, 256);
     assert((size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base) <= vtcm_budget);
 
-    FARF(HIGH, "hmx-mm: m=%d k=%d n=%d wtype=%d block M=%zu N=%zu K=%zu vtcm=%zu/%zu", __func__, m, k, n, weight_type,
+    FARF(HIGH, "hmx-mm: m=%d k=%d n=%d wtype=%d block M=%zu N=%zu K=%zu vtcm=%zu/%zu", m, k, n, weight_type,
          M_BLOCK_SIZE, N_BLOCK_SIZE, K_BLOCK_SIZE, (size_t) (vtcm_ptr - (uint8_t *) ctx->vtcm_base), vtcm_budget);
 
     // initialize eye tile (32x32 identity matrix)
diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c
index db277a25e..62942f638 100644
--- a/ggml/src/ggml-hexagon/htp/main.c
+++ b/ggml/src/ggml-hexagon/htp/main.c
@@ -101,6 +101,24 @@ AEEResult htp_iface_open(const char * uri, remote_handle64 * handle) {
         }
     }
 
+    {
+        // Set HMX clock
+        HAP_power_request_t request;
+        memset(&request, 0, sizeof(HAP_power_request_t));
+        request.type = HAP_power_set_HMX_v2;
+        request.hmx_v2.set_clock = TRUE;
+        request.hmx_v2.target_corner = HAP_DCVS_EXP_VCORNER_MAX;
+        request.hmx_v2.min_corner = HAP_DCVS_EXP_VCORNER_MAX;
+        request.hmx_v2.max_corner = HAP_DCVS_EXP_VCORNER_MAX;
+        request.hmx_v2.perf_mode = HAP_CLK_PERF_HIGH;
+        FARF(ALWAYS, "Setting HMX clock\n");
+        err = HAP_power_set((void *) &ctx, &request);
+        if (err != AEE_SUCCESS) {
+            FARF(ERROR, "Error setting HMX clock.");
+            return err;
+        }
+    }
+
     return AEE_SUCCESS;
 }
 

From 0adede866ddb2e31992b3792eaea31d18ed89acf Mon Sep 17 00:00:00 2001
From: "Piotr Wilkin (ilintar)" <piotr.wilkin@syndatis.com>
Date: Fri, 24 Apr 2026 23:19:55 +0200
Subject: [PATCH 35/35] parser: fix structured output bug (#22302)

* fix very stupid structured output bug

* Things just cannot be too easy.
---
 scripts/server-test-structured.py | 78 +++++++++++++++++++++++++++----
 tools/server/server-common.cpp    |  4 +-
 2 files changed, 72 insertions(+), 10 deletions(-)

diff --git a/scripts/server-test-structured.py b/scripts/server-test-structured.py
index 98ff473b9..da217fc46 100755
--- a/scripts/server-test-structured.py
+++ b/scripts/server-test-structured.py
@@ -3,8 +3,12 @@
 Test structured output capability via chat completions endpoint.
 
 Each test case contains:
-  - response_format: OpenAI-compatible response_format specification
-                     (json_schema only — llama.cpp does not support json_object)
+  - response_format: OpenAI-compatible response_format specification.
+                     Both "json_schema" and "json_object" are accepted; with
+                     "json_object" a schema can be supplied via extra_body.
+  - extra_body (optional): dict of extra top-level request fields merged into
+                     the request payload (mirrors the OpenAI SDK's extra_body
+                     feature; llama.cpp reads a top-level "json_schema" here).
   - messages: initial conversation messages
   - tools (optional): tool definitions (for mixed tool + structured tests)
   - mock_tool_responses (optional): dict mapping tool_name -> callable(arguments) -> str (JSON)
@@ -81,11 +85,14 @@ def print_info(msg):
     _print(f"{DIM}{msg}{RESET}")
 
 
-def print_schema_note(label, rf):
+def print_schema_note(label, rf, extra_body=None):
     kind = rf.get("type", "?")
     name = ""
     if kind == "json_schema":
         name = rf.get("json_schema", {}).get("name", "")
+    elif kind == "json_object" and extra_body and "json_schema" in extra_body:
+        extra_schema = extra_body["json_schema"] or {}
+        name = extra_schema.get("title") or "extra_body.json_schema"
     _print(f"{DIM}{MAGENTA}  ⟐ response_format [{label}]: {kind}"
            f"{(' / ' + name) if name else ''}{RESET}")
 
@@ -95,17 +102,20 @@ def print_schema_note(label, rf):
 # ---------------------------------------------------------------------------
 
 
-def chat_completion(url, messages, tools=None, response_format=None, stream=False):
+def chat_completion(url, messages, tools=None, response_format=None, stream=False,
+                    extra_body=None):
     payload = {
         "messages": messages,
         "stream": stream,
-        "max_tokens": 4096,
+        "max_tokens": 8192,
     }
     if tools:
         payload["tools"] = tools
         payload["tool_choice"] = "auto"
     if response_format is not None:
         payload["response_format"] = response_format
+    if extra_body:
+        payload.update(extra_body)
 
     try:
         response = requests.post(url, json=payload, stream=stream)
@@ -180,7 +190,7 @@ def chat_completion(url, messages, tools=None, response_format=None, stream=Fals
 
 def run_tool_loop(
     url, messages, tools, mock_tool_responses, stream, response_format=None,
-    max_turns=6,
+    extra_body=None, max_turns=6,
 ):
     """
     Drive the tool-call loop. If response_format is provided it is applied to
@@ -191,7 +201,8 @@ def run_tool_loop(
 
     for _ in range(max_turns):
         result = chat_completion(
-            url, msgs, tools=tools, response_format=response_format, stream=stream
+            url, msgs, tools=tools, response_format=response_format, stream=stream,
+            extra_body=extra_body,
         )
         if result is None:
             return all_tool_calls, msgs, None
@@ -274,7 +285,8 @@ def run_test(url, test_case, stream):
     print_header(f"{name}  [{mode}] ({apply_stage})")
 
     response_format = test_case["response_format"]
-    print_schema_note(apply_stage, response_format)
+    extra_body = test_case.get("extra_body")
+    print_schema_note(apply_stage, response_format, extra_body)
 
     tools = test_case.get("tools")
     mocks = test_case.get("mock_tool_responses") or {}
@@ -290,6 +302,7 @@ def run_test(url, test_case, stream):
             mock_tool_responses=mocks,
             stream=stream,
             response_format=response_format,
+            extra_body=extra_body,
         )
     elif apply_stage == "after_tools":
         # Phase 1: plain tool loop, no response_format applied yet.
@@ -314,7 +327,8 @@ def run_test(url, test_case, stream):
         # model focuses on producing the schema-constrained answer.
         _print(f"\n{DIM}{MAGENTA}  ⟐ follow-up turn with response_format applied{RESET}")
         result = chat_completion(
-            url, msgs, tools=None, response_format=response_format, stream=stream
+            url, msgs, tools=None, response_format=response_format, stream=stream,
+            extra_body=extra_body,
         )
         final_content = result["content"] if result else None
     else:
@@ -481,6 +495,51 @@ def _validate_sentiment(parsed):
     return True, f"sentiment={parsed['sentiment']} conf={conf} kws={kws}"
 
 
+# ---- Test: json_object + extra_body.json_schema (always) ----
+#
+# Exercises the llama.cpp-specific path where the OpenAI SDK would send
+# response_format={"type": "json_object"} and tunnel the schema through
+# extra_body.json_schema (which becomes a top-level "json_schema" field on
+# the request body).
+
+_PRODUCT_JSON_OBJECT_SCHEMA = {
+    "$schema": "https://json-schema.org/draft/2020-12/schema",
+    "$id": "https://example.com/product.schema.json",
+    "title": "Product",
+    "description": "A product in the catalog",
+    "type": "object",
+}
+
+PRODUCT_JSON_OBJECT_TEST_CASE = {
+    "name": "json_object response_format with extra_body json_schema",
+    "response_format": {"type": "json_object"},
+    "extra_body": {"json_schema": _PRODUCT_JSON_OBJECT_SCHEMA},
+    "apply_stage": "always",
+    "messages": [
+        {
+            "role": "system",
+            "content": (
+                "Extract structured data from the provided text according to the "
+                "JSON schema. Return only valid JSON matching the schema exactly."
+            ),
+        },
+        {
+            "role": "user",
+            "content": "Product: Wireless Headphones, ID: 101, In Stock: Yes",
+        },
+    ],
+    "validate": lambda parsed, tcs, raw: _validate_product_json_object(parsed),
+}
+
+
+def _validate_product_json_object(parsed):
+    if not isinstance(parsed, dict):
+        return False, f"expected JSON object, got {type(parsed).__name__}: {parsed!r}"
+    if not parsed:
+        return False, f"expected non-empty object, got {parsed!r}"
+    return True, f"product object with {len(parsed)} field(s): {sorted(parsed.keys())}"
+
+
 # ---- Test 3: Nested recipe schema (always) ----
 
 _RECIPE_SCHEMA = {
@@ -915,6 +974,7 @@ def _validate_country_report(parsed, tcs):
 ALL_TEST_CASES = [
     BOOK_TEST_CASE,
     SENTIMENT_TEST_CASE,
+    PRODUCT_JSON_OBJECT_TEST_CASE,
     RECIPE_TEST_CASE,
     SHOP_COMPARISON_TEST_CASE,
     COUNTRY_REPORT_TEST_CASE,
diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp
index ad8834e31..21c843c0d 100644
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@@ -947,7 +947,9 @@ json oaicompat_chat_params_parse(
         json response_format      = json_value(body, "response_format", json::object());
         std::string response_type = json_value(response_format, "type", std::string());
         if (response_type == "json_object") {
-            json_schema = json_value(response_format, "schema", json::object());
+            if (response_format.contains("schema") || json_schema.empty()) {
+                json_schema = json_value(response_format, "schema", json::object());
+            }
         } else if (response_type == "json_schema") {
             auto schema_wrapper = json_value(response_format, "json_schema", json::object());
             json_schema = json_value(schema_wrapper, "schema", json::object());