From 9c92e96a64fe0f03f5f3e5ab720a151941da1de5 Mon Sep 17 00:00:00 2001
From: fairydreaming <166155368+fairydreaming@users.noreply.github.com>
Date: Fri, 22 May 2026 11:55:29 +0200
Subject: [PATCH 01/17] cmake : build router app only during standalone builds
 (#23521)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 56eb608ea..4f7f4eca6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -108,7 +108,7 @@ option(LLAMA_BUILD_TESTS     "llama: build tests"
 option(LLAMA_BUILD_TOOLS     "llama: build tools"                                                                ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES  "llama: build examples"                                                             ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER    "llama: build server example"                                                       ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_APP       "llama: build the unified binary"                                                   ON)
+option(LLAMA_BUILD_APP       "llama: build the unified binary"                                                   ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_UI        "llama: build the embedded Web UI for server"                                       ON)
 option(LLAMA_USE_PREBUILT_UI "llama: use prebuilt UI from HF Bucket when available (requires LLAMA_BUILD_UI=ON)" ON)
 

From 99d4026b116605ed8e1f3ab179b3c63bc4637195 Mon Sep 17 00:00:00 2001
From: Sachin Sharma <sachin@zettabolt.com>
Date: Fri, 22 May 2026 16:46:55 +0530
Subject: [PATCH 02/17] ggml-zendnn : add Q8_0 quantization support (#23414)

* ggml-zendnn : add Q8_0 quantization support

* ggml-zendnn : sync with latest ZenDNN

* ggml-zendnn : address review comments for Q8_0
---
 ggml/src/ggml-zendnn/CMakeLists.txt  |  2 +-
 ggml/src/ggml-zendnn/ggml-zendnn.cpp | 56 ++++++++++++++++++++++------
 2 files changed, 46 insertions(+), 12 deletions(-)

diff --git a/ggml/src/ggml-zendnn/CMakeLists.txt b/ggml/src/ggml-zendnn/CMakeLists.txt
index f1e4f991f..e4ba9cfbd 100644
--- a/ggml/src/ggml-zendnn/CMakeLists.txt
+++ b/ggml/src/ggml-zendnn/CMakeLists.txt
@@ -28,7 +28,7 @@ if (NOT ZENDNN_ROOT OR ZENDNN_ROOT STREQUAL "" OR ZENDNN_ROOT STREQUAL "OFF")
     ExternalProject_Add(
         zendnn
         GIT_REPOSITORY https://github.com/amd/ZenDNN.git
-        GIT_TAG ac9e580d9434b7b98985f2627a7ebfb5eba4bb0d    # ZenDNN-2026-WW17
+        GIT_TAG 253b94ce0d7e9284c265fefb485714944caff9d3    # ZenDNN-2026-WW19
         PREFIX      ${ZENDNN_PREFIX}
         SOURCE_DIR  ${ZENDNN_SOURCE_DIR}
         BINARY_DIR  ${ZENDNN_BUILD_DIR}
diff --git a/ggml/src/ggml-zendnn/ggml-zendnn.cpp b/ggml/src/ggml-zendnn/ggml-zendnn.cpp
index 6a83bb6b1..6051d0820 100644
--- a/ggml/src/ggml-zendnn/ggml-zendnn.cpp
+++ b/ggml/src/ggml-zendnn/ggml-zendnn.cpp
@@ -2,6 +2,10 @@
 
 #include "ggml-backend-impl.h"
 #include "ggml-impl.h"
+
+#define GGML_COMMON_DECL_CPP
+#include "ggml-common.h"
+
 #include "zendnnl.hpp"
 
 #include <cstring>
@@ -19,6 +23,8 @@ zendnnl::common::data_type_t ggml_to_zendnn_type() {
         return zendnnl::common::data_type_t::f32;
     } else if constexpr (std::is_same_v<T, ggml_bf16_t>) {
         return zendnnl::common::data_type_t::bf16;
+    } else if constexpr (std::is_same_v<T, block_q8_0>) {
+        return zendnnl::common::data_type_t::s8;
     } else {
         return zendnnl::common::data_type_t::none;
     }
@@ -48,6 +54,17 @@ static bool ggml_zendnn_matmul(ggml_backend_zendnn_context * ctx, int64_t m, int
     params.num_threads = ctx->n_threads;
 
     zendnnl::lowoha::matmul::matmul_batch_params_t batch_params;
+
+    if constexpr (std::is_same_v<TA, block_q8_0>) {
+        params.dtypes.compute = zendnnl::common::data_type_t::s8;
+        const int64_t num_groups = k / QK8_0;
+        params.dynamic_quant = true;
+        params.quant_params.src_scale.buff = nullptr;
+        params.quant_params.src_scale.dt   = zendnnl::common::data_type_t::bf16;
+        params.quant_params.src_scale.dims = {n, num_groups};
+        params.packing.pack_format_b = 1;
+    }
+
     zendnnl::error_handling::status_t status = zendnnl::lowoha::matmul::matmul_direct(
         'r', false, true,   // row-major, don't transpose B, transpose A (because it's column-major)
         n,                  // M: rows of B and C
@@ -108,6 +125,14 @@ static bool ggml_zendnn_sgemm(ggml_backend_zendnn_context * ctx, int64_t m, int6
                     (const ggml_bf16_t *)B, ldb,
                     (float *)C, ldc);
             return false;
+        case GGML_TYPE_Q8_0:
+            if (Btype != GGML_TYPE_F32 || Ctype != GGML_TYPE_F32)
+                return false;
+            return ggml_zendnn_matmul<block_q8_0, float, float>(
+                ctx, m, n, k,
+                (const block_q8_0 *)A, lda,
+                (const float *)B, ldb,
+                (float *)C, ldc);
         default:
             return false; // unsupported type
     }
@@ -145,7 +170,9 @@ static void ggml_zendnn_compute_forward_mul_mat(
     const int64_t r3 = ne13/ne03;
 
     void * work_data = ctx->work_data.get();
-    if (src1->type != vec_dot_type) {
+
+    // ZenDNN requires FP32 for dynamic quantization, so conversion is skipped
+    if (src1->type != vec_dot_type && src0->type != GGML_TYPE_Q8_0) {
         const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
         const size_t nbw2 = nbw1 * ne11;
         const size_t nbw3 = nbw2 * ne12;
@@ -171,7 +198,7 @@ static void ggml_zendnn_compute_forward_mul_mat(
 
     for (int64_t i13 = 0; i13 < ne13; i13++) {
         for (int64_t i12 = 0; i12 < ne12; i12++) {
-            const void* wdata = src1->type == vec_dot_type ? src1->data : work_data;
+            const void* wdata = (src1->type == vec_dot_type || src0->type == GGML_TYPE_Q8_0) ? src1->data : work_data;
             const size_t row_size = ggml_row_size(vec_dot_type, ne10);
             if (!ggml_zendnn_sgemm(ctx,
                                   ne01,     // m
@@ -184,7 +211,7 @@ static void ggml_zendnn_compute_forward_mul_mat(
                                   static_cast<char *>(dst->data) + i12*nb2 + i13*nb3,
                                   ne01,     // ldc
                                   src0->type,
-                                  vec_dot_type,
+                                  src0->type == GGML_TYPE_Q8_0 ? GGML_TYPE_F32 : vec_dot_type,
                                   dst->type))
                 GGML_ABORT("%s: ZenDNN sgemm failed\n", __func__);
         }
@@ -261,10 +288,15 @@ static void ggml_zendnn_compute_forward_mul_mat_id(
     const size_t nbw1 = row_size;
     const size_t nbw2 = nbw1 * ne11;
     const size_t nbw3 = nbw2 * ne12;
-    const size_t src1_conv_size = (src1->type != vec_dot_type) ? ne13 * nbw3 : 0;
+    const size_t src1_conv_size = (src1->type != vec_dot_type && src0->type != GGML_TYPE_Q8_0) ? ne13 * nbw3 : 0;
+
+    // For Q8_0, src1 is always F32; the gather buffer must hold F32 rows (ne10*4 bytes),
+    // not Q8_0-encoded rows (row_size ≈ ne10/32*34 bytes) — they differ by ~4x.
+    const size_t f32_row_size = (size_t)ne10 * sizeof(float);
+    const size_t gather_row_size = (src0->type == GGML_TYPE_Q8_0) ? f32_row_size : row_size;
 
     // size for MoE gather/scatter buffers
-    const size_t wdata_cur_size = max_rows * row_size;
+    const size_t wdata_cur_size = max_rows * gather_row_size;
     const size_t dst_cur_size = max_rows * ggml_row_size(dst->type, ne01);
 
     // allocate single buffer for all needs
@@ -279,7 +311,8 @@ static void ggml_zendnn_compute_forward_mul_mat_id(
     char * wdata_cur = work_data + src1_conv_size;
     char * dst_cur = wdata_cur + wdata_cur_size;
 
-    if (src1->type != vec_dot_type) {
+    // ZenDNN requires FP32 for dynamic quantization, so conversion is skipped
+    if (src1->type != vec_dot_type && src0->type != GGML_TYPE_Q8_0) {
         GGML_ASSERT(src1->type == GGML_TYPE_F32);
 
         #pragma omp parallel for collapse(3) num_threads(ctx->n_threads) schedule(static)
@@ -294,7 +327,7 @@ static void ggml_zendnn_compute_forward_mul_mat_id(
         }
     }
 
-    const void * wdata = src1->type == vec_dot_type ? src1->data : work_data;
+    const void * wdata = (src1->type == vec_dot_type || src0->type == GGML_TYPE_Q8_0) ? src1->data : work_data;
 
     // process each expert with gather -> gemm -> scatter pattern
     for (int64_t cur_a = 0; cur_a < n_as; ++cur_a) {
@@ -315,9 +348,9 @@ static void ggml_zendnn_compute_forward_mul_mat_id(
             const int64_t i12 = row_mapping.i2;
 
             std::memcpy(
-                wdata_cur + ir1 * row_size,
-                (const char *) wdata + (i11 + i12*ne11) * row_size,
-                row_size
+                wdata_cur + ir1 * gather_row_size,
+                (const char *) wdata + (i11 + i12*ne11) * gather_row_size,
+                gather_row_size
             );
         }
 
@@ -333,7 +366,7 @@ static void ggml_zendnn_compute_forward_mul_mat_id(
                               dst_cur,
                               ne01,       // ldc
                               src0->type,
-                              vec_dot_type,
+                              src0->type == GGML_TYPE_Q8_0 ? GGML_TYPE_F32 : vec_dot_type,
                               dst->type)) {
             GGML_ABORT("%s: ZenDNN sgemm failed\n", __func__);
         }
@@ -577,6 +610,7 @@ static bool ggml_backend_zendnn_device_supports_op(ggml_backend_dev_t dev, const
             switch (weights->type) {
                 case GGML_TYPE_F32:
                 case GGML_TYPE_BF16:
+                case GGML_TYPE_Q8_0:
                     return true;
                 default:
                     return false;

From 95feeab52e41ceaf71e87b2dd01895f6d8815b60 Mon Sep 17 00:00:00 2001
From: Jesus Talavera <145992175+jesus-talavera-ibm@users.noreply.github.com>
Date: Fri, 22 May 2026 14:35:46 +0200
Subject: [PATCH 03/17] docs: Update documentation with Granite 4.0/4.1
 (#23404)

---
 docs/autoparser.md       | 1 +
 docs/function-calling.md | 1 +
 2 files changed, 2 insertions(+)

diff --git a/docs/autoparser.md b/docs/autoparser.md
index adc4d43ed..da2f1a3a0 100644
--- a/docs/autoparser.md
+++ b/docs/autoparser.md
@@ -489,6 +489,7 @@ The following templates have active tests in `tests/test-chat.cpp`:
 | Qwen-QwQ-32B | Reasoning | Forced-open thinking |
 | NousResearch Hermes 2 Pro | JSON_NATIVE | `<tool_call>` wrapper |
 | IBM Granite 3.3 | JSON_NATIVE | `<think></think>` + `<response></response>` |
+| IBM Granite 4.0 | JSON_NATIVE | `<tool_call>` wrapper (same template used by 4.1) |
 | ByteDance Seed-OSS | TAG_WITH_TAGGED | Custom `<seed:think>` and `<seed:tool_call>` tags |
 | Qwen3-Coder | TAG_WITH_TAGGED | XML-style tool format |
 | DeepSeek V3.1 | JSON_NATIVE | Forced thinking mode |
diff --git a/docs/function-calling.md b/docs/function-calling.md
index 9ede914c0..850b59ce7 100644
--- a/docs/function-calling.md
+++ b/docs/function-calling.md
@@ -291,6 +291,7 @@ Here are some models known to work (w/ chat template override when needed):
 llama-server --jinja -fa -hf bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M
 llama-server --jinja -fa -hf bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q6_K_L
 llama-server --jinja -fa -hf bartowski/Llama-3.3-70B-Instruct-GGUF:Q4_K_M
+llama-server --jinja -fa -hf ibm-granite/granite-4.1-3b-GGUF:Q4_K_M
 
 # Native support for DeepSeek R1 works best w/ our template override (official template is buggy, although we do work around it)
 

From 8cc67efcd4834a46b18a0cf32c9b1c99762daeac Mon Sep 17 00:00:00 2001
From: Katostrofik <georgiopapairo@gmail.com>
Date: Fri, 22 May 2026 08:48:24 -0400
Subject: [PATCH 04/17] SYCL: add BF16 to DMMV kernel path (~4x tg speedup on
 Intel Arc) (#21580)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* SYCL: add BF16 to DMMV kernel path for ~4x token generation speedup

BF16 models had no dedicated token generation kernel — they fell through
to the generic full-GEMM path, resulting in ~14% memory bandwidth
utilization on Intel Arc GPUs. This adds BF16 support to the DMMV
(dequantize mul-mat-vec) path, matching the existing F16 implementation.

Fixes #20478

* SYCL: fix BF16 DMMV out-of-bounds when ncols % 64 != 0

The qk=1 kernel (used for F16 and BF16) iterates with stride
2*GGML_SYCL_DMMV_X (= 64 on Intel targets where WARP_SIZE=16). When
ncols is a multiple of DMMV_X (32) but not of 2*DMMV_X (64), the last
warp iteration accesses elements at col >= ncols, producing NaN for the
final row and wrong values for interior rows.

Fix: tighten can_use_dequantize_mul_mat_vec to require ne[0] %
(2*DMMV_X) == 0 for F16/BF16 types, and update the ASSERT in the BF16
launcher to match. Quantized types use block-structured kernels with
different access patterns and keep the existing DMMV_X check.

Verified: test-backend-ops MUL_MAT passes 913/913 on Intel Arc Pro B70.
Previously failing: m=128/129 n=1 k=1056 cases (NaN and ERR > 0.0005).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

---------

Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 ggml/src/ggml-sycl/dmmv.cpp      | 47 +++++++++++++++++++++++++++++++-
 ggml/src/ggml-sycl/ggml-sycl.cpp |  8 +++++-
 2 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-sycl/dmmv.cpp b/ggml/src/ggml-sycl/dmmv.cpp
index 5577bf73b..4ae431a96 100644
--- a/ggml/src/ggml-sycl/dmmv.cpp
+++ b/ggml/src/ggml-sycl/dmmv.cpp
@@ -3,6 +3,13 @@
 #include "dequantize.hpp"
 #include "presets.hpp"
 
+#if defined(__INTEL_LLVM_COMPILER)
+    #if __has_include(<sycl/ext/oneapi/bfloat16.hpp>)
+        #include <sycl/ext/oneapi/bfloat16.hpp>
+        #define GGML_SYCL_DMMV_HAS_BF16
+    #endif
+#endif
+
 static void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
     const sycl::half *x = (const sycl::half *)vx;
 
@@ -11,6 +18,16 @@ static void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat
     v.y() = x[ib + iqs + 1];
 }
 
+#ifdef GGML_SYCL_DMMV_HAS_BF16
+static void convert_bf16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
+    const sycl::ext::oneapi::bfloat16 *x = (const sycl::ext::oneapi::bfloat16 *)vx;
+
+    // automatic bfloat16 -> float type cast if dfloat == float
+    v.x() = x[ib + iqs + 0];
+    v.y() = x[ib + iqs + 1];
+}
+#endif
+
 static void convert_f32(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){
     const float * x = (const float *) vx;
 
@@ -217,6 +234,28 @@ static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
     }
 }
 
+#ifdef GGML_SYCL_DMMV_HAS_BF16
+static void convert_mul_mat_vec_bf16_sycl(const void *vx, const dfloat *y,
+                                          float *dst, const int ncols,
+                                          const int nrows,
+                                          dpct::queue_ptr stream) {
+    // The qk=1 kernel iterates with stride 2*GGML_SYCL_DMMV_X, so ncols must be a
+    // multiple of that — not just GGML_SYCL_DMMV_X — to avoid out-of-bounds reads.
+    GGML_ASSERT(ncols % (2*GGML_SYCL_DMMV_X) == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+    {
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                dequantize_mul_mat_vec<1, 1, convert_bf16>(vx, y, dst, ncols,
+                                                           nrows, item_ct1);
+            });
+    }
+}
+#endif
+
 /*
 DPCT1110:4: The total declared local variable size in device function
 dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register
@@ -1497,7 +1536,8 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
     bool src1_convert_f16 =
         src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
         src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
-        src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
+        src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16 ||
+        src0->type == GGML_TYPE_BF16;
 
     if (src1_convert_f16) {
         scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2,
@@ -1565,6 +1605,11 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
         case GGML_TYPE_F16:
             convert_mul_mat_vec_f16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
             break;
+#ifdef GGML_SYCL_DMMV_HAS_BF16
+        case GGML_TYPE_BF16:
+            convert_mul_mat_vec_bf16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+#endif
         default:
             printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type);
             GGML_ABORT("fatal error");
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 2ea47f715..bba37a6f8 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -3455,6 +3455,7 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
         case GGML_TYPE_Q5_K:
         case GGML_TYPE_Q6_K:
         case GGML_TYPE_F16:
+        case GGML_TYPE_BF16:
             return true;
         default:
             return false;
@@ -3818,8 +3819,13 @@ static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor *
 
 
 static bool can_use_dequantize_mul_mat_vec(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    // The F16/BF16 qk=1 kernel iterates with stride 2*DMMV_X, requiring ne[0] to be
+    // a multiple of 2*DMMV_X. Quantized types use block-structured kernels that only
+    // need ne[0] % DMMV_X == 0.
+    const int64_t dmmv_x_required = (src0->type == GGML_TYPE_BF16 || src0->type == GGML_TYPE_F16) ?
+                                    2*GGML_SYCL_DMMV_X : GGML_SYCL_DMMV_X;
     return ggml_sycl_supports_dmmv(src0->type) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
-           src0->ne[0] % GGML_SYCL_DMMV_X == 0 && src1->ne[1] == 1;
+           src0->ne[0] % dmmv_x_required == 0 && src1->ne[1] == 1;
 }
 
 static bool can_use_mul_mat_vec_q(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {

From 56f16f235c4a6ffd0cd316e1d4b5dcfbf2dcb7a4 Mon Sep 17 00:00:00 2001
From: karavayev <192749314+karavayev@users.noreply.github.com>
Date: Fri, 22 May 2026 08:48:56 -0400
Subject: [PATCH 05/17] SYCL : gated_delta_net K>1 (#23174)

* sycl_gated_delta_net K>1

* editor_config
---
 ggml/src/ggml-sycl/gated_delta_net.cpp | 91 +++++++++++++++++++-------
 1 file changed, 66 insertions(+), 25 deletions(-)

diff --git a/ggml/src/ggml-sycl/gated_delta_net.cpp b/ggml/src/ggml-sycl/gated_delta_net.cpp
index ebc587524..9c2449aba 100644
--- a/ggml/src/ggml-sycl/gated_delta_net.cpp
+++ b/ggml/src/ggml-sycl/gated_delta_net.cpp
@@ -6,7 +6,7 @@
 #include <cmath>
 
 
-template <int S_v, bool KDA>
+template <int S_v, bool KDA, bool keep_rs_t>
 void gated_delta_net_sycl(const float *     q,
                           const float *     k,
                           const float *     v,
@@ -28,7 +28,8 @@ void gated_delta_net_sycl(const float *     q,
                           int64_t           sb3,
                           const sycl::uint3 neqk1_magic,
                           const sycl::uint3 rq3_magic,
-                          float             scale) {
+                          float             scale,
+                          int               K) {
     auto           item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>();
     const uint32_t h_idx    = item_ct1.get_group(2);
     const uint32_t sequence = item_ct1.get_group(1);
@@ -43,9 +44,13 @@ void gated_delta_net_sycl(const float *     q,
     float *       attn_data        = dst;
     float *       state            = dst + attn_score_elems;
 
-    const int64_t state_offset = (sequence * H + h_idx) * S_v * S_v;
-    state += state_offset;
-    curr_state += state_offset;
+    // input state layout (D, K, n_seqs) — seq stride is K * D = K * H * S_v * S_v.
+    // output state layout (per-slot D * n_seqs) — same per-(seq,head) offset as before.
+    const int64_t state_in_offset      = sequence * K * H * S_v * S_v + h_idx * S_v * S_v;
+    const int64_t state_out_offset     = (sequence * H + h_idx) * S_v * S_v;
+    const int64_t state_size_per_token = S_v * S_v * H * n_seqs; // per-slot stride in output
+    state += state_out_offset;
+    curr_state += state_in_offset + col * S_v;
     attn_data += (sequence * n_tokens * H + h_idx) * S_v;
 
     constexpr int warp_size = ggml_sycl_get_physical_warp_size() < S_v ? ggml_sycl_get_physical_warp_size() : S_v;
@@ -55,9 +60,13 @@ void gated_delta_net_sycl(const float *     q,
 #pragma unroll
     for (int r = 0; r < rows_per_lane; r++) {
         const int i = r * warp_size + lane;
-        s_shard[r]  = curr_state[col * S_v + i];
+        s_shard[r]  = curr_state[i];
     }
 
+    // slot mapping: target_slot = t - shift. When n_tokens < K only the last n_tokens slots
+    // are written; earlier slots are left untouched (caller-owned).
+    const int shift = (int) n_tokens - K;
+
     for (int t = 0; t < n_tokens; t++) {
         const float * q_t = q + iq3 * sq3 + t * sq2 + iq1 * sq1;
         const float * k_t = k + iq3 * sq3 + t * sq2 + iq1 * sq1;
@@ -131,17 +140,32 @@ void gated_delta_net_sycl(const float *     q,
         }
 
         attn_data += S_v * H;
-    }
+
 
     // Write state back to global memory
+        if constexpr (keep_rs_t) {
+            const int target_slot = t - shift;
+            if (target_slot >= 0 && target_slot < K) {
+                float * curr_state = (dst + attn_score_elems) + target_slot * state_size_per_token + state_out_offset;
 #pragma unroll
-    for (int r = 0; r < rows_per_lane; r++) {
-        const int i          = r * warp_size + lane;
-        state[col * S_v + i] = s_shard[r];
+                for (int r = 0; r < rows_per_lane; r++) {
+                    const int i = r * warp_size + lane;
+                    curr_state[col * S_v + i] = s_shard[r];
+                }
+            }
+        }
+    }
+
+    if constexpr (!keep_rs_t) {
+#pragma unroll
+        for (int r = 0; r < rows_per_lane; r++) {
+            const int i          = r * warp_size + lane;
+            state[col * S_v + i] = s_shard[r];
+        }
     }
 }
 
-template <bool KDA>
+template <bool KDA, bool keep_rs_t>
 static void launch_gated_delta_net(const float *   q_d,
                                    const float *   k_d,
                                    const float *   v_d,
@@ -165,6 +189,7 @@ static void launch_gated_delta_net(const float *   q_d,
                                    int64_t         neqk1,
                                    int64_t         rq3,
                                    float           scale,
+                                   int             K,
                                    dpct::queue_ptr stream) {
     //TODO: Add chunked kernel for even faster pre-fill
     const int warp_size = ggml_sycl_info().devices[ggml_sycl_get_device()].warp_size;
@@ -182,9 +207,9 @@ static void launch_gated_delta_net(const float *   q_d,
                 constexpr int sv = 16;
                 stream->parallel_for(sycl::nd_range<3>(grid_dims * block_dims, block_dims),
                                      [=](sycl::nd_item<3> /*item_ct1*/) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                         gated_delta_net_sycl<sv, KDA>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens,
+                                         gated_delta_net_sycl<sv, KDA, keep_rs_t>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens,
                                                                        n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, sb1, sb2,
-                                                                       sb3, neqk1_magic, rq3_magic, scale);
+                                                                       sb3, neqk1_magic, rq3_magic, scale, K);
                                      });
             }
             break;
@@ -193,9 +218,9 @@ static void launch_gated_delta_net(const float *   q_d,
                 constexpr int sv = 32;
                 stream->parallel_for(sycl::nd_range<3>(grid_dims * block_dims, block_dims),
                                      [=](sycl::nd_item<3> /*item_ct1*/) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                         gated_delta_net_sycl<sv, KDA>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens,
+                                         gated_delta_net_sycl<sv, KDA, keep_rs_t>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens,
                                                                        n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, sb1, sb2,
-                                                                       sb3, neqk1_magic, rq3_magic, scale);
+                                                                       sb3, neqk1_magic, rq3_magic, scale, K);
                                      });
             }
             break;
@@ -204,9 +229,9 @@ static void launch_gated_delta_net(const float *   q_d,
                 constexpr int sv = 64;
                 stream->parallel_for(sycl::nd_range<3>(grid_dims * block_dims, block_dims),
                                         [=](sycl::nd_item<3> /*item_ct1*/) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                            gated_delta_net_sycl<sv, KDA>(
+                                            gated_delta_net_sycl<sv, KDA, keep_rs_t>(
                                                 q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens, n_seqs, sq1, sq2,
-                                                sq3, sv1, sv2, sv3, sb1, sb2, sb3, neqk1_magic, rq3_magic, scale);
+                                                sq3, sv1, sv2, sv3, sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K);
                                         });
             }
             break;
@@ -216,9 +241,9 @@ static void launch_gated_delta_net(const float *   q_d,
                 constexpr int sv = 128;
                 stream->parallel_for(sycl::nd_range<3>(grid_dims * block_dims, block_dims),
                                         [=](sycl::nd_item<3> /*item_ct1*/) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                                            gated_delta_net_sycl<sv, KDA>(
+                                            gated_delta_net_sycl<sv, KDA, keep_rs_t>(
                                                 q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens, n_seqs, sq1, sq2,
-                                                sq3, sv1, sv2, sv3, sb1, sb2, sb3, neqk1_magic, rq3_magic, scale);
+                                                sq3, sv1, sv2, sv3, sb1, sb2, sb3, neqk1_magic, rq3_magic, scale, K);
                                         });
             }
             break;
@@ -290,14 +315,30 @@ void ggml_sycl_op_gated_delta_net(ggml_backend_sycl_context & ctx, ggml_tensor *
 
     dpct::queue_ptr stream = ctx.stream();
 
+    // state is 3D (S_v*S_v*H, K, n_seqs); K is the snapshot slot count.
+    const int K = (int) src_state->ne[1];
+    const bool keep_rs = K > 1;
+
     if (kda) {
-        launch_gated_delta_net<true>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
-            S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
-            sb1, sb2, sb3, neqk1, rq3, scale, stream);
+        if (keep_rs) {
+            launch_gated_delta_net<true, true>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
+                S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
+                sb1, sb2, sb3, neqk1, rq3, scale, K, stream);
+        } else {
+            launch_gated_delta_net<true, false>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
+                S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
+                sb1, sb2, sb3, neqk1, rq3, scale, K, stream);
+        }
     } else {
-        launch_gated_delta_net<false>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
-            S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
-            sb1, sb2, sb3, neqk1, rq3, scale, stream);
+        if (keep_rs) {
+            launch_gated_delta_net<false, true>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
+                S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
+                sb1, sb2, sb3, neqk1, rq3, scale, K, stream);
+        } else {
+            launch_gated_delta_net<false, false>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d,
+                S_v, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
+                sb1, sb2, sb3, neqk1, rq3, scale, K, stream);
+        }
     }
 }
 

From bcfd1989e9a90af74669d94057ff2468682c3f4a Mon Sep 17 00:00:00 2001
From: Alexey Kopytko <alexey@kopytko.com>
Date: Fri, 22 May 2026 21:49:45 +0900
Subject: [PATCH 06/17] sycl : Level Zero detection in ggml_sycl_init (#23097)

* [SYCL] Centralize Level Zero detection in ggml_sycl_init

* use the same wording

* get back the warning
---
 ggml/src/ggml-sycl/common.hpp    |  2 ++
 ggml/src/ggml-sycl/ggml-sycl.cpp | 26 ++++++++------------------
 2 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
index 96bc1c98b..6d1953821 100644
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -238,6 +238,8 @@ struct ggml_sycl_device_info {
     std::array<float, GGML_SYCL_MAX_DEVICES> default_tensor_split = {};
 
     int max_work_group_sizes[GGML_SYCL_MAX_DEVICES] = {0};
+
+    bool ext_oneapi_level_zero = true; // sycl::backend::ext_oneapi_level_zero used by all enumerated GPU devices
 };
 
 const ggml_sycl_device_info & ggml_sycl_info();
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index bba37a6f8..46795f436 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -98,7 +98,7 @@ static ggml_sycl_device_info ggml_sycl_init() {
     for (int i = 0; i < info.device_count; ++i) {
         info.devices[i].vmm = 0;
         dpct::device_info prop;
-        sycl::device device = dpct::dev_mgr::instance().get_device(i);
+        auto & device = dpct::dev_mgr::instance().get_device(i);
 
         SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
             prop, device)));
@@ -117,6 +117,12 @@ static ggml_sycl_device_info ggml_sycl_init() {
         info.devices[i].max_wg_per_cu = info.max_work_group_sizes[i] / prop.get_max_compute_units();
         info.devices[i].hw_info = get_device_hw_info(&device);
 
+        // Only check GPU devices; CPU devices use OpenCL and would otherwise
+        // disable Level Zero for the GPUs on systems without ONEAPI_DEVICE_SELECTOR set.
+        if (device.is_gpu() && device.default_queue().get_backend() != sycl::backend::ext_oneapi_level_zero) {
+            GGML_LOG_WARN("SYCL GPU device %d does not use Level Zero backend, disabling Level Zero memory API\n", i);
+            info.ext_oneapi_level_zero = false;
+        }
     }
 
     for (int id = 0; id < info.device_count; ++id) {
@@ -230,26 +236,10 @@ static void ggml_check_sycl() try {
         g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0);
         g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
 #ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO
-        g_ggml_sycl_enable_level_zero = get_sycl_env("GGML_SYCL_ENABLE_LEVEL_ZERO", 1);
+        g_ggml_sycl_enable_level_zero = get_sycl_env("GGML_SYCL_ENABLE_LEVEL_ZERO", ggml_sycl_info().ext_oneapi_level_zero);
 #else
         g_ggml_sycl_enable_level_zero = 0;
 #endif
-        if (g_ggml_sycl_enable_level_zero) {
-            // Verify all GPU devices use the Level Zero backend before enabling L0 APIs.
-            // Only check GPU devices; CPU devices use OpenCL and would otherwise
-            // disable Level Zero for the GPUs on systems without ONEAPI_DEVICE_SELECTOR set.
-            for (unsigned int i = 0; i < dpct::dev_mgr::instance().device_count(); i++) {
-                auto & q = dpct::dev_mgr::instance().get_device(i).default_queue();
-                if (!q.get_device().is_gpu()) {
-                    continue;
-                }
-                if (q.get_backend() != sycl::backend::ext_oneapi_level_zero) {
-                    GGML_LOG_WARN("SYCL GPU device %d does not use Level Zero backend, disabling Level Zero memory API\n", i);
-                    g_ggml_sycl_enable_level_zero = 0;
-                    break;
-                }
-            }
-        }
 
 #ifdef SYCL_FLASH_ATTN
         g_ggml_sycl_enable_flash_attention = get_sycl_env("GGML_SYCL_ENABLE_FLASH_ATTN", 1);

From cc9e331213b6a9cb186aabe01a4ec6a61419dd80 Mon Sep 17 00:00:00 2001
From: Alexey Kopytko <alexey@kopytko.com>
Date: Fri, 22 May 2026 21:50:17 +0900
Subject: [PATCH 07/17] SYCL: improve MoE prefill throughput (#23142)

- change `k_copy_src1_to_contiguous` so that uses a precomputed contiguous mapping where all rows "owned" by an expert are in one slice with a know starts and ends
- switch the `O(n_as * n_routed_rows)` contraption to a counting sort-based procedure with `O(n_as + n_routed_rows)` complexity
---
 ggml/src/ggml-sycl/ggml-sycl.cpp | 197 +++++++++++++++++--------------
 1 file changed, 106 insertions(+), 91 deletions(-)

diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 46795f436..b3fbb6211 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -3919,35 +3919,17 @@ struct mmid_row_mapping {
 
 __dpct_inline__ static void k_copy_src1_to_contiguous(
     const char *__restrict__ src1_original, char *__restrict__ src1_contiguous,
-    int *__restrict__ cur_src1_row, mmid_row_mapping *__restrict__ row_mapping,
-    const char *__restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0,
+    const mmid_row_mapping *__restrict__ row_mapping,
     int64_t ne11, int64_t ne10, size_t nb11, size_t nb12,
-    const sycl::nd_item<3> &item_ct1, int &src1_row) {
-    int32_t iid1 = item_ct1.get_group(2);
-    int32_t id = item_ct1.get_group(1);
+    const sycl::nd_item<3> &item_ct1) {
+    const int32_t src1_row = item_ct1.get_group(2);
 
-    const int32_t row_id_i = *(const int32_t *) (ids + iid1*ids_nb1 + id*ids_nb0);
-
-    if (row_id_i != i02) {
-        return;
-    }
+    const int32_t iid1 = row_mapping[src1_row].i2;
+    const int32_t id   = row_mapping[src1_row].i1;
 
     const int64_t i11 = id % ne11;
     const int64_t i12 = iid1;
 
-    if (item_ct1.get_local_id(2) == 0) {
-        src1_row =
-            dpct::atomic_fetch_add<sycl::access::address_space::generic_space>(
-                cur_src1_row, 1);
-        row_mapping[src1_row] = {id, iid1};
-    }
-    /*
-    DPCT1065:194: Consider replacing sycl::nd_item::barrier() with
-    sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
-    performance if there is no access to global memory.
-    */
-    item_ct1.barrier();
-
     const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12);
     float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11);
 
@@ -4022,6 +4004,47 @@ static bool ggml_sycl_mul_mat_id_mmvq_fused(
         src1_row_stride, stream);
 }
 
+// counting sort of the routed rows by expert id (row_id_i, as chosen by the router):
+// builds a projection of a memory layout where each expert's slice is contiguous
+static void mmid_counting_sort_rows(
+        const ggml_tensor * ids, const char * ids_host,
+        int64_t n_ids, int64_t n_as, int64_t n_routed_rows,
+        std::vector<int64_t> & expert_counts,
+        std::vector<int64_t> & expert_row_offsets,
+        std::vector<mmid_row_mapping> & routed_row_src) {
+
+    // frequencies: how many routed rows each expert "owns"
+    expert_counts.assign(n_as, 0);
+    for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
+        for (int64_t id = 0; id < n_ids; id++) {
+            const int32_t row_id_i = *(const int32_t *) (ids_host + iid1*ids->nb[1] + id*ids->nb[0]);
+            GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
+            expert_counts[row_id_i]++;
+        }
+    }
+
+    // where each expert's slice starts (row indices) and the previous ends
+    expert_row_offsets.assign(n_as + 1, 0);
+    for (int64_t i02 = 0; i02 < n_as; i02++) {
+        expert_row_offsets[i02 + 1] = expert_row_offsets[i02] + expert_counts[i02];
+    }
+
+    std::vector<int64_t> expert_row_next = expert_row_offsets;
+    routed_row_src.resize(n_routed_rows);
+    for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
+        for (int64_t id = 0; id < n_ids; id++) {
+            const int32_t row_id_i = *(const int32_t *) (ids_host + iid1*ids->nb[1] + id*ids->nb[0]);
+            GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
+
+            // find and validate the next free row for a given expert (row_id_i)
+            const int64_t routed_row = expert_row_next[row_id_i]++;
+            GGML_ASSERT(routed_row >= expert_row_offsets[row_id_i]);
+            GGML_ASSERT(routed_row < expert_row_offsets[row_id_i + 1]);
+            routed_row_src[routed_row] = {(int32_t) id, (int32_t) iid1};
+        }
+    }
+}
+
 static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
                                  ggml_tensor *dst) try {
     scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/3);
@@ -4100,99 +4123,91 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx,
         src1_row.data = src1_contiguous.get();
         dst_row.data  =  dst_contiguous.get();
 
+        // how many "owned" routed rows to pass to each expert
+        std::vector<int64_t> expert_row_counts;
+        // where each expert's slice starts and the previous ends (row indices, right-exclusive)
+        std::vector<int64_t> expert_row_offsets;
+        // the sources (slot/token pairs) of contiguous rows to guide k_copy_src1_to_contiguous
+        std::vector<mmid_row_mapping> routed_row_src;
+
+        mmid_counting_sort_rows(ids, ids_host.data(), n_ids, n_as, n_routed_rows,
+                                expert_row_counts, expert_row_offsets, routed_row_src);
+
+        ggml_sycl_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), n_routed_rows);
+        SYCL_CHECK(CHECK_TRY_ERROR(
+                stream->memcpy(dev_row_mapping.get(), routed_row_src.data(), n_routed_rows*sizeof(mmid_row_mapping))));
+
+        const unsigned int max_work_group_size = ggml_sycl_info().max_work_group_sizes[ctx.device];
+        assert(max_work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
+
+        {
+            sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, max_work_group_size));
+            sycl::range<3> grid_dims(1, 1, n_routed_rows);
+            stream->submit([&](sycl::handler &cgh) {
+                char *__restrict src1_contiguous_get =
+                    src1_contiguous.get();
+                mmid_row_mapping *__restrict dev_row_mapping_get =
+                    dev_row_mapping.get();
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(grid_dims * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        k_copy_src1_to_contiguous(
+                            src1_original, src1_contiguous_get,
+                            dev_row_mapping_get,
+                            ne11, ne10, nb11, nb12,
+                            item_ct1);
+                    });
+            });
+        }
+
         for (int64_t i02 = 0; i02 < n_as; i02++) {
-            int64_t num_src1_rows = 0;
-            for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
-                for (int64_t id = 0; id < n_ids; id++) {
-                    const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
-
-                    GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
-
-                    if (row_id_i != i02) {
-                        continue;
-                    }
-
-                    num_src1_rows++;
-                }
-            }
+            const int64_t num_src1_rows = expert_row_counts[i02];
 
             if (num_src1_rows == 0) {
                 continue;
             }
 
-
-            ggml_sycl_pool_alloc<int> dev_cur_src1_row(ctx.pool(), 1);
-            ggml_sycl_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), num_src1_rows);
-            SYCL_CHECK(CHECK_TRY_ERROR(
-                stream->memset(dev_cur_src1_row.get(), 0, sizeof(int))));
-
-            const unsigned int max_work_group_size = ggml_sycl_info().max_work_group_sizes[ctx.device];
-            assert(max_work_group_size % (WARP_SIZE * WARP_SIZE) == 0);
-
-            {
-                sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne10, max_work_group_size));
-                sycl::range<3> grid_dims(1, n_ids, ids->ne[1]);
-                stream->submit([&](sycl::handler &cgh) {
-                    sycl::local_accessor<int, 0> src1_row_acc(cgh);
-
-                    char *__restrict src1_contiguous_get =
-                        src1_contiguous.get();
-                    int *__restrict dev_cur_src1_row_get =
-                        dev_cur_src1_row.get();
-                    mmid_row_mapping *__restrict dev_row_mapping_get =
-                        dev_row_mapping.get();
-                    size_t ids_nb_ct6 = ids->nb[1];
-                    size_t ids_nb_ct7 = ids->nb[0];
-
-                    cgh.parallel_for(
-                        sycl::nd_range<3>(grid_dims * block_dims, block_dims),
-                        [=](sycl::nd_item<3> item_ct1) {
-                            k_copy_src1_to_contiguous(
-                                src1_original, src1_contiguous_get,
-                                dev_cur_src1_row_get,
-                                dev_row_mapping_get, ids_dev, i02,
-                                ids_nb_ct6, ids_nb_ct7, ne11, ne10, nb11, nb12,
-                                item_ct1, src1_row_acc);
-                        });
-                });
-            }
+            const int64_t expert_row_offset = expert_row_offsets[i02];
 
             src0_row.data = src0_original + i02*nb02;
 
             GGML_ASSERT(nb11 == sizeof(float)*ne10);
             GGML_ASSERT(nb1 == sizeof(float)*ne0);
+            src1_row.data = src1_contiguous.get() + expert_row_offset*nb11;
             src1_row.ne[1] = num_src1_rows;
 
             src1_row.nb[1] = nb11;
             src1_row.nb[2] = num_src1_rows*nb11;
             src1_row.nb[3] = num_src1_rows*nb11;
 
+            dst_row.data = dst_contiguous.get() + expert_row_offset*nb1;
             dst_row.ne[1] = num_src1_rows;
             dst_row.nb[1] = nb1;
             dst_row.nb[2] = num_src1_rows*nb1;
             dst_row.nb[3] = num_src1_rows*nb1;
 
             ggml_sycl_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
+        }
 
-            {
-                sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, max_work_group_size));
-                sycl::range<3> grid_dims(1, 1, num_src1_rows);
-                stream->submit([&](sycl::handler &cgh) {
-                    const char *__restrict dst_contiguous_get =
-                        dst_contiguous.get();
-                    const mmid_row_mapping *__restrict dev_row_mapping_get =
-                        dev_row_mapping.get();
+        {
+            sycl::range<3> block_dims(1, 1, std::min((unsigned int)ne0, max_work_group_size));
+            sycl::range<3> grid_dims(1, 1, n_routed_rows);
+            stream->submit([&](sycl::handler &cgh) {
+                const char *__restrict dst_contiguous_get =
+                    dst_contiguous.get();
+                const mmid_row_mapping *__restrict dev_row_mapping_get =
+                    dev_row_mapping.get();
 
-                    cgh.parallel_for(
-                        sycl::nd_range<3>(grid_dims * block_dims, block_dims),
-                        [=](sycl::nd_item<3> item_ct1) {
-                            k_copy_dst_from_contiguous(dst_original,
-                                                       dst_contiguous_get,
-                                                       dev_row_mapping_get,
-                                                       ne0, nb1, nb2, item_ct1);
-                        });
-                });
-            }
+                cgh.parallel_for(
+                    sycl::nd_range<3>(grid_dims * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        k_copy_dst_from_contiguous(dst_original,
+                                                   dst_contiguous_get,
+                                                   dev_row_mapping_get,
+                                                   ne0, nb1, nb2, item_ct1);
+                    });
+            });
         }
     }
 }

From ef570f63087b6a5a2930210a13f87990e8113927 Mon Sep 17 00:00:00 2001
From: fairydreaming <166155368+fairydreaming@users.noreply.github.com>
Date: Fri, 22 May 2026 14:50:44 +0200
Subject: [PATCH 08/17] perplexity : fix integer overflow (#23496)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
---
 tools/perplexity/perplexity.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perplexity/perplexity.cpp b/tools/perplexity/perplexity.cpp
index f66576eb4..ea9de83e1 100644
--- a/tools/perplexity/perplexity.cpp
+++ b/tools/perplexity/perplexity.cpp
@@ -524,7 +524,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
         logits_stream.write((const char *)&n_chunk, sizeof(n_chunk));
         logits_stream.write((const char *)tokens.data(), n_chunk*n_ctx*sizeof(tokens[0]));
         const int nv = 2*((n_vocab + 1)/2) + 4;
-        log_probs.resize(n_ctx * nv);
+        log_probs.resize(size_t(n_ctx) * nv);
     }
 
     // We get the logits for all the tokens in the context window (params.n_ctx)

From 1acee6bf8939948f9bcbf4b14034e4b475f06069 Mon Sep 17 00:00:00 2001
From: Aldehir Rojas <hello@alde.dev>
Date: Fri, 22 May 2026 11:58:15 -0400
Subject: [PATCH 09/17] server: only parse empty msg if continuing an assistant
 msg (#23506)

---
 common/chat.h                | 1 +
 tools/server/server-task.cpp | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/common/chat.h b/common/chat.h
index 8ace3e6ba..b29c627e6 100644
--- a/common/chat.h
+++ b/common/chat.h
@@ -219,6 +219,7 @@ struct common_chat_parser_params {
     bool                    reasoning_in_content = false;
     std::string             generation_prompt;
     bool                    parse_tool_calls     = true;
+    bool                    is_continuation      = false;
     bool                    echo                 = false;  // Include assistant prefilled msg in output
     bool                    debug                = false;  // Enable debug output for PEG parser
     common_peg_arena        parser               = {};
diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
index d45513dbe..abc00c82b 100644
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@@ -149,7 +149,7 @@ task_result_state::task_result_state(const common_chat_parser_params & chat_pars
     , oai_resp_id("resp_" + random_string())
     , oai_resp_reasoning_id("rs_" + random_string())
     , oai_resp_message_id("msg_" + random_string()) {
-    if (!chat_parser_params.echo) {
+    if (chat_parser_params.is_continuation && !chat_parser_params.echo) {
         // initialize chat_msg to avoid emitting a delta containing the assistant prefill
         chat_msg = common_chat_parse("", true, chat_parser_params);
     }
@@ -432,6 +432,10 @@ task_params server_task::params_from_json_cmpl(
         if (data.contains("chat_parser")) {
             params.chat_parser_params.parser.load(data.at("chat_parser").get<std::string>());
         }
+        if (data.contains("continue_final_message")) {
+            auto continuation = common_chat_continuation_parse(data.at("continue_final_message"));
+            params.chat_parser_params.is_continuation = continuation != COMMON_CHAT_CONTINUATION_NONE;
+        }
         params.chat_parser_params.echo = json_value(data, "echo", false);
     }
 

From 0f3cb3fc8b4c22a89711457e84d8358a2f480832 Mon Sep 17 00:00:00 2001
From: Shawn Gu <shawngu@qti.qualcomm.com>
Date: Fri, 22 May 2026 17:08:41 -0700
Subject: [PATCH 10/17] opencl: generalize Adreno MoE kernels on M (#23449)

---
 ggml/src/ggml-opencl/ggml-opencl.cpp          | 18 +++---
 ggml/src/ggml-opencl/kernels/cvt.cl           | 64 +++++++++++++++++++
 .../kernels/gemm_moe_mxfp4_f32_ns.cl          |  6 +-
 .../kernels/gemm_moe_q4_0_f32_ns.cl           |  6 +-
 .../kernels/gemm_moe_q4_1_f32_ns.cl           |  6 +-
 .../kernels/gemm_moe_q4_k_f32_ns.cl           |  6 +-
 .../kernels/gemm_moe_q5_0_f32_ns.cl           |  6 +-
 .../kernels/gemm_moe_q5_1_f32_ns.cl           |  6 +-
 .../kernels/gemm_moe_q5_k_f32_ns.cl           |  6 +-
 .../kernels/gemm_moe_q6_k_f32_ns.cl           |  6 +-
 .../kernels/gemv_moe_mxfp4_f32_ns.cl          |  4 ++
 .../kernels/gemv_moe_q4_0_f32_ns.cl           |  4 ++
 .../kernels/gemv_moe_q4_1_f32_ns.cl           |  4 ++
 .../kernels/gemv_moe_q4_k_f32_ns.cl           |  4 ++
 .../kernels/gemv_moe_q5_0_f32_ns.cl           |  4 ++
 .../kernels/gemv_moe_q5_1_f32_ns.cl           |  4 ++
 .../kernels/gemv_moe_q5_k_f32_ns.cl           |  4 ++
 .../kernels/gemv_moe_q6_k_f32_ns.cl           |  4 ++
 18 files changed, 145 insertions(+), 17 deletions(-)

diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index 5fc46f789..ea0b44fee 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -4693,7 +4693,7 @@ inline bool use_adreno_kernels(const ggml_backend_opencl_context *backend_ctx, c
 inline bool use_adreno_moe_kernels(const ggml_backend_opencl_context *backend_ctx, const ggml_tensor *tensor) {
     GGML_UNUSED(backend_ctx);
     int ne01 = tensor->ne[1];
-    return (((strstr(tensor->name, "ffn") != NULL) && (strstr(tensor->name, "exps") != NULL)) || (strstr(tensor->name, "as") != NULL)) && (ne01 % 64 == 0);
+    return (((strstr(tensor->name, "ffn") != NULL) && (strstr(tensor->name, "exps") != NULL)) || (strstr(tensor->name, "as") != NULL)) && (ne01 % 32 == 0);
 }
 
 inline bool enable_adreno_trans_weight(const ggml_backend_opencl_context *backend_ctx, const ggml_tensor *tensor) {
@@ -14297,7 +14297,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
                     CL_CHECK(status);
 
                     // set thread grid
-                    global_size[0] = static_cast<size_t>(ne01);
+                    global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
                     global_size[1] = 4;
                     global_size[2] = static_cast<size_t>(ne20);
                     local_size[1] = 4;
@@ -14513,7 +14513,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
                     CL_CHECK(status);
 
                     // set thread grid
-                    global_size[0] = static_cast<size_t>(ne01);
+                    global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
                     global_size[1] = 4;
                     global_size[2] = static_cast<size_t>(ne20);
                     local_size[1] = 4;
@@ -14689,7 +14689,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
                     CL_CHECK(status);
 
                     // set thread grid
-                    global_size[0] = static_cast<size_t>(ne01);
+                    global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
                     global_size[1] = 4;
                     global_size[2] = static_cast<size_t>(ne20);
                     local_size[1] = 4;
@@ -14865,7 +14865,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
                     CL_CHECK(status);
 
                     // set thread grid
-                    global_size[0] = static_cast<size_t>(ne01);
+                    global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
                     global_size[1] = 4;
                     global_size[2] = static_cast<size_t>(ne20);
                     local_size[1] = 4;
@@ -15118,7 +15118,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
                     CL_CHECK(status);
 
                     // set thread grid
-                    global_size[0] = static_cast<size_t>(ne01);
+                    global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
                     global_size[1] = 4;
                     global_size[2] = static_cast<size_t>(ne20);
                     local_size[1] = 4;
@@ -15291,7 +15291,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
                     CL_CHECK(status);
 
                     // set thread grid
-                    global_size[0] = static_cast<size_t>(ne01);
+                    global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
                     global_size[1] = 4;
                     global_size[2] = static_cast<size_t>(ne20);
                     local_size[1] = 4;
@@ -15469,7 +15469,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
                     CL_CHECK(status);
 
                     // set thread grid
-                    global_size[0] = static_cast<size_t>(ne01);
+                    global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
                     global_size[1] = 4;
                     global_size[2] = static_cast<size_t>(ne20);
                     local_size[1] = 4;
@@ -15644,7 +15644,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
                     CL_CHECK(status);
 
                     // set thread grid
-                    global_size[0] = static_cast<size_t>(ne01);
+                    global_size[0] = static_cast<size_t>(((ne01 + 63) / 64) * 64);
                     global_size[1] = 4;
                     global_size[2] = static_cast<size_t>(ne20);
                     local_size[1] = 4;
diff --git a/ggml/src/ggml-opencl/kernels/cvt.cl b/ggml/src/ggml-opencl/kernels/cvt.cl
index 312366984..c25eabdd7 100644
--- a/ggml/src/ggml-opencl/kernels/cvt.cl
+++ b/ggml/src/ggml-opencl/kernels/cvt.cl
@@ -220,6 +220,10 @@ kernel void kernel_convert_block_q4_0_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK4_0;
     uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -263,6 +267,10 @@ kernel void kernel_restore_block_q4_0_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK4_0;
     uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint src_d_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -401,6 +409,10 @@ kernel void kernel_convert_block_q4_1_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK4_1;
     uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -446,6 +458,10 @@ kernel void kernel_restore_block_q4_1_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK4_1;
     uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint src_dm_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -491,6 +507,10 @@ kernel void kernel_convert_block_q5_0_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK5_0;
     uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -536,6 +556,10 @@ kernel void kernel_restore_block_q5_0_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK5_0;
     uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -583,6 +607,10 @@ kernel void kernel_convert_block_q5_1_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK5_1;
     uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -630,6 +658,10 @@ kernel void kernel_restore_block_q5_1_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK5_1;
     uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -679,6 +711,10 @@ kernel void kernel_convert_block_q4_k_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK_K;
     uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint dst_blk_offset = i01 + i00 * ne01     + i02 * ne00_blk * ne01;
@@ -732,6 +768,10 @@ kernel void kernel_restore_block_q4_k_trans4_ns(
     uint i01 = get_global_id(0);  // row index
     uint i02 = get_global_id(2);  // batch index
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK_K;
 
     uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -784,6 +824,10 @@ kernel void kernel_convert_block_q5_k_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK_K;
     uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint dst_blk_offset = i01 + i00 * ne01     + i02 * ne00_blk * ne01;
@@ -850,6 +894,10 @@ kernel void kernel_restore_block_q5_k_trans4_ns(
     uint i01 = get_global_id(0);  // row index
     uint i02 = get_global_id(2);  // batch index
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK_K;
 
     uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -916,6 +964,10 @@ kernel void kernel_convert_block_q6_k_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK_K;
 
     uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
@@ -993,6 +1045,10 @@ kernel void kernel_restore_block_q6_k_trans4_ns(
     uint i01 = get_global_id(0);  // row index
     uint i02 = get_global_id(2);  // batch index
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK_K;
 
     uint src_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -1147,6 +1203,10 @@ kernel void kernel_convert_block_mxfp4_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK_MXFP4;
     uint src_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint dst_blk_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
@@ -1190,6 +1250,10 @@ kernel void kernel_restore_block_mxfp4_trans4_ns(
     uint i01 = get_global_id(0);
     uint i02 = get_global_id(2);
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint ne00_blk = ne00 / QK_MXFP4;
     uint dst_blk_offset = i00 + i01 * ne00_blk + i02 * ne00_blk * ne01;
     uint src_d_offset = i01 + i00 * ne01 + i02 * ne00_blk * ne01;
diff --git a/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl
index e404f392b..02cdbdd9f 100644
--- a/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32_ns.cl
@@ -163,7 +163,7 @@ kernel void kernel_gemm_moe_mxfp4_f32_ns(
     uint block_id_n = get_global_id(2); // n_tile
 
     // Boundary check
-    if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
+    if (block_id_n >= total_tiles[0]) {
         return;
     }
 
@@ -248,6 +248,10 @@ kernel void kernel_gemm_moe_mxfp4_f32_ns(
         dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
     }
 
+    if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
+        return;
+    }
+
     // Load poster router and share in LM
     __local uint out_idx[TILESIZE_N];
 
diff --git a/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl
index 02290c17e..d403ed0ca 100644
--- a/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q4_0_f32_ns.cl
@@ -115,7 +115,7 @@ kernel void kernel_gemm_moe_q4_0_f32_ns(
     uint block_id_n = get_global_id(2); // n_tile
 
     // Boundary check
-    if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
+    if (block_id_n >= total_tiles[0]) {
         return;
     }
 
@@ -198,6 +198,10 @@ kernel void kernel_gemm_moe_q4_0_f32_ns(
         dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
     }
 
+    if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
+        return;
+    }
+
     // Load poster router and share in LM
     __local uint out_idx[TILESIZE_N];
 
diff --git a/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl
index e2574ae01..b2bddf3f7 100644
--- a/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q4_1_f32_ns.cl
@@ -116,7 +116,7 @@ kernel void kernel_gemm_moe_q4_1_f32_ns(
     uint block_id_n = get_global_id(2); // n_tile
 
     // Boundary check
-    if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
+    if (block_id_n >= total_tiles[0]) {
         return;
     }
 
@@ -200,6 +200,10 @@ kernel void kernel_gemm_moe_q4_1_f32_ns(
         dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
     }
 
+    if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
+        return;
+    }
+
     // Load poster router and share in LM
     __local uint out_idx[TILESIZE_N];
 
diff --git a/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl
index 9d24aff6a..ab8228d18 100644
--- a/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q4_k_f32_ns.cl
@@ -133,7 +133,7 @@ kernel void kernel_gemm_moe_q4_k_f32_ns(
     uint block_id_n = get_global_id(2); // n_tile
 
     // Boundary check
-    if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
+    if (block_id_n >= total_tiles[0]) {
         return;
     }
 
@@ -225,6 +225,10 @@ kernel void kernel_gemm_moe_q4_k_f32_ns(
         dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
     }
 
+    if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
+        return;
+    }
+
     // Load post router and share in LM
     __local uint out_idx[TILESIZE_N];
 
diff --git a/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl
index 3524cb1bd..d1a35d58b 100644
--- a/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q5_0_f32_ns.cl
@@ -116,7 +116,7 @@ kernel void kernel_gemm_moe_q5_0_f32_ns(
     uint block_id_n = get_global_id(2); // n_tile
 
     // Boundary check
-    if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
+    if (block_id_n >= total_tiles[0]) {
         return;
     }
 
@@ -202,6 +202,10 @@ kernel void kernel_gemm_moe_q5_0_f32_ns(
         dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
     }
 
+    if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
+        return;
+    }
+
     // Load poster router and share in LM
     __local uint out_idx[TILESIZE_N];
 
diff --git a/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl
index 5fc2a5232..90d345ecf 100644
--- a/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q5_1_f32_ns.cl
@@ -117,7 +117,7 @@ kernel void kernel_gemm_moe_q5_1_f32_ns(
     uint block_id_n = get_global_id(2); // n_tile
 
     // Boundary check
-    if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
+    if (block_id_n >= total_tiles[0]) {
         return;
     }
 
@@ -204,6 +204,10 @@ kernel void kernel_gemm_moe_q5_1_f32_ns(
         dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
     }
 
+    if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
+        return;
+    }
+
     // Load poster router and share in LM
     __local uint out_idx[TILESIZE_N];
 
diff --git a/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl
index 808a0c7db..13c26f6f3 100644
--- a/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q5_k_f32_ns.cl
@@ -134,7 +134,7 @@ kernel void kernel_gemm_moe_q5_k_f32_ns(
     uint block_id_n = get_global_id(2); // n_tile
 
     // Boundary check
-    if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
+    if (block_id_n >= total_tiles[0]) {
         return;
     }
 
@@ -230,6 +230,10 @@ kernel void kernel_gemm_moe_q5_k_f32_ns(
         dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
     }
 
+    if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
+        return;
+    }
+
     // Load post router and share in LM
     __local uint out_idx[TILESIZE_N];
 
diff --git a/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl
index a040335ad..85ccebec7 100644
--- a/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemm_moe_q6_k_f32_ns.cl
@@ -117,7 +117,7 @@ kernel void kernel_gemm_moe_q6_k_f32_ns(
     uint block_id_n = get_global_id(2); // n_tile
 
     // Boundary check
-    if (((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) || (block_id_n >= total_tiles[0])) {
+    if (block_id_n >= total_tiles[0]) {
         return;
     }
 
@@ -209,6 +209,10 @@ kernel void kernel_gemm_moe_q6_k_f32_ns(
         dotx16_reduce8(reg_a, shared_b, reg_c.hi, 16);
     }
 
+    if ((get_global_id(0) + block_id_m * TILESIZE_M) >= ne01) {
+        return;
+    }
+
     // Load post router and share in LM
     __local uint out_idx[TILESIZE_N];
 
diff --git a/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl
index e4b44c1a5..75129e20c 100644
--- a/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32_ns.cl
@@ -82,6 +82,10 @@ __kernel void kernel_gemv_moe_mxfp4_f32_ns(
     uint sgid = get_local_id(1);
     uint slid = get_sub_group_local_id();
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint i11 = i20 % ne11;
 
     uint expert_id = src2[i20];
diff --git a/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl
index 6f4d3f532..2d28db63e 100644
--- a/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q4_0_f32_ns.cl
@@ -37,6 +37,10 @@ __kernel void kernel_gemv_moe_q4_0_f32_ns(
     uint sgid = get_local_id(1);
     uint slid = get_sub_group_local_id();
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint i11 = i20 % ne11;
 
     uint expert_id = src2[i20];
diff --git a/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl
index 3739a2157..b98bdc0f1 100644
--- a/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q4_1_f32_ns.cl
@@ -38,6 +38,10 @@ __kernel void kernel_gemv_moe_q4_1_f32_ns(
     uint sgid = get_local_id(1);
     uint slid = get_sub_group_local_id();
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint i11 = i20 % ne11;
 
     uint expert_id = src2[i20];
diff --git a/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl
index 13d79f252..12464e982 100644
--- a/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q4_k_f32_ns.cl
@@ -54,6 +54,10 @@ __kernel void kernel_gemv_moe_q4_k_f32_ns(
     uint sgid = get_local_id(1);
     uint slid = get_sub_group_local_id();
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint i11 = i20 % ne11;
 
     uint expert_id = src2[i20];
diff --git a/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl
index 938054cf9..b43613638 100644
--- a/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q5_0_f32_ns.cl
@@ -38,6 +38,10 @@ __kernel void kernel_gemv_moe_q5_0_f32_ns(
     uint sgid = get_local_id(1);
     uint slid = get_sub_group_local_id();
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint i11 = i20 % ne11;
 
     uint expert_id = src2[i20];
diff --git a/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl
index f33a4ef27..7a666006e 100644
--- a/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q5_1_f32_ns.cl
@@ -39,6 +39,10 @@ __kernel void kernel_gemv_moe_q5_1_f32_ns(
     uint sgid = get_local_id(1);
     uint slid = get_sub_group_local_id();
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint i11 = i20 % ne11;
 
     uint expert_id = src2[i20];
diff --git a/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl
index f128d4434..7d868d7ab 100644
--- a/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q5_k_f32_ns.cl
@@ -55,6 +55,10 @@ __kernel void kernel_gemv_moe_q5_k_f32_ns(
     uint sgid = get_local_id(1);
     uint slid = get_sub_group_local_id();
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint i11 = i20 % ne11;
 
     uint expert_id = src2[i20];
diff --git a/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl b/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl
index 526e609dc..c166bad5b 100644
--- a/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl
+++ b/ggml/src/ggml-opencl/kernels/gemv_moe_q6_k_f32_ns.cl
@@ -38,6 +38,10 @@ __kernel void kernel_gemv_moe_q6_k_f32_ns(
     uint sgid = get_local_id(1);
     uint slid = get_sub_group_local_id();
 
+    if (i01 >= ne01) {
+        return;
+    }
+
     uint i11 = i20 % ne11;
 
     uint expert_id = src2[i20];

From 95405ac65f8902a94015378a9f2e9619e3aa839c Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Sat, 23 May 2026 02:44:46 -0500
Subject: [PATCH 11/17] vulkan: fix windows find_package of SPIRV-Headers
 (#23215)

* vulkan: fix windows find_package of SPIRV-Headers

* not windows-only
---
 ggml/src/ggml-vulkan/CMakeLists.txt | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-vulkan/CMakeLists.txt b/ggml/src/ggml-vulkan/CMakeLists.txt
index 6dbcea065..65785ae45 100644
--- a/ggml/src/ggml-vulkan/CMakeLists.txt
+++ b/ggml/src/ggml-vulkan/CMakeLists.txt
@@ -8,7 +8,10 @@ endif()
 
 find_package(Vulkan COMPONENTS glslc REQUIRED)
 
-find_package(SPIRV-Headers REQUIRED)
+if (DEFINED ENV{VULKAN_SDK})
+    list(APPEND CMAKE_PREFIX_PATH "$ENV{VULKAN_SDK}")
+endif()
+find_package(SPIRV-Headers CONFIG REQUIRED)
 
 if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
     # Parallel build object files

From a49747633055f2bea6c70e45baa19913750ea0bc Mon Sep 17 00:00:00 2001
From: dskwe <dskwelmcy@163.com>
Date: Sat, 23 May 2026 18:49:24 +0800
Subject: [PATCH 12/17] ggml : Check the right iface method before using the
 fallback 2d get (#23514)

---
 ggml/src/ggml-backend.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 5c0e5b1b9..87615921c 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -306,7 +306,7 @@ void ggml_backend_tensor_get_2d_async(ggml_backend_t backend, const struct ggml_
     GGML_ASSERT(tensor);
     GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
 
-    if (n_copies <= 1 || backend->iface.set_tensor_2d_async == NULL) {
+    if (n_copies <= 1 || backend->iface.get_tensor_2d_async == NULL) {
         for (size_t i = 0; i < n_copies; i++) {
             ggml_backend_tensor_get_async(backend, tensor, (char *) data + i*stride_data, offset + i*stride_tensor, size);
         }
@@ -317,7 +317,7 @@ void ggml_backend_tensor_get_2d_async(ggml_backend_t backend, const struct ggml_
     }
 
     GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-    GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
+    GGML_ASSERT(offset + (n_copies-1)*stride_tensor + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
     backend->iface.get_tensor_2d_async(backend, tensor, data, offset, size, n_copies, stride_tensor, stride_data);
 }
 

From b0df4c0cfd2cda10738056771714a5290dc95454 Mon Sep 17 00:00:00 2001
From: Michael Wand <michael@baybridgeaquarium.com>
Date: Sat, 23 May 2026 07:30:31 -0400
Subject: [PATCH 13/17] model : add NVFP4 MTP scale tensors (#23563)

* Add NVFP4 MTP scale tensors

* Link Qwen3.5 MTP tensors

* Aligned nullptr
---
 src/llama-model.cpp      | 12 ++++++++++++
 src/llama-model.h        | 16 ++++++++++------
 src/models/qwen35.cpp    |  5 +++--
 src/models/qwen35moe.cpp |  5 +++--
 4 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 8bf20a716..0d21b2a53 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1334,6 +1334,12 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
             if (!layer.ssm_beta_s && layer.ssm_beta) {
                 layer.ssm_beta_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "scale", i), {1}, TENSOR_NOT_REQUIRED);
             }
+            if (!layer.nextn.eh_proj_s && layer.nextn.eh_proj) {
+                layer.nextn.eh_proj_s = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.nextn.shared_head_head_s && layer.nextn.shared_head_head) {
+                layer.nextn.shared_head_head_s = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
 
             // input scales
             if (!layer.wq_in_s && layer.wq) {
@@ -1393,6 +1399,12 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) {
             if (!layer.ssm_beta_in_s && layer.ssm_beta) {
                 layer.ssm_beta_in_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
             }
+            if (!layer.nextn.eh_proj_in_s && layer.nextn.eh_proj) {
+                layer.nextn.eh_proj_in_s = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
+            if (!layer.nextn.shared_head_head_in_s && layer.nextn.shared_head_head) {
+                layer.nextn.shared_head_head_in_s = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "input_scale", i), {1}, TENSOR_NOT_REQUIRED);
+            }
         }
         // output scales
         if (output && output->type == GGML_TYPE_NVFP4) {
diff --git a/src/llama-model.h b/src/llama-model.h
index 01c87a752..398a0aa72 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -202,12 +202,16 @@ struct llama_layer_shortconv {
 };
 
 struct llama_layer_nextn {
-    struct ggml_tensor * eh_proj          = nullptr;
-    struct ggml_tensor * embed_tokens     = nullptr;
-    struct ggml_tensor * enorm            = nullptr;
-    struct ggml_tensor * hnorm            = nullptr;
-    struct ggml_tensor * shared_head_head = nullptr;
-    struct ggml_tensor * shared_head_norm = nullptr;
+    struct ggml_tensor * eh_proj               = nullptr;
+    struct ggml_tensor * eh_proj_s             = nullptr;
+    struct ggml_tensor * eh_proj_in_s          = nullptr;
+    struct ggml_tensor * embed_tokens          = nullptr;
+    struct ggml_tensor * enorm                 = nullptr;
+    struct ggml_tensor * hnorm                 = nullptr;
+    struct ggml_tensor * shared_head_head      = nullptr;
+    struct ggml_tensor * shared_head_head_s    = nullptr;
+    struct ggml_tensor * shared_head_head_in_s = nullptr;
+    struct ggml_tensor * shared_head_norm      = nullptr;
 };
 
 struct llama_layer {
diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp
index adeb0c26e..04ecc18fc 100644
--- a/src/models/qwen35.cpp
+++ b/src/models/qwen35.cpp
@@ -538,7 +538,7 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr
     ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0);
     cb(concat, "mtp_concat", il);
 
-    ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat);
+    ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat, layer.nextn.eh_proj_s);
     cb(cur, "mtp_eh_proj", il);
 
     ggml_tensor * inpSA = cur;
@@ -626,8 +626,9 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr
     cb(cur, "mtp_shared_head_norm", -1);
 
     ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output;
+    ggml_tensor * head_s = layer.nextn.shared_head_head ? layer.nextn.shared_head_head_s : model.output_s;
     GGML_ASSERT(head_w && "QWEN35 MTP: missing LM head (nextn.shared_head_head or model.output)");
-    cur = build_lora_mm(head_w, cur);
+    cur = build_lora_mm(head_w, cur, head_s);
     cb(cur, "result_output", -1);
 
     res->t_logits = cur;
diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp
index e4512116d..dc24f6ed5 100644
--- a/src/models/qwen35moe.cpp
+++ b/src/models/qwen35moe.cpp
@@ -602,7 +602,7 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm
     ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0);
     cb(concat, "mtp_concat", il);
 
-    ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat);
+    ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat, layer.nextn.eh_proj_s);
     cb(cur, "mtp_eh_proj", il);
 
     ggml_tensor * inpSA = cur;
@@ -722,8 +722,9 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm
     cb(cur, "mtp_shared_head_norm", -1);
 
     ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output;
+    ggml_tensor * head_s = layer.nextn.shared_head_head ? layer.nextn.shared_head_head_s : model.output_s;
     GGML_ASSERT(head_w && "QWEN35MOE MTP: missing LM head (nextn.shared_head_head or model.output)");
-    cur = build_lora_mm(head_w, cur);
+    cur = build_lora_mm(head_w, cur, head_s);
     cb(cur, "result_output", -1);
 
     res->t_logits = cur;

From c0c7e147e7efa6c5858754b47259ba4880f8a906 Mon Sep 17 00:00:00 2001
From: Aditya Singh <60082699+adityasingh2400@users.noreply.github.com>
Date: Sat, 23 May 2026 09:24:39 -0700
Subject: [PATCH 14/17] requirements : bump torch to 2.11.0 (#23503)

* requirements: relax torch~=2.6.0 to torch>=2.6.0 for convert_hf_to_gguf

The ~=2.6.0 operator resolves to >=2.6.0, <2.7.0, which fails on
PyPI for platform/CPython combinations where 2.6.x is not present.
The accompanying comment already says 'PyTorch 2.6.0 or later', so
the looser >=2.6.0 matches the documented intent and unblocks
pip install -r requirements/requirements-convert_hf_to_gguf.txt.

Fixes #23408

* requirements: bump torch floor to 2.11.0 per maintainer

* requirements: pin torch to ==2.11.0 per project policy

* requirements: pin mtmd torch and torchvision to 2.11.0/0.26.0 per project policy

* requirements: suppress check_requirements pin warning on mtmd

The check_requirements script flags '==' on lines in files matched by
*/**/requirements*.txt. Append the documented suppression comment to the
pinned torch and torchvision lines (and to the s390x platform marker lines)
so the check passes while keeping the pins required by project policy.

* ty: silence Tensor/Module union check on model[0].auto_model

With torch 2.11.0 stubs, nn.Sequential.__getitem__ now returns
Tensor | Module rather than Module, so model[0].auto_model fails ty
on the SentenceTransformer code path. The runtime behavior is
unchanged because SentenceTransformer always wraps a Module at
index 0. Adding a targeted unresolved-attribute ignore keeps the
type-check green without altering behavior. A follow-up issue
tracks typing the variable explicitly.
---
 .../scripts/embedding/run-original-model.py           |  2 +-
 requirements/requirements-convert_hf_to_gguf.txt      |  4 ++--
 tools/mtmd/requirements.txt                           | 11 +++++++++--
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/examples/model-conversion/scripts/embedding/run-original-model.py b/examples/model-conversion/scripts/embedding/run-original-model.py
index 614c1a86b..001d58896 100755
--- a/examples/model-conversion/scripts/embedding/run-original-model.py
+++ b/examples/model-conversion/scripts/embedding/run-original-model.py
@@ -64,7 +64,7 @@ def load_model_and_tokenizer(model_path, use_sentence_transformers=False, device
         print("Using SentenceTransformer to apply all numbered layers")
         model = SentenceTransformer(model_path)
         tokenizer = model.tokenizer
-        config = model[0].auto_model.config
+        config = model[0].auto_model.config  # ty: ignore[unresolved-attribute]
     else:
         tokenizer = AutoTokenizer.from_pretrained(model_path)
         config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
diff --git a/requirements/requirements-convert_hf_to_gguf.txt b/requirements/requirements-convert_hf_to_gguf.txt
index 122b4788d..f80fdc1f6 100644
--- a/requirements/requirements-convert_hf_to_gguf.txt
+++ b/requirements/requirements-convert_hf_to_gguf.txt
@@ -1,8 +1,8 @@
 -r ./requirements-convert_legacy_llama.txt
 --extra-index-url https://download.pytorch.org/whl/cpu
 
-## Embedding Gemma requires PyTorch 2.6.0 or later
-torch~=2.6.0; platform_machine != "s390x"
+## Embedding Gemma requires PyTorch 2.6.0 or later, bumped to 2.11.0 for compatibility
+torch==2.11.0; platform_machine != "s390x"
 
 # torch s390x packages can only be found from nightly builds
 --extra-index-url https://download.pytorch.org/whl/nightly
diff --git a/tools/mtmd/requirements.txt b/tools/mtmd/requirements.txt
index 0a1f4e864..f26d8e912 100644
--- a/tools/mtmd/requirements.txt
+++ b/tools/mtmd/requirements.txt
@@ -1,5 +1,12 @@
 -r ../../requirements/requirements-convert_legacy_llama.txt
 --extra-index-url https://download.pytorch.org/whl/cpu
 pillow~=11.3.0
-torch~=2.6.0
-torchvision~=0.21.0
+
+## Embedding Gemma requires PyTorch 2.6.0 or later, bumped to 2.11.0 for compatibility
+torch==2.11.0; platform_machine != "s390x" # check_requirements: ignore "=="
+torchvision==0.26.0; platform_machine != "s390x" # check_requirements: ignore "=="
+
+# torch s390x packages can only be found from nightly builds
+--extra-index-url https://download.pytorch.org/whl/nightly
+torch>=0.0.0.dev0; platform_machine == "s390x" # check_requirements: ignore "=="
+torchvision>=0.0.0.dev0; platform_machine == "s390x" # check_requirements: ignore "=="

From b22ff4b7b43b6d0d91636f85692ff216cb7cb607 Mon Sep 17 00:00:00 2001
From: Aldehir Rojas <hello@alde.dev>
Date: Sat, 23 May 2026 17:08:22 -0400
Subject: [PATCH 15/17] cmake/ui : refactor the build (#23352)

---
 .github/workflows/release.yml                 |  19 +
 .github/workflows/server.yml                  |  14 +-
 .github/workflows/ui-build.yml                |   4 +-
 .github/workflows/ui-publish.yml              |   6 +-
 CMakeLists.txt                                |  10 -
 common/common.h                               |   6 +-
 scripts/ui-assets.cmake                       | 342 ++++++++++++++++++
 scripts/ui-download.cmake                     | 223 ------------
 scripts/xxd.cmake                             |  16 -
 tools/server/server-http.cpp                  |  51 +--
 tools/ui/CMakeLists.txt                       | 216 +++++------
 tools/ui/embed.cpp                            | 144 ++++++++
 .../ui/scripts/vite-plugin-llama-cpp-build.ts |   2 +-
 tools/ui/sources.cmake                        |  15 +
 tools/ui/svelte.config.js                     |   8 +-
 tools/ui/ui.cpp                               |   7 -
 tools/ui/ui.h                                 |  17 -
 17 files changed, 653 insertions(+), 447 deletions(-)
 create mode 100644 scripts/ui-assets.cmake
 delete mode 100644 scripts/ui-download.cmake
 delete mode 100644 scripts/xxd.cmake
 create mode 100644 tools/ui/embed.cpp
 create mode 100644 tools/ui/sources.cmake
 delete mode 100644 tools/ui/ui.cpp
 delete mode 100644 tools/ui/ui.h

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index e7cbac35f..54ca22d24 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -1234,6 +1234,9 @@ jobs:
           path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
           name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz
 
+  ui-build:
+    uses: ./.github/workflows/ui-build.yml
+
   release:
     if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
 
@@ -1259,6 +1262,7 @@ jobs:
       - macOS-cpu
       - ios-xcode-build
       - openEuler-cann
+      - ui-build
 
     outputs:
       tag_name: ${{ steps.tag.outputs.name }}
@@ -1318,6 +1322,18 @@ jobs:
           mv -v artifact/*.zip release
           mv -v artifact/*.tar.gz release
 
+      - name: Download UI build
+        id: download_ui
+        uses: actions/download-artifact@v7
+        with:
+          name: ui-build
+          path: ./ui-dist
+
+      - name: Package UI
+        id: package_ui
+        run: |
+          tar -czvf release/llama-${{ steps.tag.outputs.name }}-ui.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./ui-dist .
+
       - name: Create release
         id: create_release
         uses: ggml-org/action-create-release@v1
@@ -1367,6 +1383,9 @@ jobs:
             - [openEuler aarch64 (310p)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-310p-openEuler-aarch64.tar.gz)
             - [openEuler aarch64 (910b, ACL Graph)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-910b-openEuler-aarch64-aclgraph.tar.gz)
 
+            **UI:**
+            - [UI](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-ui.tar.gz)
+
       - name: Upload release
         id: upload_release
         uses: actions/github-script@v8
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index 7b9c5a3a3..5e28c6d9e 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -54,8 +54,13 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  ui-build:
+    name: Build Web UI
+    uses: ./.github/workflows/ui-build.yml
+
   server:
     runs-on: ubuntu-latest
+    needs: ui-build
 
     name: server (${{ matrix.wf_name }})
     strategy:
@@ -93,12 +98,11 @@ jobs:
           fetch-depth: 0
           ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
 
-      - name: Setup Node.js
-        uses: actions/setup-node@v6
+      - name: Download built UI
+        uses: actions/download-artifact@v7
         with:
-          node-version: "24"
-          cache: "npm"
-          cache-dependency-path: "tools/ui/package-lock.json"
+          name: ui-build
+          path: tools/ui/dist
 
       - name: Build
         id: cmake_build
diff --git a/.github/workflows/ui-build.yml b/.github/workflows/ui-build.yml
index 511c96fb6..38fba13ad 100644
--- a/.github/workflows/ui-build.yml
+++ b/.github/workflows/ui-build.yml
@@ -31,7 +31,7 @@ jobs:
 
       - name: Generate checksums
         run: |
-          cd build/tools/ui/dist
+          cd tools/ui/dist
           for f in *; do
             sha256sum "$f" | awk '{print $1, $2}' >> checksums.txt
           done
@@ -40,5 +40,5 @@ jobs:
         uses: actions/upload-artifact@v6
         with:
           name: ui-build
-          path: build/tools/ui/dist/
+          path: tools/ui/dist/
           retention-days: 1
diff --git a/.github/workflows/ui-publish.yml b/.github/workflows/ui-publish.yml
index 0df358557..8a0d99193 100644
--- a/.github/workflows/ui-publish.yml
+++ b/.github/workflows/ui-publish.yml
@@ -38,7 +38,7 @@ jobs:
         uses: actions/download-artifact@v7
         with:
           name: ui-build
-          path: build/tools/ui/dist/
+          path: tools/ui/dist/
 
       - name: Install Hugging Face Hub CLI
         run: pip install -U huggingface_hub
@@ -49,12 +49,12 @@ jobs:
       - name: Sync built files to Hugging Face bucket (version tag)
         run: |
           # Upload the built files to the Hugging Face bucket under the release version
-          hf buckets sync build/tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/${{ inputs.version_tag }} --delete --quiet
+          hf buckets sync tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/${{ inputs.version_tag }} --delete --quiet
 
       - name: Sync built files to Hugging Face bucket (latest)
         run: |
           # Also upload to the 'latest' directory for fallback downloads
-          hf buckets sync build/tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/latest --delete --quiet
+          hf buckets sync tools/ui/dist hf://buckets/ggml-org/${{ env.HF_BUCKET_NAME }}/latest --delete --quiet
 
       - name: Verify upload
         run: |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4f7f4eca6..edd0ea1de 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -112,16 +112,6 @@ option(LLAMA_BUILD_APP       "llama: build the unified binary"
 option(LLAMA_BUILD_UI        "llama: build the embedded Web UI for server"                                       ON)
 option(LLAMA_USE_PREBUILT_UI "llama: use prebuilt UI from HF Bucket when available (requires LLAMA_BUILD_UI=ON)" ON)
 
-# Backward compat: when old var is set but new one isn't, forward the value
-if(DEFINED LLAMA_BUILD_WEBUI)
-    set(LLAMA_BUILD_UI ${LLAMA_BUILD_WEBUI})
-    message(DEPRECATION "LLAMA_BUILD_WEBUI is deprecated, use LLAMA_BUILD_UI instead")
-endif()
-if(DEFINED LLAMA_USE_PREBUILT_WEBUI)
-    set(LLAMA_USE_PREBUILT_UI ${LLAMA_USE_PREBUILT_WEBUI})
-    message(DEPRECATION "LLAMA_USE_PREBUILT_WEBUI is deprecated, use LLAMA_USE_PREBUILT_UI instead")
-endif()
-
 option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
 option(LLAMA_TESTS_INSTALL "llama: install tests" ON)
 
diff --git a/common/common.h b/common/common.h
index dec90456a..b0ad7b2ea 100644
--- a/common/common.h
+++ b/common/common.h
@@ -617,11 +617,7 @@ struct common_params {
     std::map<std::string, std::string> default_template_kwargs;
 
     // UI configs
-#ifdef LLAMA_UI_DEFAULT_ENABLED
-    bool ui = LLAMA_UI_DEFAULT_ENABLED != 0;
-#else
-    bool ui = true; // default to enabled when not set
-#endif
+    bool ui = true;
 
     // Deprecated: use ui, ui_mcp_proxy, ui_config_json instead
     bool webui = ui;
diff --git a/scripts/ui-assets.cmake b/scripts/ui-assets.cmake
new file mode 100644
index 000000000..ae7a1cc26
--- /dev/null
+++ b/scripts/ui-assets.cmake
@@ -0,0 +1,342 @@
+# Provision UI assets and generate ui.cpp/ui.h.
+#
+# Asset provisioning priority:
+#   1. Pre-built assets in SRC_DIST_DIR (manually built by user)
+#   2. If BUILD_UI=ON: npm build
+#   3. If above did not produce assets and HF_ENABLED=ON: HF Bucket download
+
+cmake_minimum_required(VERSION 3.16)
+
+set(UI_SOURCE_DIR     "" CACHE STRING "UI source directory (to run npm build)")
+set(UI_BINARY_DIR     "" CACHE STRING "UI binary directory (to store generated files)")
+set(LLAMA_SOURCE_DIR  "" CACHE STRING "Project source root (to resolve version from git)")
+set(HF_BUCKET         "" CACHE STRING "Hugging Face bucket name")
+set(HF_VERSION        "" CACHE STRING "Version to download (empty = resolve from git)")
+set(HF_ENABLED        "" CACHE STRING "Whether to allow HF Bucket download (ON/OFF)")
+set(BUILD_UI          "" CACHE STRING "Build UI via npm (ON/OFF)")
+set(LLAMA_UI_EMBED    "" CACHE STRING "Path to llama-ui-embed helper")
+
+set(ASSETS
+    bundle.css
+    bundle.js
+    index.html
+    loading.html
+)
+
+set(DIST_DIR     "${UI_BINARY_DIR}/dist")
+set(SRC_DIST_DIR "${UI_SOURCE_DIR}/dist")
+set(STAMP_FILE   "${UI_BINARY_DIR}/.ui-stamp")
+set(UI_CPP       "${UI_BINARY_DIR}/ui.cpp")
+set(UI_H         "${UI_BINARY_DIR}/ui.h")
+
+function(assets_present out_var)
+    set(present TRUE)
+    foreach(asset ${ASSETS})
+        if(NOT EXISTS "${DIST_DIR}/${asset}")
+            set(present FALSE)
+            break()
+        endif()
+    endforeach()
+    set(${out_var} ${present} PARENT_SCOPE)
+endfunction()
+
+function(copy_src_dist out_var)
+    set(${out_var} FALSE PARENT_SCOPE)
+
+    foreach(asset ${ASSETS})
+        if(NOT EXISTS "${SRC_DIST_DIR}/${asset}")
+            return()
+        endif()
+    endforeach()
+
+    file(MAKE_DIRECTORY "${DIST_DIR}")
+    message(STATUS "UI: using pre-built assets from ${SRC_DIST_DIR}")
+    foreach(asset ${ASSETS})
+        execute_process(
+            COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                "${SRC_DIST_DIR}/${asset}" "${DIST_DIR}/${asset}"
+        )
+    endforeach()
+    set(${out_var} TRUE PARENT_SCOPE)
+endfunction()
+
+function(npm_build_should_skip out_var)
+    set(${out_var} FALSE PARENT_SCOPE)
+
+    assets_present(present)
+    if(NOT present)
+        return()
+    endif()
+
+    if(EXISTS "${STAMP_FILE}")
+        return()
+    endif()
+
+    if(NOT EXISTS "${UI_SOURCE_DIR}/sources.cmake")
+        return()
+    endif()
+    include("${UI_SOURCE_DIR}/sources.cmake")
+
+    set(globs "")
+    foreach(g ${UI_SOURCE_GLOBS})
+        list(APPEND globs "${UI_SOURCE_DIR}/${g}")
+    endforeach()
+    file(GLOB_RECURSE sources ${globs})
+    foreach(f ${UI_SOURCE_FILES})
+        list(APPEND sources "${UI_SOURCE_DIR}/${f}")
+    endforeach()
+
+    file(TIMESTAMP "${DIST_DIR}/index.html" out_ts)
+
+    foreach(s ${sources})
+        if(NOT EXISTS "${s}")
+            continue()
+        endif()
+        file(TIMESTAMP "${s}" s_ts)
+        if(s_ts STRGREATER out_ts)
+            return()
+        endif()
+    endforeach()
+
+    set(${out_var} TRUE PARENT_SCOPE)
+endfunction()
+
+function(npm_build out_var)
+    set(${out_var} FALSE PARENT_SCOPE)
+
+    if(NOT EXISTS "${UI_SOURCE_DIR}/package.json")
+        message(STATUS "UI: ${UI_SOURCE_DIR}/package.json not found, skipping npm")
+        return()
+    endif()
+
+    npm_build_should_skip(skip)
+    if(skip)
+        message(STATUS "UI: npm output up-to-date, skipping build")
+        set(${out_var} TRUE PARENT_SCOPE)
+        return()
+    endif()
+
+    if(CMAKE_HOST_WIN32)
+        find_program(NPM_EXECUTABLE NAMES npm.cmd npm.bat npm)
+    else()
+        find_program(NPM_EXECUTABLE npm)
+    endif()
+    if(NOT NPM_EXECUTABLE)
+        message(STATUS "UI: npm not found, skipping npm build")
+        return()
+    endif()
+
+    if(NOT EXISTS "${UI_SOURCE_DIR}/node_modules")
+        message(STATUS "UI: running npm install (first time)")
+        execute_process(
+            COMMAND ${NPM_EXECUTABLE} install
+            WORKING_DIRECTORY "${UI_SOURCE_DIR}"
+            RESULT_VARIABLE rc
+            ERROR_VARIABLE  err
+        )
+        if(NOT rc EQUAL 0)
+            message(STATUS "UI: npm install failed (${rc})")
+            message(STATUS "  stderr: ${err}")
+            return()
+        endif()
+    endif()
+
+    file(MAKE_DIRECTORY "${DIST_DIR}")
+
+    message(STATUS "UI: running npm run build, output -> ${DIST_DIR}")
+    execute_process(
+        COMMAND ${CMAKE_COMMAND} -E env "LLAMA_UI_OUT_DIR=${DIST_DIR}"
+                ${NPM_EXECUTABLE} run build
+        WORKING_DIRECTORY "${UI_SOURCE_DIR}"
+        RESULT_VARIABLE rc
+        ERROR_VARIABLE  err
+    )
+    if(NOT rc EQUAL 0)
+        message(STATUS "UI: npm run build failed (${rc})")
+        message(STATUS "  stderr: ${err}")
+        return()
+    endif()
+
+    assets_present(present)
+    if(NOT present)
+        message(STATUS "UI: npm build finished but assets missing in ${DIST_DIR}")
+        return()
+    endif()
+
+    message(STATUS "UI: npm build succeeded")
+    file(REMOVE "${STAMP_FILE}")
+    set(${out_var} TRUE PARENT_SCOPE)
+endfunction()
+
+function(resolve_version out_var)
+    if(NOT "${HF_VERSION}" STREQUAL "")
+        set(${out_var} "${HF_VERSION}" PARENT_SCOPE)
+        return()
+    endif()
+
+    if(EXISTS "${LLAMA_SOURCE_DIR}/cmake/build-info.cmake")
+        include("${LLAMA_SOURCE_DIR}/cmake/build-info.cmake")
+        if(NOT "${BUILD_NUMBER}" STREQUAL "" AND NOT BUILD_NUMBER EQUAL 0)
+            set(${out_var} "b${BUILD_NUMBER}" PARENT_SCOPE)
+            return()
+        endif()
+    endif()
+
+    set(${out_var} "" PARENT_SCOPE)
+endfunction()
+
+function(hf_download version out_var out_resolved)
+    set(${out_var}      FALSE PARENT_SCOPE)
+    set(${out_resolved} ""    PARENT_SCOPE)
+
+    file(MAKE_DIRECTORY "${DIST_DIR}")
+
+    set(candidates "")
+    if(NOT "${version}" STREQUAL "")
+        list(APPEND candidates "${version}")
+    endif()
+    list(APPEND candidates "latest")
+
+    foreach(resolved ${candidates})
+        set(base "https://huggingface.co/buckets/ggml-org/${HF_BUCKET}/resolve/${resolved}")
+
+        message(STATUS "UI: downloading from ${resolved}: ${base}")
+
+        set(ok TRUE)
+        foreach(asset ${ASSETS})
+            file(DOWNLOAD "${base}/${asset}?download=true" "${DIST_DIR}/${asset}"
+                STATUS status TIMEOUT 60
+            )
+            list(GET status 0 rc)
+            if(NOT rc EQUAL 0)
+                list(GET status 1 errmsg)
+                message(STATUS "UI: download ${asset} from ${resolved} failed: ${errmsg}")
+                set(ok FALSE)
+                break()
+            endif()
+            message(STATUS "UI: downloaded ${asset}")
+        endforeach()
+
+        if(NOT ok)
+            continue()
+        endif()
+
+        # Best-effort checksum verification
+        file(DOWNLOAD "${base}/checksums.txt?download=true" "${DIST_DIR}/checksums.txt"
+            STATUS cs_status TIMEOUT 30
+        )
+        list(GET cs_status 0 cs_rc)
+        if(cs_rc EQUAL 0)
+            message(STATUS "UI: verifying checksums")
+            file(STRINGS "${DIST_DIR}/checksums.txt" cs_lines)
+            foreach(asset ${ASSETS})
+                file(SHA256 "${DIST_DIR}/${asset}" h)
+                string(TOLOWER "${h}" h)
+                string(REGEX MATCH "${h}[ \t]+${asset}" m "${cs_lines}")
+                if(NOT m)
+                    message(WARNING "UI: checksum verification failed for ${asset}")
+                    set(ok FALSE)
+                    break()
+                endif()
+            endforeach()
+            if(ok)
+                message(STATUS "UI: all checksums verified")
+            endif()
+        endif()
+
+        if(ok)
+            set(${out_var}      TRUE         PARENT_SCOPE)
+            set(${out_resolved} "${resolved}" PARENT_SCOPE)
+            return()
+        endif()
+    endforeach()
+endfunction()
+
+function(emit_files)
+    assets_present(present)
+
+    set(args "${UI_CPP}" "${UI_H}")
+    if(present)
+        foreach(asset ${ASSETS})
+            list(APPEND args "${asset}" "${DIST_DIR}/${asset}")
+        endforeach()
+    endif()
+
+    execute_process(
+        COMMAND "${LLAMA_UI_EMBED}" ${args}
+        RESULT_VARIABLE rc
+    )
+    if(NOT rc EQUAL 0)
+        message(FATAL_ERROR "UI: llama-ui-embed failed (${rc})")
+    endif()
+endfunction()
+
+# ---------------------------------------------------------------------------
+# 1. Priority 1: pre-built assets supplied in tools/ui/dist
+# ---------------------------------------------------------------------------
+copy_src_dist(SRC_OK)
+if(SRC_OK)
+    emit_files()
+    return()
+endif()
+
+# ---------------------------------------------------------------------------
+# 2. Priority 2: npm build (if BUILD_UI=ON)
+# ---------------------------------------------------------------------------
+set(provisioned FALSE)
+
+if(BUILD_UI)
+    npm_build(NPM_OK)
+    if(NPM_OK)
+        set(provisioned TRUE)
+    endif()
+endif()
+
+# ---------------------------------------------------------------------------
+# 3. Priority 3: HF Bucket download (if npm did not produce assets and HF_ENABLED=ON)
+# ---------------------------------------------------------------------------
+if(NOT provisioned AND HF_ENABLED)
+    resolve_version(VERSION)
+
+    set(stamp_ok FALSE)
+    if(EXISTS "${STAMP_FILE}" AND NOT "${VERSION}" STREQUAL "")
+        file(READ "${STAMP_FILE}" stamped)
+        string(STRIP "${stamped}" stamped)
+        if("${stamped}" STREQUAL "${VERSION}")
+            set(stamp_ok TRUE)
+        endif()
+    endif()
+
+    assets_present(have_assets)
+    if(stamp_ok AND have_assets)
+        message(STATUS "UI: HF stamp '${stamped}' matches version, skipping HF fetch")
+        set(provisioned TRUE)
+    else()
+        hf_download("${VERSION}" HF_OK HF_RESOLVED)
+        if(HF_OK)
+            file(WRITE "${STAMP_FILE}" "${HF_RESOLVED}")
+            message(STATUS "UI: HF download succeeded, stamp updated (${HF_RESOLVED})")
+            set(provisioned TRUE)
+        else()
+            message(STATUS "UI: HF download failed")
+        endif()
+    endif()
+endif()
+
+# ---------------------------------------------------------------------------
+# 4. Fallback: warn about stale or missing assets, then emit whatever we have
+# ---------------------------------------------------------------------------
+if(NOT provisioned)
+    assets_present(have_assets)
+    if(have_assets)
+        message(WARNING "UI: provisioning failed; embedding stale assets from ${DIST_DIR}")
+    else()
+        message(WARNING "UI: no assets available - building without an embedded UI. "
+                        "In a disconnected environment, download the pre-built UI "
+                        "from a llama.cpp release at "
+                        "https://github.com/ggml-org/llama.cpp/releases and "
+                        "extract to tools/ui/dist.")
+    endif()
+endif()
+
+emit_files()
diff --git a/scripts/ui-download.cmake b/scripts/ui-download.cmake
deleted file mode 100644
index 65143642a..000000000
--- a/scripts/ui-download.cmake
+++ /dev/null
@@ -1,223 +0,0 @@
-# Download UI assets from Hugging Face Bucket at build time
-# Usage: cmake -DPUBLIC_DIR=... -DHF_BUCKET=... -DHF_VERSION=... -DASSETS="a;b;c" -P scripts/ui-download.cmake
-#
-# Asset provisioning priority:
-#   1. Pre-built assets already in PUBLIC_DIR (cached from a previous run)
-#   2. Local npm build (if NPM_DIR is provided and has package.json)
-#   3. Hugging Face Bucket download (version-specific, then 'latest' fallback)
-
-cmake_minimum_required(VERSION 3.16)
-
-set(PUBLIC_DIR   "" CACHE STRING "Directory to store/download assets")
-set(HF_BUCKET    "" CACHE STRING "Hugging Face bucket name")
-set(HF_VERSION   "" CACHE STRING "Version to download (empty = resolve from git)")
-set(ASSETS       "" CACHE STRING "Plus-separated list of asset filenames (+)")
-set(STAMP_FILE   "" CACHE STRING "Stamp file to create on success (optional)")
-set(SOURCE_DIR   "" CACHE STRING "Project source root (to resolve version from git)")
-set(NPM_DIR      "" CACHE STRING "UI source directory (to run npm build)")
-set(HF_ENABLED   "" CACHE STRING "Whether to allow HF Bucket download (ON/OFF)")
-
-# ---------------------------------------------------------------------------
-# 1. Resolve version from git if not provided at configure time
-# ---------------------------------------------------------------------------
-set(RESOLVED_VERSION "${HF_VERSION}")
-if("${RESOLVED_VERSION}" STREQUAL "" AND NOT "${SOURCE_DIR}" STREQUAL "")
-    if(EXISTS "${SOURCE_DIR}/cmake/build-info.cmake")
-        include("${SOURCE_DIR}/cmake/build-info.cmake")
-        if(NOT "${BUILD_NUMBER}" STREQUAL "" AND NOT BUILD_NUMBER EQUAL 0)
-            set(RESOLVED_VERSION "b${BUILD_NUMBER}")
-            message(STATUS "UI: resolved version from git: ${RESOLVED_VERSION}")
-        endif()
-    endif()
-endif()
-
-# Convert + back to CMake list (+ is used as separator instead of ; to
-# avoid platform-specific escaping issues when passing via -D arguments)
-string(REGEX REPLACE "\\+" ";" ASSETS "${ASSETS}")
-
-# ---------------------------------------------------------------------------
-# 2. Check stamp freshness — re-download if resolved version changed
-# ---------------------------------------------------------------------------
-set(FORCE_REBUILD FALSE)
-if(NOT "${STAMP_FILE}" STREQUAL "" AND EXISTS "${STAMP_FILE}")
-    file(READ "${STAMP_FILE}" STAMPED_VERSION)
-    string(STRIP "${STAMPED_VERSION}" STAMPED_VERSION)
-    if(NOT "${STAMPED_VERSION}" STREQUAL "${RESOLVED_VERSION}")
-        message(STATUS "UI: version changed (${STAMPED_VERSION} -> ${RESOLVED_VERSION}), re-building")
-        set(FORCE_REBUILD TRUE)
-    endif()
-endif()
-
-# ---------------------------------------------------------------------------
-# 3. Check if assets already exist (cached from a previous run)
-# ---------------------------------------------------------------------------
-set(ALL_EXISTS TRUE)
-foreach(asset ${ASSETS})
-    if(NOT EXISTS "${PUBLIC_DIR}/${asset}")
-        set(ALL_EXISTS FALSE)
-        break()
-    endif()
-endforeach()
-
-if(ALL_EXISTS AND NOT FORCE_REBUILD)
-    message(STATUS "UI: all assets already exist in ${PUBLIC_DIR}, skipping")
-    return()
-endif()
-
-file(MAKE_DIRECTORY "${PUBLIC_DIR}")
-
-# ---------------------------------------------------------------------------
-# 4. Priority 2: build from source via npm (fast path for developers)
-# ---------------------------------------------------------------------------
-set(PROVISION_SUCCESS FALSE)
-
-if(NOT PROVISION_SUCCESS AND NOT "${NPM_DIR}" STREQUAL "")
-    if(EXISTS "${NPM_DIR}/package.json")
-        # Check if npm is available before attempting npm build
-        find_program(NPM_EXECUTABLE npm)
-        if(NPM_EXECUTABLE)
-            message(STATUS "UI: building from source in ${NPM_DIR}")
-
-            # Run npm install if node_modules is missing
-            if(NOT EXISTS "${NPM_DIR}/node_modules")
-                message(STATUS "UI: running npm install (first time)")
-                execute_process(
-                    COMMAND ${NPM_EXECUTABLE} install
-                    WORKING_DIRECTORY "${NPM_DIR}"
-                    RESULT_VARIABLE NPM_INSTALL_RESULT
-                    OUTPUT_VARIABLE NPM_OUT
-                    ERROR_VARIABLE  NPM_ERR
-                )
-                if(NOT NPM_INSTALL_RESULT EQUAL 0)
-                    message(STATUS "UI: npm install failed (${NPM_INSTALL_RESULT}), falling back to download")
-                    message(STATUS "  stderr: ${NPM_ERR}")
-                endif()
-            endif()
-
-            # Run the build
-            execute_process(
-                COMMAND ${NPM_EXECUTABLE} run build
-                WORKING_DIRECTORY "${NPM_DIR}"
-                RESULT_VARIABLE NPM_BUILD_RESULT
-                OUTPUT_VARIABLE NPM_OUT
-                ERROR_VARIABLE  NPM_ERR
-            )
-
-            if(NPM_BUILD_RESULT EQUAL 0)
-                # Verify that the expected assets were produced
-                set(ALL_BUILT TRUE)
-                foreach(asset ${ASSETS})
-                    if(NOT EXISTS "${PUBLIC_DIR}/${asset}")
-                        set(ALL_BUILT FALSE)
-                        break()
-                    endif()
-                endforeach()
-
-                if(ALL_BUILT)
-                    message(STATUS "UI: local npm build succeeded")
-                    set(PROVISION_SUCCESS TRUE)
-                else()
-                    message(STATUS "UI: npm build completed but assets missing from ${PUBLIC_DIR}, falling back to download")
-                endif()
-            else()
-                message(STATUS "UI: npm build failed (${NPM_BUILD_RESULT}), falling back to download")
-                message(STATUS "  stderr: ${NPM_ERR}")
-            endif()
-        else()
-            message(STATUS "UI: npm not found, skipping npm build and trying HF Bucket download")
-        endif()
-    else()
-        message(STATUS "UI: NPM_DIR (${NPM_DIR}) has no package.json, skipping npm build")
-    endif()
-endif()
-
-# ---------------------------------------------------------------------------
-# 5. Priority 3: download from Hugging Face Bucket (if enabled)
-# ---------------------------------------------------------------------------
-if(NOT PROVISION_SUCCESS AND HF_ENABLED)
-    # Build list of URLs to try — version-specific first, then 'latest'
-    set(URL_ENTRIES "")
-    if(NOT "${RESOLVED_VERSION}" STREQUAL "")
-        list(APPEND URL_ENTRIES
-            "version:https://huggingface.co/buckets/ggml-org/${HF_BUCKET}/resolve/${RESOLVED_VERSION}")
-    endif()
-    list(APPEND URL_ENTRIES
-        "latest:https://huggingface.co/buckets/ggml-org/${HF_BUCKET}/resolve/latest")
-
-    foreach(entry ${URL_ENTRIES})
-        string(REGEX REPLACE "^([^:]+):.*$" "\\1" url_label "${entry}")
-        string(REGEX REPLACE "^[^:]+:(.*)$" "\\1" base_url "${entry}")
-
-        message(STATUS "UI: downloading assets from ${url_label}: ${base_url}")
-
-        # Download each asset
-        set(ALL_OK TRUE)
-        foreach(asset ${ASSETS})
-            set(download_url "${base_url}/${asset}?download=true")
-            set(download_path "${PUBLIC_DIR}/${asset}")
-            file(DOWNLOAD "${download_url}" "${download_path}"
-                STATUS download_status TIMEOUT 60
-            )
-            list(GET download_status 0 download_result)
-            if(NOT download_result EQUAL 0)
-                list(GET download_status 1 error_message)
-                message(STATUS "UI: failed to download ${asset} from ${url_label}: ${error_message}")
-                set(ALL_OK FALSE)
-                break()
-            endif()
-            message(STATUS "UI: downloaded ${asset}")
-        endforeach()
-
-        if(NOT ALL_OK)
-            continue()
-        endif()
-
-        # Verify checksums if the server provides them
-        file(DOWNLOAD "${base_url}/checksums.txt?download=true"
-            "${PUBLIC_DIR}/checksums.txt"
-            STATUS checksum_status TIMEOUT 30
-        )
-        list(GET checksum_status 0 checksum_result)
-        if(checksum_result EQUAL 0)
-            message(STATUS "UI: verifying checksums...")
-            file(STRINGS "${PUBLIC_DIR}/checksums.txt" CHECKSUMS_CONTENT)
-            foreach(asset ${ASSETS})
-                set(download_path "${PUBLIC_DIR}/${asset}")
-                file(SHA256 "${download_path}" asset_hash)
-                string(TOLOWER "${asset_hash}" EXPECTED_HASH_LOWER)
-                string(REGEX MATCH "${EXPECTED_HASH_LOWER}[ \\t]+${asset}" CHECKSUM_LINE "${CHECKSUMS_CONTENT}")
-                if(NOT CHECKSUM_LINE)
-                    message(WARNING "UI: checksum verification failed for ${asset}")
-                    set(ALL_OK FALSE)
-                    break()
-                endif()
-            endforeach()
-            if(ALL_OK)
-                message(STATUS "UI: all checksums verified")
-            endif()
-        endif()
-
-        if(ALL_OK)
-            set(PROVISION_SUCCESS TRUE)
-            break()
-        endif()
-    endforeach()
-
-    if(PROVISION_SUCCESS)
-        message(STATUS "UI: provisioning complete")
-    else()
-        message(WARNING "UI: failed to download assets from HF Bucket (${HF_BUCKET})")
-    endif()
-endif()
-
-# ---------------------------------------------------------------------------
-# 6. Write stamp file on success (stores resolved version for freshness check)
-# ---------------------------------------------------------------------------
-if(PROVISION_SUCCESS)
-    if(NOT "${STAMP_FILE}" STREQUAL "")
-        file(WRITE "${STAMP_FILE}" "${RESOLVED_VERSION}")
-    endif()
-else()
-    message(WARNING "UI: no source available. Neither local build (${NPM_DIR}) nor HF Bucket download succeeded.")
-    message(WARNING "UI: building server without embedded UI. Set LLAMA_BUILD_UI=OFF to suppress this warning.")
-endif()
diff --git a/scripts/xxd.cmake b/scripts/xxd.cmake
deleted file mode 100644
index 73f6cfff7..000000000
--- a/scripts/xxd.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMake equivalent of `xxd -i ${INPUT} ${OUTPUT}`
-# Usage: cmake -DINPUT=build/tools/ui/dist/index.html -DOUTPUT=build/tools/ui/dist/index.html.hpp -P scripts/xxd.cmake
-
-SET(INPUT "" CACHE STRING "Input File")
-SET(OUTPUT "" CACHE STRING "Output File")
-
-get_filename_component(filename "${INPUT}" NAME)
-string(REGEX REPLACE "\\.|-" "_" name "${filename}")
-
-file(READ "${INPUT}" hex_data HEX)
-string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," hex_sequence "${hex_data}")
-
-string(LENGTH ${hex_data} hex_len)
-math(EXPR len "${hex_len} / 2")
-
-file(WRITE "${OUTPUT}" "unsigned char ${name}[] = {${hex_sequence}};\nunsigned int ${name}_len = ${len};\n")
diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp
index 9d008fc94..9c025952d 100644
--- a/tools/server/server-http.cpp
+++ b/tools/server/server-http.cpp
@@ -231,16 +231,19 @@ bool server_http_context::init(const common_params & params) {
     };
 
     auto middleware_server_state = [this](const httplib::Request & req, httplib::Response & res) {
-        (void)req; // suppress unused parameter warning when LLAMA_BUILD_UI is not defined
         bool ready = is_ready.load();
         if (!ready) {
-#if defined(LLAMA_BUILD_UI)
+#if defined(LLAMA_UI_HAS_ASSETS)
             auto tmp = string_split<std::string>(req.path, '.');
             if (req.path == "/" || (tmp.size() > 0 && tmp.back() == "html")) {
-                res.status = 503;
-                res.set_content(reinterpret_cast<const char*>(loading_html), loading_html_len, "text/html; charset=utf-8");
-                return false;
+                if (const llama_ui_asset * a = llama_ui_find_asset("loading.html")) {
+                    res.status = 503;
+                    res.set_content(reinterpret_cast<const char*>(a->data), a->size, "text/html; charset=utf-8");
+                    return false;
+                }
             }
+#else
+            (void)req;
 #endif
             // no endpoints are allowed to be accessed when the server is not ready
             // this is to prevent any data races or inconsistent states
@@ -312,23 +315,27 @@ bool server_http_context::init(const common_params & params) {
                 return 1;
             }
         } else {
-#if defined(LLAMA_BUILD_UI)
-            // using embedded static index.html
-            srv->Get(params.api_prefix + "/", [](const httplib::Request & /*req*/, httplib::Response & res) {
-                // COEP and COOP headers, required by pyodide (python interpreter)
-                res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
-                res.set_header("Cross-Origin-Opener-Policy", "same-origin");
-                res.set_content(reinterpret_cast<const char*>(index_html), index_html_len, "text/html; charset=utf-8");
-                return false;
-            });
-            srv->Get(params.api_prefix + "/bundle.js", [](const httplib::Request & /*req*/, httplib::Response & res) {
-                res.set_content(reinterpret_cast<const char*>(bundle_js), bundle_js_len, "application/javascript; charset=utf-8");
-                return false;
-            });
-            srv->Get(params.api_prefix + "/bundle.css", [](const httplib::Request & /*req*/, httplib::Response & res) {
-                res.set_content(reinterpret_cast<const char*>(bundle_css), bundle_css_len, "text/css; charset=utf-8");
-                return false;
-            });
+#if defined(LLAMA_UI_HAS_ASSETS)
+            auto serve_asset = [](const std::string & name, const char * mime, bool with_isolation_headers) {
+                return [name, mime, with_isolation_headers](const httplib::Request & /*req*/, httplib::Response & res) {
+                    const llama_ui_asset * a = llama_ui_find_asset(name.c_str());
+                    if (!a) {
+                        res.status = 404;
+                        return false;
+                    }
+                    if (with_isolation_headers) {
+                        // COEP and COOP headers, required by pyodide (python interpreter)
+                        res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
+                        res.set_header("Cross-Origin-Opener-Policy", "same-origin");
+                    }
+                    res.set_content(reinterpret_cast<const char*>(a->data), a->size, mime);
+                    return false;
+                };
+            };
+
+            srv->Get(params.api_prefix + "/",           serve_asset("index.html", "text/html; charset=utf-8",              true));
+            srv->Get(params.api_prefix + "/bundle.js",  serve_asset("bundle.js",  "application/javascript; charset=utf-8", false));
+            srv->Get(params.api_prefix + "/bundle.css", serve_asset("bundle.css", "text/css; charset=utf-8",               false));
 #endif
         }
     }
diff --git a/tools/ui/CMakeLists.txt b/tools/ui/CMakeLists.txt
index 383940cb6..d4cf35802 100644
--- a/tools/ui/CMakeLists.txt
+++ b/tools/ui/CMakeLists.txt
@@ -1,150 +1,98 @@
 set(TARGET llama-ui)
 
-# Deprecated: use LLAMA_UI_HF_BUCKET instead
-set(LLAMA_WEBUI_HF_BUCKET "llama-ui" CACHE STRING "Hugging Face bucket name for prebuilt webui assets (deprecated: use LLAMA_UI_HF_BUCKET)")
 set(LLAMA_UI_HF_BUCKET "llama-ui" CACHE STRING "Hugging Face bucket name for prebuilt UI assets")
 
 # Backward compat: forward old var to new one
-if(DEFINED LLAMA_WEBUI_HF_BUCKET AND NOT DEFINED LLAMA_UI_HF_BUCKET)
+if(DEFINED LLAMA_BUILD_WEBUI)
+    set(LLAMA_BUILD_UI ${LLAMA_BUILD_WEBUI})
+    message(DEPRECATION "LLAMA_BUILD_WEBUI is deprecated, use LLAMA_BUILD_UI instead")
+endif()
+if(DEFINED LLAMA_USE_PREBUILT_WEBUI)
+    set(LLAMA_USE_PREBUILT_UI ${LLAMA_USE_PREBUILT_WEBUI})
+    message(DEPRECATION "LLAMA_USE_PREBUILT_WEBUI is deprecated, use LLAMA_USE_PREBUILT_UI instead")
+endif()
+if(DEFINED LLAMA_WEBUI_HF_BUCKET)
     set(LLAMA_UI_HF_BUCKET ${LLAMA_WEBUI_HF_BUCKET})
-elseif(DEFINED LLAMA_WEBUI_HF_BUCKET AND NOT "${LLAMA_WEBUI_HF_BUCKET}" STREQUAL "${LLAMA_UI_HF_BUCKET}")
     message(DEPRECATION "LLAMA_WEBUI_HF_BUCKET is deprecated, use LLAMA_UI_HF_BUCKET instead")
 endif()
 
-set(TARGET_SRCS "")
-set(UI_COMPILE_DEFS "")
-
-if(LLAMA_BUILD_UI)
-    set(PUBLIC_ASSETS
-        index.html
-        bundle.js
-        bundle.css
-        loading.html
-    )
-
-    # Determine source of UI assets (priority: local > HF Bucket)
-    set(UI_SOURCE "")
-    set(UI_SOURCE_DIR "")
-
-    # Priority 1: Check for local build output
-    set(LOCAL_UI_DIR "${PROJECT_SOURCE_DIR}/build/tools/ui/dist")
-
-    # Verify all required assets exist before declaring local source valid
-    set(ALL_ASSETS_PRESENT TRUE)
-    foreach(asset ${PUBLIC_ASSETS})
-        if(NOT EXISTS "${LOCAL_UI_DIR}/${asset}")
-            set(ALL_ASSETS_PRESENT FALSE)
-            break()
-        endif()
-    endforeach()
-
-    if(ALL_ASSETS_PRESENT)
-        set(UI_SOURCE "local")
-        set(UI_SOURCE_DIR "${LOCAL_UI_DIR}")
-        message(STATUS "UI: using local build from ${UI_SOURCE_DIR}")
-    endif()
-
-    # Priority 2: Build-time asset provisioning (npm build → HF Bucket fallback)
-    if(NOT UI_SOURCE_DIR)
-        # Environment variable takes precedence (e.g., from CI workflows)
-        # Deprecated: use HF_UI_VERSION instead
-        if(DEFINED ENV{HF_WEBUI_VERSION})
-            set(HF_UI_VERSION "$ENV{HF_WEBUI_VERSION}")
-            message(DEPRECATION "HF_WEBUI_VERSION env var is deprecated, use HF_UI_VERSION instead")
-            if(NOT HF_UI_VERSION MATCHES "^[A-Za-z0-9._-]+$")
-                message(FATAL_ERROR "UI: invalid HF_WEBUI_VERSION='${HF_UI_VERSION}' - must match ^[A-Za-z0-9._-]+$")
-            endif()
-        elseif(DEFINED ENV{HF_UI_VERSION})
-            set(HF_UI_VERSION "$ENV{HF_UI_VERSION}")
-            if(NOT HF_UI_VERSION MATCHES "^[A-Za-z0-9._-]+$")
-                message(FATAL_ERROR "UI: invalid HF_UI_VERSION='${HF_UI_VERSION}' - must match ^[A-Za-z0-9._-]+$")
-            endif()
-        elseif(DEFINED LLAMA_BUILD_NUMBER)
-            set(HF_UI_VERSION "b${LLAMA_BUILD_NUMBER}")
-            message(STATUS "UI: derived HF_UI_VERSION=b${LLAMA_BUILD_NUMBER}")
-        else()
-            set(HF_UI_VERSION "")
-            message(STATUS "UI: version not specified (will use HF 'latest')")
-        endif()
-
-        if("${HF_UI_VERSION}" STREQUAL "")
-            set(UI_VERSION_TAG "provisioned")
-        else()
-            set(UI_VERSION_TAG "${HF_UI_VERSION}")
-        endif()
-        set(UI_STAMP "${CMAKE_CURRENT_BINARY_DIR}/.ui-${UI_VERSION_TAG}.stamp")
-
-        string(REPLACE ";" "+" PUBLIC_ASSETS_JOINED "${PUBLIC_ASSETS}")
-
-        add_custom_command(
-            OUTPUT ${UI_STAMP}
-            COMMAND ${CMAKE_COMMAND}
-                "-DSOURCE_DIR=${PROJECT_SOURCE_DIR}"
-                "-DPUBLIC_DIR=${PROJECT_SOURCE_DIR}/build/tools/ui/dist"
-                "-DHF_BUCKET=${LLAMA_UI_HF_BUCKET}"
-                "-DHF_VERSION=${HF_UI_VERSION}"
-                "-DHF_ENABLED=${LLAMA_USE_PREBUILT_UI}"
-                "-DASSETS=${PUBLIC_ASSETS_JOINED}"
-                "-DSTAMP_FILE=${UI_STAMP}"
-                "-DNPM_DIR=${PROJECT_SOURCE_DIR}/tools/ui"
-                -P ${PROJECT_SOURCE_DIR}/scripts/ui-download.cmake
-            COMMENT "Building/provisioning UI assets (npm build -> HF Bucket fallback)"
-        )
-
-        set(UI_SOURCE "provisioned")
-        set(UI_SOURCE_DIR "${PROJECT_SOURCE_DIR}/build/tools/ui/dist")
-    endif()
-
-    # Process assets from the determined source
-    if(UI_SOURCE_DIR)
-        foreach(asset ${PUBLIC_ASSETS})
-            set(input "${UI_SOURCE_DIR}/${asset}")
-            set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
-            list(APPEND TARGET_SRCS ${output})
-
-            if(UI_SOURCE STREQUAL "local")
-                if(NOT EXISTS "${input}")
-                    message(FATAL_ERROR "UI asset not found: ${input}")
-                endif()
-                set(dependency "${input}")
-            else()
-                set(dependency "${UI_STAMP}")
-            endif()
-
-            add_custom_command(
-                DEPENDS ${dependency}
-                OUTPUT "${output}"
-                COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake"
-            )
-            set_source_files_properties(${output} PROPERTIES GENERATED TRUE)
-        endforeach()
-
-        list(APPEND UI_COMPILE_DEFS
-            LLAMA_BUILD_UI
-            LLAMA_UI_DEFAULT_ENABLED=1
-        )
-        message(STATUS "UI: embedded with source: ${UI_SOURCE}")
-    else()
-        message(WARNING "UI: no source available. Neither local build (build/tools/ui/dist/) nor HF Bucket download succeeded.")
-        message(WARNING "UI: building server without embedded UI. Set LLAMA_BUILD_UI=OFF to suppress this warning.")
-        list(APPEND UI_COMPILE_DEFS LLAMA_UI_DEFAULT_ENABLED=0)
-    endif()
+# Resolve HF asset version: explicit env var > derived from build number > unset
+if(DEFINED ENV{HF_WEBUI_VERSION})
+    set(HF_UI_VERSION "$ENV{HF_WEBUI_VERSION}")
+    message(DEPRECATION "HF_WEBUI_VERSION env var is deprecated, use HF_UI_VERSION instead")
+elseif(DEFINED ENV{HF_UI_VERSION})
+    set(HF_UI_VERSION "$ENV{HF_UI_VERSION}")
+elseif(DEFINED LLAMA_BUILD_NUMBER)
+    set(HF_UI_VERSION "b${LLAMA_BUILD_NUMBER}")
 else()
-    list(APPEND UI_COMPILE_DEFS LLAMA_UI_DEFAULT_ENABLED=0)
+    set(HF_UI_VERSION "")
 endif()
 
-# Build the static library
-add_library(${TARGET} STATIC ui.cpp)
+if(NOT "${HF_UI_VERSION}" STREQUAL "" AND NOT HF_UI_VERSION MATCHES "^[A-Za-z0-9._-]+$")
+    message(FATAL_ERROR "UI: invalid HF_UI_VERSION='${HF_UI_VERSION}' - must match ^[A-Za-z0-9._-]+$")
+endif()
 
-target_include_directories(${TARGET} PUBLIC
-    ${CMAKE_CURRENT_SOURCE_DIR}
-    ${CMAKE_CURRENT_BINARY_DIR}
+set(UI_CPP "${CMAKE_CURRENT_BINARY_DIR}/ui.cpp")
+set(UI_H   "${CMAKE_CURRENT_BINARY_DIR}/ui.h")
+
+if(CMAKE_CROSSCOMPILING)
+    find_program(HOST_CXX_COMPILER NAMES g++ clang++ NO_CMAKE_FIND_ROOT_PATH)
+    if(NOT HOST_CXX_COMPILER)
+        message(FATAL_ERROR "UI: no host C++ compiler (g++/clang++) found to build llama-ui-embed; set -DHOST_CXX_COMPILER=<path>")
+    endif()
+    message(STATUS "UI: building llama-ui-embed with host compiler ${HOST_CXX_COMPILER}")
+
+    if(CMAKE_HOST_WIN32)
+        set(LLAMA_UI_EMBED_EXE "${CMAKE_CURRENT_BINARY_DIR}/llama-ui-embed.exe")
+    else()
+        set(LLAMA_UI_EMBED_EXE "${CMAKE_CURRENT_BINARY_DIR}/llama-ui-embed")
+    endif()
+
+    add_custom_command(
+        OUTPUT  "${LLAMA_UI_EMBED_EXE}"
+        COMMAND "${HOST_CXX_COMPILER}" -O2 -std=c++17
+                -o "${LLAMA_UI_EMBED_EXE}" "${CMAKE_CURRENT_SOURCE_DIR}/embed.cpp"
+        DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/embed.cpp"
+        COMMENT "Building llama-ui-embed (host)"
+        VERBATIM
+    )
+    add_custom_target(llama-ui-embed DEPENDS "${LLAMA_UI_EMBED_EXE}")
+else()
+    add_executable(llama-ui-embed embed.cpp)
+    target_compile_features(llama-ui-embed PRIVATE cxx_std_17)
+    set_target_properties(llama-ui-embed PROPERTIES
+        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
+    )
+    set(LLAMA_UI_EMBED_EXE "$<TARGET_FILE:llama-ui-embed>")
+endif()
+
+# Run the provisioning script every build so source changes in tools/ui/ are
+# always picked up. The script uses copy_if_different for ui.cpp/ui.h, so the
+# library only recompiles when contents actually change.
+add_custom_target(llama-ui-assets ALL
+    BYPRODUCTS ${UI_CPP} ${UI_H}
+    COMMAND ${CMAKE_COMMAND}
+        "-DUI_SOURCE_DIR=${CMAKE_CURRENT_SOURCE_DIR}"
+        "-DUI_BINARY_DIR=${CMAKE_CURRENT_BINARY_DIR}"
+        "-DLLAMA_SOURCE_DIR=${PROJECT_SOURCE_DIR}"
+        "-DHF_BUCKET=${LLAMA_UI_HF_BUCKET}"
+        "-DHF_VERSION=${HF_UI_VERSION}"
+        "-DHF_ENABLED=${LLAMA_USE_PREBUILT_UI}"
+        "-DBUILD_UI=${LLAMA_BUILD_UI}"
+        "-DLLAMA_UI_EMBED=${LLAMA_UI_EMBED_EXE}"
+        -P "${PROJECT_SOURCE_DIR}/scripts/ui-assets.cmake"
+    COMMENT "Provisioning UI assets"
+    VERBATIM
 )
 
-target_compile_definitions(${TARGET} PUBLIC ${UI_COMPILE_DEFS})
+add_dependencies(llama-ui-assets llama-ui-embed)
 
-if(TARGET_SRCS)
-    # List generated .hpp files as sources so CMake tracks them as build dependencies
-    target_sources(${TARGET} PRIVATE ${TARGET_SRCS})
-    set_source_files_properties(${TARGET_SRCS} PROPERTIES HEADER_FILE_ONLY TRUE)
-endif()
+set_source_files_properties(${UI_CPP} ${UI_H} PROPERTIES GENERATED TRUE)
+
+add_library(${TARGET} STATIC ${UI_CPP} ${UI_H})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
+add_dependencies(${TARGET} llama-ui-assets)
+
+target_include_directories(${TARGET} PUBLIC
+    ${CMAKE_CURRENT_BINARY_DIR}
+)
diff --git a/tools/ui/embed.cpp b/tools/ui/embed.cpp
new file mode 100644
index 000000000..41227868e
--- /dev/null
+++ b/tools/ui/embed.cpp
@@ -0,0 +1,144 @@
+// llama-ui-embed: generate ui.cpp / ui.h that embed UI assets as C arrays.
+//
+// Usage:
+//   llama-ui-embed <out_cpp> <out_h> [<asset_name> <asset_path>]...
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <fstream>
+#include <string>
+#include <vector>
+
+static bool read_file(const std::string & path, std::vector<unsigned char> & out) {
+    std::ifstream f(path, std::ios::binary | std::ios::ate);
+    if (!f) {
+        fprintf(stderr, "embed: cannot open %s\n", path.c_str());
+        return false;
+    }
+    const auto sz = f.tellg();
+    if (sz < 0) {
+        return false;
+    }
+    f.seekg(0);
+    out.resize(static_cast<size_t>(sz));
+    if (sz > 0 && !f.read(reinterpret_cast<char *>(out.data()), sz)) {
+        return false;
+    }
+    return true;
+}
+
+static void append_bytes_hex(std::string & out, const std::vector<unsigned char> & bytes) {
+    static const char hex[] = "0123456789abcdef";
+    out.reserve(out.size() + bytes.size() * 5);
+    for (unsigned char b : bytes) {
+        out += '0';
+        out += 'x';
+        out += hex[b >> 4];
+        out += hex[b & 0xf];
+        out += ',';
+    }
+}
+
+static bool write_if_different(const std::string & path, const std::string & content) {
+    std::ifstream f(path, std::ios::binary | std::ios::ate);
+    if (f) {
+        const auto sz = f.tellg();
+        if (sz >= 0 && static_cast<size_t>(sz) == content.size()) {
+            std::string existing(static_cast<size_t>(sz), '\0');
+            f.seekg(0);
+            if (sz == 0 || f.read(existing.data(), sz)) {
+                if (existing == content) {
+                    return true;
+                }
+            }
+        }
+    }
+
+    std::ofstream out(path, std::ios::binary | std::ios::trunc);
+    if (!out) {
+        fprintf(stderr, "embed: cannot write %s\n", path.c_str());
+        return false;
+    }
+    if (!content.empty()) {
+        out.write(content.data(), static_cast<std::streamsize>(content.size()));
+    }
+    return out.good();
+}
+
+static std::string fmt(const char * pattern, ...) {
+    char tmp[512];
+    va_list ap;
+    va_start(ap, pattern);
+    const int n = vsnprintf(tmp, sizeof(tmp), pattern, ap);
+    va_end(ap);
+    return (n > 0) ? std::string(tmp, static_cast<size_t>(n)) : std::string();
+}
+
+int main(int argc, char ** argv) {
+    if (argc < 3 || ((argc - 3) % 2) != 0) {
+        fprintf(stderr, "usage: %s <out_cpp> <out_h> [<name> <path>]...\n", argv[0]);
+        return 1;
+    }
+
+    const std::string out_cpp = argv[1];
+    const std::string out_h   = argv[2];
+    const int n_assets = (argc - 3) / 2;
+
+    std::string h;
+    h += "#pragma once\n\n#include <stddef.h>\n\n";
+    if (n_assets > 0) {
+        h += "#define LLAMA_UI_HAS_ASSETS 1\n\n";
+    }
+    h +=
+        "struct llama_ui_asset {\n"
+        "    const char *          name;\n"
+        "    const unsigned char * data;\n"
+        "    size_t                size;\n"
+        "};\n\n"
+        "const llama_ui_asset * llama_ui_find_asset(const char * name);\n";
+
+    std::string cpp;
+    cpp += "#include \"ui.h\"\n\n#include <string.h>\n\n";
+
+    if (n_assets > 0) {
+        for (int i = 0; i < n_assets; i++) {
+            const char * path = argv[3 + i * 2 + 1];
+            std::vector<unsigned char> bytes;
+            if (!read_file(path, bytes)) {
+                return 1;
+            }
+            cpp += fmt("static const unsigned char asset_%d_data[] = {", i);
+            append_bytes_hex(cpp, bytes);
+            cpp += fmt("};\nstatic const size_t        asset_%d_size = %lu;\n\n",
+                       i, static_cast<unsigned long>(bytes.size()));
+        }
+
+        cpp += "static const llama_ui_asset g_assets[] = {\n";
+        for (int i = 0; i < n_assets; i++) {
+            const char * name = argv[3 + i * 2];
+            cpp += fmt("    { \"%s\", asset_%d_data, asset_%d_size },\n", name, i, i);
+        }
+        cpp += "};\n\n";
+
+        cpp +=
+            "const llama_ui_asset * llama_ui_find_asset(const char * name) {\n"
+            "    for (const auto & a : g_assets) {\n"
+            "        if (strcmp(a.name, name) == 0) {\n"
+            "            return &a;\n"
+            "        }\n"
+            "    }\n"
+            "    return nullptr;\n"
+            "}\n";
+    } else {
+        cpp +=
+            "const llama_ui_asset * llama_ui_find_asset(const char *) {\n"
+            "    return nullptr;\n"
+            "}\n";
+    }
+
+    bool ok = true;
+    ok = write_if_different(out_h,   h)   && ok;
+    ok = write_if_different(out_cpp, cpp) && ok;
+    return ok ? 0 : 1;
+}
diff --git a/tools/ui/scripts/vite-plugin-llama-cpp-build.ts b/tools/ui/scripts/vite-plugin-llama-cpp-build.ts
index ddf6fa1e5..01c714a24 100644
--- a/tools/ui/scripts/vite-plugin-llama-cpp-build.ts
+++ b/tools/ui/scripts/vite-plugin-llama-cpp-build.ts
@@ -19,7 +19,7 @@ const GUIDE_FOR_FRONTEND = `
 -->
 `.trim();
 
-const OUTPUT_DIR = '../../build/tools/ui/dist';
+const OUTPUT_DIR = process.env.LLAMA_UI_OUT_DIR ?? './dist';
 
 export function llamaCppBuildPlugin(): Plugin {
 	return {
diff --git a/tools/ui/sources.cmake b/tools/ui/sources.cmake
new file mode 100644
index 000000000..de9dbf78b
--- /dev/null
+++ b/tools/ui/sources.cmake
@@ -0,0 +1,15 @@
+# Inputs used to decide whether the npm build output is up-to-date.
+
+set(UI_SOURCE_GLOBS
+    src/*
+    static/*
+)
+
+set(UI_SOURCE_FILES
+    package.json
+    package-lock.json
+    vite.config.ts
+    svelte.config.js
+    tsconfig.json
+    scripts/vite-plugin-llama-cpp-build.ts
+)
diff --git a/tools/ui/svelte.config.js b/tools/ui/svelte.config.js
index 4b14065da..1a7355b2d 100644
--- a/tools/ui/svelte.config.js
+++ b/tools/ui/svelte.config.js
@@ -2,6 +2,10 @@ import { mdsvex } from 'mdsvex';
 import adapter from '@sveltejs/adapter-static';
 import { vitePreprocess } from '@sveltejs/vite-plugin-svelte';
 
+// CMake sets LLAMA_UI_OUT_DIR to the staging dir under the build tree; manual
+// `npm run build` runs without the env var default to ./dist.
+const outDir = process.env.LLAMA_UI_OUT_DIR ?? './dist';
+
 /** @type {import('@sveltejs/kit').Config} */
 const config = {
 	// Consult https://svelte.dev/docs/kit/integrations
@@ -14,8 +18,8 @@ const config = {
 		},
 		router: { type: 'hash' },
 		adapter: adapter({
-			pages: '../../build/tools/ui/dist',
-			assets: '../../build/tools/ui/dist',
+			pages: outDir,
+			assets: outDir,
 			fallback: 'index.html',
 			precompress: false,
 			strict: true
diff --git a/tools/ui/ui.cpp b/tools/ui/ui.cpp
deleted file mode 100644
index d02a62c2c..000000000
--- a/tools/ui/ui.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifdef LLAMA_BUILD_UI
-// auto generated files (see README.md for details)
-#include "index.html.hpp"
-#include "bundle.js.hpp"
-#include "bundle.css.hpp"
-#include "loading.html.hpp"
-#endif
diff --git a/tools/ui/ui.h b/tools/ui/ui.h
deleted file mode 100644
index 6f775ea3a..000000000
--- a/tools/ui/ui.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#pragma once
-
-// TODO @ngxson : refactor, wrap these in a function
-
-#ifdef LLAMA_BUILD_UI
-extern unsigned char index_html[];
-extern unsigned int  index_html_len;
-
-extern unsigned char bundle_js[];
-extern unsigned int  bundle_js_len;
-
-extern unsigned char bundle_css[];
-extern unsigned int  bundle_css_len;
-
-extern unsigned char loading_html[];
-extern unsigned int  loading_html_len;
-#endif

From cec51c7a7d3d290b40a9980bfa55fb447b82f679 Mon Sep 17 00:00:00 2001
From: Aparna M P <aparmp@qti.qualcomm.com>
Date: Sun, 24 May 2026 08:26:41 +0530
Subject: [PATCH 16/17] snapdragon: update windows toolchain to use hsdk
 v6.6.0.0 (#23552)

---
 docs/backend/snapdragon/CMakeUserPresets.json |  4 ++--
 docs/backend/snapdragon/README.md             |  4 ++--
 docs/backend/snapdragon/windows.md            | 12 ++++++------
 scripts/snapdragon/windows/setup-build.ps1    |  6 +++---
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/docs/backend/snapdragon/CMakeUserPresets.json b/docs/backend/snapdragon/CMakeUserPresets.json
index d2629fc4d..d37100764 100644
--- a/docs/backend/snapdragon/CMakeUserPresets.json
+++ b/docs/backend/snapdragon/CMakeUserPresets.json
@@ -33,8 +33,8 @@
         "name": "arm64-windows-snapdragon",
         "inherits": [ "base", "arm64-windows-llvm" ],
         "cacheVariables": {
-            "CMAKE_C_FLAGS":   "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE",
-            "CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16 -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE",
+            "CMAKE_C_FLAGS":   "-march=armv8.7a+fp16+dotprod+i8mm -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE",
+            "CMAKE_CXX_FLAGS": "-march=armv8.7a+fp16+dotprod+i8mm -fvectorize -ffp-model=fast -flto -D_GNU_SOURCE",
             "CMAKE_C_FLAGS_RELEASE":          "-O3 -DNDEBUG",
             "CMAKE_CXX_FLAGS_RELEASE":        "-O3 -DNDEBUG",
             "CMAKE_C_FLAGS_RELWITHDEBINFO":   "-O3 -DNDEBUG -g",
diff --git a/docs/backend/snapdragon/README.md b/docs/backend/snapdragon/README.md
index f5bb3d11c..a90f7da30 100644
--- a/docs/backend/snapdragon/README.md
+++ b/docs/backend/snapdragon/README.md
@@ -24,7 +24,7 @@ Native Windows 11 arm64 builds has the following tools dependencies:
   - UCRT and Driver Kit
 - LLVM core libraries and Clang compiler (winget)
 - CMake, Git, Python (winget)
-- Hexagon SDK Community Edition 6.4 or later (see windows.md)
+- Hexagon SDK Community Edition 6.6 or later (see windows.md)
 - OpenCL SDK 2.3 or later (see windows.md)
 
 Note: The rest of the **Windows** build process assumes that you're running natively in Powershell.
@@ -45,7 +45,7 @@ Preset CMake variables:
   GGML_HEXAGON="ON"
   GGML_OPENCL="ON"
   GGML_OPENMP="OFF"
-  HEXAGON_SDK_ROOT="/opt/hexagon/6.4.0.2"
+  HEXAGON_SDK_ROOT="/opt/hexagon/6.6.0.0"
 ...
 -- Including OpenCL backend
 -- Including Hexagon backend
diff --git a/docs/backend/snapdragon/windows.md b/docs/backend/snapdragon/windows.md
index 6307e1b69..aa731413c 100644
--- a/docs/backend/snapdragon/windows.md
+++ b/docs/backend/snapdragon/windows.md
@@ -28,15 +28,15 @@ c:\Qualcomm\OpenCL_SDK\2.3.2
 
 Either use the trimmed down version (optimized for CI) from
 
-    https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.4.0.2/hexagon-sdk-v6.4.0.2-arm64-wos.tar.xz
+    https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.6.0.0/hexagon-sdk-v6.6.0.0-arm64-wos.tar.xz
 
 Or download the complete official version from
 
-    https://softwarecenter.qualcomm.com/catalog/item/Hexagon_SDK?version=6.4.0.2
+    https://softwarecenter.qualcomm.com/catalog/item/Hexagon_SDK?version=6.6.0.0
 
 Unzip/untar the archive into
 ```
-c:\Qualcomm\Hexagon_SDK\6.4.0.2
+c:\Qualcomm\Hexagon_SDK\6.6.0.0
 ```
 
 ## Install the latest Adreno GPU driver
@@ -123,10 +123,10 @@ The overall Hexagon backend build procedure for Windows on Snapdragon is the sam
 However, additional settings are required for generating and signing HTP Ops libraries.
 ```
 > $env:OPENCL_SDK_ROOT="C:\Qualcomm\OpenCL_SDK\2.3.2"
-> $env:HEXAGON_SDK_ROOT="C:\Qualcomm\Hexagon_SDK\6.4.0.2"
-> $env:HEXAGON_TOOLS_ROOT="C:\Qualcomm\Hexagon_SDK\6.4.0.2\tools\HEXAGON_Tools\19.0.04"
+> $env:HEXAGON_SDK_ROOT="C:\Qualcomm\Hexagon_SDK\6.6.0.0"
+> $env:HEXAGON_TOOLS_ROOT="C:\Qualcomm\Hexagon_SDK\6.6.0.0\tools\HEXAGON_Tools\19.0.07"
 > $env:HEXAGON_HTP_CERT="c:\Users\MyUsers\Certs\ggml-htp-v1.pfx"
-> $env:WINDOWS_SDK_BIN="C:\Program Files (x86)\Windows Kits\10\bin\10.0.26100.0\arm64"
+> $env:WINDOWS_SDK_BIN="C:\Program Files (x86)\Windows Kits\10\bin\10.0.26100.0"
 
 > cmake --preset arm64-windows-snapdragon-release -B build-wos
 ...
diff --git a/scripts/snapdragon/windows/setup-build.ps1 b/scripts/snapdragon/windows/setup-build.ps1
index 0f3244cc9..d8ef24d44 100644
--- a/scripts/snapdragon/windows/setup-build.ps1
+++ b/scripts/snapdragon/windows/setup-build.ps1
@@ -7,10 +7,10 @@ $ErrorActionPreference = "Stop"
 $BaseDir = "C:\Qualcomm"
 
 # SDK 1: Hexagon
-$HexagonUrl     = "https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.4.0.2/hexagon-sdk-v6.4.0.2-arm64-wos.tar.xz"
+$HexagonUrl     = "https://github.com/snapdragon-toolchain/hexagon-sdk/releases/download/v6.6.0.0/hexagon-sdk-v6.6.0.0-arm64-wos.tar.xz"
 $HexagonParent  = Join-Path $BaseDir "Hexagon_SDK"
-$HexagonSdkVersion   = "6.4.0.2"
-$HexagonToolsVersion = "19.0.04"
+$HexagonSdkVersion   = "6.6.0.0"
+$HexagonToolsVersion = "19.0.07"
 $HexagonSdkTarget    = Join-Path $HexagonParent $HexagonSdkVersion
 $HexagonToolsTarget  = Join-Path $HexagonSdkTarget "\tools\HEXAGON_Tools\$HexagonToolsVersion"
 

From 1c0f6db54542a8c152d8f9403415d68b5da61264 Mon Sep 17 00:00:00 2001
From: Yiwei Shao <44545837+njsyw1997@users.noreply.github.com>
Date: Sat, 23 May 2026 19:56:59 -0700
Subject: [PATCH 17/17] hexagon: apply repl optimization in flash attn softmax
 as #22993 (#23455)

---
 ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c b/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c
index 4a4ff0b33..9e1b778b0 100644
--- a/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c
+++ b/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c
@@ -852,9 +852,10 @@ static void fa_softmax_thread(unsigned int n, unsigned int i, void * data) {
             v_s_rowmax1 = hvx_vec_reduce_max_f16(v_s_rowmax1);
 
             // Splat m_prev[r], m_prev[r+1] from the per-row accumulator.
-            // vror brings the target lane to lane 0, then extract + re-splat.
-            HVX_Vector v_m_prev0 = hvx_vec_splat_f16(hvx_vec_get_f16(Q6_V_vror_VR(m_prev_v, r_vec_off * 2)));
-            HVX_Vector v_m_prev1 = hvx_vec_splat_f16(hvx_vec_get_f16(Q6_V_vror_VR(m_prev_v, (r_vec_off + 1) * 2)));
+            // vror brings the target lane to lane 0, then vdelta replicates it
+            // across all lanes — stays in the vector domain (no store/reload).
+            HVX_Vector v_m_prev0 = hvx_vec_repl_f16(Q6_V_vror_VR(m_prev_v, r_vec_off * 2));
+            HVX_Vector v_m_prev1 = hvx_vec_repl_f16(Q6_V_vror_VR(m_prev_v, (r_vec_off + 1) * 2));
 
             // HVX max — both operands are splats, so result is splat of m_new.
             HVX_Vector v_dup_m0 = Q6_Vhf_vmax_VhfVhf(v_m_prev0, v_s_rowmax0);