From c9021714e843dd02670f18bd826be076bc58502c Mon Sep 17 00:00:00 2001
From: Pascal <admin@serveurperso.com>
Date: Thu, 21 May 2026 10:09:19 +0200
Subject: [PATCH 01/12] server: re-inject subcommand when router spawns
 children under unified binary (#23442)

---
 app/llama.cpp                  | 9 +++++++++
 tools/server/server-models.cpp | 8 ++++++++
 2 files changed, 17 insertions(+)
diff --git a/app/llama.cpp b/app/llama.cpp
index e149975d2..b0b86fd47 100644
--- a/app/llama.cpp
+++ b/app/llama.cpp
@@ -1,6 +1,7 @@
 #include "build-info.h"
 
 #include <cstdio>
+#include <cstdlib>
 #include <string>
 #include <vector>
 
@@ -77,6 +78,14 @@ int main(int argc, char ** argv) {
 
     for (const auto & cmd : cmds) {
         if (matches(arg, cmd)) {
+
+            // router spawns children through this same binary, it needs the
+            // subcommand to relaunch as 'llama serve' and not bare options
+#ifdef _WIN32
+            _putenv_s("LLAMA_APP_CMD", cmd.name);
+#else
+            setenv("LLAMA_APP_CMD", cmd.name, 1);
+#endif
             return cmd.func(argc - 1, argv + 1);
         }
     }
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
index ccf42320f..47b6c2a4e 100644
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -14,6 +14,7 @@
 #include <mutex>
 #include <condition_variable>
 #include <cstring>
+#include <cstdlib>
 #include <atomic>
 #include <chrono>
 #include <queue>
@@ -159,6 +160,13 @@ void server_model_meta::update_args(common_preset_context & ctx_preset, std::str
     // TODO: maybe validate preset before rendering ?
     // render args
     args = preset.to_args(bin_path);
+
+    // unified binary dispatches by subcommand, re-inject it right after the
+    // binary path so the child starts as 'llama serve ...' not 'llama ...'
+    const char * app_cmd = std::getenv("LLAMA_APP_CMD");
+    if (app_cmd != nullptr && app_cmd[0] != '\0' && !bin_path.empty()) {
+        args.insert(args.begin() + 1, app_cmd);
+    }
 }
 
 void server_model_meta::update_caps() {

From 52fb93a2bd6b12673b9f4f225e61968e70443b11 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Thu, 21 May 2026 16:11:11 +0800
Subject: [PATCH 02/12] server : free draft/MTP resources on sleep to fix VRAM
 leak (#23461)

The destroy() function in server_context_impl only cleaned up the main
model and context (via llama_init.reset()) but did not free the speculative
decoder (spec), draft context (ctx_dft), or draft model (model_dft).

For MTP (Multi-Token Prediction) models, ctx_dft holds GPU-allocated
resources (KV cache, compute buffers) that are not freed when entering
the sleeping state. On each sleep/resume cycle, new resources are
allocated without the old ones being freed, leading to a VRAM leak
that eventually crashes the server with out-of-memory errors.

Fix by explicitly resetting spec, ctx_dft, and model_dft in destroy()
before resetting llama_init, ensuring proper cleanup order to avoid
use-after-free.

ref: https://github.com/ggml-org/llama.cpp/issues/23395

Assisted-by: llama.cpp:local pi
---
 tools/server/server-context.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index f51731026..80d77b0c0 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -701,6 +701,10 @@ private:
     bool sleeping = false;
 
     void destroy() {
+        spec.reset();
+        ctx_dft.reset();
+        model_dft.reset();
+
         llama_init.reset();
 
         ctx_tgt = nullptr;

From a1a69f777a14bb8584ba0eb53505cd5ee888bd5e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 21 May 2026 13:34:08 +0300
Subject: [PATCH 03/12] metal : optimize concat kernel and fix set kernel
 threads (#23411)

* metal : fix GGML_OP_SET kernel threads

* tests : extend test_cpy to support different src/dst shapes

Extend test_cpy to support different source and destination tensor shapes
for CPY operations (reshaping), where the total number of elements must match.

- Renamed ne -> ne_src, added ne_dst parameter (default: use src shape)
- Added 50 new reshaping test cases covering 1D<->2D<->3D<->4D conversions
- Tests exercise 1024 boundary, small shapes, and large dimensionality changes
- Fixed dangling reference bug (storing & to temporary std::array)
- Updated all existing test calls with permute/transpose args for compatibility

Assisted-by: llama.cpp:local pi

* metal : optimize concat kernel with row batching for small widths

When ne0 < 256, batch multiple rows into a single threadgroup to improve
occupancy. This avoids underutilizing the GPU when processing narrow tensors.

- Dispatch nth = min(256, ne0) threads per group
- Calculate nrptg (rows per threadgroup) to fill up to 256 threads
- Update kernel index calculation to handle the row batching
- Add boundary check for i1 >= ne1

Assisted-by: llama.cpp:local pi

* tests : clean-up

* tests : refactor CPY shape tests to use dimension permutations

Replace 75 hardcoded test cases with a loop over permutations of
{3, 5, 7, 32} (total elements: 3360). Each src permutation is tested
against canonical sorted and reverse dst, skipping identical shapes.
Covers F32, F16, and Q4_0 (when both src and dst ne0 == 32).

Assisted-by: llama.cpp:local pi
---
 ggml/src/ggml-metal/ggml-metal-ops.cpp |  19 +++-
 ggml/src/ggml-metal/ggml-metal.metal   |   6 +-
 tests/test-backend-ops.cpp             | 130 ++++++++++++++++++-------
 3 files changed, 113 insertions(+), 42 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
index 8506000b6..206af227a 100644
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -564,9 +564,20 @@ int ggml_metal_op_concat(ggml_metal_op_t ctx, int idx) {
     ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
     ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         3);
 
-    const int nth = std::min(1024, ne0);
+    int nth = std::min(256, ne0);
 
-    ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1);
+    // when rows are small, we can batch them together in a single threadgroup
+    int nrptg = 1;
+    if (nth < 256) {
+        nrptg = std::min((256 + nth - 1) / nth, ne1);
+        if (nrptg * nth > 256) {
+            nrptg = 256 / nth;
+        }
+    }
+
+    const int nw0 = (ne1 + nrptg - 1) / nrptg;
+
+    ggml_metal_encoder_dispatch_threadgroups(enc, nw0, ne2, ne3, nth, nrptg, 1);
 
     return 1;
 }
@@ -1786,7 +1797,7 @@ int ggml_metal_op_set(ggml_metal_op_t ctx, int idx) {
         nk0 = ne10/ggml_blck_size(op->type);
     }
 
-    int nth = std::min<int>(nk0, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
+    int nth = std::min<int>(nk0*ne11, 256);
 
     // when rows are small, we can batch them together in a single threadgroup
     int nrptg = 1;
@@ -1797,7 +1808,7 @@ int ggml_metal_op_set(ggml_metal_op_t ctx, int idx) {
             nrptg = (nth + nk0 - 1)/nk0;
             nth   = nk0;
 
-            if (nrptg*nth > ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
+            if (nrptg*nth > 256) {
                 nrptg--;
             }
         }
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 4cf9dbea9..e772664ba 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -7486,7 +7486,11 @@ kernel void kernel_concat(
 
     const int i3 = tgpig.z;
     const int i2 = tgpig.y;
-    const int i1 = tgpig.x;
+    const int i1 = ntg.y == 1 ? tgpig.x : tgpig.x*ntg.y + tpitg.y;
+
+    if (i1 >= args.ne1) {
+        return;
+    }
 
     int o[4] = {0, 0, 0, 0};
     o[args.dim] = args.dim == 0 ? args.ne00 : (args.dim == 1 ? args.ne01 : (args.dim == 2 ? args.ne02 : args.ne03));
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 43343b679..303f5a40d 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -2866,15 +2866,24 @@ struct test_set : public test_case {
 struct test_cpy : public test_case {
     const ggml_type type_src;
     const ggml_type type_dst;
-    const std::array<int64_t, 4> ne;
+    const std::array<int64_t, 4> ne_src;
+    const std::array<int64_t, 4> ne_dst;
     const std::array<int64_t, 4> permute_src;
     const std::array<int64_t, 4> permute_dst;
     bool _src_use_permute;
     bool _dst_use_permute;
     bool _src_transpose;
+    bool _use_dst_shape;
 
     std::string vars() override {
-        return VARS_TO_STR6(type_src, type_dst, ne, permute_src, permute_dst, _src_transpose);
+        if (_use_dst_shape) {
+            return VARS_TO_STR7(type_src, type_dst, ne_src, ne_dst, permute_src, permute_dst, _src_transpose);
+        }
+        return VARS_TO_STR6(type_src, type_dst, ne_src, permute_src, permute_dst, _src_transpose);
+    }
+
+    int64_t total_elements() const {
+        return ne_src[0] * ne_src[1] * ne_src[2] * ne_src[3];
     }
 
     double max_nmse_err() override {
@@ -2899,7 +2908,7 @@ struct test_cpy : public test_case {
                 err_estimate /= 8.0f;
             }
             err_estimate *= err_estimate;
-            err_estimate /= (150.0f*150.0f*0.25f)*float(ne[0] * ne[1] * ne[2] * ne[3]);
+            err_estimate /= (150.0f*150.0f*0.25f)*float(total_elements());
             return err_estimate;
         }
         return 1e-6;
@@ -2910,17 +2919,19 @@ struct test_cpy : public test_case {
     }
 
     test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 1},
+            std::array<int64_t, 4> ne_src = {10, 10, 10, 1},
+            std::array<int64_t, 4> ne_dst = {-1, -1, -1, -1},
             std::array<int64_t, 4> permute_src = {0, 0, 0, 0},
             std::array<int64_t, 4> permute_dst = {0, 0, 0, 0},
             bool transpose_src = false)
-        : type_src(type_src), type_dst(type_dst), ne(ne), permute_src(permute_src), permute_dst(permute_dst),
+        : type_src(type_src), type_dst(type_dst), ne_src(ne_src), ne_dst(ne_dst), permute_src(permute_src), permute_dst(permute_dst),
           _src_use_permute(permute_src[0] + permute_src[1] + permute_src[2] + permute_src[3] > 0),
           _dst_use_permute(permute_dst[0] + permute_dst[1] + permute_dst[2] + permute_dst[3] > 0),
-          _src_transpose(transpose_src){}
+          _src_transpose(transpose_src),
+          _use_dst_shape(ne_dst[0] >= 0 && ne_dst[1] >= 0 && ne_dst[2] >= 0 && ne_dst[3] >= 0){}
 
     ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
+        ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne_src.data());
         ggml_set_param(src);
         ggml_set_name(src, "src");
 
@@ -2934,7 +2945,8 @@ struct test_cpy : public test_case {
             ggml_set_name(src, "src_transposed");
         }
 
-        ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, src->ne);
+        std::array<int64_t, 4> dst_ne = _use_dst_shape ? ne_dst : std::array<int64_t, 4>{src->ne[0], src->ne[1], src->ne[2], src->ne[3]};
+        ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, dst_ne.data());
         ggml_set_name(dst, "dst");
 
         if (_dst_use_permute) {
@@ -8040,42 +8052,72 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
 
         for (int k = 1; k < 4; ++k) {
             test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}));
-            test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {0, 2, 1, 3}));
-            test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {0, 3, 1, 2}, {0, 2, 1, 3}));
+            test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {-1,-1,-1,-1}, {0, 2, 1, 3}));
+            test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {-1,-1,-1,-1}, {0, 3, 1, 2}, {0, 2, 1, 3}));
         }
     }
 
     for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32}) {
         for (ggml_type type_dst : all_types) {
             test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
-            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
+            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {-1,-1,-1,-1}, {0, 2, 1, 3})); // cpy by rows
         }
     }
     for (ggml_type type_src : all_types) {
         for (ggml_type type_dst : {GGML_TYPE_F32}) {
             test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
-            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
+            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {-1,-1,-1,-1}, {0, 2, 1, 3})); // cpy by rows
         }
     }
     for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
         for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_F32}) {
-            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {1, 0, 2, 3})); // cpy not-contiguous
+            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {-1,-1,-1,-1}, {1, 0, 2, 3})); // cpy not-contiguous
         }
     }
     test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_I32, {256, 2, 3, 4}));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_I32, {256, 2, 3, 4}, {1, 0, 2, 3}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_I32, {256, 2, 3, 4}, {-1,-1,-1,-1}, {1, 0, 2, 3}));
     test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4}));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4}, {1, 0, 2, 3}));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 3}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4}, {-1,-1,-1,-1}, {1, 0, 2, 3}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 3, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 3}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 3, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 4, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 1, 4, 1}, {-1,-1,-1,-1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {-1,-1,-1,-1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
+
+    // CPY - different src/dst shapes (reshaping via CPY)
+    // Use permutations of {3, 5, 7, 32}. Total elements: 3*5*7*32 = 3360.
+    // Each src permutation is tested against canonical sorted and reverse dst (skip self).
+    {
+        std::array<int64_t, 4> dims = {3, 5, 7, 32};
+        std::sort(dims.begin(), dims.end());
+        std::array<int64_t, 4> canonical = dims;
+        std::array<int64_t, 4> reversed  = {32, 7, 5, 3};
+        for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
+            std::array<int64_t, 4> cur = dims;
+            do {
+                if (cur != canonical) {
+                    test_cases.emplace_back(new test_cpy(type, type, cur, canonical));
+                }
+                if (cur != reversed) {
+                    test_cases.emplace_back(new test_cpy(type, type, cur, reversed));
+                }
+                if (cur[0] == 32 && type == GGML_TYPE_F32) {
+                    if (canonical[0] == 32) {
+                        test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_Q4_0, cur, canonical));
+                    }
+                    if (reversed[0] == 32) {
+                        test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_Q4_0, cur, reversed));
+                    }
+                }
+                std::next_permutation(cur.begin(), cur.end());
+            } while (cur != canonical);
+        }
+    }
 
     for (ggml_type type_dst : { GGML_TYPE_F32, GGML_TYPE_I32, GGML_TYPE_F16, GGML_TYPE_BF16 }) {
         for (bool use_view_slice : { true, false }) {
@@ -8830,9 +8872,24 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 2, 3}, {256, 16, 2, 3}, 1));
     test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 2, 3}, {128, 16, 2, 3}, 2));
     test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 2, 3}, {64, 16, 2, 3}, 3));
+
     test_cases.emplace_back(new test_pad());
     test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {33, 17, 2, 1}, 4, 3, true)); // circular
     test_cases.emplace_back(new test_pad_ext());
+    test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 1, 1, 1}, 1, 0, false));
+    test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 2, 1, 1}, 1, 0, false));
+    test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 16, 1, 1}, 0, 1, false));
+    test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1023, 1, 1, 1}, 1, 0, false));
+    test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1023, 8, 1, 1}, 1, 0, false));
+    test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1025, 1, 1, 1}, 1, 0, false));
+    test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1025, 8, 1, 1}, 1, 0, false));
+    test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {2048, 1, 1, 1}, 1, 0, false));
+    test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {2048, 4, 1, 1}, 1, 0, false));
+    test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {2049, 1, 1, 1}, 1, 0, false));
+    test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 1, 1, 1}, 100, 0, false));
+    test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 1, 1, 1}, 0, 100, false));
+    test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 100, 1, 1}, 50, 50, false));
+
     test_cases.emplace_back(new test_pad_reflect_1d());
     test_cases.emplace_back(new test_pad_reflect_1d(GGML_TYPE_F32, {3000, 384, 4, 1}));
     test_cases.emplace_back(new test_roll());
@@ -9132,22 +9189,21 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
     test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1}));
 
     test_cases.emplace_back(new test_cpy(GGML_TYPE_F32,  GGML_TYPE_F16,  {512, 3072, 1, 1}));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32,  GGML_TYPE_F32,  {8192, 512, 2, 1}, {0, 2, 1, 3}));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32,  GGML_TYPE_F32,  {3072, 512, 2, 1}, {0, 2, 1, 3}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32,  GGML_TYPE_F32,  {8192, 512, 2, 1}, {-1,-1,-1,-1}, {0, 2, 1, 3}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32,  GGML_TYPE_F32,  {3072, 512, 2, 1}, {-1,-1,-1,-1}, {0, 2, 1, 3}));
     test_cases.emplace_back(new test_cpy(GGML_TYPE_F32,  GGML_TYPE_Q4_0, {8192, 512, 2, 1}));
     test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_F32,  {8192, 512, 2, 1}));
 
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
-
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
-    test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}, {0, 0, 0, 0}));
 
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
+    test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
 
     test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {4096, 4096, 5, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));
     test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {12888, 256, 5, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f));

From b65bb4baaeae712975e09a92e1d37d3842ea5da7 Mon Sep 17 00:00:00 2001
From: ScrewTSW <TheScrewCollab@gmail.com>
Date: Thu, 21 May 2026 13:29:13 +0200
Subject: [PATCH 04/12] server: expose prompt token counts in /slots endpoint
 (#23454)

Add n_prompt_tokens, n_prompt_tokens_processed, and n_prompt_tokens_cache
to the /slots JSON response. These fields are already tracked internally
but were not exposed, making it impossible for clients to monitor prompt
evaluation progress during processing.
---
 tools/server/server-context.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 80d77b0c0..b939e3b75 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -506,6 +506,9 @@ struct server_slot {
 
         if (ptask) {
             res["id_task"] = ptask->id;
+            res["n_prompt_tokens"]           = (int32_t) prompt.tokens.size();
+            res["n_prompt_tokens_processed"] = n_prompt_tokens_processed;
+            res["n_prompt_tokens_cache"]     = n_prompt_tokens_cache;
             res["params"] = ptask->params.to_json(only_metrics);
             res["next_token"] = {
                 {

From 40d5358d3c730b81729ba81cd5c44ed596d02510 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 21 May 2026 14:41:50 +0300
Subject: [PATCH 05/12] tests : move save-load-state from examples to tests
 (#23336)

* tests : move save-load-state from examples to tests

- Move examples/save-load-state/ to tests/test-save-load-state.cpp
- Remove subdirectory reference from examples/CMakeLists.txt
- Add test to tests/CMakeLists.txt as a model test
- Remove CODEOWNERS entry for removed example directory

Assisted-by: llama.cpp:local pi

* cont : update ci
---
 CODEOWNERS                                                | 1 -
 ci/run.sh                                                 | 8 ++++----
 examples/CMakeLists.txt                                   | 1 -
 examples/save-load-state/CMakeLists.txt                   | 5 -----
 tests/CMakeLists.txt                                      | 4 ++++
 .../save-load-state.cpp => tests/test-save-load-state.cpp | 0
 6 files changed, 8 insertions(+), 11 deletions(-)
 delete mode 100644 examples/save-load-state/CMakeLists.txt
 rename examples/save-load-state/save-load-state.cpp => tests/test-save-load-state.cpp (100%)

diff --git a/CODEOWNERS b/CODEOWNERS
index f92fe98ce..4b9d90177 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -49,7 +49,6 @@
 /examples/parallel/                     @ggerganov
 /examples/passkey/                      @ggerganov
 /examples/retrieval/                    @ggerganov
-/examples/save-load-state/              @ggerganov
 /examples/speculative-simple/           @ggerganov
 /examples/speculative/                  @ggerganov
 /ggml/cmake/                            @ggerganov
diff --git a/ci/run.sh b/ci/run.sh
index a8cbd3371..b096dc23b 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -461,10 +461,10 @@ function gg_run_qwen3_0_6b {
 
     (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
 
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on  --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off                ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on                 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/test-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/test-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on  --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/test-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off                ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/test-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on                 ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
 
     function check_ppl {
         qnt="$1"
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index a29dc707c..39f802d25 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -27,7 +27,6 @@ else()
     add_subdirectory(parallel)
     add_subdirectory(passkey)
     add_subdirectory(retrieval)
-    add_subdirectory(save-load-state)
     add_subdirectory(simple)
     add_subdirectory(simple-chat)
     add_subdirectory(speculative)
diff --git a/examples/save-load-state/CMakeLists.txt b/examples/save-load-state/CMakeLists.txt
deleted file mode 100644
index 78024672e..000000000
--- a/examples/save-load-state/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-set(TARGET llama-save-load-state)
-add_executable(${TARGET} save-load-state.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 0fdbd39c9..33ae3b303 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -255,6 +255,10 @@ set_tests_properties(test-state-restore-fragmented PROPERTIES FIXTURES_REQUIRED
 llama_build_and_test(test-recurrent-state-rollback.cpp LABEL "model" ARGS -m "${MODEL_DEST}")
 set_tests_properties(test-recurrent-state-rollback PROPERTIES FIXTURES_REQUIRED test-download-model)
 
+# Test state save/load functionality
+llama_build_and_test(test-save-load-state.cpp LABEL "model" ARGS -m "${MODEL_DEST}")
+set_tests_properties(test-save-load-state PROPERTIES FIXTURES_REQUIRED test-download-model)
+
 if (NOT GGML_BACKEND_DL)
     # these tests use the backends directly and cannot be built with dynamic loading
     llama_build_and_test(test-barrier.cpp)
diff --git a/examples/save-load-state/save-load-state.cpp b/tests/test-save-load-state.cpp
similarity index 100%
rename from examples/save-load-state/save-load-state.cpp
rename to tests/test-save-load-state.cpp

From 5306f4b3b54a0e261e83b1d2961a97685e898871 Mon Sep 17 00:00:00 2001
From: Chen Yuan <constantchen525@gmail.com>
Date: Thu, 21 May 2026 10:58:49 -0400
Subject: [PATCH 06/12] fix(flash-attn): replace f32 with kv_type and q_type
 (#23372)

---
 .../wgsl-shaders/flash_attn_tile.wgsl         | 44 +++++++++----------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl
index ae8036b9a..4133f0ab5 100644
--- a/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl
@@ -122,9 +122,9 @@ const V_CHUNKS: u32 = HEAD_DIM_V / 4u;
 const SCORE_REGS_PER_LANE: u32 = (KV_TILE + MIN_SUBGROUP_SIZE - 1u) / MIN_SUBGROUP_SIZE;
 const OUT_REGS_PER_LANE: u32 = (V_CHUNKS + MIN_SUBGROUP_SIZE - 1u) / MIN_SUBGROUP_SIZE;
 
-var<workgroup> q_shmem: array<f32, Q_TILE * HEAD_DIM_QK>;
-var<workgroup> kv_shmem: array<f32, KV_TILE * KV_STAGE_STRIDE>;
-var<workgroup> p_shmem: array<f32, Q_TILE * KV_TILE>;
+var<workgroup> q_shmem: array<Q_TYPE, Q_TILE * HEAD_DIM_QK>;
+var<workgroup> kv_shmem: array<KV_TYPE, KV_TILE * KV_STAGE_STRIDE>;
+var<workgroup> p_shmem: array<KV_TYPE, Q_TILE * KV_TILE>;
 
 @compute @workgroup_size(WG_SIZE)
 fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
@@ -169,10 +169,10 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
 
     let head = f32(head_idx);
     let slope = select(1.0,
-                       select(pow(params.m1, 2.0 * (head - params.n_head_log2) + 1.0),
-                              pow(params.m0, head + 1.0),
-                              head < params.n_head_log2),
-                       params.max_bias > 0.0);
+        select(pow(params.m1, 2.0 * (head - params.n_head_log2) + 1.0),
+                pow(params.m0, head + 1.0),
+                head < params.n_head_log2),
+        params.max_bias > 0.0);
 
     for (var elem_idx = local_id.x; elem_idx < Q_TILE * HEAD_DIM_QK; elem_idx += WG_SIZE) {
         let q_tile_row = elem_idx / HEAD_DIM_QK;
@@ -181,7 +181,7 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
         let global_q_row_offset = q_head_offset + head_q_row * params.stride_q1;
         q_shmem[elem_idx] = select(
             0.0,
-            f32(Q[global_q_row_offset + q_col]) * params.scale,
+            Q_TYPE(Q[global_q_row_offset + q_col]) * params.scale,
             head_q_row < params.seq_len_q);
     }
 
@@ -213,10 +213,10 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
             let k_vec_index = (k_head_offset + global_k_row * params.stride_k1 + chunk * 4u) >> 2u;
             let k4 = K[k_vec_index];
             let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
-            kv_shmem[kv_off + 0u] = f32(k4.x);
-            kv_shmem[kv_off + 1u] = f32(k4.y);
-            kv_shmem[kv_off + 2u] = f32(k4.z);
-            kv_shmem[kv_off + 3u] = f32(k4.w);
+            kv_shmem[kv_off + 0u] = KV_TYPE(k4.x);
+            kv_shmem[kv_off + 1u] = KV_TYPE(k4.y);
+            kv_shmem[kv_off + 2u] = KV_TYPE(k4.z);
+            kv_shmem[kv_off + 3u] = KV_TYPE(k4.w);
         }
 
         workgroupBarrier();
@@ -233,18 +233,18 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
                 var dot_val = 0.0;
                 for (var chunk = 0u; chunk < Q_CHUNKS; chunk += 1u) {
                     let q_off = q_base + chunk * 4u;
-                    let qv = vec4<f32>(
+                    let qv = vec4<Q_TYPE>(
                         q_shmem[q_off + 0u],
                         q_shmem[q_off + 1u],
                         q_shmem[q_off + 2u],
                         q_shmem[q_off + 3u]);
                     let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
-                    let kv = vec4<f32>(
+                    let kv = vec4<KV_TYPE>(
                         kv_shmem[kv_off + 0u],
                         kv_shmem[kv_off + 1u],
                         kv_shmem[kv_off + 2u],
                         kv_shmem[kv_off + 3u]);
-                    dot_val += dot(qv, kv);
+                    dot_val += dot(vec4<f32>(qv), vec4<f32>(kv));
                 }
 #ifdef LOGIT_SOFTCAP
                 dot_val = params.logit_softcap * tanh(dot_val);
@@ -271,7 +271,7 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
             let kv_local = sg_inv_id + slot * subgroup_size;
             if (row_active && kv_local < kv_count) {
                 let p = exp(local_scores[slot] - new_max);
-                p_shmem[subgroup_p_offset + kv_local] = p;
+                p_shmem[subgroup_p_offset + kv_local] = KV_TYPE(p);
                 local_sum += p;
             }
         }
@@ -285,10 +285,10 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
             let v_vec_index = (v_head_offset + global_v_row * params.stride_v1 + chunk * 4u) >> 2u;
             let v4 = V[v_vec_index];
             let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
-            kv_shmem[kv_off + 0u] = f32(v4.x);
-            kv_shmem[kv_off + 1u] = f32(v4.y);
-            kv_shmem[kv_off + 2u] = f32(v4.z);
-            kv_shmem[kv_off + 3u] = f32(v4.w);
+            kv_shmem[kv_off + 0u] = KV_TYPE(v4.x);
+            kv_shmem[kv_off + 1u] = KV_TYPE(v4.y);
+            kv_shmem[kv_off + 2u] = KV_TYPE(v4.z);
+            kv_shmem[kv_off + 3u] = KV_TYPE(v4.w);
         }
 
         workgroupBarrier();
@@ -308,12 +308,12 @@ fn main(@builtin(workgroup_id) wg_id: vec3<u32>,
                 for (var kv_local = 0u; kv_local < kv_count; kv_local += 1u) {
                     let p = p_shmem[subgroup_p_offset + kv_local];
                     let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u;
-                    let v4 = vec4<f32>(
+                    let v4 = vec4<KV_TYPE>(
                         kv_shmem[kv_off + 0u],
                         kv_shmem[kv_off + 1u],
                         kv_shmem[kv_off + 2u],
                         kv_shmem[kv_off + 3u]);
-                    acc += p * v4;
+                    acc += f32(p) * vec4<f32>(v4);
                 }
                 out_regs[reg_idx] = acc;
             }

From 47c0eda9d4980bdb3031f6affe98ccaf6e1e69ee Mon Sep 17 00:00:00 2001
From: Pascal <admin@serveurperso.com>
Date: Thu, 21 May 2026 19:39:42 +0200
Subject: [PATCH 07/12] vulkan: fuse snake activation (mul, sin, sqr, mul, add)
 (#22855)

* vulkan: fuse snake activation (mul, sin, sqr, mul, add)

Add snake.comp shader with F32 / F16 / BF16 pipelines and
ggml_vk_snake_dispatch_fused. The matcher recognizes the naive 5 op
decomposition emitted by audio decoders (BigVGAN, Vocos) for snake
activation y = x + sin(a*x)^2 * inv_b and rewrites it to a single
elementwise kernel.

test_snake_fuse from the CUDA PR now also compares CPU naive vs
Vulkan fused across F32 / F16 / BF16.

* vulkan: address jeffbolznv review for fused snake activation

Rename T / C to ne0 / ne1 in the shader and push constants to match
the standard naming convention used across the Vulkan backend.

Tighten ggml_vk_can_fuse_snake: require x and dst to be contiguous
(the shader uses idx = i0 + i1 * ne0) and require a / inv_b to be
tightly packed on the broadcast dim (the shader reads data_a[i1]).

* vulkan: tighten snake fusion type checks for all operands (address jeffbolznv review)

* vulkan: reject snake fusion when ne[2] or ne[3] > 1 (address jeffbolznv review)

* vulkan: address 0cc4m review for fused snake activation

snake.comp is renamed to follow the ggml DATA_A_* / A_TYPE convention.
A_TYPE now applies to the activation tensor data_a instead of the
broadcast multiplier, and the bindings become data_a (A_TYPE), data_b
(float), data_c (float) and data_d (D_TYPE). A header at the top of
the shader maps each buffer to its role in y = x + sin(b * x)^2 * c.

On the C++ side, ggml_vk_can_fuse_snake reuses the existing snake_pattern
constant instead of duplicating the op list, sin_node is extracted as a
named local alongside the other chain nodes, and the broadcast operands
a and inv_b are now required to be GGML_TYPE_F32 to match the hardcoded
float bindings on data_b and data_c (the previous a->type == x->type
would silently reject any future BF16 or F16 chain once the supports_op
gate for SIN / SQR is lifted). ggml_vk_snake_dispatch_fused gets an
explicit GGML_TYPE_F32 case and GGML_ABORT on default in place of the
silent f32 fallback, and a stale comment about data_a[i1] / data_inv_b[i1]
is refreshed to match the new binding names.
---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp          | 136 +++++++++++++++++-
 .../src/ggml-vulkan/vulkan-shaders/snake.comp |  49 +++++++
 .../vulkan-shaders/vulkan-shaders-gen.cpp     |   4 +
 3 files changed, 187 insertions(+), 2 deletions(-)
 create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/snake.comp

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index d3fb19048..aa289220a 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -499,6 +499,12 @@ static constexpr std::initializer_list<ggml_op> topk_moe_late_softmax      { GGM
                                                                              GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
                                                                              GGML_OP_SOFT_MAX, GGML_OP_RESHAPE };
 
+// Snake activation: y = x + sin(a*x)^2 * inv_b. Used by the optimize_graph reorder
+// pass so it keeps the chain contiguous and by the dispatcher to detect the fusion.
+static constexpr std::initializer_list<ggml_op> snake_pattern              { GGML_OP_MUL,      GGML_OP_SIN,
+                                                                             GGML_OP_SQR,      GGML_OP_MUL,
+                                                                             GGML_OP_ADD };
+
 //node #978 (  SOFT_MAX):     ffn_moe_probs-15 (   0K) [Vulka         ] use=2:    ffn_moe_logits-15 (   0K) [Vulka         ]
 //node #979 (   RESHAPE): ffn_moe_probs-15 (re (   0K) [Vulka         ] use=1:     ffn_moe_probs-15 (   0K) [Vulka         ]
 //node #980 (   ARGSORT):   ffn_moe_argsort-15 (   0K) [Vulka         ] use=1:     ffn_moe_probs-15 (   0K) [Vulka         ]
@@ -846,6 +852,9 @@ struct vk_device_struct {
     vk_pipeline pipeline_im2col_3d_f32, pipeline_im2col_3d_f32_f16;
     vk_pipeline pipeline_timestep_embedding_f32;
     vk_pipeline pipeline_conv_transpose_1d_f32;
+    vk_pipeline pipeline_snake_f32;
+    vk_pipeline pipeline_snake_f16;
+    vk_pipeline pipeline_snake_bf16;
     vk_pipeline pipeline_pool2d_f32;
     vk_pipeline pipeline_rwkv_wkv6_f32;
     vk_pipeline pipeline_rwkv_wkv7_f32;
@@ -1475,6 +1484,11 @@ struct vk_op_conv_transpose_1d_push_constants {
     int32_t s0;
 };
 
+struct vk_op_snake_push_constants {
+    uint32_t ne0;
+    uint32_t ne1;
+};
+
 struct vk_op_pool2d_push_constants {
     uint32_t IW; uint32_t IH;
     uint32_t OW; uint32_t OH;
@@ -4845,6 +4859,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
 
     ggml_vk_create_pipeline(device, device->pipeline_conv_transpose_1d_f32, "conv_transpose_1d_f32", conv_transpose_1d_f32_len, conv_transpose_1d_f32_data, "main", 3, sizeof(vk_op_conv_transpose_1d_push_constants), {1, 1, 1}, {}, 1);
 
+    ggml_vk_create_pipeline(device, device->pipeline_snake_f32,  "snake_f32",  snake_f32_len,  snake_f32_data,  "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_snake_f16,  "snake_f16",  snake_f16_len,  snake_f16_data,  "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_snake_bf16, "snake_bf16", snake_bf16_len, snake_bf16_data, "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1);
+
     ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1);
 
     ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv6_f32, "rwkv_wkv6_f32", rwkv_wkv6_f32_len, rwkv_wkv6_f32_data, "main", 7, sizeof(vk_op_rwkv_wkv6_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
@@ -12110,6 +12128,45 @@ static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context&
     ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p));
 }
 
+// Dispatch the fused snake activation: y = x + sin^2(a * x) * inv_b.
+// Match the naive mul -> sin -> sqr -> mul -> add chain and run the
+// dedicated kernel directly. The pattern is validated by
+// ggml_vk_can_fuse_snake before this call.
+static void ggml_vk_snake_dispatch_fused(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx) {
+    const ggml_tensor * mul0 = cgraph->nodes[node_idx + 0];
+    const ggml_tensor * sqr  = cgraph->nodes[node_idx + 2];
+    const ggml_tensor * mul1 = cgraph->nodes[node_idx + 3];
+    ggml_tensor *       add  = cgraph->nodes[node_idx + 4];
+
+    // x carries the full activation shape, a is the broadcast operand
+    const ggml_tensor * x = ggml_are_same_shape(mul0, mul0->src[0]) ? mul0->src[0] : mul0->src[1];
+    const ggml_tensor * a = (x == mul0->src[0]) ? mul0->src[1] : mul0->src[0];
+
+    // mul1 reads sqr and inv_b in either operand order
+    const ggml_tensor * inv_b = (mul1->src[0] == sqr) ? mul1->src[1] : mul1->src[0];
+
+    vk_pipeline pipeline = nullptr;
+    switch (x->type) {
+        case GGML_TYPE_F32:  pipeline = ctx->device->pipeline_snake_f32;  break;
+        case GGML_TYPE_F16:  pipeline = ctx->device->pipeline_snake_f16;  break;
+        case GGML_TYPE_BF16: pipeline = ctx->device->pipeline_snake_bf16; break;
+        default:             GGML_ABORT("unsupported type");
+    }
+    ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
+
+    vk_subbuffer x_buf     = ggml_vk_tensor_subbuffer(ctx, x);
+    vk_subbuffer a_buf     = ggml_vk_tensor_subbuffer(ctx, a);
+    vk_subbuffer inv_b_buf = ggml_vk_tensor_subbuffer(ctx, inv_b);
+    vk_subbuffer dst_buf   = ggml_vk_tensor_subbuffer(ctx, add);
+
+    vk_op_snake_push_constants pc{};
+    pc.ne0 = static_cast<uint32_t>(x->ne[0]);
+    pc.ne1 = static_cast<uint32_t>(x->ne[1]);
+
+    std::array<uint32_t, 3> elements = { pc.ne0, pc.ne1, 1 };
+    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { x_buf, a_buf, inv_b_buf, dst_buf }, pc, elements);
+}
+
 static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
     uint32_t op = static_cast<uint32_t>(dst->op_params[0]);
     const int32_t k1 = dst->op_params[1];
@@ -13318,7 +13375,11 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
 
         break;
     case GGML_OP_MUL:
-        ggml_vk_mul(ctx, compute_ctx, src0, src1, node);
+        if (ctx->num_additional_fused_ops) {
+            ggml_vk_snake_dispatch_fused(ctx, compute_ctx, cgraph, node_idx);
+        } else {
+            ggml_vk_mul(ctx, compute_ctx, src0, src1, node);
+        }
 
         break;
     case GGML_OP_DIV:
@@ -14691,6 +14752,65 @@ static bool ggml_vk_can_fuse_rope_set_rows(ggml_backend_vk_context * ctx, const
     return true;
 }
 
+// Pattern check for the 5-op Snake fusion: mul -> sin -> sqr -> mul -> add.
+// Verifies the chain shape, the closure x_in_add == x_in_mul0, and that
+// the broadcast operands a and inv_b share a [1, C] layout.
+static bool ggml_vk_can_fuse_snake(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx) {
+    GGML_UNUSED(ctx);
+    if (!ggml_can_fuse(cgraph, node_idx, snake_pattern)) {
+        return false;
+    }
+
+    const ggml_tensor * mul0     = cgraph->nodes[node_idx + 0];
+    const ggml_tensor * sin_node = cgraph->nodes[node_idx + 1];
+    const ggml_tensor * sqr      = cgraph->nodes[node_idx + 2];
+    const ggml_tensor * mul1     = cgraph->nodes[node_idx + 3];
+    const ggml_tensor * add      = cgraph->nodes[node_idx + 4];
+
+    const ggml_tensor * x = ggml_are_same_shape(mul0, mul0->src[0]) ? mul0->src[0] : mul0->src[1];
+    const ggml_tensor * a = (x == mul0->src[0]) ? mul0->src[1] : mul0->src[0];
+
+    const ggml_tensor * inv_b    = (mul1->src[0] == sqr) ? mul1->src[1] : mul1->src[0];
+    const ggml_tensor * x_in_add = (add->src[0] == mul1) ? add->src[1] : add->src[0];
+
+    if (x_in_add != x) {
+        return false;
+    }
+    if (x->type != GGML_TYPE_F32 && x->type != GGML_TYPE_F16 && x->type != GGML_TYPE_BF16) {
+        return false;
+    }
+    // Shader bindings: data_a is A_TYPE so it follows x's precision, while
+    // data_b and data_c are hardcoded float, so the broadcast operands must
+    // be F32 regardless of x's type.
+    if (a->type     != GGML_TYPE_F32) return false;
+    if (inv_b->type != GGML_TYPE_F32) return false;
+    // Chain intermediates and output share x's precision (single A_TYPE / D_TYPE pipeline).
+    if (mul0->type     != x->type) return false;
+    if (sin_node->type != x->type) return false;
+    if (sqr->type      != x->type) return false;
+    if (mul1->type     != x->type) return false;
+    if (add->type      != x->type) return false;
+    if (!ggml_are_same_shape(a, inv_b)) {
+        return false;
+    }
+    if (a->ne[0] != 1 || a->ne[1] != x->ne[1]) {
+        return false;
+    }
+    // Dispatch is 2D over (ne0, ne1), so x and add must be 2D and a / inv_b
+    // must collapse to [1, C, 1, 1]. Higher dims are not handled by the shader.
+    if (x->ne[2]     != 1 || x->ne[3]     != 1) return false;
+    if (add->ne[2]   != 1 || add->ne[3]   != 1) return false;
+    if (a->ne[2]     != 1 || a->ne[3]     != 1) return false;
+    if (inv_b->ne[2] != 1 || inv_b->ne[3] != 1) return false;
+    // Shader uses idx = i0 + i1 * ne0 and reads data_b[i1] / data_c[i1],
+    // so every operand must be contiguous.
+    if (!ggml_is_contiguous(x) || !ggml_is_contiguous(add) ||
+        !ggml_is_contiguous(a) || !ggml_is_contiguous(inv_b)) {
+        return false;
+    }
+    return true;
+}
+
 // Check whether the tensors overlap in memory.
 // Fusions can potentially overwrite src tensors in ways that are not prevented
 // by ggml-alloc. If the fusion src is being applied in a way that's elementwise
@@ -14998,6 +15118,14 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
                 op_srcs_fused_elementwise[0] = false;
                 op_srcs_fused_elementwise[1] = false;
                 op_srcs_fused_elementwise[2] = false;
+            } else if (ggml_vk_can_fuse_snake(ctx, cgraph, i)) {
+                ctx->num_additional_fused_ops = 4;
+                fusion_string = "SNAKE";
+                // elementwise=true: snake.comp is safe under exact aliasing because each
+                // thread reads data_x[idx] into a register before writing data_d[idx]
+                // with a data dependency on that register. The overlap check still
+                // rejects partial overlaps (different base or size).
+                std::fill_n(op_srcs_fused_elementwise, 5, true);
             } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax_norm, { i + 3, i + 9 }) &&
                        ggml_check_edges(cgraph, i, topk_moe_early_softmax_norm_edges) &&
                        ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX_NORM)) {
@@ -15288,6 +15416,9 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
         if (keep_pattern(topk_moe_late_softmax)) {
             continue;
         }
+        if (keep_pattern(snake_pattern)) {
+            continue;
+        }
 
         // First, grab the next unused node.
         current_set.push_back(first_unused);
@@ -15310,7 +15441,8 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph *
             if (match_pattern(topk_moe_early_softmax_norm, j) ||
                 match_pattern(topk_moe_sigmoid_norm_bias, j) ||
                 match_pattern(topk_moe_early_softmax, j) ||
-                match_pattern(topk_moe_late_softmax, j)) {
+                match_pattern(topk_moe_late_softmax, j) ||
+                match_pattern(snake_pattern, j)) {
                 continue;
             }
             bool ok = true;
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp b/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp
new file mode 100644
index 000000000..8585538cb
--- /dev/null
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp
@@ -0,0 +1,49 @@
+#version 450
+
+#include "types.glsl"
+
+// Fused snake activation: y = x + sin(b * x)^2 * c
+//   data_a [ne0, ne1]   per element activation x    (A_TYPE)
+//   data_b [1,   ne1]   per channel multiplier      (float)
+//   data_c [1,   ne1]   per channel inverse scale   (float, precomputed as 1 / freq)
+//   data_d [ne0, ne1]   output y                    (D_TYPE)
+layout (binding = 0) readonly  buffer A {A_TYPE data_a[];};
+layout (binding = 1) readonly  buffer B {float  data_b[];};
+layout (binding = 2) readonly  buffer C {float  data_c[];};
+layout (binding = 3) writeonly buffer D {D_TYPE data_d[];};
+
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+layout (push_constant) uniform parameter {
+    uint32_t ne0;
+    uint32_t ne1;
+} p;
+
+// Load A_TYPE to float
+float load_val(uint32_t idx) {
+#if defined(DATA_A_BF16)
+    return bf16_to_fp32(uint32_t(data_a[idx]));
+#else
+    return float(data_a[idx]);
+#endif
+}
+
+// Store float as D_TYPE
+void store_val(uint32_t idx, float v) {
+#if defined(DATA_D_BF16)
+    data_d[idx] = D_TYPE(fp32_to_bf16(v));
+#else
+    data_d[idx] = D_TYPE(v);
+#endif
+}
+
+void main() {
+    const uint32_t i0 = gl_GlobalInvocationID.x;
+    const uint32_t i1 = gl_GlobalInvocationID.y;
+    if (i0 >= p.ne0 || i1 >= p.ne1) return;
+
+    const uint32_t idx = i0 + i1 * p.ne0;
+    const float xi = load_val(idx);
+    const float s  = sin(data_b[i1] * xi);
+    store_val(idx, xi + s * s * data_c[i1]);
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
index e3a9d61a5..a1d735150 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -952,6 +952,10 @@ void process_shaders() {
 
     string_to_spv("conv_transpose_1d_f32", "conv_transpose_1d.comp", {{"A_TYPE", "float"},  {"B_TYPE", "float"}, {"D_TYPE", "float"}});
 
+    string_to_spv("snake_f32",  "snake.comp", {{"DATA_A_F32", "1"},  {"A_TYPE", "float"},     {"D_TYPE", "float"}});
+    string_to_spv("snake_f16",  "snake.comp", {{"DATA_A_F16", "1"},  {"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
+    string_to_spv("snake_bf16", "snake.comp", {{"DATA_A_BF16", "1"}, {"DATA_D_BF16", "1"}, {"A_TYPE", "uint16_t"},  {"D_TYPE", "uint16_t"}});
+
     string_to_spv("pool2d_f32", "pool2d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
 
     string_to_spv("rwkv_wkv6_f32", "wkv6.comp", merge_maps(base_dict, {{"A_TYPE", "float"}}));

From ee7c30578a3bf3d8244cefe83b3f44c15df7768c Mon Sep 17 00:00:00 2001
From: Reese Levine <reeselevine1@gmail.com>
Date: Thu, 21 May 2026 11:00:27 -0700
Subject: [PATCH 08/12] Update WebGPU support and add link to blog/demo
 (#23483)

---
 README.md     | 3 ++-
 docs/build.md | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 71327e514..dbe2c363a 100644
--- a/README.md
+++ b/README.md
@@ -27,6 +27,7 @@ LLM inference in C/C++
 - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
 - Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
 - Hugging Face GGUF editor: [discussion](https://github.com/ggml-org/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
+- WebGPU support is now available in the browser, see a blog/demo introducing it [here](https://reeselevine.github.io/llamas-on-the-web/).
 
 ----
 
@@ -290,7 +291,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 | [CANN](docs/build.md#cann) | Ascend NPU |
 | [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
 | [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE |
-| [WebGPU [In Progress]](docs/build.md#webgpu) | All |
+| [WebGPU](docs/build.md#webgpu) | All |
 | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
 | [Hexagon [In Progress]](docs/backend/snapdragon/README.md) | Snapdragon |
 | [VirtGPU](docs/backend/VirtGPU.md) | VirtGPU APIR |
diff --git a/docs/build.md b/docs/build.md
index a18479b33..7beafbf5f 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -735,7 +735,7 @@ ninja
 
 To read documentation for how to build on Android, [click here](./android.md)
 
-## WebGPU [In Progress]
+## WebGPU
 
 The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake. The current implementation is up-to-date with Dawn commit `18eb229`.
 

From bb28c1fe246b72276ee1d00ce89306be7b865766 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 21 May 2026 21:13:59 +0300
Subject: [PATCH 09/12] cmake : remove STATIC from impl libraries, enable
 LLAMA_BUILD_APP by default (#23462)

* cmake : remove STATIC from impl libraries, allow BUILD_SHARED_LIBS control

Remove explicit STATIC from all -impl libraries (server, cli, completion, bench,
batched-bench, fit-params, quantize, perplexity) so BUILD_SHARED_LIBS controls
shared vs static linkage.

Add WINDOWS_EXPORT_ALL_SYMBOLS ON for proper DLL export on Windows.

Assisted-by: llama.cpp:local pi

* cmake : enable LLAMA_BUILD_APP by default

Assisted-by: llama.cpp:local pi

* ci : disable app in build-cmake-pkg.yml
---
 .github/workflows/build-cmake-pkg.yml | 2 +-
 CMakeLists.txt                        | 2 +-
 tools/batched-bench/CMakeLists.txt    | 3 ++-
 tools/cli/CMakeLists.txt              | 3 ++-
 tools/completion/CMakeLists.txt       | 3 ++-
 tools/fit-params/CMakeLists.txt       | 3 ++-
 tools/llama-bench/CMakeLists.txt      | 3 ++-
 tools/perplexity/CMakeLists.txt       | 3 ++-
 tools/quantize/CMakeLists.txt         | 3 ++-
 tools/server/CMakeLists.txt           | 3 ++-
 10 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/build-cmake-pkg.yml b/.github/workflows/build-cmake-pkg.yml
index 84cf8ddf4..6bbfd9988 100644
--- a/.github/workflows/build-cmake-pkg.yml
+++ b/.github/workflows/build-cmake-pkg.yml
@@ -21,7 +21,7 @@ jobs:
           PREFIX="$(pwd)"/inst
           cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \
                 -DLLAMA_OPENSSL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
-                -DLLAMA_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release
+                -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_APP=OFF -DCMAKE_BUILD_TYPE=Release
           cmake --build build --config Release
           cmake --install build --prefix "$PREFIX" --config Release
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7ed6432b2..56eb608ea 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -108,7 +108,7 @@ option(LLAMA_BUILD_TESTS     "llama: build tests"
 option(LLAMA_BUILD_TOOLS     "llama: build tools"                                                                ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES  "llama: build examples"                                                             ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER    "llama: build server example"                                                       ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_APP       "llama: build the unified binary"                                                   OFF)
+option(LLAMA_BUILD_APP       "llama: build the unified binary"                                                   ON)
 option(LLAMA_BUILD_UI        "llama: build the embedded Web UI for server"                                       ON)
 option(LLAMA_USE_PREBUILT_UI "llama: use prebuilt UI from HF Bucket when available (requires LLAMA_BUILD_UI=ON)" ON)
 
diff --git a/tools/batched-bench/CMakeLists.txt b/tools/batched-bench/CMakeLists.txt
index 1769c2136..42b50972f 100644
--- a/tools/batched-bench/CMakeLists.txt
+++ b/tools/batched-bench/CMakeLists.txt
@@ -2,7 +2,8 @@
 
 set(TARGET llama-batched-bench-impl)
 
-add_library(${TARGET} STATIC batched-bench.cpp)
+add_library(${TARGET} batched-bench.cpp)
+set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
 
 target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/tools/cli/CMakeLists.txt b/tools/cli/CMakeLists.txt
index 32b660508..aa44e586c 100644
--- a/tools/cli/CMakeLists.txt
+++ b/tools/cli/CMakeLists.txt
@@ -2,7 +2,8 @@
 
 set(TARGET llama-cli-impl)
 
-add_library(${TARGET} STATIC cli.cpp)
+add_library(${TARGET} cli.cpp)
+set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
 
 target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ../server)
 target_link_libraries(${TARGET} PUBLIC server-context llama-common ${CMAKE_THREAD_LIBS_INIT})
diff --git a/tools/completion/CMakeLists.txt b/tools/completion/CMakeLists.txt
index 687bdfda6..a485bf0a3 100644
--- a/tools/completion/CMakeLists.txt
+++ b/tools/completion/CMakeLists.txt
@@ -2,7 +2,8 @@
 
 set(TARGET llama-completion-impl)
 
-add_library(${TARGET} STATIC completion.cpp)
+add_library(${TARGET} completion.cpp)
+set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
 
 target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/tools/fit-params/CMakeLists.txt b/tools/fit-params/CMakeLists.txt
index 207caf2ce..799c2d747 100644
--- a/tools/fit-params/CMakeLists.txt
+++ b/tools/fit-params/CMakeLists.txt
@@ -2,7 +2,8 @@
 
 set(TARGET llama-fit-params-impl)
 
-add_library(${TARGET} STATIC fit-params.cpp)
+add_library(${TARGET} fit-params.cpp)
+set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
 
 target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/tools/llama-bench/CMakeLists.txt b/tools/llama-bench/CMakeLists.txt
index ed419a740..2b71faa5f 100644
--- a/tools/llama-bench/CMakeLists.txt
+++ b/tools/llama-bench/CMakeLists.txt
@@ -2,7 +2,8 @@
 
 set(TARGET llama-bench-impl)
 
-add_library(${TARGET} STATIC llama-bench.cpp)
+add_library(${TARGET} llama-bench.cpp)
+set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
 
 target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/tools/perplexity/CMakeLists.txt b/tools/perplexity/CMakeLists.txt
index 44061d0a5..b03d61a41 100644
--- a/tools/perplexity/CMakeLists.txt
+++ b/tools/perplexity/CMakeLists.txt
@@ -2,7 +2,8 @@
 
 set(TARGET llama-perplexity-impl)
 
-add_library(${TARGET} STATIC perplexity.cpp)
+add_library(${TARGET} perplexity.cpp)
+set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
 
 target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/tools/quantize/CMakeLists.txt b/tools/quantize/CMakeLists.txt
index e76f7d811..5ef4e4e8a 100644
--- a/tools/quantize/CMakeLists.txt
+++ b/tools/quantize/CMakeLists.txt
@@ -2,7 +2,8 @@
 
 set(TARGET llama-quantize-impl)
 
-add_library(${TARGET} STATIC quantize.cpp)
+add_library(${TARGET} quantize.cpp)
+set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
 
 target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT})
diff --git a/tools/server/CMakeLists.txt b/tools/server/CMakeLists.txt
index bf75b199b..d87d1a5a5 100644
--- a/tools/server/CMakeLists.txt
+++ b/tools/server/CMakeLists.txt
@@ -31,13 +31,14 @@ target_link_libraries(${TARGET} PUBLIC llama-common mtmd ${CMAKE_THREAD_LIBS_INI
 
 set(TARGET llama-server-impl)
 
-add_library(${TARGET} STATIC
+add_library(${TARGET}
     server.cpp
     server-http.cpp
     server-http.h
     server-models.cpp
     server-models.h
 )
+set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
 
 target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_include_directories(${TARGET} PRIVATE ../mtmd ${CMAKE_SOURCE_DIR})

From 4f0e43da6f8f6e9390d88409610098ec2d2dc5c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Thu, 21 May 2026 23:35:29 +0200
Subject: [PATCH 10/12] CUDA: fix PDL CC check for JIT compilation (#23471)

---
 ggml/src/ggml-cuda/common.cuh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 9c73fe7e6..e54ecb293 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -1561,7 +1561,8 @@ static __inline__ void ggml_cuda_kernel_launch(Kernel kernel, const ggml_cuda_ke
         return env == nullptr || std::atoi(env) != 0;
     }();
 
-    if (env_pdl_enabled && ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= GGML_CUDA_CC_HOPPER) {
+    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
+    if (env_pdl_enabled && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_HOPPER) {
         auto pdl_cfg = ggml_cuda_pdl_config(launch_params);
 
         CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, kernel, std::forward<Args>(args)... ));

From bbce619adb409880fb6db850a1c5a5f36a4dc7b1 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 22 May 2026 11:46:26 +0300
Subject: [PATCH 11/12] cmake : add install() for impl libraries + fix apple
 builds (#23511)

* pi : update

* ci : fix ios build

* ci : fix andoroid

* ci : fix apple builds

* cmake : add install() for impl libraries

Add install(TARGETS <target> LIBRARY) for all -impl libraries that were
changed from STATIC to shared (controlled by BUILD_SHARED_LIBS) in
commit bb28c1fe2. Without this, cmake --install fails to copy the shared
libraries, causing runtime errors like:

  llama-server: error while loading shared libraries: libllama-server-impl.so

Ref: https://github.com/ggml-org/llama.cpp/issues/23494#issuecomment-4512912515

Assisted-by: llama.cpp:local pi

* ci : fix xcframework build
---
 .github/workflows/build-apple.yml           |  5 +++++
 .github/workflows/build-cmake-pkg.yml       | 11 ++++++++---
 .github/workflows/release.yml               |  1 +
 .pi/gg/SYSTEM.md                            |  5 +++--
 build-xcframework.sh                        |  2 ++
 examples/llama.android/lib/build.gradle.kts |  1 +
 tools/batched-bench/CMakeLists.txt          |  4 ++++
 tools/cli/CMakeLists.txt                    |  4 ++++
 tools/completion/CMakeLists.txt             |  4 ++++
 tools/fit-params/CMakeLists.txt             |  4 ++++
 tools/llama-bench/CMakeLists.txt            |  4 ++++
 tools/perplexity/CMakeLists.txt             |  4 ++++
 tools/quantize/CMakeLists.txt               |  4 ++++
 tools/server/CMakeLists.txt                 |  4 ++++
 14 files changed, 52 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/build-apple.yml b/.github/workflows/build-apple.yml
index b99e61466..a5b7ef56e 100644
--- a/.github/workflows/build-apple.yml
+++ b/.github/workflows/build-apple.yml
@@ -59,6 +59,7 @@ jobs:
           cmake -B build -G Xcode \
             -DGGML_METAL_USE_BF16=ON \
             -DGGML_METAL_EMBED_LIBRARY=ON \
+            -DLLAMA_BUILD_APP=OFF \
             -DLLAMA_BUILD_COMMON=OFF \
             -DLLAMA_BUILD_EXAMPLES=OFF \
             -DLLAMA_BUILD_TOOLS=OFF \
@@ -89,6 +90,7 @@ jobs:
             -DGGML_METAL_USE_BF16=ON \
             -DGGML_METAL_EMBED_LIBRARY=ON \
             -DLLAMA_OPENSSL=OFF \
+            -DLLAMA_BUILD_APP=OFF \
             -DLLAMA_BUILD_EXAMPLES=OFF \
             -DLLAMA_BUILD_TOOLS=OFF \
             -DLLAMA_BUILD_TESTS=OFF \
@@ -138,6 +140,7 @@ jobs:
             -DGGML_METAL_USE_BF16=ON \
             -DGGML_METAL_EMBED_LIBRARY=ON \
             -DLLAMA_BUILD_COMMON=OFF \
+            -DLLAMA_BUILD_APP=OFF \
             -DLLAMA_BUILD_EXAMPLES=OFF \
             -DLLAMA_BUILD_TOOLS=OFF \
             -DLLAMA_BUILD_TESTS=OFF \
@@ -163,6 +166,7 @@ jobs:
             -DGGML_METAL_USE_BF16=ON \
             -DGGML_METAL_EMBED_LIBRARY=ON \
             -DLLAMA_BUILD_COMMON=OFF \
+            -DLLAMA_BUILD_APP=OFF \
             -DLLAMA_BUILD_EXAMPLES=OFF \
             -DLLAMA_BUILD_TOOLS=OFF \
             -DLLAMA_BUILD_TESTS=OFF \
@@ -206,6 +210,7 @@ jobs:
             -DGGML_METAL_USE_BF16=ON \
             -DGGML_METAL_EMBED_LIBRARY=ON \
             -DLLAMA_OPENSSL=OFF \
+            -DLLAMA_BUILD_APP=OFF \
             -DLLAMA_BUILD_EXAMPLES=OFF \
             -DLLAMA_BUILD_TOOLS=OFF \
             -DLLAMA_BUILD_TESTS=OFF \
diff --git a/.github/workflows/build-cmake-pkg.yml b/.github/workflows/build-cmake-pkg.yml
index 6bbfd9988..b36ac5b8e 100644
--- a/.github/workflows/build-cmake-pkg.yml
+++ b/.github/workflows/build-cmake-pkg.yml
@@ -19,9 +19,14 @@ jobs:
       - name: Build
         run: |
           PREFIX="$(pwd)"/inst
-          cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \
-                -DLLAMA_OPENSSL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \
-                -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_APP=OFF -DCMAKE_BUILD_TYPE=Release
+          cmake -S . -B build \
+                -DCMAKE_PREFIX_PATH="$PREFIX" \
+                -DLLAMA_OPENSSL=OFF \
+                -DLLAMA_BUILD_TESTS=OFF \
+                -DLLAMA_BUILD_TOOLS=OFF \
+                -DLLAMA_BUILD_EXAMPLES=OFF \
+                -DLLAMA_BUILD_APP=OFF \
+                -DCMAKE_BUILD_TYPE=Release
           cmake --build build --config Release
           cmake --install build --prefix "$PREFIX" --config Release
 
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 1880c155c..e7cbac35f 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -1108,6 +1108,7 @@ jobs:
             -DGGML_METAL_USE_BF16=ON \
             -DGGML_METAL_EMBED_LIBRARY=ON \
             -DLLAMA_OPENSSL=OFF \
+            -DLLAMA_BUILD_APP=OFF \
             -DLLAMA_BUILD_EXAMPLES=OFF \
             -DLLAMA_BUILD_TOOLS=OFF \
             -DLLAMA_BUILD_TESTS=OFF \
diff --git a/.pi/gg/SYSTEM.md b/.pi/gg/SYSTEM.md
index b7597a4c3..06d97ae78 100644
--- a/.pi/gg/SYSTEM.md
+++ b/.pi/gg/SYSTEM.md
@@ -1,7 +1,7 @@
 You are a coding agent. Here are some very important rules that you must follow:
 
 General:
-- By very precise and concise when writing code, comments, explanations, etc.
+- Be very precise and concise when writing code, comments, explanations, etc.
 - PR and commit titles format: `<module> : <title>`. Lookup recents for examples
 - Don't try to build or run the code unless you are explicitly asked to do so
 - Use the `gh` CLI tool when querying PRs, issues, or other GitHub resources
@@ -16,7 +16,8 @@ Pull requests (PRs):
 - New branch names are prefixed with "gg/"
 - Before opening a pull request, ask the user to confirm the description
 - When creating a pull request, look for the repository's PR template and follow it
-- For the AI usage disclosure section, write "YES. llama.cpp + pi"
+- For the AI usage disclosure section, write "YES. llama.cpp + pi + [MODEL]"
+- Ask the user to tell you what model was used and write it in place of [MODEL]
 - Always create the pull requests in draft mode
 
 Commits:
diff --git a/build-xcframework.sh b/build-xcframework.sh
index c25a1ef28..d287d72fb 100755
--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@@ -7,6 +7,7 @@ VISIONOS_MIN_OS_VERSION=1.0
 TVOS_MIN_OS_VERSION=16.4
 
 BUILD_SHARED_LIBS=OFF
+LLAMA_BUILD_APP=OFF
 LLAMA_BUILD_EXAMPLES=OFF
 LLAMA_BUILD_TOOLS=OFF
 LLAMA_BUILD_TESTS=OFF
@@ -31,6 +32,7 @@ COMMON_CMAKE_ARGS=(
     -DCMAKE_XCODE_ATTRIBUTE_STRIP_INSTALLED_PRODUCT=NO
     -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
     -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
+    -DLLAMA_BUILD_APP=${LLAMA_BUILD_APP}
     -DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES}
     -DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS}
     -DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS}
diff --git a/examples/llama.android/lib/build.gradle.kts b/examples/llama.android/lib/build.gradle.kts
index 9b290d6d4..ae95f41a8 100644
--- a/examples/llama.android/lib/build.gradle.kts
+++ b/examples/llama.android/lib/build.gradle.kts
@@ -25,6 +25,7 @@ android {
                 arguments += "-DCMAKE_VERBOSE_MAKEFILE=ON"
 
                 arguments += "-DBUILD_SHARED_LIBS=ON"
+                arguments += "-DLLAMA_BUILD_APP=OFF"
                 arguments += "-DLLAMA_BUILD_COMMON=ON"
                 arguments += "-DLLAMA_OPENSSL=OFF"
 
diff --git a/tools/batched-bench/CMakeLists.txt b/tools/batched-bench/CMakeLists.txt
index 42b50972f..f6ed257f5 100644
--- a/tools/batched-bench/CMakeLists.txt
+++ b/tools/batched-bench/CMakeLists.txt
@@ -8,6 +8,10 @@ set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
 target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} LIBRARY)
+endif()
+
 # llama-batched-bench executable
 
 set(TARGET llama-batched-bench)
diff --git a/tools/cli/CMakeLists.txt b/tools/cli/CMakeLists.txt
index aa44e586c..a3e635719 100644
--- a/tools/cli/CMakeLists.txt
+++ b/tools/cli/CMakeLists.txt
@@ -8,6 +8,10 @@ set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
 target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ../server)
 target_link_libraries(${TARGET} PUBLIC server-context llama-common ${CMAKE_THREAD_LIBS_INIT})
 
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} LIBRARY)
+endif()
+
 # llama-cli executable
 
 set(TARGET llama-cli)
diff --git a/tools/completion/CMakeLists.txt b/tools/completion/CMakeLists.txt
index a485bf0a3..a310251ef 100644
--- a/tools/completion/CMakeLists.txt
+++ b/tools/completion/CMakeLists.txt
@@ -8,6 +8,10 @@ set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
 target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} LIBRARY)
+endif()
+
 # llama-completion executable
 
 set(TARGET llama-completion)
diff --git a/tools/fit-params/CMakeLists.txt b/tools/fit-params/CMakeLists.txt
index 799c2d747..8acdaef37 100644
--- a/tools/fit-params/CMakeLists.txt
+++ b/tools/fit-params/CMakeLists.txt
@@ -8,6 +8,10 @@ set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
 target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} LIBRARY)
+endif()
+
 # llama-fit-params executable
 
 set(TARGET llama-fit-params)
diff --git a/tools/llama-bench/CMakeLists.txt b/tools/llama-bench/CMakeLists.txt
index 2b71faa5f..b1c35ee88 100644
--- a/tools/llama-bench/CMakeLists.txt
+++ b/tools/llama-bench/CMakeLists.txt
@@ -8,6 +8,10 @@ set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
 target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} LIBRARY)
+endif()
+
 # llama-bench executable
 
 set(TARGET llama-bench)
diff --git a/tools/perplexity/CMakeLists.txt b/tools/perplexity/CMakeLists.txt
index b03d61a41..0eee9acd4 100644
--- a/tools/perplexity/CMakeLists.txt
+++ b/tools/perplexity/CMakeLists.txt
@@ -8,6 +8,10 @@ set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
 target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} LIBRARY)
+endif()
+
 # llama-perplexity executable
 
 set(TARGET llama-perplexity)
diff --git a/tools/quantize/CMakeLists.txt b/tools/quantize/CMakeLists.txt
index 5ef4e4e8a..eead4c859 100644
--- a/tools/quantize/CMakeLists.txt
+++ b/tools/quantize/CMakeLists.txt
@@ -8,6 +8,10 @@ set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
 target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT})
 
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} LIBRARY)
+endif()
+
 # llama-quantize executable
 
 set(TARGET llama-quantize)
diff --git a/tools/server/CMakeLists.txt b/tools/server/CMakeLists.txt
index d87d1a5a5..7d427431d 100644
--- a/tools/server/CMakeLists.txt
+++ b/tools/server/CMakeLists.txt
@@ -44,6 +44,10 @@ target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
 target_include_directories(${TARGET} PRIVATE ../mtmd ${CMAKE_SOURCE_DIR})
 target_link_libraries(${TARGET} PUBLIC server-context llama-ui cpp-httplib ${CMAKE_THREAD_LIBS_INIT})
 
+if(LLAMA_TOOLS_INSTALL)
+    install(TARGETS ${TARGET} LIBRARY)
+endif()
+
 # llama-server executable
 
 set(TARGET llama-server)

From afcda09d154a285cd366135f98ffc1d357f7ddbd Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Fri, 22 May 2026 11:17:31 +0200
Subject: [PATCH 12/12] vocab : fix HybridDNA tokenizer (#23466)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* vocab : mark hybriddna k-mers to avoid BPE token collisions

* improved loop

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
---
 conversion/base.py  |  5 +++++
 src/llama-vocab.cpp | 50 +++++++++++++++++++++++++++------------------
 2 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/conversion/base.py b/conversion/base.py
index 8e12af6c5..d8f050ed3 100644
--- a/conversion/base.py
+++ b/conversion/base.py
@@ -1617,6 +1617,11 @@ class TextModel(ModelBase):
         assert max(tokenizer.vocab.values()) < vocab_size  # ty: ignore[unresolved-attribute]
 
         reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}  # ty: ignore[unresolved-attribute]
+        # k-mers can share text with a base-vocab BPE token (e.g. CCCCCC) and get
+        # dropped by get_vocab(); a reserved marker suffix (U+E000) keeps each
+        # k-mer's own id (llama.cpp strips it on detokenization)
+        for kmer in tokenizer.kmers:  # ty: ignore[unresolved-attribute]
+            reverse_vocab[tokenizer.dna_token_to_id[kmer]] = kmer + "\ue000"  # ty: ignore[unresolved-attribute]
         added_vocab = tokenizer.get_added_vocab()  # ty: ignore[unresolved-attribute]
         added_tokens_decoder = tokenizer.added_tokens_decoder  # ty: ignore[unresolved-attribute]
 
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index acf832d05..a5cf148b2 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1581,6 +1581,11 @@ private:
     const llm_tokenizer_plamo2 & tokenizer;
 };
 
+// reserved suffix (U+E000) that keeps DNA k-mers distinct from identical
+// base-vocab BPE tokens (e.g. CCCCCC) in token_to_id; erased from id_to_token
+// text at load
+static const std::string dna_kmer_marker = "\xee\x80\x80";
+
 struct llm_tokenizer_hybriddna_session : llm_tokenizer_bpe_session {
     llm_tokenizer_hybriddna_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : llm_tokenizer_bpe_session{vocab, tokenizer}, vocab{vocab} {}
 
@@ -1636,34 +1641,22 @@ private:
                 c = char(c - 32);
             }
         }
-        auto is_valid_kmer = [](const std::string & s) {
-            for (char c : s) {
-                if (c != 'A' && c != 'C' && c != 'G' && c != 'T') {
-                    return false;
-                }
-            }
-            return true;
+
+        // k-mers carry the reserved marker suffix; a non-ACGT k-mer simply
+        // isn't in the vocab and falls back to <oov>
+        auto kmer_token = [&](const std::string & kmer) {
+            const auto tok = vocab.text_to_token(kmer + dna_kmer_marker);
+            return tok != LLAMA_TOKEN_NULL ? tok : oov_id;
         };
 
         size_t i = 0;
         for (; i + k <= seq.size(); i += k) {
-            const std::string kmer = seq.substr(i, k);
-            if (is_valid_kmer(kmer)) {
-                const auto tok = vocab.text_to_token(kmer);
-                output.push_back(tok != LLAMA_TOKEN_NULL ? tok : oov_id);
-            } else {
-                output.push_back(oov_id);
-            }
+            output.push_back(kmer_token(seq.substr(i, k)));
         }
         if (i < seq.size()) {
             std::string kmer = seq.substr(i);
             kmer.append(k - kmer.size(), 'A');
-            if (is_valid_kmer(kmer)) {
-                const auto tok = vocab.text_to_token(kmer);
-                output.push_back(tok != LLAMA_TOKEN_NULL ? tok : oov_id);
-            } else {
-                output.push_back(oov_id);
-            }
+            output.push_back(kmer_token(kmer));
         }
     }
 
@@ -2357,6 +2350,23 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
     }
     GGML_ASSERT(id_to_token.size() == token_to_id.size());
 
+    // hybriddna: the marker suffix kept k-mer ids distinct in token_to_id; erase
+    // it from id_to_token so the k-mers detokenize to the bare DNA sequence. The
+    // k-mers are the block right after <oov>, so only scan from there.
+    if (tokenizer_model == "hybriddna") {
+        const auto idx = token_to_id.find("<oov>");
+        if (idx != token_to_id.end()) {
+            auto it = id_to_token.begin() + idx->second + 1;
+            for (; it != id_to_token.end(); ++it) {
+                std::string & text = it->text;
+                if (text.size() > dna_kmer_marker.size()
+                        && text.compare(text.size() - dna_kmer_marker.size(), dna_kmer_marker.size(), dna_kmer_marker) == 0) {
+                    text.erase(text.size() - dna_kmer_marker.size());
+                }
+            }
+        }
+    }
+
     init_tokenizer(type);
 
     // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'