From c9021714e843dd02670f18bd826be076bc58502c Mon Sep 17 00:00:00 2001 From: Pascal Date: Thu, 21 May 2026 10:09:19 +0200 Subject: [PATCH 01/12] server: re-inject subcommand when router spawns children under unified binary (#23442) --- app/llama.cpp | 9 +++++++++ tools/server/server-models.cpp | 8 ++++++++ 2 files changed, 17 insertions(+) diff --git a/app/llama.cpp b/app/llama.cpp index e149975d2..b0b86fd47 100644 --- a/app/llama.cpp +++ b/app/llama.cpp @@ -1,6 +1,7 @@ #include "build-info.h" #include +#include #include #include @@ -77,6 +78,14 @@ int main(int argc, char ** argv) { for (const auto & cmd : cmds) { if (matches(arg, cmd)) { + + // router spawns children through this same binary, it needs the + // subcommand to relaunch as 'llama serve' and not bare options +#ifdef _WIN32 + _putenv_s("LLAMA_APP_CMD", cmd.name); +#else + setenv("LLAMA_APP_CMD", cmd.name, 1); +#endif return cmd.func(argc - 1, argv + 1); } } diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index ccf42320f..47b6c2a4e 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -159,6 +160,13 @@ void server_model_meta::update_args(common_preset_context & ctx_preset, std::str // TODO: maybe validate preset before rendering ? // render args args = preset.to_args(bin_path); + + // unified binary dispatches by subcommand, re-inject it right after the + // binary path so the child starts as 'llama serve ...' not 'llama ...' + const char * app_cmd = std::getenv("LLAMA_APP_CMD"); + if (app_cmd != nullptr && app_cmd[0] != '\0' && !bin_path.empty()) { + args.insert(args.begin() + 1, app_cmd); + } } void server_model_meta::update_caps() { From 52fb93a2bd6b12673b9f4f225e61968e70443b11 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Thu, 21 May 2026 16:11:11 +0800 Subject: [PATCH 02/12] server : free draft/MTP resources on sleep to fix VRAM leak (#23461) The destroy() function in server_context_impl only cleaned up the main model and context (via llama_init.reset()) but did not free the speculative decoder (spec), draft context (ctx_dft), or draft model (model_dft). For MTP (Multi-Token Prediction) models, ctx_dft holds GPU-allocated resources (KV cache, compute buffers) that are not freed when entering the sleeping state. On each sleep/resume cycle, new resources are allocated without the old ones being freed, leading to a VRAM leak that eventually crashes the server with out-of-memory errors. Fix by explicitly resetting spec, ctx_dft, and model_dft in destroy() before resetting llama_init, ensuring proper cleanup order to avoid use-after-free. ref: https://github.com/ggml-org/llama.cpp/issues/23395 Assisted-by: llama.cpp:local pi --- tools/server/server-context.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index f51731026..80d77b0c0 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -701,6 +701,10 @@ private: bool sleeping = false; void destroy() { + spec.reset(); + ctx_dft.reset(); + model_dft.reset(); + llama_init.reset(); ctx_tgt = nullptr; From a1a69f777a14bb8584ba0eb53505cd5ee888bd5e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 21 May 2026 13:34:08 +0300 Subject: [PATCH 03/12] metal : optimize concat kernel and fix set kernel threads (#23411) * metal : fix GGML_OP_SET kernel threads * tests : extend test_cpy to support different src/dst shapes Extend test_cpy to support different source and destination tensor shapes for CPY operations (reshaping), where the total number of elements must match. - Renamed ne -> ne_src, added ne_dst parameter (default: use src shape) - Added 50 new reshaping test cases covering 1D<->2D<->3D<->4D conversions - Tests exercise 1024 boundary, small shapes, and large dimensionality changes - Fixed dangling reference bug (storing & to temporary std::array) - Updated all existing test calls with permute/transpose args for compatibility Assisted-by: llama.cpp:local pi * metal : optimize concat kernel with row batching for small widths When ne0 < 256, batch multiple rows into a single threadgroup to improve occupancy. This avoids underutilizing the GPU when processing narrow tensors. - Dispatch nth = min(256, ne0) threads per group - Calculate nrptg (rows per threadgroup) to fill up to 256 threads - Update kernel index calculation to handle the row batching - Add boundary check for i1 >= ne1 Assisted-by: llama.cpp:local pi * tests : clean-up * tests : refactor CPY shape tests to use dimension permutations Replace 75 hardcoded test cases with a loop over permutations of {3, 5, 7, 32} (total elements: 3360). Each src permutation is tested against canonical sorted and reverse dst, skipping identical shapes. Covers F32, F16, and Q4_0 (when both src and dst ne0 == 32). Assisted-by: llama.cpp:local pi --- ggml/src/ggml-metal/ggml-metal-ops.cpp | 19 +++- ggml/src/ggml-metal/ggml-metal.metal | 6 +- tests/test-backend-ops.cpp | 130 ++++++++++++++++++------- 3 files changed, 113 insertions(+), 42 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index 8506000b6..206af227a 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -564,9 +564,20 @@ int ggml_metal_op_concat(ggml_metal_op_t ctx, int idx) { ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2); ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3); - const int nth = std::min(1024, ne0); + int nth = std::min(256, ne0); - ggml_metal_encoder_dispatch_threadgroups(enc, ne1, ne2, ne3, nth, 1, 1); + // when rows are small, we can batch them together in a single threadgroup + int nrptg = 1; + if (nth < 256) { + nrptg = std::min((256 + nth - 1) / nth, ne1); + if (nrptg * nth > 256) { + nrptg = 256 / nth; + } + } + + const int nw0 = (ne1 + nrptg - 1) / nrptg; + + ggml_metal_encoder_dispatch_threadgroups(enc, nw0, ne2, ne3, nth, nrptg, 1); return 1; } @@ -1786,7 +1797,7 @@ int ggml_metal_op_set(ggml_metal_op_t ctx, int idx) { nk0 = ne10/ggml_blck_size(op->type); } - int nth = std::min(nk0, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)); + int nth = std::min(nk0*ne11, 256); // when rows are small, we can batch them together in a single threadgroup int nrptg = 1; @@ -1797,7 +1808,7 @@ int ggml_metal_op_set(ggml_metal_op_t ctx, int idx) { nrptg = (nth + nk0 - 1)/nk0; nth = nk0; - if (nrptg*nth > ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) { + if (nrptg*nth > 256) { nrptg--; } } diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 4cf9dbea9..e772664ba 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -7486,7 +7486,11 @@ kernel void kernel_concat( const int i3 = tgpig.z; const int i2 = tgpig.y; - const int i1 = tgpig.x; + const int i1 = ntg.y == 1 ? tgpig.x : tgpig.x*ntg.y + tpitg.y; + + if (i1 >= args.ne1) { + return; + } int o[4] = {0, 0, 0, 0}; o[args.dim] = args.dim == 0 ? args.ne00 : (args.dim == 1 ? args.ne01 : (args.dim == 2 ? args.ne02 : args.ne03)); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 43343b679..303f5a40d 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -2866,15 +2866,24 @@ struct test_set : public test_case { struct test_cpy : public test_case { const ggml_type type_src; const ggml_type type_dst; - const std::array ne; + const std::array ne_src; + const std::array ne_dst; const std::array permute_src; const std::array permute_dst; bool _src_use_permute; bool _dst_use_permute; bool _src_transpose; + bool _use_dst_shape; std::string vars() override { - return VARS_TO_STR6(type_src, type_dst, ne, permute_src, permute_dst, _src_transpose); + if (_use_dst_shape) { + return VARS_TO_STR7(type_src, type_dst, ne_src, ne_dst, permute_src, permute_dst, _src_transpose); + } + return VARS_TO_STR6(type_src, type_dst, ne_src, permute_src, permute_dst, _src_transpose); + } + + int64_t total_elements() const { + return ne_src[0] * ne_src[1] * ne_src[2] * ne_src[3]; } double max_nmse_err() override { @@ -2899,7 +2908,7 @@ struct test_cpy : public test_case { err_estimate /= 8.0f; } err_estimate *= err_estimate; - err_estimate /= (150.0f*150.0f*0.25f)*float(ne[0] * ne[1] * ne[2] * ne[3]); + err_estimate /= (150.0f*150.0f*0.25f)*float(total_elements()); return err_estimate; } return 1e-6; @@ -2910,17 +2919,19 @@ struct test_cpy : public test_case { } test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32, - std::array ne = {10, 10, 10, 1}, + std::array ne_src = {10, 10, 10, 1}, + std::array ne_dst = {-1, -1, -1, -1}, std::array permute_src = {0, 0, 0, 0}, std::array permute_dst = {0, 0, 0, 0}, bool transpose_src = false) - : type_src(type_src), type_dst(type_dst), ne(ne), permute_src(permute_src), permute_dst(permute_dst), + : type_src(type_src), type_dst(type_dst), ne_src(ne_src), ne_dst(ne_dst), permute_src(permute_src), permute_dst(permute_dst), _src_use_permute(permute_src[0] + permute_src[1] + permute_src[2] + permute_src[3] > 0), _dst_use_permute(permute_dst[0] + permute_dst[1] + permute_dst[2] + permute_dst[3] > 0), - _src_transpose(transpose_src){} + _src_transpose(transpose_src), + _use_dst_shape(ne_dst[0] >= 0 && ne_dst[1] >= 0 && ne_dst[2] >= 0 && ne_dst[3] >= 0){} ggml_tensor * build_graph(ggml_context * ctx) override { - ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data()); + ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne_src.data()); ggml_set_param(src); ggml_set_name(src, "src"); @@ -2934,7 +2945,8 @@ struct test_cpy : public test_case { ggml_set_name(src, "src_transposed"); } - ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, src->ne); + std::array dst_ne = _use_dst_shape ? ne_dst : std::array{src->ne[0], src->ne[1], src->ne[2], src->ne[3]}; + ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, dst_ne.data()); ggml_set_name(dst, "dst"); if (_dst_use_permute) { @@ -8040,42 +8052,72 @@ static std::vector> make_test_cases_eval() { for (int k = 1; k < 4; ++k) { test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4})); - test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {0, 2, 1, 3})); - test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {0, 3, 1, 2}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {-1,-1,-1,-1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_cpy(type, type, {k*nk, 2, 3, 4}, {-1,-1,-1,-1}, {0, 3, 1, 2}, {0, 2, 1, 3})); } } for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32}) { for (ggml_type type_dst : all_types) { test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4})); - test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows + test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {-1,-1,-1,-1}, {0, 2, 1, 3})); // cpy by rows } } for (ggml_type type_src : all_types) { for (ggml_type type_dst : {GGML_TYPE_F32}) { test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4})); - test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows + test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {-1,-1,-1,-1}, {0, 2, 1, 3})); // cpy by rows } } for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) { for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_F32}) { - test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {1, 0, 2, 3})); // cpy not-contiguous + test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {-1,-1,-1,-1}, {1, 0, 2, 3})); // cpy not-contiguous } } test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_I32, {256, 2, 3, 4})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_I32, {256, 2, 3, 4}, {1, 0, 2, 3})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_I32, {256, 2, 3, 4}, {-1,-1,-1,-1}, {1, 0, 2, 3})); test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4}, {1, 0, 2, 3})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 3}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_F32, {256, 2, 3, 4}, {-1,-1,-1,-1}, {1, 0, 2, 3})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 3, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 3, 3}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 3, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {256, 4, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 4, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 4, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 1, 4, 1}, {-1,-1,-1,-1}, {1, 2, 0, 3}, {0, 0, 0, 0})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {-1,-1,-1,-1}, {1, 2, 0, 3}, {0, 0, 0, 0})); + + // CPY - different src/dst shapes (reshaping via CPY) + // Use permutations of {3, 5, 7, 32}. Total elements: 3*5*7*32 = 3360. + // Each src permutation is tested against canonical sorted and reverse dst (skip self). + { + std::array dims = {3, 5, 7, 32}; + std::sort(dims.begin(), dims.end()); + std::array canonical = dims; + std::array reversed = {32, 7, 5, 3}; + for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) { + std::array cur = dims; + do { + if (cur != canonical) { + test_cases.emplace_back(new test_cpy(type, type, cur, canonical)); + } + if (cur != reversed) { + test_cases.emplace_back(new test_cpy(type, type, cur, reversed)); + } + if (cur[0] == 32 && type == GGML_TYPE_F32) { + if (canonical[0] == 32) { + test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_Q4_0, cur, canonical)); + } + if (reversed[0] == 32) { + test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_Q4_0, cur, reversed)); + } + } + std::next_permutation(cur.begin(), cur.end()); + } while (cur != canonical); + } + } for (ggml_type type_dst : { GGML_TYPE_F32, GGML_TYPE_I32, GGML_TYPE_F16, GGML_TYPE_BF16 }) { for (bool use_view_slice : { true, false }) { @@ -8830,9 +8872,24 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 2, 3}, {256, 16, 2, 3}, 1)); test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 2, 3}, {128, 16, 2, 3}, 2)); test_cases.emplace_back(new test_acc(GGML_TYPE_F32, {256, 17, 2, 3}, {64, 16, 2, 3}, 3)); + test_cases.emplace_back(new test_pad()); test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {33, 17, 2, 1}, 4, 3, true)); // circular test_cases.emplace_back(new test_pad_ext()); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 1, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 2, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1024, 16, 1, 1}, 0, 1, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1023, 1, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1023, 8, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1025, 1, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {1025, 8, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {2048, 1, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {2048, 4, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {2049, 1, 1, 1}, 1, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 1, 1, 1}, 100, 0, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 1, 1, 1}, 0, 100, false)); + test_cases.emplace_back(new test_pad(GGML_TYPE_F32, {100, 100, 1, 1}, 50, 50, false)); + test_cases.emplace_back(new test_pad_reflect_1d()); test_cases.emplace_back(new test_pad_reflect_1d(GGML_TYPE_F32, {3000, 384, 4, 1})); test_cases.emplace_back(new test_roll()); @@ -9132,22 +9189,21 @@ static std::vector> make_test_cases_perf() { test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1})); test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F16, {512, 3072, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8192, 512, 2, 1}, {0, 2, 1, 3})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {3072, 512, 2, 1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8192, 512, 2, 1}, {-1,-1,-1,-1}, {0, 2, 1, 3})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {3072, 512, 2, 1}, {-1,-1,-1,-1}, {0, 2, 1, 3})); test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_Q4_0, {8192, 512, 2, 1})); test_cases.emplace_back(new test_cpy(GGML_TYPE_Q4_0, GGML_TYPE_F32, {8192, 512, 2, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {1, 0, 2, 3}, {0, 0, 0, 0})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {1, 0, 2, 3}, {0, 0, 0, 0})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {1, 0, 2, 3}, {0, 0, 0, 0})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {1, 0, 2, 3}, {0, 0, 0, 0})); - - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); - test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}, {0, 0, 0, 0})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}, {0, 0, 0, 0})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}, {0, 0, 0, 0})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3}, {0, 0, 0, 0})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768*1024, 256, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768*1024, 256, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); + test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {768, 1024, 256, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {4096, 4096, 5, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {12888, 256, 5, 1}, false, false, GGML_TYPE_F32, {1, 1}, 1.0f, 0.0f)); From b65bb4baaeae712975e09a92e1d37d3842ea5da7 Mon Sep 17 00:00:00 2001 From: ScrewTSW Date: Thu, 21 May 2026 13:29:13 +0200 Subject: [PATCH 04/12] server: expose prompt token counts in /slots endpoint (#23454) Add n_prompt_tokens, n_prompt_tokens_processed, and n_prompt_tokens_cache to the /slots JSON response. These fields are already tracked internally but were not exposed, making it impossible for clients to monitor prompt evaluation progress during processing. --- tools/server/server-context.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 80d77b0c0..b939e3b75 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -506,6 +506,9 @@ struct server_slot { if (ptask) { res["id_task"] = ptask->id; + res["n_prompt_tokens"] = (int32_t) prompt.tokens.size(); + res["n_prompt_tokens_processed"] = n_prompt_tokens_processed; + res["n_prompt_tokens_cache"] = n_prompt_tokens_cache; res["params"] = ptask->params.to_json(only_metrics); res["next_token"] = { { From 40d5358d3c730b81729ba81cd5c44ed596d02510 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 21 May 2026 14:41:50 +0300 Subject: [PATCH 05/12] tests : move save-load-state from examples to tests (#23336) * tests : move save-load-state from examples to tests - Move examples/save-load-state/ to tests/test-save-load-state.cpp - Remove subdirectory reference from examples/CMakeLists.txt - Add test to tests/CMakeLists.txt as a model test - Remove CODEOWNERS entry for removed example directory Assisted-by: llama.cpp:local pi * cont : update ci --- CODEOWNERS | 1 - ci/run.sh | 8 ++++---- examples/CMakeLists.txt | 1 - examples/save-load-state/CMakeLists.txt | 5 ----- tests/CMakeLists.txt | 4 ++++ .../save-load-state.cpp => tests/test-save-load-state.cpp | 0 6 files changed, 8 insertions(+), 11 deletions(-) delete mode 100644 examples/save-load-state/CMakeLists.txt rename examples/save-load-state/save-load-state.cpp => tests/test-save-load-state.cpp (100%) diff --git a/CODEOWNERS b/CODEOWNERS index f92fe98ce..4b9d90177 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -49,7 +49,6 @@ /examples/parallel/ @ggerganov /examples/passkey/ @ggerganov /examples/retrieval/ @ggerganov -/examples/save-load-state/ @ggerganov /examples/speculative-simple/ @ggerganov /examples/speculative/ @ggerganov /ggml/cmake/ @ggerganov diff --git a/ci/run.sh b/ci/run.sh index a8cbd3371..b096dc23b 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -461,10 +461,10 @@ function gg_run_qwen3_0_6b { (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -ngl 99 -c 1024 -b 512 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log - (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log - (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/test-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa off --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/test-save-load-state --model ${model_q4_0} -ngl 10 -c 1024 -fa on --no-op-offload) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/test-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa off ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/test-save-load-state --model ${model_q4_0} -ngl 99 -c 1024 -fa on ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log function check_ppl { qnt="$1" diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index a29dc707c..39f802d25 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -27,7 +27,6 @@ else() add_subdirectory(parallel) add_subdirectory(passkey) add_subdirectory(retrieval) - add_subdirectory(save-load-state) add_subdirectory(simple) add_subdirectory(simple-chat) add_subdirectory(speculative) diff --git a/examples/save-load-state/CMakeLists.txt b/examples/save-load-state/CMakeLists.txt deleted file mode 100644 index 78024672e..000000000 --- a/examples/save-load-state/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -set(TARGET llama-save-load-state) -add_executable(${TARGET} save-load-state.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama-common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 0fdbd39c9..33ae3b303 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -255,6 +255,10 @@ set_tests_properties(test-state-restore-fragmented PROPERTIES FIXTURES_REQUIRED llama_build_and_test(test-recurrent-state-rollback.cpp LABEL "model" ARGS -m "${MODEL_DEST}") set_tests_properties(test-recurrent-state-rollback PROPERTIES FIXTURES_REQUIRED test-download-model) +# Test state save/load functionality +llama_build_and_test(test-save-load-state.cpp LABEL "model" ARGS -m "${MODEL_DEST}") +set_tests_properties(test-save-load-state PROPERTIES FIXTURES_REQUIRED test-download-model) + if (NOT GGML_BACKEND_DL) # these tests use the backends directly and cannot be built with dynamic loading llama_build_and_test(test-barrier.cpp) diff --git a/examples/save-load-state/save-load-state.cpp b/tests/test-save-load-state.cpp similarity index 100% rename from examples/save-load-state/save-load-state.cpp rename to tests/test-save-load-state.cpp From 5306f4b3b54a0e261e83b1d2961a97685e898871 Mon Sep 17 00:00:00 2001 From: Chen Yuan Date: Thu, 21 May 2026 10:58:49 -0400 Subject: [PATCH 06/12] fix(flash-attn): replace f32 with kv_type and q_type (#23372) --- .../wgsl-shaders/flash_attn_tile.wgsl | 44 +++++++++---------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl index ae8036b9a..4133f0ab5 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_tile.wgsl @@ -122,9 +122,9 @@ const V_CHUNKS: u32 = HEAD_DIM_V / 4u; const SCORE_REGS_PER_LANE: u32 = (KV_TILE + MIN_SUBGROUP_SIZE - 1u) / MIN_SUBGROUP_SIZE; const OUT_REGS_PER_LANE: u32 = (V_CHUNKS + MIN_SUBGROUP_SIZE - 1u) / MIN_SUBGROUP_SIZE; -var q_shmem: array; -var kv_shmem: array; -var p_shmem: array; +var q_shmem: array; +var kv_shmem: array; +var p_shmem: array; @compute @workgroup_size(WG_SIZE) fn main(@builtin(workgroup_id) wg_id: vec3, @@ -169,10 +169,10 @@ fn main(@builtin(workgroup_id) wg_id: vec3, let head = f32(head_idx); let slope = select(1.0, - select(pow(params.m1, 2.0 * (head - params.n_head_log2) + 1.0), - pow(params.m0, head + 1.0), - head < params.n_head_log2), - params.max_bias > 0.0); + select(pow(params.m1, 2.0 * (head - params.n_head_log2) + 1.0), + pow(params.m0, head + 1.0), + head < params.n_head_log2), + params.max_bias > 0.0); for (var elem_idx = local_id.x; elem_idx < Q_TILE * HEAD_DIM_QK; elem_idx += WG_SIZE) { let q_tile_row = elem_idx / HEAD_DIM_QK; @@ -181,7 +181,7 @@ fn main(@builtin(workgroup_id) wg_id: vec3, let global_q_row_offset = q_head_offset + head_q_row * params.stride_q1; q_shmem[elem_idx] = select( 0.0, - f32(Q[global_q_row_offset + q_col]) * params.scale, + Q_TYPE(Q[global_q_row_offset + q_col]) * params.scale, head_q_row < params.seq_len_q); } @@ -213,10 +213,10 @@ fn main(@builtin(workgroup_id) wg_id: vec3, let k_vec_index = (k_head_offset + global_k_row * params.stride_k1 + chunk * 4u) >> 2u; let k4 = K[k_vec_index]; let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u; - kv_shmem[kv_off + 0u] = f32(k4.x); - kv_shmem[kv_off + 1u] = f32(k4.y); - kv_shmem[kv_off + 2u] = f32(k4.z); - kv_shmem[kv_off + 3u] = f32(k4.w); + kv_shmem[kv_off + 0u] = KV_TYPE(k4.x); + kv_shmem[kv_off + 1u] = KV_TYPE(k4.y); + kv_shmem[kv_off + 2u] = KV_TYPE(k4.z); + kv_shmem[kv_off + 3u] = KV_TYPE(k4.w); } workgroupBarrier(); @@ -233,18 +233,18 @@ fn main(@builtin(workgroup_id) wg_id: vec3, var dot_val = 0.0; for (var chunk = 0u; chunk < Q_CHUNKS; chunk += 1u) { let q_off = q_base + chunk * 4u; - let qv = vec4( + let qv = vec4( q_shmem[q_off + 0u], q_shmem[q_off + 1u], q_shmem[q_off + 2u], q_shmem[q_off + 3u]); let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u; - let kv = vec4( + let kv = vec4( kv_shmem[kv_off + 0u], kv_shmem[kv_off + 1u], kv_shmem[kv_off + 2u], kv_shmem[kv_off + 3u]); - dot_val += dot(qv, kv); + dot_val += dot(vec4(qv), vec4(kv)); } #ifdef LOGIT_SOFTCAP dot_val = params.logit_softcap * tanh(dot_val); @@ -271,7 +271,7 @@ fn main(@builtin(workgroup_id) wg_id: vec3, let kv_local = sg_inv_id + slot * subgroup_size; if (row_active && kv_local < kv_count) { let p = exp(local_scores[slot] - new_max); - p_shmem[subgroup_p_offset + kv_local] = p; + p_shmem[subgroup_p_offset + kv_local] = KV_TYPE(p); local_sum += p; } } @@ -285,10 +285,10 @@ fn main(@builtin(workgroup_id) wg_id: vec3, let v_vec_index = (v_head_offset + global_v_row * params.stride_v1 + chunk * 4u) >> 2u; let v4 = V[v_vec_index]; let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u; - kv_shmem[kv_off + 0u] = f32(v4.x); - kv_shmem[kv_off + 1u] = f32(v4.y); - kv_shmem[kv_off + 2u] = f32(v4.z); - kv_shmem[kv_off + 3u] = f32(v4.w); + kv_shmem[kv_off + 0u] = KV_TYPE(v4.x); + kv_shmem[kv_off + 1u] = KV_TYPE(v4.y); + kv_shmem[kv_off + 2u] = KV_TYPE(v4.z); + kv_shmem[kv_off + 3u] = KV_TYPE(v4.w); } workgroupBarrier(); @@ -308,12 +308,12 @@ fn main(@builtin(workgroup_id) wg_id: vec3, for (var kv_local = 0u; kv_local < kv_count; kv_local += 1u) { let p = p_shmem[subgroup_p_offset + kv_local]; let kv_off = kv_local * KV_STAGE_STRIDE + chunk * 4u; - let v4 = vec4( + let v4 = vec4( kv_shmem[kv_off + 0u], kv_shmem[kv_off + 1u], kv_shmem[kv_off + 2u], kv_shmem[kv_off + 3u]); - acc += p * v4; + acc += f32(p) * vec4(v4); } out_regs[reg_idx] = acc; } From 47c0eda9d4980bdb3031f6affe98ccaf6e1e69ee Mon Sep 17 00:00:00 2001 From: Pascal Date: Thu, 21 May 2026 19:39:42 +0200 Subject: [PATCH 07/12] vulkan: fuse snake activation (mul, sin, sqr, mul, add) (#22855) * vulkan: fuse snake activation (mul, sin, sqr, mul, add) Add snake.comp shader with F32 / F16 / BF16 pipelines and ggml_vk_snake_dispatch_fused. The matcher recognizes the naive 5 op decomposition emitted by audio decoders (BigVGAN, Vocos) for snake activation y = x + sin(a*x)^2 * inv_b and rewrites it to a single elementwise kernel. test_snake_fuse from the CUDA PR now also compares CPU naive vs Vulkan fused across F32 / F16 / BF16. * vulkan: address jeffbolznv review for fused snake activation Rename T / C to ne0 / ne1 in the shader and push constants to match the standard naming convention used across the Vulkan backend. Tighten ggml_vk_can_fuse_snake: require x and dst to be contiguous (the shader uses idx = i0 + i1 * ne0) and require a / inv_b to be tightly packed on the broadcast dim (the shader reads data_a[i1]). * vulkan: tighten snake fusion type checks for all operands (address jeffbolznv review) * vulkan: reject snake fusion when ne[2] or ne[3] > 1 (address jeffbolznv review) * vulkan: address 0cc4m review for fused snake activation snake.comp is renamed to follow the ggml DATA_A_* / A_TYPE convention. A_TYPE now applies to the activation tensor data_a instead of the broadcast multiplier, and the bindings become data_a (A_TYPE), data_b (float), data_c (float) and data_d (D_TYPE). A header at the top of the shader maps each buffer to its role in y = x + sin(b * x)^2 * c. On the C++ side, ggml_vk_can_fuse_snake reuses the existing snake_pattern constant instead of duplicating the op list, sin_node is extracted as a named local alongside the other chain nodes, and the broadcast operands a and inv_b are now required to be GGML_TYPE_F32 to match the hardcoded float bindings on data_b and data_c (the previous a->type == x->type would silently reject any future BF16 or F16 chain once the supports_op gate for SIN / SQR is lifted). ggml_vk_snake_dispatch_fused gets an explicit GGML_TYPE_F32 case and GGML_ABORT on default in place of the silent f32 fallback, and a stale comment about data_a[i1] / data_inv_b[i1] is refreshed to match the new binding names. --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 136 +++++++++++++++++- .../src/ggml-vulkan/vulkan-shaders/snake.comp | 49 +++++++ .../vulkan-shaders/vulkan-shaders-gen.cpp | 4 + 3 files changed, 187 insertions(+), 2 deletions(-) create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/snake.comp diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index d3fb19048..aa289220a 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -499,6 +499,12 @@ static constexpr std::initializer_list topk_moe_late_softmax { GGM GGML_OP_GET_ROWS, GGML_OP_RESHAPE, GGML_OP_SOFT_MAX, GGML_OP_RESHAPE }; +// Snake activation: y = x + sin(a*x)^2 * inv_b. Used by the optimize_graph reorder +// pass so it keeps the chain contiguous and by the dispatcher to detect the fusion. +static constexpr std::initializer_list snake_pattern { GGML_OP_MUL, GGML_OP_SIN, + GGML_OP_SQR, GGML_OP_MUL, + GGML_OP_ADD }; + //node #978 ( SOFT_MAX): ffn_moe_probs-15 ( 0K) [Vulka ] use=2: ffn_moe_logits-15 ( 0K) [Vulka ] //node #979 ( RESHAPE): ffn_moe_probs-15 (re ( 0K) [Vulka ] use=1: ffn_moe_probs-15 ( 0K) [Vulka ] //node #980 ( ARGSORT): ffn_moe_argsort-15 ( 0K) [Vulka ] use=1: ffn_moe_probs-15 ( 0K) [Vulka ] @@ -846,6 +852,9 @@ struct vk_device_struct { vk_pipeline pipeline_im2col_3d_f32, pipeline_im2col_3d_f32_f16; vk_pipeline pipeline_timestep_embedding_f32; vk_pipeline pipeline_conv_transpose_1d_f32; + vk_pipeline pipeline_snake_f32; + vk_pipeline pipeline_snake_f16; + vk_pipeline pipeline_snake_bf16; vk_pipeline pipeline_pool2d_f32; vk_pipeline pipeline_rwkv_wkv6_f32; vk_pipeline pipeline_rwkv_wkv7_f32; @@ -1475,6 +1484,11 @@ struct vk_op_conv_transpose_1d_push_constants { int32_t s0; }; +struct vk_op_snake_push_constants { + uint32_t ne0; + uint32_t ne1; +}; + struct vk_op_pool2d_push_constants { uint32_t IW; uint32_t IH; uint32_t OW; uint32_t OH; @@ -4845,6 +4859,10 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_conv_transpose_1d_f32, "conv_transpose_1d_f32", conv_transpose_1d_f32_len, conv_transpose_1d_f32_data, "main", 3, sizeof(vk_op_conv_transpose_1d_push_constants), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_snake_f32, "snake_f32", snake_f32_len, snake_f32_data, "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_snake_f16, "snake_f16", snake_f16_len, snake_f16_data, "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_snake_bf16, "snake_bf16", snake_bf16_len, snake_bf16_data, "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv6_f32, "rwkv_wkv6_f32", rwkv_wkv6_f32_len, rwkv_wkv6_f32_data, "main", 7, sizeof(vk_op_rwkv_wkv6_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); @@ -12110,6 +12128,45 @@ static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context& ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p)); } +// Dispatch the fused snake activation: y = x + sin^2(a * x) * inv_b. +// Match the naive mul -> sin -> sqr -> mul -> add chain and run the +// dedicated kernel directly. The pattern is validated by +// ggml_vk_can_fuse_snake before this call. +static void ggml_vk_snake_dispatch_fused(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_cgraph * cgraph, int node_idx) { + const ggml_tensor * mul0 = cgraph->nodes[node_idx + 0]; + const ggml_tensor * sqr = cgraph->nodes[node_idx + 2]; + const ggml_tensor * mul1 = cgraph->nodes[node_idx + 3]; + ggml_tensor * add = cgraph->nodes[node_idx + 4]; + + // x carries the full activation shape, a is the broadcast operand + const ggml_tensor * x = ggml_are_same_shape(mul0, mul0->src[0]) ? mul0->src[0] : mul0->src[1]; + const ggml_tensor * a = (x == mul0->src[0]) ? mul0->src[1] : mul0->src[0]; + + // mul1 reads sqr and inv_b in either operand order + const ggml_tensor * inv_b = (mul1->src[0] == sqr) ? mul1->src[1] : mul1->src[0]; + + vk_pipeline pipeline = nullptr; + switch (x->type) { + case GGML_TYPE_F32: pipeline = ctx->device->pipeline_snake_f32; break; + case GGML_TYPE_F16: pipeline = ctx->device->pipeline_snake_f16; break; + case GGML_TYPE_BF16: pipeline = ctx->device->pipeline_snake_bf16; break; + default: GGML_ABORT("unsupported type"); + } + ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); + + vk_subbuffer x_buf = ggml_vk_tensor_subbuffer(ctx, x); + vk_subbuffer a_buf = ggml_vk_tensor_subbuffer(ctx, a); + vk_subbuffer inv_b_buf = ggml_vk_tensor_subbuffer(ctx, inv_b); + vk_subbuffer dst_buf = ggml_vk_tensor_subbuffer(ctx, add); + + vk_op_snake_push_constants pc{}; + pc.ne0 = static_cast(x->ne[0]); + pc.ne1 = static_cast(x->ne[1]); + + std::array elements = { pc.ne0, pc.ne1, 1 }; + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { x_buf, a_buf, inv_b_buf, dst_buf }, pc, elements); +} + static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { uint32_t op = static_cast(dst->op_params[0]); const int32_t k1 = dst->op_params[1]; @@ -13318,7 +13375,11 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr break; case GGML_OP_MUL: - ggml_vk_mul(ctx, compute_ctx, src0, src1, node); + if (ctx->num_additional_fused_ops) { + ggml_vk_snake_dispatch_fused(ctx, compute_ctx, cgraph, node_idx); + } else { + ggml_vk_mul(ctx, compute_ctx, src0, src1, node); + } break; case GGML_OP_DIV: @@ -14691,6 +14752,65 @@ static bool ggml_vk_can_fuse_rope_set_rows(ggml_backend_vk_context * ctx, const return true; } +// Pattern check for the 5-op Snake fusion: mul -> sin -> sqr -> mul -> add. +// Verifies the chain shape, the closure x_in_add == x_in_mul0, and that +// the broadcast operands a and inv_b share a [1, C] layout. +static bool ggml_vk_can_fuse_snake(ggml_backend_vk_context * ctx, const struct ggml_cgraph * cgraph, int node_idx) { + GGML_UNUSED(ctx); + if (!ggml_can_fuse(cgraph, node_idx, snake_pattern)) { + return false; + } + + const ggml_tensor * mul0 = cgraph->nodes[node_idx + 0]; + const ggml_tensor * sin_node = cgraph->nodes[node_idx + 1]; + const ggml_tensor * sqr = cgraph->nodes[node_idx + 2]; + const ggml_tensor * mul1 = cgraph->nodes[node_idx + 3]; + const ggml_tensor * add = cgraph->nodes[node_idx + 4]; + + const ggml_tensor * x = ggml_are_same_shape(mul0, mul0->src[0]) ? mul0->src[0] : mul0->src[1]; + const ggml_tensor * a = (x == mul0->src[0]) ? mul0->src[1] : mul0->src[0]; + + const ggml_tensor * inv_b = (mul1->src[0] == sqr) ? mul1->src[1] : mul1->src[0]; + const ggml_tensor * x_in_add = (add->src[0] == mul1) ? add->src[1] : add->src[0]; + + if (x_in_add != x) { + return false; + } + if (x->type != GGML_TYPE_F32 && x->type != GGML_TYPE_F16 && x->type != GGML_TYPE_BF16) { + return false; + } + // Shader bindings: data_a is A_TYPE so it follows x's precision, while + // data_b and data_c are hardcoded float, so the broadcast operands must + // be F32 regardless of x's type. + if (a->type != GGML_TYPE_F32) return false; + if (inv_b->type != GGML_TYPE_F32) return false; + // Chain intermediates and output share x's precision (single A_TYPE / D_TYPE pipeline). + if (mul0->type != x->type) return false; + if (sin_node->type != x->type) return false; + if (sqr->type != x->type) return false; + if (mul1->type != x->type) return false; + if (add->type != x->type) return false; + if (!ggml_are_same_shape(a, inv_b)) { + return false; + } + if (a->ne[0] != 1 || a->ne[1] != x->ne[1]) { + return false; + } + // Dispatch is 2D over (ne0, ne1), so x and add must be 2D and a / inv_b + // must collapse to [1, C, 1, 1]. Higher dims are not handled by the shader. + if (x->ne[2] != 1 || x->ne[3] != 1) return false; + if (add->ne[2] != 1 || add->ne[3] != 1) return false; + if (a->ne[2] != 1 || a->ne[3] != 1) return false; + if (inv_b->ne[2] != 1 || inv_b->ne[3] != 1) return false; + // Shader uses idx = i0 + i1 * ne0 and reads data_b[i1] / data_c[i1], + // so every operand must be contiguous. + if (!ggml_is_contiguous(x) || !ggml_is_contiguous(add) || + !ggml_is_contiguous(a) || !ggml_is_contiguous(inv_b)) { + return false; + } + return true; +} + // Check whether the tensors overlap in memory. // Fusions can potentially overwrite src tensors in ways that are not prevented // by ggml-alloc. If the fusion src is being applied in a way that's elementwise @@ -14998,6 +15118,14 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg op_srcs_fused_elementwise[0] = false; op_srcs_fused_elementwise[1] = false; op_srcs_fused_elementwise[2] = false; + } else if (ggml_vk_can_fuse_snake(ctx, cgraph, i)) { + ctx->num_additional_fused_ops = 4; + fusion_string = "SNAKE"; + // elementwise=true: snake.comp is safe under exact aliasing because each + // thread reads data_x[idx] into a register before writing data_d[idx] + // with a data dependency on that register. The overlap check still + // rejects partial overlaps (different base or size). + std::fill_n(op_srcs_fused_elementwise, 5, true); } else if (ggml_can_fuse_subgraph(cgraph, i, topk_moe_early_softmax_norm, { i + 3, i + 9 }) && ggml_check_edges(cgraph, i, topk_moe_early_softmax_norm_edges) && ggml_vk_can_fuse_topk_moe(ctx, cgraph, i, TOPK_MOE_EARLY_SOFTMAX_NORM)) { @@ -15288,6 +15416,9 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * if (keep_pattern(topk_moe_late_softmax)) { continue; } + if (keep_pattern(snake_pattern)) { + continue; + } // First, grab the next unused node. current_set.push_back(first_unused); @@ -15310,7 +15441,8 @@ static void ggml_vk_graph_optimize(ggml_backend_t backend, struct ggml_cgraph * if (match_pattern(topk_moe_early_softmax_norm, j) || match_pattern(topk_moe_sigmoid_norm_bias, j) || match_pattern(topk_moe_early_softmax, j) || - match_pattern(topk_moe_late_softmax, j)) { + match_pattern(topk_moe_late_softmax, j) || + match_pattern(snake_pattern, j)) { continue; } bool ok = true; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp b/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp new file mode 100644 index 000000000..8585538cb --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/snake.comp @@ -0,0 +1,49 @@ +#version 450 + +#include "types.glsl" + +// Fused snake activation: y = x + sin(b * x)^2 * c +// data_a [ne0, ne1] per element activation x (A_TYPE) +// data_b [1, ne1] per channel multiplier (float) +// data_c [1, ne1] per channel inverse scale (float, precomputed as 1 / freq) +// data_d [ne0, ne1] output y (D_TYPE) +layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; +layout (binding = 1) readonly buffer B {float data_b[];}; +layout (binding = 2) readonly buffer C {float data_c[];}; +layout (binding = 3) writeonly buffer D {D_TYPE data_d[];}; + +layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; + +layout (push_constant) uniform parameter { + uint32_t ne0; + uint32_t ne1; +} p; + +// Load A_TYPE to float +float load_val(uint32_t idx) { +#if defined(DATA_A_BF16) + return bf16_to_fp32(uint32_t(data_a[idx])); +#else + return float(data_a[idx]); +#endif +} + +// Store float as D_TYPE +void store_val(uint32_t idx, float v) { +#if defined(DATA_D_BF16) + data_d[idx] = D_TYPE(fp32_to_bf16(v)); +#else + data_d[idx] = D_TYPE(v); +#endif +} + +void main() { + const uint32_t i0 = gl_GlobalInvocationID.x; + const uint32_t i1 = gl_GlobalInvocationID.y; + if (i0 >= p.ne0 || i1 >= p.ne1) return; + + const uint32_t idx = i0 + i1 * p.ne0; + const float xi = load_val(idx); + const float s = sin(data_b[i1] * xi); + store_val(idx, xi + s * s * data_c[i1]); +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index e3a9d61a5..a1d735150 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -952,6 +952,10 @@ void process_shaders() { string_to_spv("conv_transpose_1d_f32", "conv_transpose_1d.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("snake_f32", "snake.comp", {{"DATA_A_F32", "1"}, {"A_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("snake_f16", "snake.comp", {{"DATA_A_F16", "1"}, {"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); + string_to_spv("snake_bf16", "snake.comp", {{"DATA_A_BF16", "1"}, {"DATA_D_BF16", "1"}, {"A_TYPE", "uint16_t"}, {"D_TYPE", "uint16_t"}}); + string_to_spv("pool2d_f32", "pool2d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("rwkv_wkv6_f32", "wkv6.comp", merge_maps(base_dict, {{"A_TYPE", "float"}})); From ee7c30578a3bf3d8244cefe83b3f44c15df7768c Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Thu, 21 May 2026 11:00:27 -0700 Subject: [PATCH 08/12] Update WebGPU support and add link to blog/demo (#23483) --- README.md | 3 ++- docs/build.md | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 71327e514..dbe2c363a 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ LLM inference in C/C++ - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim - Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669 - Hugging Face GGUF editor: [discussion](https://github.com/ggml-org/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor) +- WebGPU support is now available in the browser, see a blog/demo introducing it [here](https://reeselevine.github.io/llamas-on-the-web/). ---- @@ -290,7 +291,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo | [CANN](docs/build.md#cann) | Ascend NPU | | [OpenCL](docs/backend/OPENCL.md) | Adreno GPU | | [IBM zDNN](docs/backend/zDNN.md) | IBM Z & LinuxONE | -| [WebGPU [In Progress]](docs/build.md#webgpu) | All | +| [WebGPU](docs/build.md#webgpu) | All | | [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All | | [Hexagon [In Progress]](docs/backend/snapdragon/README.md) | Snapdragon | | [VirtGPU](docs/backend/VirtGPU.md) | VirtGPU APIR | diff --git a/docs/build.md b/docs/build.md index a18479b33..7beafbf5f 100644 --- a/docs/build.md +++ b/docs/build.md @@ -735,7 +735,7 @@ ninja To read documentation for how to build on Android, [click here](./android.md) -## WebGPU [In Progress] +## WebGPU The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake. The current implementation is up-to-date with Dawn commit `18eb229`. From bb28c1fe246b72276ee1d00ce89306be7b865766 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 21 May 2026 21:13:59 +0300 Subject: [PATCH 09/12] cmake : remove STATIC from impl libraries, enable LLAMA_BUILD_APP by default (#23462) * cmake : remove STATIC from impl libraries, allow BUILD_SHARED_LIBS control Remove explicit STATIC from all -impl libraries (server, cli, completion, bench, batched-bench, fit-params, quantize, perplexity) so BUILD_SHARED_LIBS controls shared vs static linkage. Add WINDOWS_EXPORT_ALL_SYMBOLS ON for proper DLL export on Windows. Assisted-by: llama.cpp:local pi * cmake : enable LLAMA_BUILD_APP by default Assisted-by: llama.cpp:local pi * ci : disable app in build-cmake-pkg.yml --- .github/workflows/build-cmake-pkg.yml | 2 +- CMakeLists.txt | 2 +- tools/batched-bench/CMakeLists.txt | 3 ++- tools/cli/CMakeLists.txt | 3 ++- tools/completion/CMakeLists.txt | 3 ++- tools/fit-params/CMakeLists.txt | 3 ++- tools/llama-bench/CMakeLists.txt | 3 ++- tools/perplexity/CMakeLists.txt | 3 ++- tools/quantize/CMakeLists.txt | 3 ++- tools/server/CMakeLists.txt | 3 ++- 10 files changed, 18 insertions(+), 10 deletions(-) diff --git a/.github/workflows/build-cmake-pkg.yml b/.github/workflows/build-cmake-pkg.yml index 84cf8ddf4..6bbfd9988 100644 --- a/.github/workflows/build-cmake-pkg.yml +++ b/.github/workflows/build-cmake-pkg.yml @@ -21,7 +21,7 @@ jobs: PREFIX="$(pwd)"/inst cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \ -DLLAMA_OPENSSL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \ - -DLLAMA_BUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=Release + -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_APP=OFF -DCMAKE_BUILD_TYPE=Release cmake --build build --config Release cmake --install build --prefix "$PREFIX" --config Release diff --git a/CMakeLists.txt b/CMakeLists.txt index 7ed6432b2..56eb608ea 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -108,7 +108,7 @@ option(LLAMA_BUILD_TESTS "llama: build tests" option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE}) -option(LLAMA_BUILD_APP "llama: build the unified binary" OFF) +option(LLAMA_BUILD_APP "llama: build the unified binary" ON) option(LLAMA_BUILD_UI "llama: build the embedded Web UI for server" ON) option(LLAMA_USE_PREBUILT_UI "llama: use prebuilt UI from HF Bucket when available (requires LLAMA_BUILD_UI=ON)" ON) diff --git a/tools/batched-bench/CMakeLists.txt b/tools/batched-bench/CMakeLists.txt index 1769c2136..42b50972f 100644 --- a/tools/batched-bench/CMakeLists.txt +++ b/tools/batched-bench/CMakeLists.txt @@ -2,7 +2,8 @@ set(TARGET llama-batched-bench-impl) -add_library(${TARGET} STATIC batched-bench.cpp) +add_library(${TARGET} batched-bench.cpp) +set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/tools/cli/CMakeLists.txt b/tools/cli/CMakeLists.txt index 32b660508..aa44e586c 100644 --- a/tools/cli/CMakeLists.txt +++ b/tools/cli/CMakeLists.txt @@ -2,7 +2,8 @@ set(TARGET llama-cli-impl) -add_library(${TARGET} STATIC cli.cpp) +add_library(${TARGET} cli.cpp) +set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ../server) target_link_libraries(${TARGET} PUBLIC server-context llama-common ${CMAKE_THREAD_LIBS_INIT}) diff --git a/tools/completion/CMakeLists.txt b/tools/completion/CMakeLists.txt index 687bdfda6..a485bf0a3 100644 --- a/tools/completion/CMakeLists.txt +++ b/tools/completion/CMakeLists.txt @@ -2,7 +2,8 @@ set(TARGET llama-completion-impl) -add_library(${TARGET} STATIC completion.cpp) +add_library(${TARGET} completion.cpp) +set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/tools/fit-params/CMakeLists.txt b/tools/fit-params/CMakeLists.txt index 207caf2ce..799c2d747 100644 --- a/tools/fit-params/CMakeLists.txt +++ b/tools/fit-params/CMakeLists.txt @@ -2,7 +2,8 @@ set(TARGET llama-fit-params-impl) -add_library(${TARGET} STATIC fit-params.cpp) +add_library(${TARGET} fit-params.cpp) +set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/tools/llama-bench/CMakeLists.txt b/tools/llama-bench/CMakeLists.txt index ed419a740..2b71faa5f 100644 --- a/tools/llama-bench/CMakeLists.txt +++ b/tools/llama-bench/CMakeLists.txt @@ -2,7 +2,8 @@ set(TARGET llama-bench-impl) -add_library(${TARGET} STATIC llama-bench.cpp) +add_library(${TARGET} llama-bench.cpp) +set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/tools/perplexity/CMakeLists.txt b/tools/perplexity/CMakeLists.txt index 44061d0a5..b03d61a41 100644 --- a/tools/perplexity/CMakeLists.txt +++ b/tools/perplexity/CMakeLists.txt @@ -2,7 +2,8 @@ set(TARGET llama-perplexity-impl) -add_library(${TARGET} STATIC perplexity.cpp) +add_library(${TARGET} perplexity.cpp) +set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/tools/quantize/CMakeLists.txt b/tools/quantize/CMakeLists.txt index e76f7d811..5ef4e4e8a 100644 --- a/tools/quantize/CMakeLists.txt +++ b/tools/quantize/CMakeLists.txt @@ -2,7 +2,8 @@ set(TARGET llama-quantize-impl) -add_library(${TARGET} STATIC quantize.cpp) +add_library(${TARGET} quantize.cpp) +set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT}) diff --git a/tools/server/CMakeLists.txt b/tools/server/CMakeLists.txt index bf75b199b..d87d1a5a5 100644 --- a/tools/server/CMakeLists.txt +++ b/tools/server/CMakeLists.txt @@ -31,13 +31,14 @@ target_link_libraries(${TARGET} PUBLIC llama-common mtmd ${CMAKE_THREAD_LIBS_INI set(TARGET llama-server-impl) -add_library(${TARGET} STATIC +add_library(${TARGET} server.cpp server-http.cpp server-http.h server-models.cpp server-models.h ) +set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) target_include_directories(${TARGET} PRIVATE ../mtmd ${CMAKE_SOURCE_DIR}) From 4f0e43da6f8f6e9390d88409610098ec2d2dc5c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Thu, 21 May 2026 23:35:29 +0200 Subject: [PATCH 10/12] CUDA: fix PDL CC check for JIT compilation (#23471) --- ggml/src/ggml-cuda/common.cuh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 9c73fe7e6..e54ecb293 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1561,7 +1561,8 @@ static __inline__ void ggml_cuda_kernel_launch(Kernel kernel, const ggml_cuda_ke return env == nullptr || std::atoi(env) != 0; }(); - if (env_pdl_enabled && ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= GGML_CUDA_CC_HOPPER) { + const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; + if (env_pdl_enabled && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_HOPPER) { auto pdl_cfg = ggml_cuda_pdl_config(launch_params); CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, kernel, std::forward(args)... )); From bbce619adb409880fb6db850a1c5a5f36a4dc7b1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 22 May 2026 11:46:26 +0300 Subject: [PATCH 11/12] cmake : add install() for impl libraries + fix apple builds (#23511) * pi : update * ci : fix ios build * ci : fix andoroid * ci : fix apple builds * cmake : add install() for impl libraries Add install(TARGETS LIBRARY) for all -impl libraries that were changed from STATIC to shared (controlled by BUILD_SHARED_LIBS) in commit bb28c1fe2. Without this, cmake --install fails to copy the shared libraries, causing runtime errors like: llama-server: error while loading shared libraries: libllama-server-impl.so Ref: https://github.com/ggml-org/llama.cpp/issues/23494#issuecomment-4512912515 Assisted-by: llama.cpp:local pi * ci : fix xcframework build --- .github/workflows/build-apple.yml | 5 +++++ .github/workflows/build-cmake-pkg.yml | 11 ++++++++--- .github/workflows/release.yml | 1 + .pi/gg/SYSTEM.md | 5 +++-- build-xcframework.sh | 2 ++ examples/llama.android/lib/build.gradle.kts | 1 + tools/batched-bench/CMakeLists.txt | 4 ++++ tools/cli/CMakeLists.txt | 4 ++++ tools/completion/CMakeLists.txt | 4 ++++ tools/fit-params/CMakeLists.txt | 4 ++++ tools/llama-bench/CMakeLists.txt | 4 ++++ tools/perplexity/CMakeLists.txt | 4 ++++ tools/quantize/CMakeLists.txt | 4 ++++ tools/server/CMakeLists.txt | 4 ++++ 14 files changed, 52 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build-apple.yml b/.github/workflows/build-apple.yml index b99e61466..a5b7ef56e 100644 --- a/.github/workflows/build-apple.yml +++ b/.github/workflows/build-apple.yml @@ -59,6 +59,7 @@ jobs: cmake -B build -G Xcode \ -DGGML_METAL_USE_BF16=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \ + -DLLAMA_BUILD_APP=OFF \ -DLLAMA_BUILD_COMMON=OFF \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TOOLS=OFF \ @@ -89,6 +90,7 @@ jobs: -DGGML_METAL_USE_BF16=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \ -DLLAMA_OPENSSL=OFF \ + -DLLAMA_BUILD_APP=OFF \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TOOLS=OFF \ -DLLAMA_BUILD_TESTS=OFF \ @@ -138,6 +140,7 @@ jobs: -DGGML_METAL_USE_BF16=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \ -DLLAMA_BUILD_COMMON=OFF \ + -DLLAMA_BUILD_APP=OFF \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TOOLS=OFF \ -DLLAMA_BUILD_TESTS=OFF \ @@ -163,6 +166,7 @@ jobs: -DGGML_METAL_USE_BF16=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \ -DLLAMA_BUILD_COMMON=OFF \ + -DLLAMA_BUILD_APP=OFF \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TOOLS=OFF \ -DLLAMA_BUILD_TESTS=OFF \ @@ -206,6 +210,7 @@ jobs: -DGGML_METAL_USE_BF16=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \ -DLLAMA_OPENSSL=OFF \ + -DLLAMA_BUILD_APP=OFF \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TOOLS=OFF \ -DLLAMA_BUILD_TESTS=OFF \ diff --git a/.github/workflows/build-cmake-pkg.yml b/.github/workflows/build-cmake-pkg.yml index 6bbfd9988..b36ac5b8e 100644 --- a/.github/workflows/build-cmake-pkg.yml +++ b/.github/workflows/build-cmake-pkg.yml @@ -19,9 +19,14 @@ jobs: - name: Build run: | PREFIX="$(pwd)"/inst - cmake -S . -B build -DCMAKE_PREFIX_PATH="$PREFIX" \ - -DLLAMA_OPENSSL=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=OFF \ - -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_APP=OFF -DCMAKE_BUILD_TYPE=Release + cmake -S . -B build \ + -DCMAKE_PREFIX_PATH="$PREFIX" \ + -DLLAMA_OPENSSL=OFF \ + -DLLAMA_BUILD_TESTS=OFF \ + -DLLAMA_BUILD_TOOLS=OFF \ + -DLLAMA_BUILD_EXAMPLES=OFF \ + -DLLAMA_BUILD_APP=OFF \ + -DCMAKE_BUILD_TYPE=Release cmake --build build --config Release cmake --install build --prefix "$PREFIX" --config Release diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 1880c155c..e7cbac35f 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1108,6 +1108,7 @@ jobs: -DGGML_METAL_USE_BF16=ON \ -DGGML_METAL_EMBED_LIBRARY=ON \ -DLLAMA_OPENSSL=OFF \ + -DLLAMA_BUILD_APP=OFF \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_TOOLS=OFF \ -DLLAMA_BUILD_TESTS=OFF \ diff --git a/.pi/gg/SYSTEM.md b/.pi/gg/SYSTEM.md index b7597a4c3..06d97ae78 100644 --- a/.pi/gg/SYSTEM.md +++ b/.pi/gg/SYSTEM.md @@ -1,7 +1,7 @@ You are a coding agent. Here are some very important rules that you must follow: General: -- By very precise and concise when writing code, comments, explanations, etc. +- Be very precise and concise when writing code, comments, explanations, etc. - PR and commit titles format: ` : `. Lookup recents for examples - Don't try to build or run the code unless you are explicitly asked to do so - Use the `gh` CLI tool when querying PRs, issues, or other GitHub resources @@ -16,7 +16,8 @@ Pull requests (PRs): - New branch names are prefixed with "gg/" - Before opening a pull request, ask the user to confirm the description - When creating a pull request, look for the repository's PR template and follow it -- For the AI usage disclosure section, write "YES. llama.cpp + pi" +- For the AI usage disclosure section, write "YES. llama.cpp + pi + [MODEL]" +- Ask the user to tell you what model was used and write it in place of [MODEL] - Always create the pull requests in draft mode Commits: diff --git a/build-xcframework.sh b/build-xcframework.sh index c25a1ef28..d287d72fb 100755 --- a/build-xcframework.sh +++ b/build-xcframework.sh @@ -7,6 +7,7 @@ VISIONOS_MIN_OS_VERSION=1.0 TVOS_MIN_OS_VERSION=16.4 BUILD_SHARED_LIBS=OFF +LLAMA_BUILD_APP=OFF LLAMA_BUILD_EXAMPLES=OFF LLAMA_BUILD_TOOLS=OFF LLAMA_BUILD_TESTS=OFF @@ -31,6 +32,7 @@ COMMON_CMAKE_ARGS=( -DCMAKE_XCODE_ATTRIBUTE_STRIP_INSTALLED_PRODUCT=NO -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} + -DLLAMA_BUILD_APP=${LLAMA_BUILD_APP} -DLLAMA_BUILD_EXAMPLES=${LLAMA_BUILD_EXAMPLES} -DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS} -DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS} diff --git a/examples/llama.android/lib/build.gradle.kts b/examples/llama.android/lib/build.gradle.kts index 9b290d6d4..ae95f41a8 100644 --- a/examples/llama.android/lib/build.gradle.kts +++ b/examples/llama.android/lib/build.gradle.kts @@ -25,6 +25,7 @@ android { arguments += "-DCMAKE_VERBOSE_MAKEFILE=ON" arguments += "-DBUILD_SHARED_LIBS=ON" + arguments += "-DLLAMA_BUILD_APP=OFF" arguments += "-DLLAMA_BUILD_COMMON=ON" arguments += "-DLLAMA_OPENSSL=OFF" diff --git a/tools/batched-bench/CMakeLists.txt b/tools/batched-bench/CMakeLists.txt index 42b50972f..f6ed257f5 100644 --- a/tools/batched-bench/CMakeLists.txt +++ b/tools/batched-bench/CMakeLists.txt @@ -8,6 +8,10 @@ set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT}) +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} LIBRARY) +endif() + # llama-batched-bench executable set(TARGET llama-batched-bench) diff --git a/tools/cli/CMakeLists.txt b/tools/cli/CMakeLists.txt index aa44e586c..a3e635719 100644 --- a/tools/cli/CMakeLists.txt +++ b/tools/cli/CMakeLists.txt @@ -8,6 +8,10 @@ set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ../server) target_link_libraries(${TARGET} PUBLIC server-context llama-common ${CMAKE_THREAD_LIBS_INIT}) +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} LIBRARY) +endif() + # llama-cli executable set(TARGET llama-cli) diff --git a/tools/completion/CMakeLists.txt b/tools/completion/CMakeLists.txt index a485bf0a3..a310251ef 100644 --- a/tools/completion/CMakeLists.txt +++ b/tools/completion/CMakeLists.txt @@ -8,6 +8,10 @@ set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT}) +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} LIBRARY) +endif() + # llama-completion executable set(TARGET llama-completion) diff --git a/tools/fit-params/CMakeLists.txt b/tools/fit-params/CMakeLists.txt index 799c2d747..8acdaef37 100644 --- a/tools/fit-params/CMakeLists.txt +++ b/tools/fit-params/CMakeLists.txt @@ -8,6 +8,10 @@ set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT}) +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} LIBRARY) +endif() + # llama-fit-params executable set(TARGET llama-fit-params) diff --git a/tools/llama-bench/CMakeLists.txt b/tools/llama-bench/CMakeLists.txt index 2b71faa5f..b1c35ee88 100644 --- a/tools/llama-bench/CMakeLists.txt +++ b/tools/llama-bench/CMakeLists.txt @@ -8,6 +8,10 @@ set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT}) +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} LIBRARY) +endif() + # llama-bench executable set(TARGET llama-bench) diff --git a/tools/perplexity/CMakeLists.txt b/tools/perplexity/CMakeLists.txt index b03d61a41..0eee9acd4 100644 --- a/tools/perplexity/CMakeLists.txt +++ b/tools/perplexity/CMakeLists.txt @@ -8,6 +8,10 @@ set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT}) +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} LIBRARY) +endif() + # llama-perplexity executable set(TARGET llama-perplexity) diff --git a/tools/quantize/CMakeLists.txt b/tools/quantize/CMakeLists.txt index 5ef4e4e8a..eead4c859 100644 --- a/tools/quantize/CMakeLists.txt +++ b/tools/quantize/CMakeLists.txt @@ -8,6 +8,10 @@ set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) target_link_libraries(${TARGET} PUBLIC llama-common llama ${CMAKE_THREAD_LIBS_INIT}) +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} LIBRARY) +endif() + # llama-quantize executable set(TARGET llama-quantize) diff --git a/tools/server/CMakeLists.txt b/tools/server/CMakeLists.txt index d87d1a5a5..7d427431d 100644 --- a/tools/server/CMakeLists.txt +++ b/tools/server/CMakeLists.txt @@ -44,6 +44,10 @@ target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) target_include_directories(${TARGET} PRIVATE ../mtmd ${CMAKE_SOURCE_DIR}) target_link_libraries(${TARGET} PUBLIC server-context llama-ui cpp-httplib ${CMAKE_THREAD_LIBS_INIT}) +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} LIBRARY) +endif() + # llama-server executable set(TARGET llama-server) From afcda09d154a285cd366135f98ffc1d357f7ddbd Mon Sep 17 00:00:00 2001 From: Kashif Rasul <kashif.rasul@gmail.com> Date: Fri, 22 May 2026 11:17:31 +0200 Subject: [PATCH 12/12] vocab : fix HybridDNA tokenizer (#23466) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * vocab : mark hybriddna k-mers to avoid BPE token collisions * improved loop --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --- conversion/base.py | 5 +++++ src/llama-vocab.cpp | 50 +++++++++++++++++++++++++++------------------ 2 files changed, 35 insertions(+), 20 deletions(-) diff --git a/conversion/base.py b/conversion/base.py index 8e12af6c5..d8f050ed3 100644 --- a/conversion/base.py +++ b/conversion/base.py @@ -1617,6 +1617,11 @@ class TextModel(ModelBase): assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute] reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute] + # k-mers can share text with a base-vocab BPE token (e.g. CCCCCC) and get + # dropped by get_vocab(); a reserved marker suffix (U+E000) keeps each + # k-mer's own id (llama.cpp strips it on detokenization) + for kmer in tokenizer.kmers: # ty: ignore[unresolved-attribute] + reverse_vocab[tokenizer.dna_token_to_id[kmer]] = kmer + "\ue000" # ty: ignore[unresolved-attribute] added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute] added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute] diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index acf832d05..a5cf148b2 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1581,6 +1581,11 @@ private: const llm_tokenizer_plamo2 & tokenizer; }; +// reserved suffix (U+E000) that keeps DNA k-mers distinct from identical +// base-vocab BPE tokens (e.g. CCCCCC) in token_to_id; erased from id_to_token +// text at load +static const std::string dna_kmer_marker = "\xee\x80\x80"; + struct llm_tokenizer_hybriddna_session : llm_tokenizer_bpe_session { llm_tokenizer_hybriddna_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : llm_tokenizer_bpe_session{vocab, tokenizer}, vocab{vocab} {} @@ -1636,34 +1641,22 @@ private: c = char(c - 32); } } - auto is_valid_kmer = [](const std::string & s) { - for (char c : s) { - if (c != 'A' && c != 'C' && c != 'G' && c != 'T') { - return false; - } - } - return true; + + // k-mers carry the reserved marker suffix; a non-ACGT k-mer simply + // isn't in the vocab and falls back to <oov> + auto kmer_token = [&](const std::string & kmer) { + const auto tok = vocab.text_to_token(kmer + dna_kmer_marker); + return tok != LLAMA_TOKEN_NULL ? tok : oov_id; }; size_t i = 0; for (; i + k <= seq.size(); i += k) { - const std::string kmer = seq.substr(i, k); - if (is_valid_kmer(kmer)) { - const auto tok = vocab.text_to_token(kmer); - output.push_back(tok != LLAMA_TOKEN_NULL ? tok : oov_id); - } else { - output.push_back(oov_id); - } + output.push_back(kmer_token(seq.substr(i, k))); } if (i < seq.size()) { std::string kmer = seq.substr(i); kmer.append(k - kmer.size(), 'A'); - if (is_valid_kmer(kmer)) { - const auto tok = vocab.text_to_token(kmer); - output.push_back(tok != LLAMA_TOKEN_NULL ? tok : oov_id); - } else { - output.push_back(oov_id); - } + output.push_back(kmer_token(kmer)); } } @@ -2357,6 +2350,23 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { } GGML_ASSERT(id_to_token.size() == token_to_id.size()); + // hybriddna: the marker suffix kept k-mer ids distinct in token_to_id; erase + // it from id_to_token so the k-mers detokenize to the bare DNA sequence. The + // k-mers are the block right after <oov>, so only scan from there. + if (tokenizer_model == "hybriddna") { + const auto idx = token_to_id.find("<oov>"); + if (idx != token_to_id.end()) { + auto it = id_to_token.begin() + idx->second + 1; + for (; it != id_to_token.end(); ++it) { + std::string & text = it->text; + if (text.size() > dna_kmer_marker.size() + && text.compare(text.size() - dna_kmer_marker.size(), dna_kmer_marker.size(), dna_kmer_marker) == 0) { + text.erase(text.size() - dna_kmer_marker.size()); + } + } + } + } + init_tokenizer(type); // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'