From b44890df2e4fad0ece1d5366dcbc8bedae23b658 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 21 May 2025 13:09:21 +0300
Subject: [PATCH 01/18] model : disable SWA for Phi models (#13676)

* model : disable SWA for Phi models

ggml-ci

* model : update warning message

* model : print warning only if n_swa > 0

* model : fix typo
---
 src/llama-graph.cpp |  7 +++--
 src/llama-model.cpp | 65 +++++++++++++++++++--------------------------
 2 files changed, 30 insertions(+), 42 deletions(-)

diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index e745b41e3..13e36d161 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -1236,8 +1236,7 @@ llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified()
     auto inp = std::make_unique<llm_graph_input_attn_kv_unified>(hparams, cparams, kv_self);
 
     {
-        GGML_ASSERT(hparams.n_swa_pattern == 1 && "Use llama_kv_cache_unified_iswa for SWA");
-        GGML_ASSERT(hparams.n_swa == 0         && "Use llama_kv_cache_unified_iswa for SWA");
+        GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified_iswa for SWA");
 
         const auto n_kv = kv_self->get_n();
 
@@ -1312,8 +1311,8 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif
         inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
     }
 
-    if (hparams.n_swa_pattern > 1) {
-        GGML_ASSERT(hparams.n_swa > 0          && "Use llama_kv_cache_unified for non-SWA");
+    {
+        GGML_ASSERT(hparams.swa_type != LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified for non-SWA");
 
         const auto n_kv = kv_self->get_kv_swa()->get_n();
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 383972f94..7c135e981 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -853,43 +853,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                 }
 
-                // for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
-                if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) {
-                    // default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct
-                    LLAMA_LOG_WARN("%s: assuming n_swa = 2047 for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct\n", __func__);
+                const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
 
-                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-
-                    hparams.n_swa = 2047;
-                } else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
-                    // default value for Phi-3-mini-128k-instruct
-                    LLAMA_LOG_WARN("%s: assuming no SWA for Phi-3-mini-128k-instruct\n", __func__);
+                if (found_swa && hparams.n_swa > 0) {
+                    LLAMA_LOG_WARN("%s: Phi SWA is currently disabled - results might be suboptimal for some models (see %s)\n",
+                            __func__, "https://github.com/ggml-org/llama.cpp/pull/13676");
 
+                    // TODO: fix conversion scripts to correctly populate `n_swa` and `n_swa_pattern`
                     hparams.swa_type = LLAMA_SWA_TYPE_NONE;
 
-                    hparams.n_swa         = hparams.n_ctx_train;
-                    hparams.n_swa_pattern = 1;
-                } else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
-                    // default value for Phi-3-medium-128k-instruct
-                    LLAMA_LOG_WARN("%s: assuming no SWA for Phi-3-medium-128k-instruct\n", __func__);
-
-                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
-
-                    hparams.n_swa         = hparams.n_ctx_train;
-                    hparams.n_swa_pattern = 1;
-                }
-
-                bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
-                if (!found_swa && hparams.n_swa == 0) {
-                    throw std::runtime_error("invalid value for sliding_window");
-                }
-
-                if (hparams.n_swa > hparams.n_ctx_train) {
-                    LLAMA_LOG_WARN("%s: unexpected n_swa: %d >= %d, disabling SWA\n", __func__, hparams.n_swa, hparams.n_ctx_train);
-
-                    hparams.swa_type = LLAMA_SWA_TYPE_NONE;
-
-                    hparams.n_swa         = hparams.n_ctx_train;
+                    hparams.n_swa         = 0;
                     hparams.n_swa_pattern = 1;
                 }
             } break;
@@ -7368,8 +7341,9 @@ struct llm_build_phi2 : public llm_graph_context {
     }
 };
 
-struct llm_build_phi3_iswa : public llm_graph_context {
-    llm_build_phi3_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+template<bool iswa>
+struct llm_build_phi3 : public llm_graph_context {
+    llm_build_phi3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
 
@@ -7383,7 +7357,14 @@ struct llm_build_phi3_iswa : public llm_graph_context {
         // inp_pos - contains the positions
         ggml_tensor * inp_pos = build_inp_pos();
 
-        auto * inp_attn = build_attn_inp_kv_unified_iswa();
+        using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
+        inp_attn_type * inp_attn = nullptr;
+
+        if constexpr (iswa) {
+            inp_attn = build_attn_inp_kv_unified_iswa();
+        } else {
+            inp_attn = build_attn_inp_kv_unified();
+        }
 
         for (int il = 0; il < n_layer; ++il) {
             auto * residual = inpL;
@@ -13232,7 +13213,9 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
 
                 LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
 
-                if (hparams.n_swa > 0) {
+                if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
+                    GGML_ASSERT(hparams.n_swa_pattern != 1);
+
                     res = new llama_kv_cache_unified_iswa(
                             *this,
                             params.type_k,
@@ -13245,6 +13228,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                             cparams.n_batch,
                             padding);
                 } else {
+                    GGML_ASSERT(hparams.n_swa_pattern == 1);
+
                     res = new llama_kv_cache_unified(
                             *this,
                             nullptr,
@@ -13353,7 +13338,11 @@ llm_graph_result_ptr llama_model::build_graph(
         case LLM_ARCH_PHI3:
         case LLM_ARCH_PHIMOE:
             {
-                llm = std::make_unique<llm_build_phi3_iswa>(*this, params, gf);
+                if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
+                    llm = std::make_unique<llm_build_phi3<true>> (*this, params, gf);
+                } else {
+                    llm = std::make_unique<llm_build_phi3<false>>(*this, params, gf);
+                }
             } break;
         case LLM_ARCH_PLAMO:
             {

From 797f2ac0625b22edeff03cc30e0f988da6b6b068 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 21 May 2025 15:11:13 +0300
Subject: [PATCH 02/18] kv-cache : simplify the interface (#13660)

* kv-cache : simplify the interface

ggml-ci

* context : revert llama_batch_allocr position change

ggml-ci
---
 examples/simple-chat/simple-chat.cpp |   4 +-
 include/llama.h                      |   6 +-
 src/llama-batch.cpp                  |   4 +-
 src/llama-context.cpp                |  39 +++++++++-
 src/llama-kv-cache.cpp               | 111 ++++++---------------------
 src/llama-kv-cache.h                 |  51 ++++--------
 src/llama-model.cpp                  |   6 +-
 tools/run/run.cpp                    |   4 +-
 tools/server/server.cpp              |  17 ----
 9 files changed, 89 insertions(+), 153 deletions(-)

diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp
index 84f415973..6608d4bea 100644
--- a/examples/simple-chat/simple-chat.cpp
+++ b/examples/simple-chat/simple-chat.cpp
@@ -98,7 +98,7 @@ int main(int argc, char ** argv) {
     auto generate = [&](const std::string & prompt) {
         std::string response;
 
-        const bool is_first = llama_kv_self_used_cells(ctx) == 0;
+        const bool is_first = llama_kv_self_seq_pos_max(ctx, 0) == 0;
 
         // tokenize the prompt
         const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
@@ -113,7 +113,7 @@ int main(int argc, char ** argv) {
         while (true) {
             // check if we have enough space in the context to evaluate this batch
             int n_ctx = llama_n_ctx(ctx);
-            int n_ctx_used = llama_kv_self_used_cells(ctx);
+            int n_ctx_used = llama_kv_self_seq_pos_max(ctx, 0);
             if (n_ctx_used + batch.n_tokens > n_ctx) {
                 printf("\033[0m\n");
                 fprintf(stderr, "context size exceeded\n");
diff --git a/include/llama.h b/include/llama.h
index 6b4fc5d11..52cd7a5a0 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -610,10 +610,12 @@ extern "C" {
 
     // Returns the number of tokens in the KV cache (slow, use only for debug)
     // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-    LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);
+    DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
+               "Use llama_kv_self_seq_pos_max() instead");
 
     // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
-    LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx);
+    DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
+               "Use llama_kv_self_seq_pos_max() instead");
 
     // Clear the KV cache - both cell info is erased and KV data is zeroed
     LLAMA_API void llama_kv_self_clear(
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
index a88b2fe30..b98e3256c 100644
--- a/src/llama-batch.cpp
+++ b/src/llama-batch.cpp
@@ -1,5 +1,6 @@
 #include "llama-batch.h"
 
+#include <cassert>
 #include <cstring>
 #include <algorithm>
 
@@ -281,9 +282,10 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0
     batch = in_batch;
     GGML_ASSERT(batch.n_tokens > 0);
     if (!batch.pos) {
+        assert(p0 >= 0);
         pos.resize(batch.n_tokens);
         for (int32_t i = 0; i < batch.n_tokens; i++) {
-            pos[i] = i + p0;
+            pos[i] = p0 + i;
         }
         batch.pos = pos.data();
     }
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index bba3ee0b5..85b4324b6 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -857,11 +857,17 @@ int llama_context::decode(llama_batch & inp_batch) {
         return -1;
     }
 
+    if (!inp_batch.pos) {
+        if (inp_batch.seq_id) {
+            LLAMA_LOG_ERROR("%s: pos == NULL, but seq_id != NULL\n", __func__);
+            return -1;
+        }
+    }
+
     llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
 
     // temporary allocate memory for the input batch if needed
-    // TODO: this is incorrect for multiple sequences because get_pos_max() is the maximum across all sequences
-    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->get_pos_max() + 1);
+    llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->seq_pos_max(0) + 1);
 
     const llama_batch & batch = batch_allocr.batch;
 
@@ -2292,22 +2298,47 @@ int32_t llama_apply_adapter_cvec(
 // kv cache
 //
 
+// deprecated
 int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
     const auto * kv = ctx->get_kv_self();
     if (!kv) {
         return 0;
     }
 
-    return kv->get_n_tokens();
+    int32_t res = 0;
+
+    for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) {
+        const llama_pos p0 = kv->seq_pos_min(s);
+        const llama_pos p1 = kv->seq_pos_max(s);
+
+        if (p0 >= 0) {
+            res += (p1 - p0) + 1;
+        }
+    }
+
+    return res;
 }
 
+// deprecated
+// note: this is the same as above - will be removed anyway, so it's ok
 int32_t llama_kv_self_used_cells(const llama_context * ctx) {
     const auto * kv = ctx->get_kv_self();
     if (!kv) {
         return 0;
     }
 
-    return kv->get_used_cells();
+    int32_t res = 0;
+
+    for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) {
+        const llama_pos p0 = kv->seq_pos_min(s);
+        const llama_pos p1 = kv->seq_pos_max(s);
+
+        if (p0 >= 0) {
+            res += (p1 - p0) + 1;
+        }
+    }
+
+    return res;
 }
 
 void llama_kv_self_clear(llama_context * ctx) {
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 77b2c0dbf..a2624d715 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -30,13 +30,14 @@ llama_kv_cache_unified::llama_kv_cache_unified(
                      bool    v_trans,
                      bool    offload,
                  uint32_t    kv_size,
-                 uint32_t    padding,
+                 uint32_t    n_seq_max,
+                 uint32_t    n_pad,
                  uint32_t    n_swa,
-           llama_swa_type    swa_type) : model(model), hparams(model.hparams), v_trans(v_trans), padding(padding), n_swa(n_swa), swa_type(swa_type) {
-    GGML_ASSERT(kv_size % padding == 0 && "kv_size must be a multiple of padding");
+           llama_swa_type    swa_type) :
+    model(model), hparams(model.hparams), v_trans(v_trans),
+    n_seq_max(n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
 
-    this->type_k = type_k;
-    this->type_v = type_v;
+    GGML_ASSERT(kv_size % n_pad == 0);
 
     // create a context for each buffer type
     std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
@@ -129,8 +130,8 @@ llama_kv_cache_unified::llama_kv_cache_unified(
         const size_t memory_size_k = size_k_bytes();
         const size_t memory_size_v = size_v_bytes();
 
-        LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6d cells, %3d layers), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
-                (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(),
+        LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
+                (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max,
                 ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
                 ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
     }
@@ -442,7 +443,7 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
 void llama_kv_cache_unified::defrag_sched(float thold) {
     // - do not defrag small contexts (i.e. < 2048 tokens)
     // - count the padding towards the number of used tokens
-    const float fragmentation = n >= 2048 ? std::max(0.0f, 1.0f - (float(used + padding)/n)) : 0.0f;
+    const float fragmentation = n >= 2048 ? std::max(0.0f, 1.0f - (float(used + n_pad)/n)) : 0.0f;
 
     // queue defragmentation for next llama_kv_cache_update
     if (fragmentation > thold) {
@@ -558,7 +559,7 @@ bool llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) {
     // a heuristic, to avoid attending the full cache if it is not yet utilized
     // after enough generations, the benefit from this heuristic disappears
     // if we start defragmenting the cache, the benefit from this will be more important
-    n = std::min(size, std::max(padding, GGML_PAD(cell_max(), padding)));
+    n = std::min(size, std::max(n_pad, GGML_PAD(cell_max(), n_pad)));
 
 #ifdef FIND_SLOT_DEBUG
     LLAMA_LOG_WARN("end:   n = %5d, used = %5d, head = %5d, n_swa = %5d\n", n, used, head, n_swa);
@@ -567,20 +568,6 @@ bool llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) {
     return true;
 }
 
-int32_t llama_kv_cache_unified::get_n_tokens() const {
-    int32_t result = 0;
-
-    for (uint32_t i = 0; i < size; i++) {
-        result += cells[i].seq_id.size();
-    }
-
-    return result;
-}
-
-int32_t llama_kv_cache_unified::get_used_cells() const {
-    return used;
-}
-
 bool llama_kv_cache_unified::get_can_shift() const {
     return true;
 }
@@ -802,16 +789,6 @@ void llama_kv_cache_unified::set_input_pos_bucket(ggml_tensor * dst, const llama
     }
 }
 
-llama_pos llama_kv_cache_unified::get_pos_max() const {
-    llama_pos pos_max = -1;
-
-    for (const auto & cell : cells) {
-        pos_max = std::max(pos_max, cell.pos);
-    }
-
-    return pos_max;
-}
-
 size_t llama_kv_cache_unified::total_size() const {
     size_t size = 0;
 
@@ -1501,11 +1478,8 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell
                 llama_seq_id seq_id;
                 io.read_to(&seq_id, sizeof(seq_id));
 
-                // TODO: llama_kv_cache_unified should have a notion of max sequences
-                //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
-                if (seq_id < 0) {
-                    //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
-                    LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id);
+                if (seq_id < 0 || (uint32_t) seq_id >= n_seq_max) {
+                    LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, n_seq_max);
                     return false;
                 }
 
@@ -1655,17 +1629,17 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
                 ggml_type   type_v,
                      bool   v_trans,
                      bool   offload,
-                 uint32_t   kv_size,
                      bool   swa_full,
+                 uint32_t   kv_size,
                  uint32_t   n_seq_max,
                  uint32_t   n_batch,
-                 uint32_t   padding) : hparams(model.hparams) {
+                 uint32_t   n_pad) : hparams(model.hparams) {
     llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
     llama_kv_cache_unified::layer_filter_cb filter_swa  = [&](int32_t il) { return  model.hparams.is_swa(il); };
 
     const uint32_t size_base = kv_size;
 
-    uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_batch, padding));
+    uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_batch, n_pad));
 
     // when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size and disable pruning
     if (swa_full) {
@@ -1680,14 +1654,14 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
 
     kv_base = std::make_unique<llama_kv_cache_unified>(
             model, std::move(filter_base), type_k, type_v,
-            v_trans, offload, size_base, padding,
+            v_trans, offload, size_base, n_seq_max, n_pad,
             0, LLAMA_SWA_TYPE_NONE);
 
     LLAMA_LOG_INFO("%s: creating     SWA KV cache, size = %u cells\n", __func__, size_swa);
 
     kv_swa = std::make_unique<llama_kv_cache_unified>(
             model, std::move(filter_swa), type_k, type_v,
-            v_trans, offload, size_swa, padding,
+            v_trans, offload, size_swa, n_seq_max, n_pad,
             hparams.n_swa, hparams.swa_type);
 }
 
@@ -1810,18 +1784,6 @@ bool llama_kv_cache_unified_iswa::find_slot(const llama_ubatch & batch) {
     return res;
 }
 
-int32_t llama_kv_cache_unified_iswa::get_n_tokens()   const {
-    return kv_base->get_n_tokens();
-}
-
-int32_t llama_kv_cache_unified_iswa::get_used_cells() const {
-    return kv_base->get_used_cells();
-}
-
-llama_pos llama_kv_cache_unified_iswa::get_pos_max() const {
-    return kv_base->get_pos_max();
-}
-
 bool llama_kv_cache_unified_iswa::get_can_shift() const {
     return kv_base->get_size() == kv_swa->get_size();
 }
@@ -1853,19 +1815,17 @@ llama_kv_cache_recurrent::llama_kv_cache_recurrent(
                 ggml_type   type_k,
                 ggml_type   type_v,
                      bool   offload,
-                 uint32_t   kv_size) : hparams(model.hparams) {
+                 uint32_t   kv_size,
+                 uint32_t   n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) {
     const int32_t n_layer = hparams.n_layer;
 
-    LLAMA_LOG_INFO("%s: kv_size = %d, type_k = '%s', type_v = '%s', n_layer = %d\n",
-            __func__, kv_size, ggml_type_name(type_k), ggml_type_name(type_v), n_layer);
+    LLAMA_LOG_INFO("%s: kv_size = %u, n_seq_max = %u, type_k = '%s', type_v = '%s', n_layer = %d\n",
+            __func__, kv_size, n_seq_max, ggml_type_name(type_k), ggml_type_name(type_v), n_layer);
 
     head = 0;
     size = kv_size;
     used = 0;
 
-    this->type_k = type_k;
-    this->type_v = type_v;
-
     cells.clear();
     cells.resize(kv_size);
 
@@ -2203,8 +2163,8 @@ void llama_kv_cache_recurrent::commit() {
     pending.ranges.clear();
 }
 
-bool llama_kv_cache_recurrent::update(llama_context & lctx) {
-    GGML_UNUSED(lctx);
+bool llama_kv_cache_recurrent::update(llama_context & ctx) {
+    GGML_UNUSED(ctx);
     return false;
 }
 
@@ -2265,7 +2225,7 @@ bool llama_kv_cache_recurrent::find_slot(
             if (seq_id < 0 || (uint32_t) seq_id >= size) {
                 // too big seq_id
                 // TODO: would it be possible to resize the cache instead?
-                LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, size);
+                LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%u Try using a bigger --parallel value\n", __func__, seq_id, n_seq_max);
                 return false;
             }
             if (j > 0) {
@@ -2408,29 +2368,6 @@ bool llama_kv_cache_recurrent::find_slot(
     return n >= n_seqs;
 }
 
-int32_t llama_kv_cache_recurrent::get_n_tokens() const {
-    int32_t result = 0;
-
-    for (uint32_t i = 0; i < size; i++) {
-        result += cells[i].seq_id.size();
-    }
-
-    return result;
-}
-
-int32_t llama_kv_cache_recurrent::get_used_cells() const {
-    return used;
-}
-
-llama_pos llama_kv_cache_recurrent::get_pos_max() const {
-    llama_pos pos_max = -1;
-    for (const auto & cell : cells) {
-        pos_max = std::max(pos_max, cell.pos);
-    }
-
-    return pos_max;
-}
-
 bool llama_kv_cache_recurrent::get_can_shift() const {
     return false;
 }
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index bd0485bc6..191a1090a 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -55,10 +55,7 @@ struct llama_kv_cache : public llama_memory_i {
     // =============================================================================================================
 
     // getters
-    virtual int32_t   get_n_tokens()   const = 0;
-    virtual int32_t   get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
-    virtual llama_pos get_pos_max()    const = 0;
-    virtual bool      get_can_shift()  const = 0;
+    virtual bool get_can_shift() const = 0;
 
     bool get_can_edit() const override { return get_can_shift(); }
 
@@ -108,7 +105,8 @@ public:
                          bool    v_trans,
                          bool    offload,
                      uint32_t    kv_size,
-                     uint32_t    padding,
+                     uint32_t    n_seq_max,
+                     uint32_t    n_pad,
                      uint32_t    n_swa,
                llama_swa_type    swa_type);
 
@@ -150,12 +148,6 @@ public:
     // to the first cell of the slot.
     bool find_slot(const llama_ubatch & batch) override;
 
-    int32_t get_n_tokens()   const override;
-    int32_t get_used_cells() const override;
-
-    // TODO: better data structures to reduce the cost of this operation
-    llama_pos get_pos_max() const override;
-
     bool get_can_shift() const override;
 
     // state write/load
@@ -228,16 +220,15 @@ private:
     // computed before each graph build
     uint32_t n = 0;
 
-    // required padding
-    uint32_t padding = 1;
+    const uint32_t n_seq_max = 1;
 
-    ggml_type type_k = GGML_TYPE_F16;
-    ggml_type type_v = GGML_TYPE_F16;
+    // required padding
+    const uint32_t n_pad = 1;
 
     // SWA
-    uint32_t n_swa = 0;
+    const uint32_t n_swa = 0;
 
-    llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
+    const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
 
     std::vector<ggml_context_ptr>        ctxs;
     std::vector<ggml_backend_buffer_ptr> bufs;
@@ -317,11 +308,11 @@ public:
                     ggml_type   type_v,
                          bool   v_trans,
                          bool   offload,
-                     uint32_t   kv_size,
                          bool   swa_full,
+                     uint32_t   kv_size,
                      uint32_t   n_seq_max,
                      uint32_t   n_batch,
-                     uint32_t   padding);
+                     uint32_t   n_pad);
 
     ~llama_kv_cache_unified_iswa() = default;
 
@@ -358,12 +349,6 @@ public:
 
     bool find_slot(const llama_ubatch & batch) override;
 
-    int32_t get_n_tokens()   const override;
-    int32_t get_used_cells() const override;
-
-    // TODO: better data structures to reduce the cost of this operation
-    llama_pos get_pos_max() const override;
-
     bool get_can_shift() const override;
 
     // state write/load
@@ -432,7 +417,8 @@ public:
                     ggml_type   type_k,
                     ggml_type   type_v,
                          bool   offload,
-                     uint32_t   kv_size);
+                     uint32_t   kv_size,
+                     uint32_t   n_seq_max);
 
     ~llama_kv_cache_recurrent() = default;
 
@@ -444,7 +430,7 @@ public:
 
     bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
     void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
-    void seq_keep(llama_seq_id seq_id) override;
+    void seq_keep(llama_seq_id seq_id)                                                          override;
     void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) override;
     void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
 
@@ -458,7 +444,7 @@ public:
     void restore() override;
     void commit()  override;
 
-    bool update(llama_context & lctx) override;
+    bool update(llama_context & ctx) override;
 
     void defrag_sched(float thold) override;
 
@@ -469,12 +455,6 @@ public:
 
     bool find_slot(const llama_ubatch & batch) override;
 
-    int32_t get_n_tokens()   const override;
-    int32_t get_used_cells() const override;
-
-    // TODO: better data structures to reduce the cost of this operation
-    llama_pos get_pos_max() const override;
-
     bool get_can_shift() const override;
 
     // TODO: temporary methods - they are not really const as they do const_cast<>, fix this
@@ -514,8 +494,7 @@ private:
         std::vector<slot_range> ranges;
     } pending;
 
-    ggml_type type_k = GGML_TYPE_F16;
-    ggml_type type_v = GGML_TYPE_F16;
+    const uint32_t n_seq_max = 1;
 
     std::vector<ggml_context_ptr>        ctxs;
     std::vector<ggml_backend_buffer_ptr> bufs;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 7c135e981..82557ea05 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -13203,7 +13203,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                         GGML_TYPE_F32,
                         GGML_TYPE_F32,
                         cparams.offload_kqv,
-                        std::max((uint32_t) 1, cparams.n_seq_max));
+                        std::max((uint32_t) 1, cparams.n_seq_max),
+                        cparams.n_seq_max);
             } break;
         default:
             {
@@ -13222,8 +13223,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                             params.type_v,
                             !cparams.flash_attn,
                             cparams.offload_kqv,
-                            cparams.n_ctx,
                             params.swa_full,
+                            cparams.n_ctx,
                             cparams.n_seq_max,
                             cparams.n_batch,
                             padding);
@@ -13238,6 +13239,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                             !cparams.flash_attn,
                             cparams.offload_kqv,
                             cparams.n_ctx,
+                            cparams.n_seq_max,
                             padding,
                             hparams.n_swa,
                             hparams.swa_type);
diff --git a/tools/run/run.cpp b/tools/run/run.cpp
index a189ae7fa..702c307d8 100644
--- a/tools/run/run.cpp
+++ b/tools/run/run.cpp
@@ -936,7 +936,7 @@ static int apply_chat_template(const struct common_chat_templates * tmpls, Llama
 // Function to tokenize the prompt
 static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt,
                            std::vector<llama_token> & prompt_tokens, const LlamaData & llama_data) {
-    const bool is_first = llama_kv_self_used_cells(llama_data.context.get()) == 0;
+    const bool is_first = llama_kv_self_seq_pos_max(llama_data.context.get(), 0) == 0;
 
     const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
     prompt_tokens.resize(n_prompt_tokens);
@@ -952,7 +952,7 @@ static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt
 // Check if we have enough space in the context to evaluate this batch
 static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
     const int n_ctx      = llama_n_ctx(ctx.get());
-    const int n_ctx_used = llama_kv_self_used_cells(ctx.get());
+    const int n_ctx_used = llama_kv_self_seq_pos_max(ctx.get(), 0);
     if (n_ctx_used + batch.n_tokens > n_ctx) {
         printf(LOG_COL_DEFAULT "\n");
         printe("context size exceeded\n");
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index f8b7ff062..3b1305e1a 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -1137,9 +1137,6 @@ struct server_task_result_metrics : server_task_result {
     int n_tasks_deferred;
     int64_t t_start;
 
-    int32_t kv_cache_tokens_count;
-    int32_t kv_cache_used_cells;
-
     // TODO: somehow reuse server_metrics in the future, instead of duplicating the fields
     uint64_t n_prompt_tokens_processed_total = 0;
     uint64_t t_prompt_processing_total       = 0;
@@ -1179,9 +1176,6 @@ struct server_task_result_metrics : server_task_result {
             { "n_decode_total",                  n_decode_total },
             { "n_busy_slots_total",              n_busy_slots_total },
 
-            { "kv_cache_tokens_count",           kv_cache_tokens_count },
-            { "kv_cache_used_cells",             kv_cache_used_cells },
-
             { "slots",                           slots_data },
         };
     }
@@ -2771,9 +2765,6 @@ struct server_context {
                     res->n_tasks_deferred    = queue_tasks.queue_tasks_deferred.size();
                     res->t_start             = metrics.t_start;
 
-                    res->kv_cache_tokens_count = llama_kv_self_n_tokens(ctx);
-                    res->kv_cache_used_cells   = llama_kv_self_used_cells(ctx);
-
                     res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total;
                     res->t_prompt_processing_total       = metrics.t_prompt_processing_total;
                     res->n_tokens_predicted_total        = metrics.n_tokens_predicted_total;
@@ -3883,14 +3874,6 @@ int main(int argc, char ** argv) {
                     {"name",  "predicted_tokens_seconds"},
                     {"help",  "Average generation throughput in tokens/s."},
                     {"value",  res_metrics->n_tokens_predicted ? 1.e3 / res_metrics->t_tokens_generation * res_metrics->n_tokens_predicted : 0.}
-            },{
-                    {"name",  "kv_cache_usage_ratio"},
-                    {"help",  "KV-cache usage. 1 means 100 percent usage."},
-                    {"value",  1. * res_metrics->kv_cache_used_cells / params.n_ctx}
-            },{
-                    {"name",  "kv_cache_tokens"},
-                    {"help",  "KV-cache tokens."},
-                    {"value",  (uint64_t) res_metrics->kv_cache_tokens_count}
             },{
                     {"name",  "requests_processing"},
                     {"help",  "Number of requests processing."},

From 42158ae2e8ead667a83f07247321ce85f32ace66 Mon Sep 17 00:00:00 2001
From: Dorin-Andrei Geman <doringeman@gmail.com>
Date: Wed, 21 May 2025 16:07:57 +0300
Subject: [PATCH 03/18] server : fix first message identification (#13634)

* server : fix first message identification

When using the OpenAI SDK (https://github.com/openai/openai-node/blob/master/src/lib/ChatCompletionStream.ts#L623-L626) we noticed that the expected assistant role is missing in the first streaming message. Fix this by correctly checking for the first message.

Co-authored-by: Piotr Stankiewicz <piotr.stankiewicz@docker.com>
Signed-off-by: Dorin Geman <dorin.geman@docker.com>

* server : Fix checks for first role message for stream=True

Co-authored-by: Piotr Stankiewicz <piotr.stankiewicz@docker.com>
Signed-off-by: Dorin Geman <dorin.geman@docker.com>

---------

Signed-off-by: Dorin Geman <dorin.geman@docker.com>
Co-authored-by: Piotr Stankiewicz <piotr.stankiewicz@docker.com>
---
 tools/server/server.cpp                       | 18 +++++-
 .../server/tests/unit/test_chat_completion.py | 56 ++++++++++++-------
 2 files changed, 53 insertions(+), 21 deletions(-)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 3b1305e1a..d48cf46e4 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -951,7 +951,7 @@ struct server_task_result_cmpl_partial : server_task_result {
     }
 
     json to_json_oaicompat_chat() {
-        bool first = n_decoded == 0;
+        bool first = n_decoded == 1;
         std::time_t t = std::time(0);
         json choices;
 
@@ -962,15 +962,18 @@ struct server_task_result_cmpl_partial : server_task_result {
                                             {"delta", json{{"role", "assistant"}}}}});
             } else {
                 // We have to send this as two updates to conform to openai behavior
+                // initial_ret is the role message for stream=True
                 json initial_ret = json{{"choices", json::array({json{
                                         {"finish_reason", nullptr},
                                         {"index", 0},
                                         {"delta", json{
-                                            {"role", "assistant"}
+                                            {"role", "assistant"},
+                                            {"content", ""}
                                         }}}})},
                             {"created", t},
                             {"id", oaicompat_cmpl_id},
                             {"model", oaicompat_model},
+                            {"system_fingerprint", build_info},
                             {"object", "chat.completion.chunk"}};
 
                 json second_ret = json{
@@ -982,8 +985,19 @@ struct server_task_result_cmpl_partial : server_task_result {
                             {"created", t},
                             {"id", oaicompat_cmpl_id},
                             {"model", oaicompat_model},
+                            {"system_fingerprint", build_info},
                             {"object", "chat.completion.chunk"}};
 
+                if (prob_output.probs.size() > 0) {
+                    second_ret["choices"][0]["logprobs"] = json{
+                        {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)},
+                    };
+                }
+
+                if (timings.prompt_n >= 0) {
+                    second_ret.push_back({"timings", timings.to_json()});
+                }
+
                 return std::vector<json>({initial_ret, second_ret});
             }
         } else {
diff --git a/tools/server/tests/unit/test_chat_completion.py b/tools/server/tests/unit/test_chat_completion.py
index 491cb3a5d..bab5d005d 100644
--- a/tools/server/tests/unit/test_chat_completion.py
+++ b/tools/server/tests/unit/test_chat_completion.py
@@ -71,8 +71,14 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte
     })
     content = ""
     last_cmpl_id = None
-    for data in res:
+    for i, data in enumerate(res):
         choice = data["choices"][0]
+        if i == 0:
+            # Check first role message for stream=True
+            assert choice["delta"]["content"] == ""
+            assert choice["delta"]["role"] == "assistant"
+        else:
+            assert "role" not in choice["delta"]
         assert data["system_fingerprint"].startswith("b")
         assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future
         if last_cmpl_id is None:
@@ -242,12 +248,18 @@ def test_chat_completion_with_timings_per_token():
         "stream": True,
         "timings_per_token": True,
     })
-    for data in res:
-        assert "timings" in data
-        assert "prompt_per_second" in data["timings"]
-        assert "predicted_per_second" in data["timings"]
-        assert "predicted_n" in data["timings"]
-        assert data["timings"]["predicted_n"] <= 10
+    for i, data in enumerate(res):
+        if i == 0:
+            # Check first role message for stream=True
+            assert data["choices"][0]["delta"]["content"] == ""
+            assert data["choices"][0]["delta"]["role"] == "assistant"
+        else:
+            assert "role" not in data["choices"][0]["delta"]
+            assert "timings" in data
+            assert "prompt_per_second" in data["timings"]
+            assert "predicted_per_second" in data["timings"]
+            assert "predicted_n" in data["timings"]
+            assert data["timings"]["predicted_n"] <= 10
 
 
 def test_logprobs():
@@ -295,17 +307,23 @@ def test_logprobs_stream():
     )
     output_text = ''
     aggregated_text = ''
-    for data in res:
+    for i, data in enumerate(res):
         choice = data.choices[0]
-        if choice.finish_reason is None:
-            if choice.delta.content:
-                output_text += choice.delta.content
-            assert choice.logprobs is not None
-            assert choice.logprobs.content is not None
-            for token in choice.logprobs.content:
-                aggregated_text += token.token
-                assert token.logprob <= 0.0
-                assert token.bytes is not None
-                assert token.top_logprobs is not None
-                assert len(token.top_logprobs) > 0
+        if i == 0:
+            # Check first role message for stream=True
+            assert choice.delta.content == ""
+            assert choice.delta.role == "assistant"
+        else:
+            assert choice.delta.role is None
+            if choice.finish_reason is None:
+                if choice.delta.content:
+                    output_text += choice.delta.content
+                assert choice.logprobs is not None
+                assert choice.logprobs.content is not None
+                for token in choice.logprobs.content:
+                    aggregated_text += token.token
+                    assert token.logprob <= 0.0
+                    assert token.bytes is not None
+                    assert token.top_logprobs is not None
+                    assert len(token.top_logprobs) > 0
     assert aggregated_text == output_text

From 0d5c74216170ef97e5e7511563837263f2d1a496 Mon Sep 17 00:00:00 2001
From: Robin Davidsson <40024429+R-Dson@users.noreply.github.com>
Date: Wed, 21 May 2025 15:15:27 +0200
Subject: [PATCH 04/18] server : Add the endpoints /api/tags and /api/chat
 (#13659)

* Add the endpoints /api/tags and /api/chat

Add the endpoints /api/tags and /api/chat, and improved the model metadata response

* Remove trailing whitespaces

* Removed code that is not needed for copilot to work.
---
 tools/server/server.cpp | 42 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 40 insertions(+), 2 deletions(-)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index d48cf46e4..087665e41 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -3707,6 +3707,7 @@ int main(int argc, char ** argv) {
             "/health",
             "/models",
             "/v1/models",
+            "/api/tags"
         };
 
         // If API key is not set, skip validation
@@ -3745,7 +3746,7 @@ int main(int argc, char ** argv) {
             if (req.path == "/" || tmp.back() == "html") {
                 res.set_content(reinterpret_cast<const char*>(loading_html), loading_html_len, "text/html; charset=utf-8");
                 res.status = 503;
-            } else if (req.path == "/models" || req.path == "/v1/models") {
+            } else if (req.path == "/models" || req.path == "/v1/models" || req.path == "/api/tags") {
                 // allow the models endpoint to be accessed during loading
                 return true;
             } else {
@@ -4083,6 +4084,19 @@ int main(int argc, char ** argv) {
                     { "llama.context_length", ctx_server.slots.back().n_ctx, },
                 }
             },
+            {"modelfile", ""},
+            {"parameters", ""},
+            {"template", common_chat_templates_source(ctx_server.chat_templates.get())},
+            {"details", {
+                {"parent_model", ""},
+                {"format", "gguf"},
+                {"family", ""},
+                {"families", {""}},
+                {"parameter_size", ""},
+                {"quantization_level", ""}
+            }},
+            {"model_info", ""},
+            {"capabilities", {"completion"}}
         };
 
         res_ok(res, data);
@@ -4408,6 +4422,28 @@ int main(int argc, char ** argv) {
         }
 
         json models = {
+            {"models", {
+                {
+                    {"name", params.model_alias.empty() ? params.model.path : params.model_alias},
+                    {"model", params.model_alias.empty() ? params.model.path : params.model_alias},
+                    {"modified_at", ""},
+                    {"size", ""},
+                    {"digest", ""}, // dummy value, llama.cpp does not support managing model file's hash
+                    {"type", "model"},
+                    {"description", ""},
+                    {"tags", {""}},
+                    {"capabilities", {"completion"}},
+                    {"parameters", ""},
+                    {"details", {
+                        {"parent_model", ""},
+                        {"format", "gguf"},
+                        {"family", ""},
+                        {"families", {""}},
+                        {"parameter_size", ""},
+                        {"quantization_level", ""}
+                    }}
+                }
+            }},
             {"object", "list"},
             {"data", {
                 {
@@ -4417,7 +4453,7 @@ int main(int argc, char ** argv) {
                     {"owned_by", "llamacpp"},
                     {"meta",     model_meta},
                 },
-             }}
+            }}
         };
 
         res_ok(res, models);
@@ -4745,11 +4781,13 @@ int main(int argc, char ** argv) {
     svr->Post("/api/show",            handle_api_show);
     svr->Get ("/models",              handle_models); // public endpoint (no API key check)
     svr->Get ("/v1/models",           handle_models); // public endpoint (no API key check)
+    svr->Get ("/api/tags",            handle_models); // ollama specific endpoint. public endpoint (no API key check)
     svr->Post("/completion",          handle_completions); // legacy
     svr->Post("/completions",         handle_completions);
     svr->Post("/v1/completions",      handle_completions_oai);
     svr->Post("/chat/completions",    handle_chat_completions);
     svr->Post("/v1/chat/completions", handle_chat_completions);
+    svr->Post("/api/chat",            handle_chat_completions); // ollama specific endpoint
     svr->Post("/infill",              handle_infill);
     svr->Post("/embedding",           handle_embeddings); // legacy
     svr->Post("/embeddings",          handle_embeddings);

From cf4cb59e64d72a1b4c781f71a74de5756a4e2376 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Wed, 21 May 2025 16:26:33 +0200
Subject: [PATCH 05/18] ggml : add ggml_gelu_erf() (#13667)

* ggml : add ggml_gelu_na (not approximated)

* fix naming order

* rename na --> erf

* apply review suggesions

* revert naming order
---
 ggml/include/ggml.h                  |  13 +++-
 ggml/src/ggml-cpu/ggml-cpu.c         |   1 +
 ggml/src/ggml-cpu/ops.cpp            | 107 +++++++++++++++++++++++++++
 ggml/src/ggml-cpu/vec.h              |  16 ++++
 ggml/src/ggml-metal/ggml-metal.m     |  24 ++++++
 ggml/src/ggml-metal/ggml-metal.metal |  37 +++++++++
 ggml/src/ggml.c                      |  17 ++++-
 7 files changed, 213 insertions(+), 2 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index e91dedf14..c81ff03fe 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -528,14 +528,15 @@ extern "C" {
         GGML_UNARY_OP_STEP,
         GGML_UNARY_OP_TANH,
         GGML_UNARY_OP_ELU,
-        GGML_UNARY_OP_RELU,
         GGML_UNARY_OP_SIGMOID,
         GGML_UNARY_OP_GELU,
+        GGML_UNARY_OP_GELU_ERF,
         GGML_UNARY_OP_GELU_QUICK,
         GGML_UNARY_OP_SILU,
         GGML_UNARY_OP_HARDSWISH,
         GGML_UNARY_OP_HARDSIGMOID,
         GGML_UNARY_OP_EXP,
+        GGML_UNARY_OP_RELU,
 
         GGML_UNARY_OP_COUNT,
     };
@@ -1024,6 +1025,16 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
 
+    // GELU using erf (error function) when possible
+    // some backends may fallback to approximation based on Abramowitz and Stegun formula
+    GGML_API struct ggml_tensor * ggml_gelu_erf(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_gelu_erf_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
     GGML_API struct ggml_tensor * ggml_gelu_quick(
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 133b50606..46f75ad97 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2202,6 +2202,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
                     } break;
 
                 case GGML_UNARY_OP_GELU:
+                case GGML_UNARY_OP_GELU_ERF:
                 case GGML_UNARY_OP_GELU_QUICK:
                 case GGML_UNARY_OP_SILU:
                     {
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 955fec59a..26501b711 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -2691,6 +2691,109 @@ static void ggml_compute_forward_gelu(
     }
 }
 
+// ggml_compute_forward_gelu_erf
+
+static void ggml_compute_forward_gelu_erf_f32(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+
+    const ggml_tensor * src0 = dst->src[0];
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        ggml_vec_gelu_erf_f32(nc,
+                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
+                (float *) ((char *) src0->data + i1*(src0->nb[1])));
+
+#ifndef NDEBUG
+        for (int k = 0; k < nc; k++) {
+            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
+            GGML_UNUSED(x);
+            assert(!isnan(x));
+            assert(!isinf(x));
+        }
+#endif
+    }
+}
+
+static void ggml_compute_forward_gelu_erf_f16(
+    const ggml_compute_params * params,
+    ggml_tensor * dst) {
+
+    const ggml_tensor * src0 = dst->src[0];
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nrows(src0);
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        ggml_vec_gelu_erf_f16(nc,
+                (ggml_fp16_t *) ((char *) dst->data  + i1*( dst->nb[1])),
+                (ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1])));
+
+#ifndef NDEBUG
+        for (int k = 0; k < nc; k++) {
+            const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k];
+            const float v = GGML_FP16_TO_FP32(x);
+            GGML_UNUSED(v);
+            assert(!isnan(v));
+            assert(!isinf(v));
+        }
+#endif
+    }
+}
+
+static void ggml_compute_forward_gelu_erf(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+
+    const ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_gelu_erf_f32(params, dst);
+            } break;
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_gelu_erf_f16(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
 // ggml_compute_forward_gelu_quick
 
 static void ggml_compute_forward_gelu_quick_f32(
@@ -7749,6 +7852,10 @@ void ggml_compute_forward_unary(
             {
                 ggml_compute_forward_gelu(params, dst);
             } break;
+        case GGML_UNARY_OP_GELU_ERF:
+            {
+                ggml_compute_forward_gelu_erf(params, dst);
+            } break;
         case GGML_UNARY_OP_GELU_QUICK:
             {
                 ggml_compute_forward_gelu_quick(params, dst);
diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h
index 23cbb3051..c77349ebe 100644
--- a/ggml/src/ggml-cpu/vec.h
+++ b/ggml/src/ggml-cpu/vec.h
@@ -428,6 +428,7 @@ inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp
 static const float GELU_COEF_A     = 0.044715f;
 static const float GELU_QUICK_COEF = -1.702f;
 static const float SQRT_2_OVER_PI  = 0.79788456080286535587989211986876f;
+static const float SQRT_2_INV      = 0.70710678118654752440084436210484f;
 
 inline static float ggml_gelu_f32(float x) {
     return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
@@ -440,6 +441,14 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp
     }
 }
 
+inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+    for (int i = 0; i < n; ++i) {
+        float xi = GGML_FP16_TO_FP32(x[i]);
+        float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
+        y[i] = GGML_FP32_TO_FP16(res);
+    }
+}
+
 #ifdef GGML_GELU_FP16
 inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
     uint16_t t;
@@ -463,6 +472,13 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
 }
 #endif
 
+inline static void ggml_vec_gelu_erf_f32(const int n, float * y, const float * x) {
+    for (int i = 0; i < n; ++i) {
+        float xi = x[i];
+        y[i] = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
+    }
+}
+
 inline static float ggml_gelu_quick_f32(float x) {
     return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
 }
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 85dbbcd5d..f78e7eee5 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -149,6 +149,8 @@ enum ggml_metal_kernel_type {
     GGML_METAL_KERNEL_TYPE_SIGMOID,
     GGML_METAL_KERNEL_TYPE_GELU,
     GGML_METAL_KERNEL_TYPE_GELU_4,
+    GGML_METAL_KERNEL_TYPE_GELU_ERF,
+    GGML_METAL_KERNEL_TYPE_GELU_ERF_4,
     GGML_METAL_KERNEL_TYPE_GELU_QUICK,
     GGML_METAL_KERNEL_TYPE_GELU_QUICK_4,
     GGML_METAL_KERNEL_TYPE_SILU,
@@ -1103,6 +1105,8 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIGMOID,                         sigmoid,                         true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU,                            gelu,                            true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_4,                          gelu_4,                          true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_ERF,                        gelu_erf,                        true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_ERF_4,                      gelu_erf_4,                      true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK,                      gelu_quick,                      true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK_4,                    gelu_quick_4,                    true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU,                            silu,                            true);
@@ -1613,6 +1617,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
                 case GGML_UNARY_OP_RELU:
                 case GGML_UNARY_OP_SIGMOID:
                 case GGML_UNARY_OP_GELU:
+                case GGML_UNARY_OP_GELU_ERF:
                 case GGML_UNARY_OP_GELU_QUICK:
                 case GGML_UNARY_OP_SILU:
                 case GGML_UNARY_OP_ELU:
@@ -2251,6 +2256,25 @@ static bool ggml_metal_encode_node(
 
                     [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                 } break;
+                case GGML_UNARY_OP_GELU_ERF:
+                {
+                    int64_t n = ggml_nelements(dst);
+
+                    id<MTLComputePipelineState> pipeline = nil;
+
+                    if (n % 4 == 0) {
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_ERF_4].pipeline;
+                        n /= 4;
+                    } else {
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_ERF].pipeline;
+                    }
+
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
                 case GGML_UNARY_OP_GELU_QUICK:
                 {
                     int64_t n = ggml_nelements(dst);
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index f18473dcb..59899550e 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -856,6 +856,7 @@ kernel void kernel_tanh(
 constant float GELU_COEF_A     = 0.044715f;
 constant float GELU_QUICK_COEF = -1.702f;
 constant float SQRT_2_OVER_PI  = 0.79788456080286535587989211986876f;
+constant float SQRT_2_INV      = 0.70710678118654752440084436210484f;
 
 kernel void kernel_gelu(
     device const float * src0,
@@ -897,6 +898,42 @@ kernel void kernel_gelu_quick_4(
     dst[tpig] = x*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x)));
 }
 
+// based on Abramowitz and Stegun formula 7.1.26 or similar Hastings' approximation
+// ref: https://www.johndcook.com/blog/python_erf/
+constant float p_erf  = 0.3275911f;
+constant float a1_erf = 0.254829592f;
+constant float a2_erf = -0.284496736f;
+constant float a3_erf = 1.421413741f;
+constant float a4_erf = -1.453152027f;
+constant float a5_erf = 1.061405429f;
+
+template<typename T>
+T erf_approx(T x) {
+    T sign_x = sign(x);
+    x = fabs(x);
+    T t = 1.0f / (1.0f + p_erf * x);
+    T y = 1.0f - (((((a5_erf * t + a4_erf) * t) + a3_erf) * t + a2_erf) * t + a1_erf) * t * exp(-x * x);
+    return sign_x * y;
+}
+
+kernel void kernel_gelu_erf(
+    device const float * src0,
+    device       float * dst,
+    uint tpig[[thread_position_in_grid]]) {
+    device const float & x = src0[tpig];
+
+    dst[tpig] = 0.5f*x*(1.0f+erf_approx<float>(x*SQRT_2_INV));
+}
+
+kernel void kernel_gelu_erf_4(
+    device const float4 * src0,
+    device       float4 * dst,
+    uint tpig[[thread_position_in_grid]]) {
+    device const float4 & x = src0[tpig];
+
+    dst[tpig] = 0.5f*x*(1.0f+erf_approx<float4>(x*SQRT_2_INV));
+}
+
 kernel void kernel_silu(
         device const float * src0,
         device       float * dst,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index d48adb9af..57d3e39ad 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1099,9 +1099,10 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
     "HARDSWISH",
     "HARDSIGMOID",
     "EXP",
+    "GELU_ERF",
 };
 
-static_assert(GGML_UNARY_OP_COUNT == 14, "GGML_UNARY_OP_COUNT != 14");
+static_assert(GGML_UNARY_OP_COUNT == 15, "GGML_UNARY_OP_COUNT != 15");
 
 
 static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
@@ -2501,6 +2502,20 @@ struct ggml_tensor * ggml_gelu_inplace(
     return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU);
 }
 
+// ggml_gelu_erf
+
+struct ggml_tensor * ggml_gelu_erf(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_GELU_ERF);
+}
+
+struct ggml_tensor * ggml_gelu_erf_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU_ERF);
+}
+
 // ggml_gelu_quick
 
 struct ggml_tensor * ggml_gelu_quick(

From eb0f5c28d37126baa756117d5bdaadc62e03344e Mon Sep 17 00:00:00 2001
From: Emmanuel Ferdman <emmanuelferdman@gmail.com>
Date: Wed, 21 May 2025 17:33:54 +0300
Subject: [PATCH 06/18] gguf-py : display the invalid gguf type (#13687)

Signed-off-by: Emmanuel Ferdman <emmanuelferdman@gmail.com>
---
 gguf-py/gguf/gguf_reader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py
index 5991cdb76..d87e8f723 100644
--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@@ -251,7 +251,7 @@ class GGUFReader:
                 offs += curr_size
             return offs - orig_offs, aparts, data_idxs, types
         # We can't deal with this one.
-        raise ValueError('Unknown/unhandled field type {gtype}')
+        raise ValueError(f'Unknown/unhandled field type {gtype}')
 
     def _get_tensor_info_field(self, orig_offs: int) -> ReaderField:
         offs = orig_offs

From 2aa777d86d3f7bb80b93e226f1c25e47825f6a83 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Wed, 21 May 2025 16:57:38 +0200
Subject: [PATCH 07/18] examples : switch retrieval to llama_encode (#13685)

* switch retrieval to llama_encode

* enable --no-warmup for retrieval
---
 common/arg.cpp                   |  2 +-
 examples/retrieval/retrieval.cpp | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index b1754f30f..997f732cc 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1678,7 +1678,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.warmup = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL}));
     add_opt(common_arg(
         {"--spm-infill"},
         string_format(
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
index 0efe20d4b..e3d0c9542 100644
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -81,14 +81,14 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
     }
 }
 
-static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
+static void batch_encode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
     // clear previous kv_cache values (irrelevant for embeddings)
     llama_kv_self_clear(ctx);
 
     // run model
     LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
-    if (llama_decode(ctx, batch) < 0) {
-        LOG_ERR("%s : failed to decode\n", __func__);
+    if (llama_encode(ctx, batch) < 0) {
+        LOG_ERR("%s : failed to encode\n", __func__);
     }
 
     for (int i = 0; i < batch.n_tokens; i++) {
@@ -233,7 +233,7 @@ int main(int argc, char ** argv) {
         // encode if at capacity
         if (batch.n_tokens + n_toks > n_batch) {
             float * out = emb + p * n_embd;
-            batch_decode(ctx, batch, out, s, n_embd);
+            batch_encode(ctx, batch, out, s, n_embd);
             common_batch_clear(batch);
             p += s;
             s = 0;
@@ -246,7 +246,7 @@ int main(int argc, char ** argv) {
 
     // final batch
     float * out = emb + p * n_embd;
-    batch_decode(ctx, batch, out, s, n_embd);
+    batch_encode(ctx, batch, out, s, n_embd);
 
     // save embeddings to chunks
     for (int i = 0; i < n_chunks; i++) {
@@ -267,7 +267,7 @@ int main(int argc, char ** argv) {
         batch_add_seq(query_batch, query_tokens, 0);
 
         std::vector<float> query_emb(n_embd, 0);
-        batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd);
+        batch_encode(ctx, query_batch, query_emb.data(), 1, n_embd);
 
         common_batch_clear(query_batch);
 

From c76532e7ba128bb097bf6836bf0f5592e1b56b76 Mon Sep 17 00:00:00 2001
From: antichristHater <142441588+antichristHater@users.noreply.github.com>
Date: Wed, 21 May 2025 19:40:35 +0300
Subject: [PATCH 08/18] convert : add qwen2vl support for unsloth merges
 (#13686)

---
 convert_hf_to_gguf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 15e019a10..e88076ccc 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -2645,7 +2645,7 @@ class Qwen2Model(TextModel):
         yield from super().modify_tensors(data_torch, name, bid)
 
 
-@ModelBase.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
+@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
 class Qwen2VLModel(TextModel):
     model_arch = gguf.MODEL_ARCH.QWEN2VL
 
@@ -2669,7 +2669,7 @@ class Qwen2VLModel(TextModel):
         return [(self.map_tensor_name(name), data_torch)]
 
 
-@ModelBase.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
+@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
 class Qwen2VLVisionModel(VisionModel):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)

From 5fbfe384d4659f81c47a477eb8ee97692c7ffef9 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 21 May 2025 19:46:56 +0300
Subject: [PATCH 09/18] server : improve error reporting (#13680)

---
 tools/server/server.cpp | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 087665e41..7424da523 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -3366,14 +3366,29 @@ struct server_context {
             metrics.on_decoded(slots);
 
             if (ret != 0) {
-                if (n_batch == 1 || ret < 0) {
-                    // if you get here, it means the KV cache is full - try increasing it via the context size
-                    SRV_ERR("failed to decode the batch: KV cache is full - try increasing it via the context size, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret);
-                    for (auto & slot : slots) {
-                        slot.release();
-                        send_error(slot, "Input prompt is too big compared to KV size. Please try increasing KV size.");
+                {
+                    std::string err;
+
+                    if (n_batch == 1 && ret == 1) {
+                        err = "Context size has been exceeded.";
+                    }
+
+                    if (ret == -1) {
+                        err = "Invalid input batch.";
+                    }
+
+                    if (ret < -1) {
+                        err = "Compute error.";
+                    }
+
+                    if (!err.empty()) {
+                        SRV_ERR("%s, i = %d, n_batch = %d, ret = %d\n", err.c_str(), i, n_batch, ret);
+                        for (auto & slot : slots) {
+                            slot.release();
+                            send_error(slot, err);
+                        }
+                        break;
                     }
-                    break; // break loop of n_batch
                 }
 
                 // retry with half the batch size to try to find a free slot in the KV cache

From 8e186ef0e764c7a620e402d1f76ebad60bf31c49 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 21 May 2025 20:00:49 +0300
Subject: [PATCH 10/18] hparams : support models for which all layers use SWA
 (#13682)

ggml-ci
---
 src/llama-hparams.cpp |  2 +-
 src/llama-hparams.h   | 13 ++++++++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index 90dfe7a7f..4f84e56b3 100644
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -72,7 +72,7 @@ uint32_t llama_hparams::n_embd_v_s() const {
 
 bool llama_hparams::is_swa(uint32_t il) const {
     if (il < n_layer) {
-        return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
+        return n_swa_pattern == 0 || (il % n_swa_pattern < (n_swa_pattern - 1));
     }
 
     GGML_ABORT("fatal error");
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index f865cbaea..5222eedcf 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -104,7 +104,18 @@ struct llama_hparams {
     llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
 
     uint32_t n_swa = 0;         // the size of the sliding window (0 - no SWA)
-    uint32_t n_swa_pattern = 1; // by default, all layers use non-sliding-window attention
+    uint32_t n_swa_pattern = 1; // this value n means that every nth layer is dense (i.e. non-SWA)
+                                // by default n == 1, all layers are dense
+                                // note that if n_swa_pattern == 0, all layers are SWA
+                                // example: n_swa_pattern = 3
+                                //   il == 0: swa
+                                //   il == 1: swa
+                                //   il == 2: dense
+                                //   il == 3: swa
+                                //   il == 4: swa
+                                //   il == 5: dense
+                                //   il == 6: swa
+                                //   etc ...
 
     // for State Space Models
     uint32_t ssm_d_conv  = 0;

From d643bb2c798df9c2cd61067d2692b1cd417df402 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Wed, 21 May 2025 13:09:57 -0700
Subject: [PATCH 11/18] releases : build CPU backend separately (windows)
 (#13642)

---
 .github/workflows/release.yml | 267 ++++++++++++++++++----------------
 1 file changed, 142 insertions(+), 125 deletions(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index ed827bf70..494ea5294 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -1,4 +1,4 @@
-name: Create Release
+name: Release
 
 on:
   workflow_dispatch: # allows manual triggering
@@ -227,6 +227,66 @@ jobs:
           path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip
           name: llama-bin-ubuntu-vulkan-x64.zip
 
+  windows-cpu:
+    runs-on: windows-latest
+
+    strategy:
+      matrix:
+        include:
+          - arch: 'x64'
+          - arch: 'arm64'
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2.16
+        with:
+          key: windows-latest-cmake-cpu-${{ matrix.arch }}
+          variant: ccache
+          evict-old-files: 1d
+
+      - name: Install Ninja
+        run: |
+          choco install ninja
+
+      - name: libCURL
+        id: get_libcurl
+        uses: ./.github/actions/windows-setup-curl
+        with:
+          architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
+
+      - name: Build
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
+        run: |
+          cmake -S . -B build -G "Ninja Multi-Config" `
+            -D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake `
+            -DGGML_NATIVE=OFF `
+            -DGGML_BACKEND_DL=ON `
+            -DGGML_CPU_ALL_VARIANTS=ON `
+            -DGGML_OPENMP=OFF `
+            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
+            ${{ env.CMAKE_ARGS }}
+          cmake --build build --config Release
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        env:
+          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
+        run: |
+          Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
+          7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\*
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          path: llama-bin-win-cpu-${{ matrix.arch }}.zip
+          name: llama-bin-win-cpu-${{ matrix.arch }}.zip
+
   windows:
     runs-on: windows-latest
 
@@ -237,52 +297,30 @@ jobs:
     strategy:
       matrix:
         include:
-          - build: 'cpu-x64'
+          - backend: 'vulkan'
             arch: 'x64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF'
-          #- build: 'openblas-x64'
-          #  arch: 'x64'
-          #  defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
-          - build: 'vulkan-x64'
-            arch: 'x64'
-            defines: '-DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
-          - build: 'cpu-arm64'
-            arch: 'arm64'
-            defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF'
-          - build: 'opencl-adreno-arm64'
+            defines: '-DGGML_VULKAN=ON'
+            target: 'ggml-vulkan'
+          - backend: 'opencl-adreno'
             arch: 'arm64'
             defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
+            target: 'ggml-opencl'
 
     steps:
       - name: Clone
         id: checkout
         uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
 
       - name: ccache
         uses: hendrikmuhs/ccache-action@v1.2.16
         with:
-          key: windows-latest-cmake-${{ matrix.build }}
+          key: windows-latest-cmake-${{ matrix.backend }}-${{ matrix.arch }}
           variant: ccache
           evict-old-files: 1d
 
-      - name: Download OpenBLAS
-        id: get_openblas
-        if: ${{ matrix.build == 'openblas-x64' }}
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
-          curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
-          mkdir $env:RUNNER_TEMP/openblas
-          tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
-          $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
-          $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
-          $lib =  $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
-          & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
-
       - name: Install Vulkan SDK
         id: get_vulkan
-        if: ${{ matrix.build == 'vulkan-x64' }}
+        if: ${{ matrix.backend == 'vulkan' }}
         run: |
           curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
           & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
@@ -296,7 +334,7 @@ jobs:
 
       - name: Install OpenCL Headers and Libs
         id: install_opencl
-        if: ${{ matrix.build == 'opencl-adreno-arm64' }}
+        if: ${{ matrix.backend == 'opencl-adreno' && matrix.arch == 'arm64' }}
         run: |
           git clone https://github.com/KhronosGroup/OpenCL-Headers
           cd OpenCL-Headers
@@ -314,46 +352,22 @@ jobs:
             -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
           cmake --build build-arm64-release --target install --config release
 
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-        with:
-          architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }}
-
       - name: Build
         id: cmake_build
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
         run: |
-          cmake -S . -B build ${{ matrix.defines }} `
-            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
-
-      - name: Add libopenblas.dll
-        id: add_libopenblas_dll
-        if: ${{ matrix.build == 'openblas-x64' }}
-        run: |
-          cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
-          cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
+          cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_CURL=OFF
+          cmake --build build --config Release --target ${{ matrix.target }}
 
       - name: Pack artifacts
         id: pack_artifacts
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
         run: |
-          Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
+          7z a llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll
 
       - name: Upload artifacts
         uses: actions/upload-artifact@v4
         with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
-          name: llama-bin-win-${{ matrix.build }}.zip
+          path: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
+          name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
 
   windows-cuda:
     runs-on: windows-2019
@@ -366,8 +380,6 @@ jobs:
       - name: Clone
         id: checkout
         uses: actions/checkout@v4
-        with:
-            fetch-depth: 0
 
       - name: Install ccache
         uses: hendrikmuhs/ccache-action@v1.2.16
@@ -386,45 +398,30 @@ jobs:
         run: |
           choco install ninja
 
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
       - name: Build
         id: cmake_build
         shell: cmd
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
         run: |
           call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
           cmake -S . -B build -G "Ninja Multi-Config" ^
-            -DGGML_NATIVE=OFF ^
             -DGGML_BACKEND_DL=ON ^
-            -DGGML_CPU_ALL_VARIANTS=ON ^
+            -DGGML_NATIVE=OFF ^
+            -DGGML_CPU=OFF ^
             -DGGML_CUDA=ON ^
-            -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^
-            ${{ env.CMAKE_ARGS }}
+            -DLLAMA_CURL=OFF
           set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
-          cmake --build build --config Release -j %NINJA_JOBS% -t ggml
-          cmake --build build --config Release
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
+          cmake --build build --config Release -j %NINJA_JOBS% --target ggml-cuda
 
       - name: Pack artifacts
         id: pack_artifacts
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
         run: |
-          cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-cuda${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
+          7z a llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll
 
       - name: Upload artifacts
         uses: actions/upload-artifact@v4
         with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-cuda${{ matrix.cuda }}-x64.zip
-          name: llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
+          path: llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
+          name: llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
 
       - name: Copy and pack Cuda runtime
         run: |
@@ -432,13 +429,13 @@ jobs:
           $dst='.\build\bin\cudart\'
           robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
           robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
-          7z a cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip $dst\*
+          7z a cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip $dst\*
 
       - name: Upload Cuda runtime
         uses: actions/upload-artifact@v4
         with:
-          path: cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
-          name: cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
+          path: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
+          name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip
 
   windows-sycl:
     runs-on: windows-latest
@@ -455,8 +452,6 @@ jobs:
       - name: Clone
         id: checkout
         uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
 
       - name: ccache
         uses: hendrikmuhs/ccache-action@v1.2.16
@@ -469,15 +464,18 @@ jobs:
         run:  |
           scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
 
-      # TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args
-
       - name: Build
         id: cmake_build
-        run:  examples/sycl/win-build-sycl.bat
-
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
+        shell: cmd
+        run: |
+          call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
+          cmake -G "Ninja" -B build ^
+            -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^
+            -DCMAKE_BUILD_TYPE=Release ^
+            -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^
+            -DGGML_CPU=OFF -DGGML_SYCL=ON ^
+            -DLLAMA_CURL=OFF
+          cmake --build build --target ggml-sycl -j
 
       - name: Build the release package
         id: pack_artifacts
@@ -502,12 +500,12 @@ jobs:
           cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
 
           echo "cp oneAPI running time dll files to ./build/bin done"
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
+          7z a llama-bin-win-sycl-x64.zip ./build/bin/*
 
       - name: Upload the release package
         uses: actions/upload-artifact@v4
         with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
+          path: llama-bin-win-sycl-x64.zip
           name: llama-bin-win-sycl-x64.zip
 
   windows-hip:
@@ -521,8 +519,6 @@ jobs:
       - name: Clone
         id: checkout
         uses: actions/checkout@v4
-        with:
-            fetch-depth: 0
 
       - name: Clone rocWMMA repository
         id: clone_rocwmma
@@ -532,7 +528,7 @@ jobs:
       - name: ccache
         uses: hendrikmuhs/ccache-action@v1.2.16
         with:
-          key: windows-latest-cmake-hip-release
+          key: windows-latest-cmake-hip-${{ matrix.gpu_target }}-x64
           evict-old-files: 1d
 
       - name: Install
@@ -550,14 +546,8 @@ jobs:
         run: |
           & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
 
-      - name: libCURL
-        id: get_libcurl
-        uses: ./.github/actions/windows-setup-curl
-
       - name: Build
         id: cmake_build
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
         run: |
           $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
           $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
@@ -569,31 +559,23 @@ jobs:
             -DAMDGPU_TARGETS=${{ matrix.gpu_target }} `
             -DGGML_HIP_ROCWMMA_FATTN=ON `
             -DGGML_HIP=ON `
-            -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" `
-            ${{ env.CMAKE_ARGS }}
-          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
+            -DLLAMA_CURL=OFF
+          cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
           md "build\bin\rocblas\library\"
           cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
           cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
           cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
 
-      - name: Determine tag name
-        id: tag
-        uses: ./.github/actions/get-tag-name
-
       - name: Pack artifacts
         id: pack_artifacts
-        env:
-          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
         run: |
-          cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\libcurl-x64.dll
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\*
+          7z a llama-bin-win-hip-${{ matrix.gpu_target }}-x64.zip .\build\bin\*
 
       - name: Upload artifacts
         uses: actions/upload-artifact@v4
         with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
-          name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip
+          path: llama-bin-win-hip-${{ matrix.gpu_target }}-x64.zip
+          name: llama-bin-win-hip-${{ matrix.gpu_target }}-x64.zip
 
   ios-xcode-build:
     runs-on: macos-latest
@@ -655,14 +637,16 @@ jobs:
     runs-on: ubuntu-latest
 
     needs:
-      - ubuntu-22-cpu
-      - ubuntu-22-vulkan
       - windows
+      - windows-cpu
       - windows-cuda
       - windows-sycl
       - windows-hip
+      - ubuntu-22-cpu
+      - ubuntu-22-vulkan
       - macOS-arm64
       - macOS-x64
+      - ios-xcode-build
 
     steps:
       - name: Clone
@@ -680,10 +664,43 @@ jobs:
         uses: actions/download-artifact@v4
         with:
           path: ./artifact
+          merge-multiple: true
 
       - name: Move artifacts
         id: move_artifacts
-        run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
+        run: |
+          mkdir -p release
+
+          echo "Adding CPU backend files to existing zips..."
+          for arch in x64 arm64; do
+            cpu_zip="artifact/llama-bin-win-cpu-${arch}.zip"
+            temp_dir=$(mktemp -d)
+            echo "Extracting CPU backend for $arch..."
+            unzip "$cpu_zip" -d "$temp_dir"
+
+            echo "Adding CPU files to $arch zips..."
+            for target_zip in artifact/llama-bin-win-*-${arch}.zip; do
+              if [[ "$target_zip" == "$cpu_zip" ]]; then
+                continue
+              fi
+              echo "Adding CPU backend to $(basename "$target_zip")"
+              realpath_target_zip=$(realpath "$target_zip")
+              (cd "$temp_dir" && zip -r "$realpath_target_zip" .)
+            done
+
+            rm -rf "$temp_dir"
+          done
+
+          echo "Renaming and moving zips to release..."
+          for zip_file in artifact/llama-bin-win-*.zip; do
+            base_name=$(basename "$zip_file" .zip)
+            zip_name="llama-${{ steps.tag.outputs.name }}-${base_name#llama-}.zip"
+            echo "Moving $zip_file to release/$zip_name"
+            mv "$zip_file" "release/$zip_name"
+          done
+
+          echo "Moving other artifacts..."
+          mv -v artifact/*.zip release
 
       - name: Create release
         id: create_release
@@ -702,7 +719,7 @@ jobs:
             const path = require('path');
             const fs = require('fs');
             const release_id = '${{ steps.create_release.outputs.id }}';
-            for (let file of await fs.readdirSync('./artifact/release')) {
+            for (let file of await fs.readdirSync('./release')) {
               if (path.extname(file) === '.zip') {
                 console.log('uploadReleaseAsset', file);
                 await github.repos.uploadReleaseAsset({
@@ -710,7 +727,7 @@ jobs:
                   repo: context.repo.repo,
                   release_id: release_id,
                   name: file,
-                  data: await fs.readFileSync(`./artifact/release/${file}`)
+                  data: await fs.readFileSync(`./release/${file}`)
                 });
               }
             }

From edbf42edfdabb9cea72ae12137570cf48f5d8076 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henry=20Linjam=C3=A4ki?= <henry.mikael.linjamaki@intel.com>
Date: Wed, 21 May 2025 23:21:17 +0300
Subject: [PATCH 12/18] opencl: fix couple crashes (#12795)

* opencl: fix couple crashes

* fix kernel launches failed on devices which do not support
  non-uniform work-groups. When non-uniform work-groups are not
  supported, set `local_work_size` to NULL (= let driver choose the
  work-group sizes). This patch does not cover everything - just the
  cases tested by test-backend-ops.

* fix sub-buffer creation failed due to `cl_buffer_region::origin` not
  being aligned to `CL_DEVICE_MEM_BASE_ADDR_ALIGN`.

* OpenCL: query non-uniform WG sizes only on OpenCL 3.0+
---
 ggml/src/ggml-opencl/ggml-opencl.cpp | 102 +++++++++++++++++++++------
 1 file changed, 79 insertions(+), 23 deletions(-)

diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index 586946048..3b8313761 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -74,6 +74,7 @@ struct ggml_cl_version {
     cl_uint minor = 0;
 };
 
+
 struct ggml_cl_compiler_version {
     ADRENO_CL_COMPILER_TYPE type;
     int major = -1;
@@ -91,6 +92,14 @@ struct ggml_cl_compiler_version {
     }
 };
 
+static size_t align_to(size_t value, size_t to_alignment) {
+    GGML_ASSERT(to_alignment && "Invalid alignment (must be non-zero)");
+    GGML_ASSERT((to_alignment & (to_alignment - 1)) == 0 && "to_alignment must be power-of-two");
+
+    return ((value + to_alignment - 1) / to_alignment) * to_alignment;
+}
+
+
 // Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes.
 static ggml_cl_version parse_cl_version(std::string_view str) {
     size_t major_str_begin = 0;
@@ -248,6 +257,8 @@ struct ggml_backend_opencl_context {
 
     int adreno_wave_size;
 
+    cl_bool non_uniform_workgroups;
+
     cl_context context;
     cl_command_queue queue;
 
@@ -1397,6 +1408,15 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
     GGML_LOG_INFO("ggml_opencl: SVM atomics support: %s\n",
         svm_caps & CL_DEVICE_SVM_ATOMICS ? "true" : "false");
 
+    if (opencl_c_version.major >= 3) {
+        CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool),
+                                 &backend_ctx->non_uniform_workgroups, 0));
+    } else {
+        GGML_ASSERT(opencl_c_version.major == 2);
+        // Non-uniform workgroup sizes is mandatory feature in v2.x.
+        backend_ctx->non_uniform_workgroups = true;
+    }
+
     // Print out configurations
 #ifdef GGML_OPENCL_SOA_Q
     GGML_LOG_INFO("ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n");
@@ -2058,15 +2078,16 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
         // The original tensor memory is divided into scales and quants, i.e.,
         // we first store scales, then quants.
         // Create subbuffer for scales.
-        region.origin = extra_orig->offset + tensor->view_offs + offset;
+        region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
         region.size = size_d;
         extra->d = clCreateSubBuffer(
             extra_orig->data_device, CL_MEM_READ_WRITE,
             CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
         CL_CHECK(err);
+        auto previous_origin = region.origin;
 
         // Create subbuffer for quants.
-        region.origin = extra_orig->offset + tensor->view_offs + offset + size_d;
+        region.origin = align_to(previous_origin + size_d, backend_ctx->alignment);
         region.size = size_q;
         extra->q = clCreateSubBuffer(
             extra_orig->data_device, CL_MEM_READ_WRITE,
@@ -2942,14 +2963,19 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const
         size_t global_work_size[] = {(size_t)n, 1, 1};
         size_t local_work_size[] = {64, 1, 1};
 
+        size_t * local_work_size_ptr = local_work_size;
+        if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+            local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
+        }
+
 #ifdef GGML_OPENCL_PROFILING
         cl_event evt;
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
 
         g_profiling_info.emplace_back();
-        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
 #else
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
 #endif
     } else {
         unsigned int nth = MIN(64, ne0);
@@ -3077,14 +3103,19 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const
         size_t global_work_size[] = {(size_t)n, 1, 1};
         size_t local_work_size[] = {64, 1, 1};
 
+        size_t * local_work_size_ptr = local_work_size;
+        if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+            local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
+        }
+
 #ifdef GGML_OPENCL_PROFILING
         cl_event evt;
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
 
         g_profiling_info.emplace_back();
-        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
 #else
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
 #endif
     } else {
         unsigned int nth = MIN(64, ne0);
@@ -3233,14 +3264,19 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const
     size_t global_work_size[] = {(size_t)n, 1, 1};
     size_t local_work_size[] = {64, 1, 1};
 
+    size_t * local_work_size_ptr = local_work_size;
+    if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+        local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
+    }
+
 #ifdef GGML_OPENCL_PROFILING
     cl_event evt;
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
 
     g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
 #else
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
 #endif
 }
 
@@ -3273,14 +3309,19 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const
     size_t global_work_size[] = {(size_t)n, 1, 1};
     size_t local_work_size[] = {64, 1, 1};
 
+    size_t * local_work_size_ptr = local_work_size;
+    if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+        local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
+    }
+
 #ifdef GGML_OPENCL_PROFILING
     cl_event evt;
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
 
     g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
 #else
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
 #endif
 }
 
@@ -3320,14 +3361,19 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons
     size_t global_work_size[] = {(size_t)n, 1, 1};
     size_t local_work_size[] = {64, 1, 1};
 
+    size_t * local_work_size_ptr = local_work_size;
+    if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+        local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
+    }
+
 #ifdef GGML_OPENCL_PROFILING
     cl_event evt;
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
 
     g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
 #else
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
 #endif
 }
 
@@ -4230,14 +4276,19 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons
     size_t global_work_size[] = {(size_t)n, 1, 1};
     size_t local_work_size[] = {64, 1, 1};
 
+    size_t * local_work_size_ptr = local_work_size;
+    if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+        local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
+    }
+
 #ifdef GGML_OPENCL_PROFILING
     cl_event evt;
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
 
     g_profiling_info.emplace_back();
-    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+    populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
 #else
-    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+    CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
 #endif
 }
 
@@ -4418,14 +4469,19 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr
         size_t global_work_size[] = {(size_t)ne00, (size_t)ne01, (size_t)ne02};
         size_t local_work_size[] = {64, 1, 1};
 
+        size_t * local_work_size_ptr = local_work_size;
+        if (ne00 % 64 != 0 && !backend_ctx->non_uniform_workgroups) {
+            local_work_size_ptr = nullptr;  // Let driver choose the work-group sizes.
+        }
+
 #ifdef GGML_OPENCL_PROFILING
         cl_event evt;
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt));
 
         g_profiling_info.emplace_back();
-        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
+        populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst);
 #else
-        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+        CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL));
 #endif
     }
 }

From a4e8912dfd4604be1e39bc86ba4c0b02969967ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henry=20Linjam=C3=A4ki?= <henry.mikael.linjamaki@intel.com>
Date: Thu, 22 May 2025 02:21:45 +0300
Subject: [PATCH 13/18] opencl: Add support for multiple devices (#12622)

* opencl: Add support for multiple devices

... but limited to one platform. A platform with a GPU will be preferred.

Additionally:

* Filter out devices that lack capabilities needed by the backend
  implementation (half support, OpenCL 2.0+, etc).

* Make ggml_backend_opencl_reg() thread-safe.

* fixup: fix an error in sync_with_other_backends

... when there is only one OpenCL device available.
---
 ggml/src/ggml-opencl/ggml-opencl.cpp | 370 +++++++++++++++++----------
 1 file changed, 237 insertions(+), 133 deletions(-)

diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index 3b8313761..d5412069e 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -27,6 +27,7 @@
 #include <cmath>
 #include <memory>
 #include <charconv>
+#include <mutex>
 
 #undef MIN
 #undef MAX
@@ -230,13 +231,25 @@ static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *drive
     return { type, major, minor, patch };
 }
 
+struct ggml_backend_opencl_context;
+
 // backend device context
 struct ggml_backend_opencl_device_context {
     cl_platform_id platform;
     std::string platform_name;
 
-    cl_device_id device;
-    std::string device_name;
+    cl_device_id   device;
+    std::string    device_name;
+    cl_device_type device_type;
+    std::string    device_version;
+
+    // Initialized by ggml_cl2_init().
+    ggml_backend_opencl_context * backend_ctx = nullptr;
+
+    // Initialized by ggml_backend_opencl_device_get_buffer_type()
+    ggml_backend_buffer_type buffer_type;
+
+    cl_context context = nullptr;
 };
 
 // backend context
@@ -355,15 +368,8 @@ struct ggml_backend_opencl_context {
 #endif // GGML_OPENCL_USE_ADRENO_KERNELS
 };
 
-static ggml_backend_device                 g_ggml_backend_opencl_device;
-static ggml_backend_opencl_device_context  g_ggml_ctx_dev_main {
-    /*.platform         =*/ nullptr,
-    /*.platform_nane    =*/ "",
-    /*.device           =*/ nullptr,
-    /*.device_name      =*/ "",
-};
-
-static int ggml_backend_opencl_n_devices = 0;
+// All registered devices with a default device in the front.
+static std::vector<ggml_backend_device> g_ggml_backend_opencl_devices;
 
 // Profiling
 #ifdef GGML_OPENCL_PROFILING
@@ -1118,25 +1124,19 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
     GGML_LOG_CONT("\n");
 }
 
-static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
-    static bool initialized = false;
-    static ggml_backend_opencl_context *backend_ctx = nullptr;
+// XXX static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
+// XXX    static bool initialized = false;
+// XXX    static ggml_backend_opencl_context *backend_ctx = nullptr;
 
-    if (initialized) {
-        return backend_ctx;
-    }
+static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev);
 
-    ggml_backend_opencl_device_context *dev_ctx = (ggml_backend_opencl_device_context *)dev->context;
-    GGML_ASSERT(dev_ctx);
-    GGML_ASSERT(dev_ctx->platform == nullptr);
-    GGML_ASSERT(dev_ctx->device == nullptr);
-    GGML_ASSERT(backend_ctx == nullptr);
+namespace /* anonymous */ {
+extern struct ggml_backend_device_i ggml_backend_opencl_device_i;
+}
 
-    initialized = true;
-    backend_ctx = new ggml_backend_opencl_context();
-    backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
-
-    cl_int err;
+// Look for available and suitable devices.
+static std::vector<ggml_backend_device> ggml_opencl_probe_devices(ggml_backend_reg * reg) {
+    std::vector<ggml_backend_device> found_devices;
 
 #ifdef GGML_OPENCL_PROFILING
     GGML_LOG_INFO("ggml_opencl: OpenCL profiling enabled\n");
@@ -1169,11 +1169,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
     struct cl_device devices[NDEV];
     unsigned n_devices = 0;
     struct cl_device * default_device = NULL;
+    unsigned           default_platform_number = 0;
 
     cl_platform_id platform_ids[NPLAT];
     if (clGetPlatformIDs(NPLAT, platform_ids, &n_platforms) != CL_SUCCESS) {
         GGML_LOG_ERROR("ggml_opencl: plaform IDs not available.\n");
-        return backend_ctx;
+        return found_devices;
     }
 
     for (unsigned i = 0; i < n_platforms; i++) {
@@ -1208,19 +1209,22 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
         }
 
         if (default_device == NULL && p->default_device != NULL) {
-            default_device = p->default_device;
+            default_device          = p->default_device;
+            default_platform_number = i;
         }
     }
 
     if (n_devices == 0) {
         GGML_LOG_ERROR("ggml_opencl: could find any OpenCL devices.\n");
-        return backend_ctx;
+        return found_devices;
     }
 
-    char * user_platform_string = getenv("GGML_OPENCL_PLATFORM");
-    char * user_device_string = getenv("GGML_OPENCL_DEVICE");
-    int user_platform_number = -1;
-    int user_device_number = -1;
+    char *      user_platform_string = getenv("GGML_OPENCL_PLATFORM");
+    char *      user_device_string   = getenv("GGML_OPENCL_DEVICE");
+    int         user_platform_number = -1;
+    int         user_device_number   = -1;
+    cl_device * candidate_devices    = nullptr;
+    unsigned    n_candidate_devices  = 0;
 
     unsigned n;
     if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) {
@@ -1235,12 +1239,11 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
             GGML_LOG_ERROR("ggml_opencl: invalid device number %d\n", user_device_number);
             exit(1);
         }
-        default_device = &platform->devices[user_device_number];
+        default_device      = &platform->devices[user_device_number];
+        candidate_devices   = platform->devices;
+        n_candidate_devices = platform->n_devices;
     } else {
-
-        struct cl_device * selected_devices = devices;
-        unsigned n_selected_devices = n_devices;
-
+        // Choose a platform by matching a substring.
         if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) {
             for (unsigned i = 0; i < n_platforms; i++) {
                 struct cl_platform * p = &platforms[i];
@@ -1255,20 +1258,20 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
                 exit(1);
             }
         }
-        if (user_platform_number != -1) {
-            struct cl_platform * p = &platforms[user_platform_number];
-            selected_devices = p->devices;
-            n_selected_devices = p->n_devices;
-            default_device = p->default_device;
-            if (n_selected_devices == 0) {
-                GGML_LOG_ERROR("ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
-                exit(1);
-            }
+
+        int                  platform_idx = user_platform_number != -1 ? user_platform_number : default_platform_number;
+        struct cl_platform * p            = &platforms[platform_idx];
+        candidate_devices                 = p->devices;
+        n_candidate_devices               = p->n_devices;
+        default_device                    = p->default_device;
+        if (n_candidate_devices == 0) {
+            GGML_LOG_ERROR("ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
+            exit(1);
         }
 
         if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) {
-            for (unsigned i = 0; i < n_selected_devices; i++) {
-                struct cl_device * d = &selected_devices[i];
+            for (unsigned i = 0; i < n_candidate_devices; i++) {
+                struct cl_device * d = &candidate_devices[i];
                 if (strstr(d->name, user_device_string) != NULL) {
                     user_device_number = d->number;
                     break;
@@ -1280,71 +1283,145 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
             }
         }
         if (user_device_number != -1) {
-            selected_devices = &devices[user_device_number];
-            n_selected_devices = 1;
-            default_device = &selected_devices[0];
+            candidate_devices   = &devices[user_device_number];
+            n_candidate_devices = 1;
+            default_device      = &candidate_devices[0];
         }
 
-        GGML_ASSERT(n_selected_devices > 0);
+        GGML_ASSERT(n_candidate_devices > 0);
 
         if (default_device == NULL) {
-            default_device = &selected_devices[0];
+            default_device = &candidate_devices[0];
         }
     }
 
-    GGML_LOG_INFO("ggml_opencl: selecting platform: '%s'\n", default_device->platform->name);
-    GGML_LOG_INFO("ggml_opencl: selecting device: '%s (%s)'\n", default_device->name, default_device->version);
-    if (default_device->type != CL_DEVICE_TYPE_GPU) {
-        GGML_LOG_WARN("ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name);
+    GGML_ASSERT(n_candidate_devices != 0 && candidate_devices);
+
+    // Put the default device in front.
+    for (unsigned i = 1; i < n_candidate_devices; i++) {
+        if (&candidate_devices[i] == default_device) {
+            std::swap(candidate_devices[0], candidate_devices[i]);
+            default_device = &candidate_devices[0];
+            break;
+        }
     }
 
-    dev_ctx->platform = default_device->platform->id;
-    dev_ctx->device = default_device->id;
-    backend_ctx->device = default_device->id;
+    GGML_LOG_INFO("ggml_opencl: selected platform: '%s'\n", default_device->platform->name);
 
-    if (strstr(default_device->name, "Adreno") ||
-        strstr(default_device->name, "Qualcomm") ||
-        strstr(default_device->version, "Adreno")) {
+    std::vector<cl_device_id> device_ids;
+    for (auto dev = candidate_devices, dev_end = candidate_devices + n_candidate_devices; dev != dev_end; dev++) {
+        device_ids.push_back(dev->id);
+    }
+
+    cl_int                err;
+    cl_context            shared_context;
+    cl_context_properties properties[] = { (intptr_t) CL_CONTEXT_PLATFORM, (intptr_t) default_device->platform->id, 0 };
+
+    CL_CHECK(
+        (shared_context = clCreateContext(properties, device_ids.size(), device_ids.data(), NULL, NULL, &err), err));
+
+    for (auto dev = candidate_devices, dev_end = candidate_devices + n_candidate_devices; dev != dev_end; dev++) {
+        GGML_LOG_INFO("\nggml_opencl: device: '%s (%s)'\n", dev->name, dev->version);
+
+        auto dev_ctx = std::unique_ptr<ggml_backend_opencl_device_context>(new ggml_backend_opencl_device_context{
+            /*.platform         =*/dev->platform->id,
+            /*.platform_nane    =*/dev->platform->name,
+            /*.device           =*/dev->id,
+            /*.device_name      =*/dev->name,
+            /*.device_type      =*/dev->type,
+            /*.device_version   =*/dev->version,
+            /*.backend_ctx      =*/nullptr,
+            /*.buffer_type      =*/{},
+            /*.context          =*/shared_context,
+        });
+
+        found_devices.push_back(ggml_backend_device{
+            /* .iface   = */ ggml_backend_opencl_device_i,
+            /* .reg     = */ reg,
+            /* .context = */ dev_ctx.get(),
+        });
+
+        if (!ggml_cl2_init(&found_devices.back())) {
+            found_devices.pop_back();
+            GGML_LOG_INFO("ggml_opencl: drop unsupported device.\n");
+            continue;
+        }
+
+        dev_ctx.release();
+    }
+
+    if (found_devices.size()) {
+        auto * dev_ctx = static_cast<ggml_backend_opencl_device_context *>(found_devices.front().context);
+        GGML_LOG_INFO("ggml_opencl: default device: '%s (%s)'\n", dev_ctx->device_name.c_str(),
+                      dev_ctx->device_version.c_str());
+
+        if (dev_ctx->device_type != CL_DEVICE_TYPE_GPU) {
+            GGML_LOG_WARN("ggml_opencl: warning, the default device is not a GPU: '%s'.\n",
+                          dev_ctx->device_name.c_str());
+        }
+    }
+
+    return found_devices;
+}
+
+// Initialize device if it is supported (returns nullptr if it is not).
+static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
+    GGML_ASSERT(dev);
+    GGML_ASSERT(dev->context);
+
+    ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) dev->context;
+    GGML_ASSERT(dev_ctx->platform);
+    GGML_ASSERT(dev_ctx->device);
+
+    if (dev_ctx->backend_ctx) {
+        return dev_ctx->backend_ctx;
+    }
+
+    auto backend_ctx        = std::make_unique<ggml_backend_opencl_context>();
+    backend_ctx->device     = dev_ctx->device;
+    backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
+
+    if (strstr(dev_ctx->device_name.c_str(), "Adreno") ||
+        strstr(dev_ctx->device_name.c_str(), "Qualcomm") ||
+        strstr(dev_ctx->device_version.c_str(), "Adreno")) {
         backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
         // Usually device version contains the detailed device name
-        backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->version);
+        backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_version.c_str());
         if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::ADRENO_UNKNOWN) {
-            backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name);
+            backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_name.c_str());
         }
 
         // Use wave size of 64 for all Adreno GPUs.
         backend_ctx->adreno_wave_size = 64;
-    } else if (strstr(default_device->name, "Intel")) {
+    } else if (strstr(dev_ctx->device_name.c_str(), "Intel")) {
         backend_ctx->gpu_family = GPU_FAMILY::INTEL;
     } else {
-        GGML_LOG_ERROR("Unsupported GPU: %s\n", default_device->name);
+        GGML_LOG_ERROR("Unsupported GPU: %s\n", dev_ctx->device_name.c_str());
         backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN;
-        return backend_ctx;
+        return nullptr;
     }
 
 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
     if (backend_ctx->gpu_family != GPU_FAMILY::ADRENO) {
         GGML_LOG_ERROR("ggml_opencl: Adreno-specific kernels should not be enabled for non-Adreno GPUs; "
             "run on an Adreno GPU or recompile with CMake option `-DGGML_OPENCL_USE_ADRENO_KERNELS=OFF`\n");
-        return backend_ctx;
+        return nullptr;
     }
 #endif
 
     // Populate backend device name
-    dev_ctx->platform_name = default_device->platform->name;
-    dev_ctx->device_name = default_device->name;
-    backend_ctx->device_name = default_device->name;
+    backend_ctx->device_name = dev_ctx->device_name;
 
     // A local ref of cl_device_id for convenience
     cl_device_id device = backend_ctx->device;
 
-    ggml_cl_version platform_version = get_opencl_platform_version(default_device->platform->id);
+    ggml_cl_version platform_version = get_opencl_platform_version(dev_ctx->platform);
 
     // Check device OpenCL version, OpenCL 2.0 or above is required
     ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, device);
     if (opencl_c_version.major < 2) {
         GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n");
-        return backend_ctx;
+        return nullptr;
     }
 
     // Check driver version
@@ -1375,7 +1452,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
     // fp16 is required
     if (!backend_ctx->fp16_support) {
         GGML_LOG_ERROR("ggml_opencl: device does not support FP16\n");
-        return backend_ctx;
+        return nullptr;
     }
 
     // If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes
@@ -1384,7 +1461,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
         strstr(ext_buffer, "cl_intel_subgroups") == NULL) {
         GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) "
             "(note that subgroups is an optional feature in OpenCL 3.0)\n");
-        return backend_ctx;
+        return nullptr;
     }
 
     cl_uint base_align_in_bits;
@@ -1426,14 +1503,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
     GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n");
 #endif // GGML_OPENCL_USE_ADRENO_KERNELS
 
-    cl_context_properties properties[] = {
-        (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)dev_ctx->platform, 0
-    };
-
-    CL_CHECK((backend_ctx->context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err));
+    cl_int err;
 
     // A local ref of cl_context for convenience
-    cl_context context = backend_ctx->context;
+    cl_context context = backend_ctx->context = dev_ctx->context;
 
     //CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err),
     //    (err != CL_INVALID_QUEUE_PROPERTIES && err != CL_INVALID_VALUE ? err :
@@ -1446,7 +1519,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
     CL_CHECK((backend_ctx->queue = clCreateCommandQueue(context, device, command_queue_props, &err), err));
 
     // Load kernels
-    load_cl_kernels(backend_ctx, opencl_c_version);
+    load_cl_kernels(backend_ctx.get(), opencl_c_version);
 
 #ifdef GGML_OPENCL_USE_ADRENO_KERNELS
     // Allocate intermediate buffers and images
@@ -1476,10 +1549,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
     CL_CHECK((backend_ctx->B_d_max   = clCreateBuffer(context, 0, max_B_d_bytes,   NULL, &err), err));
 #endif // GGML_OPENCL_USE_ADRENO_KERNELS
 
-    // For now we support a single devices
-    ggml_backend_opencl_n_devices = 1;
-
-    return backend_ctx;
+    dev_ctx->backend_ctx = backend_ctx.release();
+    return dev_ctx->backend_ctx;
 }
 
 static void ggml_cl2_free(void) {
@@ -1684,10 +1755,46 @@ static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
     GGML_UNUSED(backend);
 }
 
+// Syncronizes the 'backend_ctx's device with others so that commands
+// enqueued to it won't start until commands in the other devices have
+// completed.
+static void sync_with_other_backends(ggml_backend_opencl_context * backend_ctx) {
+    if (g_ggml_backend_opencl_devices.size() < 2)
+      return; // No other devices to synchronize with.
+
+    std::vector<cl_event> events;
+    events.reserve(g_ggml_backend_opencl_devices.size());
+
+    for (ggml_backend_device & backend_dev : g_ggml_backend_opencl_devices) {
+        auto * other_backend_ctx = ggml_cl2_init(&backend_dev);
+        if (backend_ctx != other_backend_ctx) {
+            cl_event ev;
+            CL_CHECK(clEnqueueMarkerWithWaitList(other_backend_ctx->queue, 0, nullptr, &ev));
+            CL_CHECK(clFlush(other_backend_ctx->queue));
+            events.push_back(ev);
+        }
+    }
+
+    CL_CHECK(clEnqueueBarrierWithWaitList(backend_ctx->queue, events.size(), events.data(), nullptr));
+    for (auto ev : events) {
+        CL_CHECK(clReleaseEvent(ev));
+    }
+}
+
+static void sync_with_other_backends(ggml_backend_t backend) {
+    auto * backend_ctx = static_cast<ggml_backend_opencl_context *>(backend->context);
+    sync_with_other_backends(backend_ctx);
+}
+
 static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
     for (int i = 0; i < cgraph->n_nodes; i++) {
         ggml_tensor * node = cgraph->nodes[i];
 
+        // NOTE: this may oversynchronize by synchronizing with
+        //       backends/devices which don't compute 'cgraph's
+        //       dependencies.
+        sync_with_other_backends(backend);
+
         if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
             continue;
         }
@@ -2292,8 +2399,8 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
     cl_context context = backend_ctx->context;
     cl_command_queue queue = backend_ctx->queue;
 
-    // Make sure all previously submitted commands are finished.
-    CL_CHECK(clFinish(queue));
+    // Make sure all previously submitted commands in other devices are finished.
+    sync_with_other_backends(backend_ctx);
 
 #ifdef GGML_OPENCL_SOA_Q
     // In end-to-end runs, get_tensor is usually used to get back the logits,
@@ -2397,13 +2504,8 @@ static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_b
 }
 
 static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
-    // FIXME: not thread safe, device may not be initialized yet
-    static cl_uint alignment = -1;
-    if (alignment == (cl_uint)-1) {
-        ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
-        alignment = backend_ctx->alignment;
-    }
-    return alignment;
+    ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device);
+    return backend_ctx->alignment;
 }
 
 static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
@@ -2430,16 +2532,6 @@ static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
     /* .is_host          = */ NULL,
 };
 
-ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() {
-    static ggml_backend_buffer_type buffer_type = {
-        /* .iface   = */ ggml_backend_opencl_buffer_type_interface,
-        /* .device  = */ &g_ggml_backend_opencl_device,
-        /* .context = */ nullptr,
-    };
-
-    return &buffer_type;
-}
-
 //
 // backend device
 //
@@ -2497,9 +2589,15 @@ static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, co
 }
 
 static ggml_backend_buffer_type_t ggml_backend_opencl_device_get_buffer_type(ggml_backend_dev_t dev) {
-    return ggml_backend_opencl_buffer_type();
+    auto * dev_ctx = static_cast<ggml_backend_opencl_device_context *>(dev->context);
 
-    GGML_UNUSED(dev);
+    dev_ctx->buffer_type = ggml_backend_buffer_type{
+        /* .iface   = */ ggml_backend_opencl_buffer_type_interface,
+        /* .device  = */ dev,
+        /* .context = */ nullptr,
+    };
+
+    return &dev_ctx->buffer_type;
 }
 
 static ggml_backend_buffer_t ggml_backend_opencl_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
@@ -2515,12 +2613,21 @@ static bool ggml_backend_opencl_device_supports_op(ggml_backend_dev_t dev, const
 }
 
 static bool ggml_backend_opencl_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    return buft->iface.get_name == ggml_backend_opencl_buffer_type_get_name;
+    // Check 'dev' and 'buffer_type' are not objects belonging to this backend.
+    if (dev->iface.get_name != ggml_backend_opencl_device_get_name ||
+        buft->iface.get_name != ggml_backend_opencl_buffer_type_get_name) {
+        return false;
+    }
 
-    GGML_UNUSED(dev);
+    // Check cl_context is the same. clEnqueue* commands may not use
+    // buffers from another cl_context.
+    ggml_backend_opencl_context * backend_ctx0 = ggml_cl2_init(dev);
+    ggml_backend_opencl_context * backend_ctx1 = ggml_cl2_init(buft->device);
+    return backend_ctx0->context == backend_ctx1->context;
 }
 
-static struct ggml_backend_device_i ggml_backend_opencl_device_i = {
+namespace /* anonymous */ {
+struct ggml_backend_device_i ggml_backend_opencl_device_i = {
     /* .get_name             = */ ggml_backend_opencl_device_get_name,
     /* .get_description      = */ ggml_backend_opencl_device_get_description,
     /* .get_memory           = */ ggml_backend_opencl_device_get_memory,
@@ -2537,6 +2644,7 @@ static struct ggml_backend_device_i ggml_backend_opencl_device_i = {
     /* .event_free           = */ NULL,
     /* .event_synchronize    = */ NULL,
 };
+}
 
 // Backend registry
 
@@ -2547,15 +2655,15 @@ static const char * ggml_backend_opencl_reg_get_name(ggml_backend_reg_t reg) {
 }
 
 static size_t ggml_backend_opencl_reg_device_count(ggml_backend_reg_t reg) {
-    return ggml_backend_opencl_n_devices;
+    return g_ggml_backend_opencl_devices.size();
 
     GGML_UNUSED(reg);
 }
 
 static ggml_backend_dev_t ggml_backend_opencl_reg_device_get(ggml_backend_reg_t reg, size_t index) {
-    GGML_ASSERT(index == 0);
+    GGML_ASSERT(index < ggml_backend_opencl_reg_device_count(reg));
 
-    return &g_ggml_backend_opencl_device;
+    return &g_ggml_backend_opencl_devices[index];
 
     GGML_UNUSED(reg);
     GGML_UNUSED(index);
@@ -2569,27 +2677,23 @@ static struct ggml_backend_reg_i ggml_backend_opencl_reg_i = {
 };
 
 ggml_backend_reg_t ggml_backend_opencl_reg(void) {
-    // TODO: make this thread-safe somehow?
+    static std::mutex mutex;
     static ggml_backend_reg reg;
     static bool initialized = false;
+    std::lock_guard<std::mutex> lock(mutex);
 
-    if (!initialized) {
-        reg = ggml_backend_reg {
-            /* .api_version = */ GGML_BACKEND_API_VERSION,
-            /* .iface   = */ ggml_backend_opencl_reg_i,
-            /* .context = */ NULL,
-        };
-
-        g_ggml_backend_opencl_device = ggml_backend_device {
-            /* .iface   = */ ggml_backend_opencl_device_i,
-            /* .reg     = */ &reg,
-            /* .context = */ &g_ggml_ctx_dev_main,
-        };
-
-        ggml_cl2_init(&g_ggml_backend_opencl_device);
-
-        initialized = true;
+    if (initialized) {
+        return &reg;
     }
+    initialized = true;
+
+    g_ggml_backend_opencl_devices = ggml_opencl_probe_devices(&reg);
+
+    reg = ggml_backend_reg{
+        /* .api_version = */ GGML_BACKEND_API_VERSION,
+        /* .iface       = */ ggml_backend_opencl_reg_i,
+        /* .context     = */ NULL,
+    };
 
     return &reg;
 }

From 6b56a64690a318fcabcd7739ac7e314d44785ea8 Mon Sep 17 00:00:00 2001
From: Ewan Crawford <ewan@codeplay.com>
Date: Thu, 22 May 2025 09:24:09 +0100
Subject: [PATCH 14/18] SYCL: Avoid using with SYCL-Graph for unsupported nodes
 (#13587)

Currently on a CUDA backend to SYCL when running
`GGML_SYCL_DISABLE_GRAPH=0 ./bin/test-backend-ops -b SYCL0` there
are two operations that throw an exception from the blocking
waits during queue recording.

* `-o CONCAT` : Use of blocking waits on a queue that's being recorded https://github.com/ggml-org/llama.cpp/blob/master/ggml/src/ggml-sycl/concat.cpp#L185-L187
* `-o MUL_MAT_ID`: Blocking wait on a recording queue for a copy to host memory https://github.com/ggml-org/llama.cpp/blob/master/ggml/src/ggml-sycl/ggml-sycl.cpp#L3072-L3074

We've noticed that `ggml-cuda.cu` has the
[check_node_graph_compatibility_and_refresh_copy_ops](https://github.com/ggml-org/llama.cpp/blob/39e73ae0d69f882d7e29cecc6dd8f5052fca6731/ggml/src/ggml-cuda/ggml-cuda.cu#L2458-L2458)
method for checking if a graph can be used, even if enabled. I've taken a
similar approach in this PR by adding a method to `ggml-sycl.cpp` for checking
if a graph can be used for the operations even if a user has asked for it to be
enabled.
---
 ggml/src/ggml-sycl/ggml-sycl.cpp | 34 +++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index d05919781..c2eb618e8 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -3809,11 +3809,43 @@ static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * syc
     }
 }
 
+#ifdef GGML_SYCL_GRAPH
+static bool check_graph_compatibility(ggml_cgraph * cgraph) {
+    if (ggml_sycl_info().device_count > 1) {
+        // A sycl_ex::command_graph object can only be created for a single device
+        GGML_LOG_INFO("%s: disabling SYCL graphs due to multiple devices\n", __func__);
+        return false;
+    }
+
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        const ggml_op node_op = cgraph->nodes[i]->op;
+        switch (node_op) {
+            default:
+                break;
+            case GGML_OP_CONCAT:
+                // ggml_sycl_op_concat() does a blocking host wait after memcpy operations,
+                // but wait() can't be called on the events returned by a queue recording
+                // to a graph.
+                [[fallthrough]];
+            case GGML_OP_MUL_MAT_ID:
+                // ggml_sycl_mul_mat_id() does a blocking host wait on the sycl queue after
+                // submitting a memcpy operation, but wait() can't be called on a queue that
+                // is recording to a graph.
+                GGML_LOG_INFO("%s: disabling SYCL graphs due to unsupported node type %s\n", __func__,
+                              ggml_op_name(node_op));
+                return false;
+        }
+    }
+    return true;
+}
+#endif
+
 static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
     auto * sycl_ctx = static_cast<ggml_backend_sycl_context *>(backend->context);
 
 #ifdef GGML_SYCL_GRAPH
-    if (!g_ggml_sycl_disable_graph) {
+    bool use_sycl_graph = !g_ggml_sycl_disable_graph && check_graph_compatibility(cgraph);
+    if (use_sycl_graph) {
         const bool graph_support = dpct::get_device(sycl_ctx->device).has(sycl::aspect::ext_oneapi_limited_graph);
         if (!graph_support) {
             GGML_SYCL_DEBUG("[SYCL-GRAPH] can not use graphs on device:%d\n", sycl_ctx->device);

From d394a9aedc50a13b7f6373416f7c1ccabfe79c32 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Scipione?= <nicolo.scipione@codeplay.com>
Date: Thu, 22 May 2025 13:54:43 +0200
Subject: [PATCH 15/18] sycl : Remove waits from function calls (#13702)

* removes the waits in async memcpy functions
---
 ggml/src/ggml-sycl/ggml-sycl.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index c2eb618e8..271f54e57 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -3740,7 +3740,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
     GGML_ASSERT(buf->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
     const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
     SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy(
-        data, (const char *)tensor->data + offset, size).wait()));
+        data, (const char *)tensor->data + offset, size)));
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -3760,7 +3760,7 @@ static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
         */
         const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0);
         SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy(
-            dst->data, src->data, ggml_nbytes(dst)).wait()));
+            dst->data, src->data, ggml_nbytes(dst))));
         return true;
     }
 

From 5be24af73d77ea23d399726f1d2a01a70ee86331 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Thu, 22 May 2025 14:25:05 +0200
Subject: [PATCH 16/18] gguf-py : correct charsmap parameter typing (#13701)

---
 gguf-py/gguf/gguf_writer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index ff50d3de3..1dcce18f7 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -896,7 +896,7 @@ class GGUFWriter:
     def add_remove_extra_whitespaces(self, value: bool) -> None:
         self.add_bool(Keys.Tokenizer.REMOVE_EXTRA_WS, value)
 
-    def add_precompiled_charsmap(self, charsmap: Sequence[bytes]) -> None:
+    def add_precompiled_charsmap(self, charsmap: bytes) -> None:
         self.add_array(Keys.Tokenizer.PRECOMPILED_CHARSMAP, charsmap)
 
     def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:

From cc74d5be990e37f201591fd868a92e64abdbf902 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 22 May 2025 16:33:39 +0300
Subject: [PATCH 17/18] server : pad small embedding batches (#13692)

ggml-ci
---
 tools/server/server.cpp | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 7424da523..1a08e30d2 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -3341,6 +3341,37 @@ struct server_context {
             common_set_adapter_lora(ctx, slot_batched->lora);
         }
 
+        const bool do_encode = (params_base.embedding || params_base.reranking);
+
+        // pad the batch so that batch.n_tokens >= n_slots
+        // TODO: temporary workaround for https://github.com/ggml-org/llama.cpp/issues/13689
+        if (do_encode) {
+            const int n_slots = slots.size();
+
+            if (batch.n_tokens < n_slots) {
+                std::set<llama_seq_id> seq_ids;
+                for (int j = 0; j < batch.n_tokens; ++j) {
+                    seq_ids.insert(batch.seq_id[j][0]);
+                }
+
+                // find unused sequence id
+                llama_seq_id seq_id = -1;
+                for (int i = 0; i < n_slots; ++i) {
+                    if (seq_ids.find(i) == seq_ids.end()) {
+                        seq_id = i;
+                    }
+                }
+
+                const int n_add = n_slots - batch.n_tokens;
+
+                SRV_WRN("adding %d dummy tokens to the batch, seq_id = %d\n", n_add, seq_id);
+
+                for (int j = 0; j < n_add; ++j) {
+                    common_batch_add(batch, 0, j, { seq_id }, false);
+                }
+            }
+        }
+
         // process the created batch of tokens
         for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
             const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
@@ -3357,7 +3388,7 @@ struct server_context {
 
             int ret = 0;
 
-            if (params_base.embedding || params_base.reranking) {
+            if (do_encode) {
                 ret = llama_encode(ctx, batch_view);
             } else {
                 ret = llama_decode(ctx, batch_view);

From ab86335760ebb441574eb47f886fa1ee302e2131 Mon Sep 17 00:00:00 2001
From: Aaron Teo <aaron.teo1@ibm.com>
Date: Fri, 23 May 2025 02:31:29 +0800
Subject: [PATCH 18/18] common: Include torch package for s390x (#13699)

* common: update requirements.txt to include pytorch nightly for s390x

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* common: fix torch installation via pip for s390x

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

---------

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
---
 requirements/requirements-convert_hf_to_gguf.txt        | 6 +++++-
 requirements/requirements-convert_hf_to_gguf_update.txt | 6 +++++-
 requirements/requirements-convert_lora_to_gguf.txt      | 2 ++
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/requirements/requirements-convert_hf_to_gguf.txt b/requirements/requirements-convert_hf_to_gguf.txt
index 8cb9c354f..431c596c1 100644
--- a/requirements/requirements-convert_hf_to_gguf.txt
+++ b/requirements/requirements-convert_hf_to_gguf.txt
@@ -1,3 +1,7 @@
 -r ./requirements-convert_legacy_llama.txt
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch~=2.2.1
+torch~=2.2.1; platform_machine != "s390x"
+
+# torch s390x packages can only be found from nightly builds
+--extra-index-url https://download.pytorch.org/whl/nightly
+torch>=0.0.0.dev0; platform_machine == "s390x"
diff --git a/requirements/requirements-convert_hf_to_gguf_update.txt b/requirements/requirements-convert_hf_to_gguf_update.txt
index 8cb9c354f..431c596c1 100644
--- a/requirements/requirements-convert_hf_to_gguf_update.txt
+++ b/requirements/requirements-convert_hf_to_gguf_update.txt
@@ -1,3 +1,7 @@
 -r ./requirements-convert_legacy_llama.txt
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch~=2.2.1
+torch~=2.2.1; platform_machine != "s390x"
+
+# torch s390x packages can only be found from nightly builds
+--extra-index-url https://download.pytorch.org/whl/nightly
+torch>=0.0.0.dev0; platform_machine == "s390x"
diff --git a/requirements/requirements-convert_lora_to_gguf.txt b/requirements/requirements-convert_lora_to_gguf.txt
index 5758076c4..d091d5648 100644
--- a/requirements/requirements-convert_lora_to_gguf.txt
+++ b/requirements/requirements-convert_lora_to_gguf.txt
@@ -1,2 +1,4 @@
 -r ./requirements-convert_hf_to_gguf.txt
 --extra-index-url https://download.pytorch.org/whl/cpu
+# torch s390x packages can only be found from nightly builds
+--extra-index-url https://download.pytorch.org/whl/nightly