From b44890df2e4fad0ece1d5366dcbc8bedae23b658 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 21 May 2025 13:09:21 +0300 Subject: [PATCH 01/18] model : disable SWA for Phi models (#13676) * model : disable SWA for Phi models ggml-ci * model : update warning message * model : print warning only if n_swa > 0 * model : fix typo --- src/llama-graph.cpp | 7 +++-- src/llama-model.cpp | 65 +++++++++++++++++++-------------------------- 2 files changed, 30 insertions(+), 42 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index e745b41e3..13e36d161 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1236,8 +1236,7 @@ llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified() auto inp = std::make_unique(hparams, cparams, kv_self); { - GGML_ASSERT(hparams.n_swa_pattern == 1 && "Use llama_kv_cache_unified_iswa for SWA"); - GGML_ASSERT(hparams.n_swa == 0 && "Use llama_kv_cache_unified_iswa for SWA"); + GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified_iswa for SWA"); const auto n_kv = kv_self->get_n(); @@ -1312,8 +1311,8 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask; } - if (hparams.n_swa_pattern > 1) { - GGML_ASSERT(hparams.n_swa > 0 && "Use llama_kv_cache_unified for non-SWA"); + { + GGML_ASSERT(hparams.swa_type != LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_unified for non-SWA"); const auto n_kv = kv_self->get_kv_swa()->get_n(); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 383972f94..7c135e981 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -853,43 +853,16 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } - // for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931 - if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) { - // default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct - LLAMA_LOG_WARN("%s: assuming n_swa = 2047 for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct\n", __func__); + const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); - hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; - - hparams.n_swa = 2047; - } else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) { - // default value for Phi-3-mini-128k-instruct - LLAMA_LOG_WARN("%s: assuming no SWA for Phi-3-mini-128k-instruct\n", __func__); + if (found_swa && hparams.n_swa > 0) { + LLAMA_LOG_WARN("%s: Phi SWA is currently disabled - results might be suboptimal for some models (see %s)\n", + __func__, "https://github.com/ggml-org/llama.cpp/pull/13676"); + // TODO: fix conversion scripts to correctly populate `n_swa` and `n_swa_pattern` hparams.swa_type = LLAMA_SWA_TYPE_NONE; - hparams.n_swa = hparams.n_ctx_train; - hparams.n_swa_pattern = 1; - } else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) { - // default value for Phi-3-medium-128k-instruct - LLAMA_LOG_WARN("%s: assuming no SWA for Phi-3-medium-128k-instruct\n", __func__); - - hparams.swa_type = LLAMA_SWA_TYPE_NONE; - - hparams.n_swa = hparams.n_ctx_train; - hparams.n_swa_pattern = 1; - } - - bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); - if (!found_swa && hparams.n_swa == 0) { - throw std::runtime_error("invalid value for sliding_window"); - } - - if (hparams.n_swa > hparams.n_ctx_train) { - LLAMA_LOG_WARN("%s: unexpected n_swa: %d >= %d, disabling SWA\n", __func__, hparams.n_swa, hparams.n_ctx_train); - - hparams.swa_type = LLAMA_SWA_TYPE_NONE; - - hparams.n_swa = hparams.n_ctx_train; + hparams.n_swa = 0; hparams.n_swa_pattern = 1; } } break; @@ -7368,8 +7341,9 @@ struct llm_build_phi2 : public llm_graph_context { } }; -struct llm_build_phi3_iswa : public llm_graph_context { - llm_build_phi3_iswa(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { +template +struct llm_build_phi3 : public llm_graph_context { + llm_build_phi3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); @@ -7383,7 +7357,14 @@ struct llm_build_phi3_iswa : public llm_graph_context { // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified_iswa(); + using inp_attn_type = std::conditional_t; + inp_attn_type * inp_attn = nullptr; + + if constexpr (iswa) { + inp_attn = build_attn_inp_kv_unified_iswa(); + } else { + inp_attn = build_attn_inp_kv_unified(); + } for (int il = 0; il < n_layer; ++il) { auto * residual = inpL; @@ -13232,7 +13213,9 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx); - if (hparams.n_swa > 0) { + if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { + GGML_ASSERT(hparams.n_swa_pattern != 1); + res = new llama_kv_cache_unified_iswa( *this, params.type_k, @@ -13245,6 +13228,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, cparams.n_batch, padding); } else { + GGML_ASSERT(hparams.n_swa_pattern == 1); + res = new llama_kv_cache_unified( *this, nullptr, @@ -13353,7 +13338,11 @@ llm_graph_result_ptr llama_model::build_graph( case LLM_ARCH_PHI3: case LLM_ARCH_PHIMOE: { - llm = std::make_unique(*this, params, gf); + if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { + llm = std::make_unique> (*this, params, gf); + } else { + llm = std::make_unique>(*this, params, gf); + } } break; case LLM_ARCH_PLAMO: { From 797f2ac0625b22edeff03cc30e0f988da6b6b068 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 21 May 2025 15:11:13 +0300 Subject: [PATCH 02/18] kv-cache : simplify the interface (#13660) * kv-cache : simplify the interface ggml-ci * context : revert llama_batch_allocr position change ggml-ci --- examples/simple-chat/simple-chat.cpp | 4 +- include/llama.h | 6 +- src/llama-batch.cpp | 4 +- src/llama-context.cpp | 39 +++++++++- src/llama-kv-cache.cpp | 111 ++++++--------------------- src/llama-kv-cache.h | 51 ++++-------- src/llama-model.cpp | 6 +- tools/run/run.cpp | 4 +- tools/server/server.cpp | 17 ---- 9 files changed, 89 insertions(+), 153 deletions(-) diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp index 84f415973..6608d4bea 100644 --- a/examples/simple-chat/simple-chat.cpp +++ b/examples/simple-chat/simple-chat.cpp @@ -98,7 +98,7 @@ int main(int argc, char ** argv) { auto generate = [&](const std::string & prompt) { std::string response; - const bool is_first = llama_kv_self_used_cells(ctx) == 0; + const bool is_first = llama_kv_self_seq_pos_max(ctx, 0) == 0; // tokenize the prompt const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true); @@ -113,7 +113,7 @@ int main(int argc, char ** argv) { while (true) { // check if we have enough space in the context to evaluate this batch int n_ctx = llama_n_ctx(ctx); - int n_ctx_used = llama_kv_self_used_cells(ctx); + int n_ctx_used = llama_kv_self_seq_pos_max(ctx, 0); if (n_ctx_used + batch.n_tokens > n_ctx) { printf("\033[0m\n"); fprintf(stderr, "context size exceeded\n"); diff --git a/include/llama.h b/include/llama.h index 6b4fc5d11..52cd7a5a0 100644 --- a/include/llama.h +++ b/include/llama.h @@ -610,10 +610,12 @@ extern "C" { // Returns the number of tokens in the KV cache (slow, use only for debug) // If a KV cell has multiple sequences assigned to it, it will be counted multiple times - LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx); + DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx), + "Use llama_kv_self_seq_pos_max() instead"); // Returns the number of used KV cells (i.e. have at least one sequence assigned to them) - LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx); + DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx), + "Use llama_kv_self_seq_pos_max() instead"); // Clear the KV cache - both cell info is erased and KV data is zeroed LLAMA_API void llama_kv_self_clear( diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp index a88b2fe30..b98e3256c 100644 --- a/src/llama-batch.cpp +++ b/src/llama-batch.cpp @@ -1,5 +1,6 @@ #include "llama-batch.h" +#include #include #include @@ -281,9 +282,10 @@ llama_batch_allocr::llama_batch_allocr(struct llama_batch in_batch, llama_pos p0 batch = in_batch; GGML_ASSERT(batch.n_tokens > 0); if (!batch.pos) { + assert(p0 >= 0); pos.resize(batch.n_tokens); for (int32_t i = 0; i < batch.n_tokens; i++) { - pos[i] = i + p0; + pos[i] = p0 + i; } batch.pos = pos.data(); } diff --git a/src/llama-context.cpp b/src/llama-context.cpp index bba3ee0b5..85b4324b6 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -857,11 +857,17 @@ int llama_context::decode(llama_batch & inp_batch) { return -1; } + if (!inp_batch.pos) { + if (inp_batch.seq_id) { + LLAMA_LOG_ERROR("%s: pos == NULL, but seq_id != NULL\n", __func__); + return -1; + } + } + llama_kv_cache * kv_self = static_cast(memory.get()); // temporary allocate memory for the input batch if needed - // TODO: this is incorrect for multiple sequences because get_pos_max() is the maximum across all sequences - llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->get_pos_max() + 1); + llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->seq_pos_max(0) + 1); const llama_batch & batch = batch_allocr.batch; @@ -2292,22 +2298,47 @@ int32_t llama_apply_adapter_cvec( // kv cache // +// deprecated int32_t llama_kv_self_n_tokens(const llama_context * ctx) { const auto * kv = ctx->get_kv_self(); if (!kv) { return 0; } - return kv->get_n_tokens(); + int32_t res = 0; + + for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) { + const llama_pos p0 = kv->seq_pos_min(s); + const llama_pos p1 = kv->seq_pos_max(s); + + if (p0 >= 0) { + res += (p1 - p0) + 1; + } + } + + return res; } +// deprecated +// note: this is the same as above - will be removed anyway, so it's ok int32_t llama_kv_self_used_cells(const llama_context * ctx) { const auto * kv = ctx->get_kv_self(); if (!kv) { return 0; } - return kv->get_used_cells(); + int32_t res = 0; + + for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) { + const llama_pos p0 = kv->seq_pos_min(s); + const llama_pos p1 = kv->seq_pos_max(s); + + if (p0 >= 0) { + res += (p1 - p0) + 1; + } + } + + return res; } void llama_kv_self_clear(llama_context * ctx) { diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 77b2c0dbf..a2624d715 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -30,13 +30,14 @@ llama_kv_cache_unified::llama_kv_cache_unified( bool v_trans, bool offload, uint32_t kv_size, - uint32_t padding, + uint32_t n_seq_max, + uint32_t n_pad, uint32_t n_swa, - llama_swa_type swa_type) : model(model), hparams(model.hparams), v_trans(v_trans), padding(padding), n_swa(n_swa), swa_type(swa_type) { - GGML_ASSERT(kv_size % padding == 0 && "kv_size must be a multiple of padding"); + llama_swa_type swa_type) : + model(model), hparams(model.hparams), v_trans(v_trans), + n_seq_max(n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) { - this->type_k = type_k; - this->type_v = type_v; + GGML_ASSERT(kv_size % n_pad == 0); // create a context for each buffer type std::map ctx_map; @@ -129,8 +130,8 @@ llama_kv_cache_unified::llama_kv_cache_unified( const size_t memory_size_k = size_k_bytes(); const size_t memory_size_v = size_v_bytes(); - LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6d cells, %3d layers), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__, - (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), + LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__, + (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max, ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f), ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); } @@ -442,7 +443,7 @@ bool llama_kv_cache_unified::update(llama_context & lctx) { void llama_kv_cache_unified::defrag_sched(float thold) { // - do not defrag small contexts (i.e. < 2048 tokens) // - count the padding towards the number of used tokens - const float fragmentation = n >= 2048 ? std::max(0.0f, 1.0f - (float(used + padding)/n)) : 0.0f; + const float fragmentation = n >= 2048 ? std::max(0.0f, 1.0f - (float(used + n_pad)/n)) : 0.0f; // queue defragmentation for next llama_kv_cache_update if (fragmentation > thold) { @@ -558,7 +559,7 @@ bool llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) { // a heuristic, to avoid attending the full cache if it is not yet utilized // after enough generations, the benefit from this heuristic disappears // if we start defragmenting the cache, the benefit from this will be more important - n = std::min(size, std::max(padding, GGML_PAD(cell_max(), padding))); + n = std::min(size, std::max(n_pad, GGML_PAD(cell_max(), n_pad))); #ifdef FIND_SLOT_DEBUG LLAMA_LOG_WARN("end: n = %5d, used = %5d, head = %5d, n_swa = %5d\n", n, used, head, n_swa); @@ -567,20 +568,6 @@ bool llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch) { return true; } -int32_t llama_kv_cache_unified::get_n_tokens() const { - int32_t result = 0; - - for (uint32_t i = 0; i < size; i++) { - result += cells[i].seq_id.size(); - } - - return result; -} - -int32_t llama_kv_cache_unified::get_used_cells() const { - return used; -} - bool llama_kv_cache_unified::get_can_shift() const { return true; } @@ -802,16 +789,6 @@ void llama_kv_cache_unified::set_input_pos_bucket(ggml_tensor * dst, const llama } } -llama_pos llama_kv_cache_unified::get_pos_max() const { - llama_pos pos_max = -1; - - for (const auto & cell : cells) { - pos_max = std::max(pos_max, cell.pos); - } - - return pos_max; -} - size_t llama_kv_cache_unified::total_size() const { size_t size = 0; @@ -1501,11 +1478,8 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell llama_seq_id seq_id; io.read_to(&seq_id, sizeof(seq_id)); - // TODO: llama_kv_cache_unified should have a notion of max sequences - //if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) { - if (seq_id < 0) { - //LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx)); - LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, inf)\n", __func__, seq_id); + if (seq_id < 0 || (uint32_t) seq_id >= n_seq_max) { + LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, n_seq_max); return false; } @@ -1655,17 +1629,17 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa( ggml_type type_v, bool v_trans, bool offload, - uint32_t kv_size, bool swa_full, + uint32_t kv_size, uint32_t n_seq_max, uint32_t n_batch, - uint32_t padding) : hparams(model.hparams) { + uint32_t n_pad) : hparams(model.hparams) { llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); }; llama_kv_cache_unified::layer_filter_cb filter_swa = [&](int32_t il) { return model.hparams.is_swa(il); }; const uint32_t size_base = kv_size; - uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_batch, padding)); + uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_batch, n_pad)); // when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size and disable pruning if (swa_full) { @@ -1680,14 +1654,14 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa( kv_base = std::make_unique( model, std::move(filter_base), type_k, type_v, - v_trans, offload, size_base, padding, + v_trans, offload, size_base, n_seq_max, n_pad, 0, LLAMA_SWA_TYPE_NONE); LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa); kv_swa = std::make_unique( model, std::move(filter_swa), type_k, type_v, - v_trans, offload, size_swa, padding, + v_trans, offload, size_swa, n_seq_max, n_pad, hparams.n_swa, hparams.swa_type); } @@ -1810,18 +1784,6 @@ bool llama_kv_cache_unified_iswa::find_slot(const llama_ubatch & batch) { return res; } -int32_t llama_kv_cache_unified_iswa::get_n_tokens() const { - return kv_base->get_n_tokens(); -} - -int32_t llama_kv_cache_unified_iswa::get_used_cells() const { - return kv_base->get_used_cells(); -} - -llama_pos llama_kv_cache_unified_iswa::get_pos_max() const { - return kv_base->get_pos_max(); -} - bool llama_kv_cache_unified_iswa::get_can_shift() const { return kv_base->get_size() == kv_swa->get_size(); } @@ -1853,19 +1815,17 @@ llama_kv_cache_recurrent::llama_kv_cache_recurrent( ggml_type type_k, ggml_type type_v, bool offload, - uint32_t kv_size) : hparams(model.hparams) { + uint32_t kv_size, + uint32_t n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) { const int32_t n_layer = hparams.n_layer; - LLAMA_LOG_INFO("%s: kv_size = %d, type_k = '%s', type_v = '%s', n_layer = %d\n", - __func__, kv_size, ggml_type_name(type_k), ggml_type_name(type_v), n_layer); + LLAMA_LOG_INFO("%s: kv_size = %u, n_seq_max = %u, type_k = '%s', type_v = '%s', n_layer = %d\n", + __func__, kv_size, n_seq_max, ggml_type_name(type_k), ggml_type_name(type_v), n_layer); head = 0; size = kv_size; used = 0; - this->type_k = type_k; - this->type_v = type_v; - cells.clear(); cells.resize(kv_size); @@ -2203,8 +2163,8 @@ void llama_kv_cache_recurrent::commit() { pending.ranges.clear(); } -bool llama_kv_cache_recurrent::update(llama_context & lctx) { - GGML_UNUSED(lctx); +bool llama_kv_cache_recurrent::update(llama_context & ctx) { + GGML_UNUSED(ctx); return false; } @@ -2265,7 +2225,7 @@ bool llama_kv_cache_recurrent::find_slot( if (seq_id < 0 || (uint32_t) seq_id >= size) { // too big seq_id // TODO: would it be possible to resize the cache instead? - LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, size); + LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%u Try using a bigger --parallel value\n", __func__, seq_id, n_seq_max); return false; } if (j > 0) { @@ -2408,29 +2368,6 @@ bool llama_kv_cache_recurrent::find_slot( return n >= n_seqs; } -int32_t llama_kv_cache_recurrent::get_n_tokens() const { - int32_t result = 0; - - for (uint32_t i = 0; i < size; i++) { - result += cells[i].seq_id.size(); - } - - return result; -} - -int32_t llama_kv_cache_recurrent::get_used_cells() const { - return used; -} - -llama_pos llama_kv_cache_recurrent::get_pos_max() const { - llama_pos pos_max = -1; - for (const auto & cell : cells) { - pos_max = std::max(pos_max, cell.pos); - } - - return pos_max; -} - bool llama_kv_cache_recurrent::get_can_shift() const { return false; } diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index bd0485bc6..191a1090a 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -55,10 +55,7 @@ struct llama_kv_cache : public llama_memory_i { // ============================================================================================================= // getters - virtual int32_t get_n_tokens() const = 0; - virtual int32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache - virtual llama_pos get_pos_max() const = 0; - virtual bool get_can_shift() const = 0; + virtual bool get_can_shift() const = 0; bool get_can_edit() const override { return get_can_shift(); } @@ -108,7 +105,8 @@ public: bool v_trans, bool offload, uint32_t kv_size, - uint32_t padding, + uint32_t n_seq_max, + uint32_t n_pad, uint32_t n_swa, llama_swa_type swa_type); @@ -150,12 +148,6 @@ public: // to the first cell of the slot. bool find_slot(const llama_ubatch & batch) override; - int32_t get_n_tokens() const override; - int32_t get_used_cells() const override; - - // TODO: better data structures to reduce the cost of this operation - llama_pos get_pos_max() const override; - bool get_can_shift() const override; // state write/load @@ -228,16 +220,15 @@ private: // computed before each graph build uint32_t n = 0; - // required padding - uint32_t padding = 1; + const uint32_t n_seq_max = 1; - ggml_type type_k = GGML_TYPE_F16; - ggml_type type_v = GGML_TYPE_F16; + // required padding + const uint32_t n_pad = 1; // SWA - uint32_t n_swa = 0; + const uint32_t n_swa = 0; - llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE; + const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE; std::vector ctxs; std::vector bufs; @@ -317,11 +308,11 @@ public: ggml_type type_v, bool v_trans, bool offload, - uint32_t kv_size, bool swa_full, + uint32_t kv_size, uint32_t n_seq_max, uint32_t n_batch, - uint32_t padding); + uint32_t n_pad); ~llama_kv_cache_unified_iswa() = default; @@ -358,12 +349,6 @@ public: bool find_slot(const llama_ubatch & batch) override; - int32_t get_n_tokens() const override; - int32_t get_used_cells() const override; - - // TODO: better data structures to reduce the cost of this operation - llama_pos get_pos_max() const override; - bool get_can_shift() const override; // state write/load @@ -432,7 +417,8 @@ public: ggml_type type_k, ggml_type type_v, bool offload, - uint32_t kv_size); + uint32_t kv_size, + uint32_t n_seq_max); ~llama_kv_cache_recurrent() = default; @@ -444,7 +430,7 @@ public: bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override; void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override; - void seq_keep(llama_seq_id seq_id) override; + void seq_keep(llama_seq_id seq_id) override; void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) override; void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override; @@ -458,7 +444,7 @@ public: void restore() override; void commit() override; - bool update(llama_context & lctx) override; + bool update(llama_context & ctx) override; void defrag_sched(float thold) override; @@ -469,12 +455,6 @@ public: bool find_slot(const llama_ubatch & batch) override; - int32_t get_n_tokens() const override; - int32_t get_used_cells() const override; - - // TODO: better data structures to reduce the cost of this operation - llama_pos get_pos_max() const override; - bool get_can_shift() const override; // TODO: temporary methods - they are not really const as they do const_cast<>, fix this @@ -514,8 +494,7 @@ private: std::vector ranges; } pending; - ggml_type type_k = GGML_TYPE_F16; - ggml_type type_v = GGML_TYPE_F16; + const uint32_t n_seq_max = 1; std::vector ctxs; std::vector bufs; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 7c135e981..82557ea05 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -13203,7 +13203,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, GGML_TYPE_F32, GGML_TYPE_F32, cparams.offload_kqv, - std::max((uint32_t) 1, cparams.n_seq_max)); + std::max((uint32_t) 1, cparams.n_seq_max), + cparams.n_seq_max); } break; default: { @@ -13222,8 +13223,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, params.type_v, !cparams.flash_attn, cparams.offload_kqv, - cparams.n_ctx, params.swa_full, + cparams.n_ctx, cparams.n_seq_max, cparams.n_batch, padding); @@ -13238,6 +13239,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, !cparams.flash_attn, cparams.offload_kqv, cparams.n_ctx, + cparams.n_seq_max, padding, hparams.n_swa, hparams.swa_type); diff --git a/tools/run/run.cpp b/tools/run/run.cpp index a189ae7fa..702c307d8 100644 --- a/tools/run/run.cpp +++ b/tools/run/run.cpp @@ -936,7 +936,7 @@ static int apply_chat_template(const struct common_chat_templates * tmpls, Llama // Function to tokenize the prompt static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt, std::vector & prompt_tokens, const LlamaData & llama_data) { - const bool is_first = llama_kv_self_used_cells(llama_data.context.get()) == 0; + const bool is_first = llama_kv_self_seq_pos_max(llama_data.context.get(), 0) == 0; const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true); prompt_tokens.resize(n_prompt_tokens); @@ -952,7 +952,7 @@ static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt // Check if we have enough space in the context to evaluate this batch static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) { const int n_ctx = llama_n_ctx(ctx.get()); - const int n_ctx_used = llama_kv_self_used_cells(ctx.get()); + const int n_ctx_used = llama_kv_self_seq_pos_max(ctx.get(), 0); if (n_ctx_used + batch.n_tokens > n_ctx) { printf(LOG_COL_DEFAULT "\n"); printe("context size exceeded\n"); diff --git a/tools/server/server.cpp b/tools/server/server.cpp index f8b7ff062..3b1305e1a 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -1137,9 +1137,6 @@ struct server_task_result_metrics : server_task_result { int n_tasks_deferred; int64_t t_start; - int32_t kv_cache_tokens_count; - int32_t kv_cache_used_cells; - // TODO: somehow reuse server_metrics in the future, instead of duplicating the fields uint64_t n_prompt_tokens_processed_total = 0; uint64_t t_prompt_processing_total = 0; @@ -1179,9 +1176,6 @@ struct server_task_result_metrics : server_task_result { { "n_decode_total", n_decode_total }, { "n_busy_slots_total", n_busy_slots_total }, - { "kv_cache_tokens_count", kv_cache_tokens_count }, - { "kv_cache_used_cells", kv_cache_used_cells }, - { "slots", slots_data }, }; } @@ -2771,9 +2765,6 @@ struct server_context { res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size(); res->t_start = metrics.t_start; - res->kv_cache_tokens_count = llama_kv_self_n_tokens(ctx); - res->kv_cache_used_cells = llama_kv_self_used_cells(ctx); - res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total; res->t_prompt_processing_total = metrics.t_prompt_processing_total; res->n_tokens_predicted_total = metrics.n_tokens_predicted_total; @@ -3883,14 +3874,6 @@ int main(int argc, char ** argv) { {"name", "predicted_tokens_seconds"}, {"help", "Average generation throughput in tokens/s."}, {"value", res_metrics->n_tokens_predicted ? 1.e3 / res_metrics->t_tokens_generation * res_metrics->n_tokens_predicted : 0.} - },{ - {"name", "kv_cache_usage_ratio"}, - {"help", "KV-cache usage. 1 means 100 percent usage."}, - {"value", 1. * res_metrics->kv_cache_used_cells / params.n_ctx} - },{ - {"name", "kv_cache_tokens"}, - {"help", "KV-cache tokens."}, - {"value", (uint64_t) res_metrics->kv_cache_tokens_count} },{ {"name", "requests_processing"}, {"help", "Number of requests processing."}, From 42158ae2e8ead667a83f07247321ce85f32ace66 Mon Sep 17 00:00:00 2001 From: Dorin-Andrei Geman Date: Wed, 21 May 2025 16:07:57 +0300 Subject: [PATCH 03/18] server : fix first message identification (#13634) * server : fix first message identification When using the OpenAI SDK (https://github.com/openai/openai-node/blob/master/src/lib/ChatCompletionStream.ts#L623-L626) we noticed that the expected assistant role is missing in the first streaming message. Fix this by correctly checking for the first message. Co-authored-by: Piotr Stankiewicz Signed-off-by: Dorin Geman * server : Fix checks for first role message for stream=True Co-authored-by: Piotr Stankiewicz Signed-off-by: Dorin Geman --------- Signed-off-by: Dorin Geman Co-authored-by: Piotr Stankiewicz --- tools/server/server.cpp | 18 +++++- .../server/tests/unit/test_chat_completion.py | 56 ++++++++++++------- 2 files changed, 53 insertions(+), 21 deletions(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 3b1305e1a..d48cf46e4 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -951,7 +951,7 @@ struct server_task_result_cmpl_partial : server_task_result { } json to_json_oaicompat_chat() { - bool first = n_decoded == 0; + bool first = n_decoded == 1; std::time_t t = std::time(0); json choices; @@ -962,15 +962,18 @@ struct server_task_result_cmpl_partial : server_task_result { {"delta", json{{"role", "assistant"}}}}}); } else { // We have to send this as two updates to conform to openai behavior + // initial_ret is the role message for stream=True json initial_ret = json{{"choices", json::array({json{ {"finish_reason", nullptr}, {"index", 0}, {"delta", json{ - {"role", "assistant"} + {"role", "assistant"}, + {"content", ""} }}}})}, {"created", t}, {"id", oaicompat_cmpl_id}, {"model", oaicompat_model}, + {"system_fingerprint", build_info}, {"object", "chat.completion.chunk"}}; json second_ret = json{ @@ -982,8 +985,19 @@ struct server_task_result_cmpl_partial : server_task_result { {"created", t}, {"id", oaicompat_cmpl_id}, {"model", oaicompat_model}, + {"system_fingerprint", build_info}, {"object", "chat.completion.chunk"}}; + if (prob_output.probs.size() > 0) { + second_ret["choices"][0]["logprobs"] = json{ + {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)}, + }; + } + + if (timings.prompt_n >= 0) { + second_ret.push_back({"timings", timings.to_json()}); + } + return std::vector({initial_ret, second_ret}); } } else { diff --git a/tools/server/tests/unit/test_chat_completion.py b/tools/server/tests/unit/test_chat_completion.py index 491cb3a5d..bab5d005d 100644 --- a/tools/server/tests/unit/test_chat_completion.py +++ b/tools/server/tests/unit/test_chat_completion.py @@ -71,8 +71,14 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte }) content = "" last_cmpl_id = None - for data in res: + for i, data in enumerate(res): choice = data["choices"][0] + if i == 0: + # Check first role message for stream=True + assert choice["delta"]["content"] == "" + assert choice["delta"]["role"] == "assistant" + else: + assert "role" not in choice["delta"] assert data["system_fingerprint"].startswith("b") assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future if last_cmpl_id is None: @@ -242,12 +248,18 @@ def test_chat_completion_with_timings_per_token(): "stream": True, "timings_per_token": True, }) - for data in res: - assert "timings" in data - assert "prompt_per_second" in data["timings"] - assert "predicted_per_second" in data["timings"] - assert "predicted_n" in data["timings"] - assert data["timings"]["predicted_n"] <= 10 + for i, data in enumerate(res): + if i == 0: + # Check first role message for stream=True + assert data["choices"][0]["delta"]["content"] == "" + assert data["choices"][0]["delta"]["role"] == "assistant" + else: + assert "role" not in data["choices"][0]["delta"] + assert "timings" in data + assert "prompt_per_second" in data["timings"] + assert "predicted_per_second" in data["timings"] + assert "predicted_n" in data["timings"] + assert data["timings"]["predicted_n"] <= 10 def test_logprobs(): @@ -295,17 +307,23 @@ def test_logprobs_stream(): ) output_text = '' aggregated_text = '' - for data in res: + for i, data in enumerate(res): choice = data.choices[0] - if choice.finish_reason is None: - if choice.delta.content: - output_text += choice.delta.content - assert choice.logprobs is not None - assert choice.logprobs.content is not None - for token in choice.logprobs.content: - aggregated_text += token.token - assert token.logprob <= 0.0 - assert token.bytes is not None - assert token.top_logprobs is not None - assert len(token.top_logprobs) > 0 + if i == 0: + # Check first role message for stream=True + assert choice.delta.content == "" + assert choice.delta.role == "assistant" + else: + assert choice.delta.role is None + if choice.finish_reason is None: + if choice.delta.content: + output_text += choice.delta.content + assert choice.logprobs is not None + assert choice.logprobs.content is not None + for token in choice.logprobs.content: + aggregated_text += token.token + assert token.logprob <= 0.0 + assert token.bytes is not None + assert token.top_logprobs is not None + assert len(token.top_logprobs) > 0 assert aggregated_text == output_text From 0d5c74216170ef97e5e7511563837263f2d1a496 Mon Sep 17 00:00:00 2001 From: Robin Davidsson <40024429+R-Dson@users.noreply.github.com> Date: Wed, 21 May 2025 15:15:27 +0200 Subject: [PATCH 04/18] server : Add the endpoints /api/tags and /api/chat (#13659) * Add the endpoints /api/tags and /api/chat Add the endpoints /api/tags and /api/chat, and improved the model metadata response * Remove trailing whitespaces * Removed code that is not needed for copilot to work. --- tools/server/server.cpp | 42 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index d48cf46e4..087665e41 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -3707,6 +3707,7 @@ int main(int argc, char ** argv) { "/health", "/models", "/v1/models", + "/api/tags" }; // If API key is not set, skip validation @@ -3745,7 +3746,7 @@ int main(int argc, char ** argv) { if (req.path == "/" || tmp.back() == "html") { res.set_content(reinterpret_cast(loading_html), loading_html_len, "text/html; charset=utf-8"); res.status = 503; - } else if (req.path == "/models" || req.path == "/v1/models") { + } else if (req.path == "/models" || req.path == "/v1/models" || req.path == "/api/tags") { // allow the models endpoint to be accessed during loading return true; } else { @@ -4083,6 +4084,19 @@ int main(int argc, char ** argv) { { "llama.context_length", ctx_server.slots.back().n_ctx, }, } }, + {"modelfile", ""}, + {"parameters", ""}, + {"template", common_chat_templates_source(ctx_server.chat_templates.get())}, + {"details", { + {"parent_model", ""}, + {"format", "gguf"}, + {"family", ""}, + {"families", {""}}, + {"parameter_size", ""}, + {"quantization_level", ""} + }}, + {"model_info", ""}, + {"capabilities", {"completion"}} }; res_ok(res, data); @@ -4408,6 +4422,28 @@ int main(int argc, char ** argv) { } json models = { + {"models", { + { + {"name", params.model_alias.empty() ? params.model.path : params.model_alias}, + {"model", params.model_alias.empty() ? params.model.path : params.model_alias}, + {"modified_at", ""}, + {"size", ""}, + {"digest", ""}, // dummy value, llama.cpp does not support managing model file's hash + {"type", "model"}, + {"description", ""}, + {"tags", {""}}, + {"capabilities", {"completion"}}, + {"parameters", ""}, + {"details", { + {"parent_model", ""}, + {"format", "gguf"}, + {"family", ""}, + {"families", {""}}, + {"parameter_size", ""}, + {"quantization_level", ""} + }} + } + }}, {"object", "list"}, {"data", { { @@ -4417,7 +4453,7 @@ int main(int argc, char ** argv) { {"owned_by", "llamacpp"}, {"meta", model_meta}, }, - }} + }} }; res_ok(res, models); @@ -4745,11 +4781,13 @@ int main(int argc, char ** argv) { svr->Post("/api/show", handle_api_show); svr->Get ("/models", handle_models); // public endpoint (no API key check) svr->Get ("/v1/models", handle_models); // public endpoint (no API key check) + svr->Get ("/api/tags", handle_models); // ollama specific endpoint. public endpoint (no API key check) svr->Post("/completion", handle_completions); // legacy svr->Post("/completions", handle_completions); svr->Post("/v1/completions", handle_completions_oai); svr->Post("/chat/completions", handle_chat_completions); svr->Post("/v1/chat/completions", handle_chat_completions); + svr->Post("/api/chat", handle_chat_completions); // ollama specific endpoint svr->Post("/infill", handle_infill); svr->Post("/embedding", handle_embeddings); // legacy svr->Post("/embeddings", handle_embeddings); From cf4cb59e64d72a1b4c781f71a74de5756a4e2376 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Wed, 21 May 2025 16:26:33 +0200 Subject: [PATCH 05/18] ggml : add ggml_gelu_erf() (#13667) * ggml : add ggml_gelu_na (not approximated) * fix naming order * rename na --> erf * apply review suggesions * revert naming order --- ggml/include/ggml.h | 13 +++- ggml/src/ggml-cpu/ggml-cpu.c | 1 + ggml/src/ggml-cpu/ops.cpp | 107 +++++++++++++++++++++++++++ ggml/src/ggml-cpu/vec.h | 16 ++++ ggml/src/ggml-metal/ggml-metal.m | 24 ++++++ ggml/src/ggml-metal/ggml-metal.metal | 37 +++++++++ ggml/src/ggml.c | 17 ++++- 7 files changed, 213 insertions(+), 2 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index e91dedf14..c81ff03fe 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -528,14 +528,15 @@ extern "C" { GGML_UNARY_OP_STEP, GGML_UNARY_OP_TANH, GGML_UNARY_OP_ELU, - GGML_UNARY_OP_RELU, GGML_UNARY_OP_SIGMOID, GGML_UNARY_OP_GELU, + GGML_UNARY_OP_GELU_ERF, GGML_UNARY_OP_GELU_QUICK, GGML_UNARY_OP_SILU, GGML_UNARY_OP_HARDSWISH, GGML_UNARY_OP_HARDSIGMOID, GGML_UNARY_OP_EXP, + GGML_UNARY_OP_RELU, GGML_UNARY_OP_COUNT, }; @@ -1024,6 +1025,16 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a); + // GELU using erf (error function) when possible + // some backends may fallback to approximation based on Abramowitz and Stegun formula + GGML_API struct ggml_tensor * ggml_gelu_erf( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_gelu_erf_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_gelu_quick( struct ggml_context * ctx, struct ggml_tensor * a); diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 133b50606..46f75ad97 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -2202,6 +2202,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { } break; case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_GELU_ERF: case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_SILU: { diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 955fec59a..26501b711 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -2691,6 +2691,109 @@ static void ggml_compute_forward_gelu( } } +// ggml_compute_forward_gelu_erf + +static void ggml_compute_forward_gelu_erf_f32( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + + assert(ggml_is_contiguous_1(src0)); + assert(ggml_is_contiguous_1(dst)); + assert(ggml_are_same_shape(src0, dst)); + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + ggml_vec_gelu_erf_f32(nc, + (float *) ((char *) dst->data + i1*( dst->nb[1])), + (float *) ((char *) src0->data + i1*(src0->nb[1]))); + +#ifndef NDEBUG + for (int k = 0; k < nc; k++) { + const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + GGML_UNUSED(x); + assert(!isnan(x)); + assert(!isinf(x)); + } +#endif + } +} + +static void ggml_compute_forward_gelu_erf_f16( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + + assert(ggml_is_contiguous_1(src0)); + assert(ggml_is_contiguous_1(dst)); + assert(ggml_are_same_shape(src0, dst)); + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + ggml_vec_gelu_erf_f16(nc, + (ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])), + (ggml_fp16_t *) ((char *) src0->data + i1*(src0->nb[1]))); + +#ifndef NDEBUG + for (int k = 0; k < nc; k++) { + const ggml_fp16_t x = ((ggml_fp16_t *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + const float v = GGML_FP16_TO_FP32(x); + GGML_UNUSED(v); + assert(!isnan(v)); + assert(!isinf(v)); + } +#endif + } +} + +static void ggml_compute_forward_gelu_erf( + const ggml_compute_params * params, + ggml_tensor * dst) { + + const ggml_tensor * src0 = dst->src[0]; + + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_gelu_erf_f32(params, dst); + } break; + case GGML_TYPE_F16: + { + ggml_compute_forward_gelu_erf_f16(params, dst); + } break; + default: + { + GGML_ABORT("fatal error"); + } + } +} + // ggml_compute_forward_gelu_quick static void ggml_compute_forward_gelu_quick_f32( @@ -7749,6 +7852,10 @@ void ggml_compute_forward_unary( { ggml_compute_forward_gelu(params, dst); } break; + case GGML_UNARY_OP_GELU_ERF: + { + ggml_compute_forward_gelu_erf(params, dst); + } break; case GGML_UNARY_OP_GELU_QUICK: { ggml_compute_forward_gelu_quick(params, dst); diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h index 23cbb3051..c77349ebe 100644 --- a/ggml/src/ggml-cpu/vec.h +++ b/ggml/src/ggml-cpu/vec.h @@ -428,6 +428,7 @@ inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp static const float GELU_COEF_A = 0.044715f; static const float GELU_QUICK_COEF = -1.702f; static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; +static const float SQRT_2_INV = 0.70710678118654752440084436210484f; inline static float ggml_gelu_f32(float x) { return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x))); @@ -440,6 +441,14 @@ inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp } } +inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { + for (int i = 0; i < n; ++i) { + float xi = GGML_FP16_TO_FP32(x[i]); + float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV)); + y[i] = GGML_FP32_TO_FP16(res); + } +} + #ifdef GGML_GELU_FP16 inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) { uint16_t t; @@ -463,6 +472,13 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) { } #endif +inline static void ggml_vec_gelu_erf_f32(const int n, float * y, const float * x) { + for (int i = 0; i < n; ++i) { + float xi = x[i]; + y[i] = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV)); + } +} + inline static float ggml_gelu_quick_f32(float x) { return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x))); } diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 85dbbcd5d..f78e7eee5 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -149,6 +149,8 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_SIGMOID, GGML_METAL_KERNEL_TYPE_GELU, GGML_METAL_KERNEL_TYPE_GELU_4, + GGML_METAL_KERNEL_TYPE_GELU_ERF, + GGML_METAL_KERNEL_TYPE_GELU_ERF_4, GGML_METAL_KERNEL_TYPE_GELU_QUICK, GGML_METAL_KERNEL_TYPE_GELU_QUICK_4, GGML_METAL_KERNEL_TYPE_SILU, @@ -1103,6 +1105,8 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIGMOID, sigmoid, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU, gelu, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_4, gelu_4, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_ERF, gelu_erf, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_ERF_4, gelu_erf_4, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK, gelu_quick, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK_4, gelu_quick_4, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU, silu, true); @@ -1613,6 +1617,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex case GGML_UNARY_OP_RELU: case GGML_UNARY_OP_SIGMOID: case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_GELU_ERF: case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_ELU: @@ -2251,6 +2256,25 @@ static bool ggml_metal_encode_node( [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; + case GGML_UNARY_OP_GELU_ERF: + { + int64_t n = ggml_nelements(dst); + + id pipeline = nil; + + if (n % 4 == 0) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_ERF_4].pipeline; + n /= 4; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_ERF].pipeline; + } + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; case GGML_UNARY_OP_GELU_QUICK: { int64_t n = ggml_nelements(dst); diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index f18473dcb..59899550e 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -856,6 +856,7 @@ kernel void kernel_tanh( constant float GELU_COEF_A = 0.044715f; constant float GELU_QUICK_COEF = -1.702f; constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; +constant float SQRT_2_INV = 0.70710678118654752440084436210484f; kernel void kernel_gelu( device const float * src0, @@ -897,6 +898,42 @@ kernel void kernel_gelu_quick_4( dst[tpig] = x*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x))); } +// based on Abramowitz and Stegun formula 7.1.26 or similar Hastings' approximation +// ref: https://www.johndcook.com/blog/python_erf/ +constant float p_erf = 0.3275911f; +constant float a1_erf = 0.254829592f; +constant float a2_erf = -0.284496736f; +constant float a3_erf = 1.421413741f; +constant float a4_erf = -1.453152027f; +constant float a5_erf = 1.061405429f; + +template +T erf_approx(T x) { + T sign_x = sign(x); + x = fabs(x); + T t = 1.0f / (1.0f + p_erf * x); + T y = 1.0f - (((((a5_erf * t + a4_erf) * t) + a3_erf) * t + a2_erf) * t + a1_erf) * t * exp(-x * x); + return sign_x * y; +} + +kernel void kernel_gelu_erf( + device const float * src0, + device float * dst, + uint tpig[[thread_position_in_grid]]) { + device const float & x = src0[tpig]; + + dst[tpig] = 0.5f*x*(1.0f+erf_approx(x*SQRT_2_INV)); +} + +kernel void kernel_gelu_erf_4( + device const float4 * src0, + device float4 * dst, + uint tpig[[thread_position_in_grid]]) { + device const float4 & x = src0[tpig]; + + dst[tpig] = 0.5f*x*(1.0f+erf_approx(x*SQRT_2_INV)); +} + kernel void kernel_silu( device const float * src0, device float * dst, diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index d48adb9af..57d3e39ad 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1099,9 +1099,10 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = { "HARDSWISH", "HARDSIGMOID", "EXP", + "GELU_ERF", }; -static_assert(GGML_UNARY_OP_COUNT == 14, "GGML_UNARY_OP_COUNT != 14"); +static_assert(GGML_UNARY_OP_COUNT == 15, "GGML_UNARY_OP_COUNT != 15"); static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); @@ -2501,6 +2502,20 @@ struct ggml_tensor * ggml_gelu_inplace( return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU); } +// ggml_gelu_erf + +struct ggml_tensor * ggml_gelu_erf( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary(ctx, a, GGML_UNARY_OP_GELU_ERF); +} + +struct ggml_tensor * ggml_gelu_erf_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_GELU_ERF); +} + // ggml_gelu_quick struct ggml_tensor * ggml_gelu_quick( From eb0f5c28d37126baa756117d5bdaadc62e03344e Mon Sep 17 00:00:00 2001 From: Emmanuel Ferdman Date: Wed, 21 May 2025 17:33:54 +0300 Subject: [PATCH 06/18] gguf-py : display the invalid gguf type (#13687) Signed-off-by: Emmanuel Ferdman --- gguf-py/gguf/gguf_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py index 5991cdb76..d87e8f723 100644 --- a/gguf-py/gguf/gguf_reader.py +++ b/gguf-py/gguf/gguf_reader.py @@ -251,7 +251,7 @@ class GGUFReader: offs += curr_size return offs - orig_offs, aparts, data_idxs, types # We can't deal with this one. - raise ValueError('Unknown/unhandled field type {gtype}') + raise ValueError(f'Unknown/unhandled field type {gtype}') def _get_tensor_info_field(self, orig_offs: int) -> ReaderField: offs = orig_offs From 2aa777d86d3f7bb80b93e226f1c25e47825f6a83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Wed, 21 May 2025 16:57:38 +0200 Subject: [PATCH 07/18] examples : switch retrieval to llama_encode (#13685) * switch retrieval to llama_encode * enable --no-warmup for retrieval --- common/arg.cpp | 2 +- examples/retrieval/retrieval.cpp | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index b1754f30f..997f732cc 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1678,7 +1678,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.warmup = false; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING})); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL})); add_opt(common_arg( {"--spm-infill"}, string_format( diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index 0efe20d4b..e3d0c9542 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -81,14 +81,14 @@ static void batch_add_seq(llama_batch & batch, const std::vector & toke } } -static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) { +static void batch_encode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) { // clear previous kv_cache values (irrelevant for embeddings) llama_kv_self_clear(ctx); // run model LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); - if (llama_decode(ctx, batch) < 0) { - LOG_ERR("%s : failed to decode\n", __func__); + if (llama_encode(ctx, batch) < 0) { + LOG_ERR("%s : failed to encode\n", __func__); } for (int i = 0; i < batch.n_tokens; i++) { @@ -233,7 +233,7 @@ int main(int argc, char ** argv) { // encode if at capacity if (batch.n_tokens + n_toks > n_batch) { float * out = emb + p * n_embd; - batch_decode(ctx, batch, out, s, n_embd); + batch_encode(ctx, batch, out, s, n_embd); common_batch_clear(batch); p += s; s = 0; @@ -246,7 +246,7 @@ int main(int argc, char ** argv) { // final batch float * out = emb + p * n_embd; - batch_decode(ctx, batch, out, s, n_embd); + batch_encode(ctx, batch, out, s, n_embd); // save embeddings to chunks for (int i = 0; i < n_chunks; i++) { @@ -267,7 +267,7 @@ int main(int argc, char ** argv) { batch_add_seq(query_batch, query_tokens, 0); std::vector query_emb(n_embd, 0); - batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd); + batch_encode(ctx, query_batch, query_emb.data(), 1, n_embd); common_batch_clear(query_batch); From c76532e7ba128bb097bf6836bf0f5592e1b56b76 Mon Sep 17 00:00:00 2001 From: antichristHater <142441588+antichristHater@users.noreply.github.com> Date: Wed, 21 May 2025 19:40:35 +0300 Subject: [PATCH 08/18] convert : add qwen2vl support for unsloth merges (#13686) --- convert_hf_to_gguf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 15e019a10..e88076ccc 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2645,7 +2645,7 @@ class Qwen2Model(TextModel): yield from super().modify_tensors(data_torch, name, bid) -@ModelBase.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration") +@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration") class Qwen2VLModel(TextModel): model_arch = gguf.MODEL_ARCH.QWEN2VL @@ -2669,7 +2669,7 @@ class Qwen2VLModel(TextModel): return [(self.map_tensor_name(name), data_torch)] -@ModelBase.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration") +@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration") class Qwen2VLVisionModel(VisionModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) From 5fbfe384d4659f81c47a477eb8ee97692c7ffef9 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 21 May 2025 19:46:56 +0300 Subject: [PATCH 09/18] server : improve error reporting (#13680) --- tools/server/server.cpp | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 087665e41..7424da523 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -3366,14 +3366,29 @@ struct server_context { metrics.on_decoded(slots); if (ret != 0) { - if (n_batch == 1 || ret < 0) { - // if you get here, it means the KV cache is full - try increasing it via the context size - SRV_ERR("failed to decode the batch: KV cache is full - try increasing it via the context size, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret); - for (auto & slot : slots) { - slot.release(); - send_error(slot, "Input prompt is too big compared to KV size. Please try increasing KV size."); + { + std::string err; + + if (n_batch == 1 && ret == 1) { + err = "Context size has been exceeded."; + } + + if (ret == -1) { + err = "Invalid input batch."; + } + + if (ret < -1) { + err = "Compute error."; + } + + if (!err.empty()) { + SRV_ERR("%s, i = %d, n_batch = %d, ret = %d\n", err.c_str(), i, n_batch, ret); + for (auto & slot : slots) { + slot.release(); + send_error(slot, err); + } + break; } - break; // break loop of n_batch } // retry with half the batch size to try to find a free slot in the KV cache From 8e186ef0e764c7a620e402d1f76ebad60bf31c49 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 21 May 2025 20:00:49 +0300 Subject: [PATCH 10/18] hparams : support models for which all layers use SWA (#13682) ggml-ci --- src/llama-hparams.cpp | 2 +- src/llama-hparams.h | 13 ++++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 90dfe7a7f..4f84e56b3 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -72,7 +72,7 @@ uint32_t llama_hparams::n_embd_v_s() const { bool llama_hparams::is_swa(uint32_t il) const { if (il < n_layer) { - return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1); + return n_swa_pattern == 0 || (il % n_swa_pattern < (n_swa_pattern - 1)); } GGML_ABORT("fatal error"); diff --git a/src/llama-hparams.h b/src/llama-hparams.h index f865cbaea..5222eedcf 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -104,7 +104,18 @@ struct llama_hparams { llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE; uint32_t n_swa = 0; // the size of the sliding window (0 - no SWA) - uint32_t n_swa_pattern = 1; // by default, all layers use non-sliding-window attention + uint32_t n_swa_pattern = 1; // this value n means that every nth layer is dense (i.e. non-SWA) + // by default n == 1, all layers are dense + // note that if n_swa_pattern == 0, all layers are SWA + // example: n_swa_pattern = 3 + // il == 0: swa + // il == 1: swa + // il == 2: dense + // il == 3: swa + // il == 4: swa + // il == 5: dense + // il == 6: swa + // etc ... // for State Space Models uint32_t ssm_d_conv = 0; From d643bb2c798df9c2cd61067d2692b1cd417df402 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Wed, 21 May 2025 13:09:57 -0700 Subject: [PATCH 11/18] releases : build CPU backend separately (windows) (#13642) --- .github/workflows/release.yml | 267 ++++++++++++++++++---------------- 1 file changed, 142 insertions(+), 125 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ed827bf70..494ea5294 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,4 +1,4 @@ -name: Create Release +name: Release on: workflow_dispatch: # allows manual triggering @@ -227,6 +227,66 @@ jobs: path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip name: llama-bin-ubuntu-vulkan-x64.zip + windows-cpu: + runs-on: windows-latest + + strategy: + matrix: + include: + - arch: 'x64' + - arch: 'arm64' + + steps: + - name: Clone + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: ccache + uses: hendrikmuhs/ccache-action@v1.2.16 + with: + key: windows-latest-cmake-cpu-${{ matrix.arch }} + variant: ccache + evict-old-files: 1d + + - name: Install Ninja + run: | + choco install ninja + + - name: libCURL + id: get_libcurl + uses: ./.github/actions/windows-setup-curl + with: + architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }} + + - name: Build + env: + CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} + run: | + cmake -S . -B build -G "Ninja Multi-Config" ` + -D CMAKE_TOOLCHAIN_FILE=cmake/${{ matrix.arch }}-windows-llvm.cmake ` + -DGGML_NATIVE=OFF ` + -DGGML_BACKEND_DL=ON ` + -DGGML_CPU_ALL_VARIANTS=ON ` + -DGGML_OPENMP=OFF ` + -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" ` + ${{ env.CMAKE_ARGS }} + cmake --build build --config Release + + - name: Pack artifacts + id: pack_artifacts + env: + CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} + run: | + Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\ + 7z a llama-bin-win-cpu-${{ matrix.arch }}.zip .\build\bin\Release\* + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + path: llama-bin-win-cpu-${{ matrix.arch }}.zip + name: llama-bin-win-cpu-${{ matrix.arch }}.zip + windows: runs-on: windows-latest @@ -237,52 +297,30 @@ jobs: strategy: matrix: include: - - build: 'cpu-x64' + - backend: 'vulkan' arch: 'x64' - defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF' - #- build: 'openblas-x64' - # arch: 'x64' - # defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' - - build: 'vulkan-x64' - arch: 'x64' - defines: '-DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON' - - build: 'cpu-arm64' - arch: 'arm64' - defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF' - - build: 'opencl-adreno-arm64' + defines: '-DGGML_VULKAN=ON' + target: 'ggml-vulkan' + - backend: 'opencl-adreno' arch: 'arm64' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON' + target: 'ggml-opencl' steps: - name: Clone id: checkout uses: actions/checkout@v4 - with: - fetch-depth: 0 - name: ccache uses: hendrikmuhs/ccache-action@v1.2.16 with: - key: windows-latest-cmake-${{ matrix.build }} + key: windows-latest-cmake-${{ matrix.backend }}-${{ matrix.arch }} variant: ccache evict-old-files: 1d - - name: Download OpenBLAS - id: get_openblas - if: ${{ matrix.build == 'openblas-x64' }} - run: | - curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip" - curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE" - mkdir $env:RUNNER_TEMP/openblas - tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas - $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath) - $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim())) - $lib = $(join-path $msvc 'bin\Hostx64\x64\lib.exe') - & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll - - name: Install Vulkan SDK id: get_vulkan - if: ${{ matrix.build == 'vulkan-x64' }} + if: ${{ matrix.backend == 'vulkan' }} run: | curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe" & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install @@ -296,7 +334,7 @@ jobs: - name: Install OpenCL Headers and Libs id: install_opencl - if: ${{ matrix.build == 'opencl-adreno-arm64' }} + if: ${{ matrix.backend == 'opencl-adreno' && matrix.arch == 'arm64' }} run: | git clone https://github.com/KhronosGroup/OpenCL-Headers cd OpenCL-Headers @@ -314,46 +352,22 @@ jobs: -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release" cmake --build build-arm64-release --target install --config release - - name: libCURL - id: get_libcurl - uses: ./.github/actions/windows-setup-curl - with: - architecture: ${{ matrix.arch == 'x64' && 'win64' || 'win64a' }} - - name: Build id: cmake_build - env: - CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} run: | - cmake -S . -B build ${{ matrix.defines }} ` - -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" ` - ${{ env.CMAKE_ARGS }} - cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} - - - name: Add libopenblas.dll - id: add_libopenblas_dll - if: ${{ matrix.build == 'openblas-x64' }} - run: | - cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll - cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt - - - name: Determine tag name - id: tag - uses: ./.github/actions/get-tag-name + cmake -S . -B build ${{ matrix.defines }} -DGGML_NATIVE=OFF -DGGML_CPU=OFF -DGGML_BACKEND_DL=ON -DLLAMA_CURL=OFF + cmake --build build --config Release --target ${{ matrix.target }} - name: Pack artifacts id: pack_artifacts - env: - CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} run: | - Copy-Item $env:CURL_PATH\bin\libcurl-${{ matrix.arch }}.dll .\build\bin\Release\ - 7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\* + 7z a llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip .\build\bin\Release\${{ matrix.target }}.dll - name: Upload artifacts uses: actions/upload-artifact@v4 with: - path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip - name: llama-bin-win-${{ matrix.build }}.zip + path: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip + name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip windows-cuda: runs-on: windows-2019 @@ -366,8 +380,6 @@ jobs: - name: Clone id: checkout uses: actions/checkout@v4 - with: - fetch-depth: 0 - name: Install ccache uses: hendrikmuhs/ccache-action@v1.2.16 @@ -386,45 +398,30 @@ jobs: run: | choco install ninja - - name: libCURL - id: get_libcurl - uses: ./.github/actions/windows-setup-curl - - name: Build id: cmake_build shell: cmd - env: - CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} run: | call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat" cmake -S . -B build -G "Ninja Multi-Config" ^ - -DGGML_NATIVE=OFF ^ -DGGML_BACKEND_DL=ON ^ - -DGGML_CPU_ALL_VARIANTS=ON ^ + -DGGML_NATIVE=OFF ^ + -DGGML_CPU=OFF ^ -DGGML_CUDA=ON ^ - -DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include" ^ - ${{ env.CMAKE_ARGS }} + -DLLAMA_CURL=OFF set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1 - cmake --build build --config Release -j %NINJA_JOBS% -t ggml - cmake --build build --config Release - - - name: Determine tag name - id: tag - uses: ./.github/actions/get-tag-name + cmake --build build --config Release -j %NINJA_JOBS% --target ggml-cuda - name: Pack artifacts id: pack_artifacts - env: - CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} run: | - cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll - 7z a llama-${{ steps.tag.outputs.name }}-bin-win-cuda${{ matrix.cuda }}-x64.zip .\build\bin\Release\* + 7z a llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip .\build\bin\Release\ggml-cuda.dll - name: Upload artifacts uses: actions/upload-artifact@v4 with: - path: llama-${{ steps.tag.outputs.name }}-bin-win-cuda${{ matrix.cuda }}-x64.zip - name: llama-bin-win-cuda${{ matrix.cuda }}-x64.zip + path: llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip + name: llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip - name: Copy and pack Cuda runtime run: | @@ -432,13 +429,13 @@ jobs: $dst='.\build\bin\cudart\' robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll - 7z a cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip $dst\* + 7z a cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip $dst\* - name: Upload Cuda runtime uses: actions/upload-artifact@v4 with: - path: cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip - name: cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip + path: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip + name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip windows-sycl: runs-on: windows-latest @@ -455,8 +452,6 @@ jobs: - name: Clone id: checkout uses: actions/checkout@v4 - with: - fetch-depth: 0 - name: ccache uses: hendrikmuhs/ccache-action@v1.2.16 @@ -469,15 +464,18 @@ jobs: run: | scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL - # TODO: add libcurl support ; we will also need to modify win-build-sycl.bat to accept user-specified args - - name: Build id: cmake_build - run: examples/sycl/win-build-sycl.bat - - - name: Determine tag name - id: tag - uses: ./.github/actions/get-tag-name + shell: cmd + run: | + call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force + cmake -G "Ninja" -B build ^ + -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^ + -DCMAKE_BUILD_TYPE=Release ^ + -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^ + -DGGML_CPU=OFF -DGGML_SYCL=ON ^ + -DLLAMA_CURL=OFF + cmake --build build --target ggml-sycl -j - name: Build the release package id: pack_artifacts @@ -502,12 +500,12 @@ jobs: cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin echo "cp oneAPI running time dll files to ./build/bin done" - 7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/* + 7z a llama-bin-win-sycl-x64.zip ./build/bin/* - name: Upload the release package uses: actions/upload-artifact@v4 with: - path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip + path: llama-bin-win-sycl-x64.zip name: llama-bin-win-sycl-x64.zip windows-hip: @@ -521,8 +519,6 @@ jobs: - name: Clone id: checkout uses: actions/checkout@v4 - with: - fetch-depth: 0 - name: Clone rocWMMA repository id: clone_rocwmma @@ -532,7 +528,7 @@ jobs: - name: ccache uses: hendrikmuhs/ccache-action@v1.2.16 with: - key: windows-latest-cmake-hip-release + key: windows-latest-cmake-hip-${{ matrix.gpu_target }}-x64 evict-old-files: 1d - name: Install @@ -550,14 +546,8 @@ jobs: run: | & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version - - name: libCURL - id: get_libcurl - uses: ./.github/actions/windows-setup-curl - - name: Build id: cmake_build - env: - CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} run: | $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" @@ -569,31 +559,23 @@ jobs: -DAMDGPU_TARGETS=${{ matrix.gpu_target }} ` -DGGML_HIP_ROCWMMA_FATTN=ON ` -DGGML_HIP=ON ` - -DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include" ` - ${{ env.CMAKE_ARGS }} - cmake --build build -j ${env:NUMBER_OF_PROCESSORS} + -DLLAMA_CURL=OFF + cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS} md "build\bin\rocblas\library\" cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\" cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\" cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\" - - name: Determine tag name - id: tag - uses: ./.github/actions/get-tag-name - - name: Pack artifacts id: pack_artifacts - env: - CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }} run: | - cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\libcurl-x64.dll - 7z a llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip .\build\bin\* + 7z a llama-bin-win-hip-${{ matrix.gpu_target }}-x64.zip .\build\bin\* - name: Upload artifacts uses: actions/upload-artifact@v4 with: - path: llama-${{ steps.tag.outputs.name }}-bin-win-hip-x64-${{ matrix.gpu_target }}.zip - name: llama-bin-win-hip-x64-${{ matrix.gpu_target }}.zip + path: llama-bin-win-hip-${{ matrix.gpu_target }}-x64.zip + name: llama-bin-win-hip-${{ matrix.gpu_target }}-x64.zip ios-xcode-build: runs-on: macos-latest @@ -655,14 +637,16 @@ jobs: runs-on: ubuntu-latest needs: - - ubuntu-22-cpu - - ubuntu-22-vulkan - windows + - windows-cpu - windows-cuda - windows-sycl - windows-hip + - ubuntu-22-cpu + - ubuntu-22-vulkan - macOS-arm64 - macOS-x64 + - ios-xcode-build steps: - name: Clone @@ -680,10 +664,43 @@ jobs: uses: actions/download-artifact@v4 with: path: ./artifact + merge-multiple: true - name: Move artifacts id: move_artifacts - run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release + run: | + mkdir -p release + + echo "Adding CPU backend files to existing zips..." + for arch in x64 arm64; do + cpu_zip="artifact/llama-bin-win-cpu-${arch}.zip" + temp_dir=$(mktemp -d) + echo "Extracting CPU backend for $arch..." + unzip "$cpu_zip" -d "$temp_dir" + + echo "Adding CPU files to $arch zips..." + for target_zip in artifact/llama-bin-win-*-${arch}.zip; do + if [[ "$target_zip" == "$cpu_zip" ]]; then + continue + fi + echo "Adding CPU backend to $(basename "$target_zip")" + realpath_target_zip=$(realpath "$target_zip") + (cd "$temp_dir" && zip -r "$realpath_target_zip" .) + done + + rm -rf "$temp_dir" + done + + echo "Renaming and moving zips to release..." + for zip_file in artifact/llama-bin-win-*.zip; do + base_name=$(basename "$zip_file" .zip) + zip_name="llama-${{ steps.tag.outputs.name }}-${base_name#llama-}.zip" + echo "Moving $zip_file to release/$zip_name" + mv "$zip_file" "release/$zip_name" + done + + echo "Moving other artifacts..." + mv -v artifact/*.zip release - name: Create release id: create_release @@ -702,7 +719,7 @@ jobs: const path = require('path'); const fs = require('fs'); const release_id = '${{ steps.create_release.outputs.id }}'; - for (let file of await fs.readdirSync('./artifact/release')) { + for (let file of await fs.readdirSync('./release')) { if (path.extname(file) === '.zip') { console.log('uploadReleaseAsset', file); await github.repos.uploadReleaseAsset({ @@ -710,7 +727,7 @@ jobs: repo: context.repo.repo, release_id: release_id, name: file, - data: await fs.readFileSync(`./artifact/release/${file}`) + data: await fs.readFileSync(`./release/${file}`) }); } } From edbf42edfdabb9cea72ae12137570cf48f5d8076 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henry=20Linjam=C3=A4ki?= Date: Wed, 21 May 2025 23:21:17 +0300 Subject: [PATCH 12/18] opencl: fix couple crashes (#12795) * opencl: fix couple crashes * fix kernel launches failed on devices which do not support non-uniform work-groups. When non-uniform work-groups are not supported, set `local_work_size` to NULL (= let driver choose the work-group sizes). This patch does not cover everything - just the cases tested by test-backend-ops. * fix sub-buffer creation failed due to `cl_buffer_region::origin` not being aligned to `CL_DEVICE_MEM_BASE_ADDR_ALIGN`. * OpenCL: query non-uniform WG sizes only on OpenCL 3.0+ --- ggml/src/ggml-opencl/ggml-opencl.cpp | 102 +++++++++++++++++++++------ 1 file changed, 79 insertions(+), 23 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 586946048..3b8313761 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -74,6 +74,7 @@ struct ggml_cl_version { cl_uint minor = 0; }; + struct ggml_cl_compiler_version { ADRENO_CL_COMPILER_TYPE type; int major = -1; @@ -91,6 +92,14 @@ struct ggml_cl_compiler_version { } }; +static size_t align_to(size_t value, size_t to_alignment) { + GGML_ASSERT(to_alignment && "Invalid alignment (must be non-zero)"); + GGML_ASSERT((to_alignment & (to_alignment - 1)) == 0 && "to_alignment must be power-of-two"); + + return ((value + to_alignment - 1) / to_alignment) * to_alignment; +} + + // Parses a version string of form "XX.YY ". On an error returns ggml_cl_version with all zeroes. static ggml_cl_version parse_cl_version(std::string_view str) { size_t major_str_begin = 0; @@ -248,6 +257,8 @@ struct ggml_backend_opencl_context { int adreno_wave_size; + cl_bool non_uniform_workgroups; + cl_context context; cl_command_queue queue; @@ -1397,6 +1408,15 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { GGML_LOG_INFO("ggml_opencl: SVM atomics support: %s\n", svm_caps & CL_DEVICE_SVM_ATOMICS ? "true" : "false"); + if (opencl_c_version.major >= 3) { + CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool), + &backend_ctx->non_uniform_workgroups, 0)); + } else { + GGML_ASSERT(opencl_c_version.major == 2); + // Non-uniform workgroup sizes is mandatory feature in v2.x. + backend_ctx->non_uniform_workgroups = true; + } + // Print out configurations #ifdef GGML_OPENCL_SOA_Q GGML_LOG_INFO("ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n"); @@ -2058,15 +2078,16 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, // The original tensor memory is divided into scales and quants, i.e., // we first store scales, then quants. // Create subbuffer for scales. - region.origin = extra_orig->offset + tensor->view_offs + offset; + region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment); region.size = size_d; extra->d = clCreateSubBuffer( extra_orig->data_device, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); CL_CHECK(err); + auto previous_origin = region.origin; // Create subbuffer for quants. - region.origin = extra_orig->offset + tensor->view_offs + offset + size_d; + region.origin = align_to(previous_origin + size_d, backend_ctx->alignment); region.size = size_q; extra->q = clCreateSubBuffer( extra_orig->data_device, CL_MEM_READ_WRITE, @@ -2942,14 +2963,19 @@ static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)n, 1, 1}; size_t local_work_size[] = {64, 1, 1}; + size_t * local_work_size_ptr = local_work_size; + if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) { + local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. + } + #ifdef GGML_OPENCL_PROFILING cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); + populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); #else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); #endif } else { unsigned int nth = MIN(64, ne0); @@ -3077,14 +3103,19 @@ static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)n, 1, 1}; size_t local_work_size[] = {64, 1, 1}; + size_t * local_work_size_ptr = local_work_size; + if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) { + local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. + } + #ifdef GGML_OPENCL_PROFILING cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); + populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); #else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); #endif } else { unsigned int nth = MIN(64, ne0); @@ -3233,14 +3264,19 @@ static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)n, 1, 1}; size_t local_work_size[] = {64, 1, 1}; + size_t * local_work_size_ptr = local_work_size; + if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) { + local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. + } + #ifdef GGML_OPENCL_PROFILING cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); + populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); #else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); #endif } @@ -3273,14 +3309,19 @@ static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const size_t global_work_size[] = {(size_t)n, 1, 1}; size_t local_work_size[] = {64, 1, 1}; + size_t * local_work_size_ptr = local_work_size; + if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) { + local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. + } + #ifdef GGML_OPENCL_PROFILING cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); + populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); #else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); #endif } @@ -3320,14 +3361,19 @@ static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, cons size_t global_work_size[] = {(size_t)n, 1, 1}; size_t local_work_size[] = {64, 1, 1}; + size_t * local_work_size_ptr = local_work_size; + if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) { + local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. + } + #ifdef GGML_OPENCL_PROFILING cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); + populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); #else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); #endif } @@ -4230,14 +4276,19 @@ static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, cons size_t global_work_size[] = {(size_t)n, 1, 1}; size_t local_work_size[] = {64, 1, 1}; + size_t * local_work_size_ptr = local_work_size; + if (n % 64 != 0 && !backend_ctx->non_uniform_workgroups) { + local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. + } + #ifdef GGML_OPENCL_PROFILING cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); + populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); #else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); #endif } @@ -4418,14 +4469,19 @@ static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * sr size_t global_work_size[] = {(size_t)ne00, (size_t)ne01, (size_t)ne02}; size_t local_work_size[] = {64, 1, 1}; + size_t * local_work_size_ptr = local_work_size; + if (ne00 % 64 != 0 && !backend_ctx->non_uniform_workgroups) { + local_work_size_ptr = nullptr; // Let driver choose the work-group sizes. + } + #ifdef GGML_OPENCL_PROFILING cl_event evt; - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, &evt)); g_profiling_info.emplace_back(); - populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); + populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size_ptr, dst); #else - CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size_ptr, 0, NULL, NULL)); #endif } } From a4e8912dfd4604be1e39bc86ba4c0b02969967ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henry=20Linjam=C3=A4ki?= Date: Thu, 22 May 2025 02:21:45 +0300 Subject: [PATCH 13/18] opencl: Add support for multiple devices (#12622) * opencl: Add support for multiple devices ... but limited to one platform. A platform with a GPU will be preferred. Additionally: * Filter out devices that lack capabilities needed by the backend implementation (half support, OpenCL 2.0+, etc). * Make ggml_backend_opencl_reg() thread-safe. * fixup: fix an error in sync_with_other_backends ... when there is only one OpenCL device available. --- ggml/src/ggml-opencl/ggml-opencl.cpp | 370 +++++++++++++++++---------- 1 file changed, 237 insertions(+), 133 deletions(-) diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index 3b8313761..d5412069e 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -27,6 +27,7 @@ #include #include #include +#include #undef MIN #undef MAX @@ -230,13 +231,25 @@ static ggml_cl_compiler_version get_adreno_cl_compiler_version(const char *drive return { type, major, minor, patch }; } +struct ggml_backend_opencl_context; + // backend device context struct ggml_backend_opencl_device_context { cl_platform_id platform; std::string platform_name; - cl_device_id device; - std::string device_name; + cl_device_id device; + std::string device_name; + cl_device_type device_type; + std::string device_version; + + // Initialized by ggml_cl2_init(). + ggml_backend_opencl_context * backend_ctx = nullptr; + + // Initialized by ggml_backend_opencl_device_get_buffer_type() + ggml_backend_buffer_type buffer_type; + + cl_context context = nullptr; }; // backend context @@ -355,15 +368,8 @@ struct ggml_backend_opencl_context { #endif // GGML_OPENCL_USE_ADRENO_KERNELS }; -static ggml_backend_device g_ggml_backend_opencl_device; -static ggml_backend_opencl_device_context g_ggml_ctx_dev_main { - /*.platform =*/ nullptr, - /*.platform_nane =*/ "", - /*.device =*/ nullptr, - /*.device_name =*/ "", -}; - -static int ggml_backend_opencl_n_devices = 0; +// All registered devices with a default device in the front. +static std::vector g_ggml_backend_opencl_devices; // Profiling #ifdef GGML_OPENCL_PROFILING @@ -1118,25 +1124,19 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve GGML_LOG_CONT("\n"); } -static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { - static bool initialized = false; - static ggml_backend_opencl_context *backend_ctx = nullptr; +// XXX static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { +// XXX static bool initialized = false; +// XXX static ggml_backend_opencl_context *backend_ctx = nullptr; - if (initialized) { - return backend_ctx; - } +static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev); - ggml_backend_opencl_device_context *dev_ctx = (ggml_backend_opencl_device_context *)dev->context; - GGML_ASSERT(dev_ctx); - GGML_ASSERT(dev_ctx->platform == nullptr); - GGML_ASSERT(dev_ctx->device == nullptr); - GGML_ASSERT(backend_ctx == nullptr); +namespace /* anonymous */ { +extern struct ggml_backend_device_i ggml_backend_opencl_device_i; +} - initialized = true; - backend_ctx = new ggml_backend_opencl_context(); - backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN; - - cl_int err; +// Look for available and suitable devices. +static std::vector ggml_opencl_probe_devices(ggml_backend_reg * reg) { + std::vector found_devices; #ifdef GGML_OPENCL_PROFILING GGML_LOG_INFO("ggml_opencl: OpenCL profiling enabled\n"); @@ -1169,11 +1169,12 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { struct cl_device devices[NDEV]; unsigned n_devices = 0; struct cl_device * default_device = NULL; + unsigned default_platform_number = 0; cl_platform_id platform_ids[NPLAT]; if (clGetPlatformIDs(NPLAT, platform_ids, &n_platforms) != CL_SUCCESS) { GGML_LOG_ERROR("ggml_opencl: plaform IDs not available.\n"); - return backend_ctx; + return found_devices; } for (unsigned i = 0; i < n_platforms; i++) { @@ -1208,19 +1209,22 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { } if (default_device == NULL && p->default_device != NULL) { - default_device = p->default_device; + default_device = p->default_device; + default_platform_number = i; } } if (n_devices == 0) { GGML_LOG_ERROR("ggml_opencl: could find any OpenCL devices.\n"); - return backend_ctx; + return found_devices; } - char * user_platform_string = getenv("GGML_OPENCL_PLATFORM"); - char * user_device_string = getenv("GGML_OPENCL_DEVICE"); - int user_platform_number = -1; - int user_device_number = -1; + char * user_platform_string = getenv("GGML_OPENCL_PLATFORM"); + char * user_device_string = getenv("GGML_OPENCL_DEVICE"); + int user_platform_number = -1; + int user_device_number = -1; + cl_device * candidate_devices = nullptr; + unsigned n_candidate_devices = 0; unsigned n; if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) { @@ -1235,12 +1239,11 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { GGML_LOG_ERROR("ggml_opencl: invalid device number %d\n", user_device_number); exit(1); } - default_device = &platform->devices[user_device_number]; + default_device = &platform->devices[user_device_number]; + candidate_devices = platform->devices; + n_candidate_devices = platform->n_devices; } else { - - struct cl_device * selected_devices = devices; - unsigned n_selected_devices = n_devices; - + // Choose a platform by matching a substring. if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) { for (unsigned i = 0; i < n_platforms; i++) { struct cl_platform * p = &platforms[i]; @@ -1255,20 +1258,20 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { exit(1); } } - if (user_platform_number != -1) { - struct cl_platform * p = &platforms[user_platform_number]; - selected_devices = p->devices; - n_selected_devices = p->n_devices; - default_device = p->default_device; - if (n_selected_devices == 0) { - GGML_LOG_ERROR("ggml_opencl: selected platform '%s' does not have any devices.\n", p->name); - exit(1); - } + + int platform_idx = user_platform_number != -1 ? user_platform_number : default_platform_number; + struct cl_platform * p = &platforms[platform_idx]; + candidate_devices = p->devices; + n_candidate_devices = p->n_devices; + default_device = p->default_device; + if (n_candidate_devices == 0) { + GGML_LOG_ERROR("ggml_opencl: selected platform '%s' does not have any devices.\n", p->name); + exit(1); } if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) { - for (unsigned i = 0; i < n_selected_devices; i++) { - struct cl_device * d = &selected_devices[i]; + for (unsigned i = 0; i < n_candidate_devices; i++) { + struct cl_device * d = &candidate_devices[i]; if (strstr(d->name, user_device_string) != NULL) { user_device_number = d->number; break; @@ -1280,71 +1283,145 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { } } if (user_device_number != -1) { - selected_devices = &devices[user_device_number]; - n_selected_devices = 1; - default_device = &selected_devices[0]; + candidate_devices = &devices[user_device_number]; + n_candidate_devices = 1; + default_device = &candidate_devices[0]; } - GGML_ASSERT(n_selected_devices > 0); + GGML_ASSERT(n_candidate_devices > 0); if (default_device == NULL) { - default_device = &selected_devices[0]; + default_device = &candidate_devices[0]; } } - GGML_LOG_INFO("ggml_opencl: selecting platform: '%s'\n", default_device->platform->name); - GGML_LOG_INFO("ggml_opencl: selecting device: '%s (%s)'\n", default_device->name, default_device->version); - if (default_device->type != CL_DEVICE_TYPE_GPU) { - GGML_LOG_WARN("ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name); + GGML_ASSERT(n_candidate_devices != 0 && candidate_devices); + + // Put the default device in front. + for (unsigned i = 1; i < n_candidate_devices; i++) { + if (&candidate_devices[i] == default_device) { + std::swap(candidate_devices[0], candidate_devices[i]); + default_device = &candidate_devices[0]; + break; + } } - dev_ctx->platform = default_device->platform->id; - dev_ctx->device = default_device->id; - backend_ctx->device = default_device->id; + GGML_LOG_INFO("ggml_opencl: selected platform: '%s'\n", default_device->platform->name); - if (strstr(default_device->name, "Adreno") || - strstr(default_device->name, "Qualcomm") || - strstr(default_device->version, "Adreno")) { + std::vector device_ids; + for (auto dev = candidate_devices, dev_end = candidate_devices + n_candidate_devices; dev != dev_end; dev++) { + device_ids.push_back(dev->id); + } + + cl_int err; + cl_context shared_context; + cl_context_properties properties[] = { (intptr_t) CL_CONTEXT_PLATFORM, (intptr_t) default_device->platform->id, 0 }; + + CL_CHECK( + (shared_context = clCreateContext(properties, device_ids.size(), device_ids.data(), NULL, NULL, &err), err)); + + for (auto dev = candidate_devices, dev_end = candidate_devices + n_candidate_devices; dev != dev_end; dev++) { + GGML_LOG_INFO("\nggml_opencl: device: '%s (%s)'\n", dev->name, dev->version); + + auto dev_ctx = std::unique_ptr(new ggml_backend_opencl_device_context{ + /*.platform =*/dev->platform->id, + /*.platform_nane =*/dev->platform->name, + /*.device =*/dev->id, + /*.device_name =*/dev->name, + /*.device_type =*/dev->type, + /*.device_version =*/dev->version, + /*.backend_ctx =*/nullptr, + /*.buffer_type =*/{}, + /*.context =*/shared_context, + }); + + found_devices.push_back(ggml_backend_device{ + /* .iface = */ ggml_backend_opencl_device_i, + /* .reg = */ reg, + /* .context = */ dev_ctx.get(), + }); + + if (!ggml_cl2_init(&found_devices.back())) { + found_devices.pop_back(); + GGML_LOG_INFO("ggml_opencl: drop unsupported device.\n"); + continue; + } + + dev_ctx.release(); + } + + if (found_devices.size()) { + auto * dev_ctx = static_cast(found_devices.front().context); + GGML_LOG_INFO("ggml_opencl: default device: '%s (%s)'\n", dev_ctx->device_name.c_str(), + dev_ctx->device_version.c_str()); + + if (dev_ctx->device_type != CL_DEVICE_TYPE_GPU) { + GGML_LOG_WARN("ggml_opencl: warning, the default device is not a GPU: '%s'.\n", + dev_ctx->device_name.c_str()); + } + } + + return found_devices; +} + +// Initialize device if it is supported (returns nullptr if it is not). +static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { + GGML_ASSERT(dev); + GGML_ASSERT(dev->context); + + ggml_backend_opencl_device_context * dev_ctx = (ggml_backend_opencl_device_context *) dev->context; + GGML_ASSERT(dev_ctx->platform); + GGML_ASSERT(dev_ctx->device); + + if (dev_ctx->backend_ctx) { + return dev_ctx->backend_ctx; + } + + auto backend_ctx = std::make_unique(); + backend_ctx->device = dev_ctx->device; + backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN; + + if (strstr(dev_ctx->device_name.c_str(), "Adreno") || + strstr(dev_ctx->device_name.c_str(), "Qualcomm") || + strstr(dev_ctx->device_version.c_str(), "Adreno")) { backend_ctx->gpu_family = GPU_FAMILY::ADRENO; // Usually device version contains the detailed device name - backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->version); + backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_version.c_str()); if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::ADRENO_UNKNOWN) { - backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name); + backend_ctx->adreno_gen = get_adreno_gpu_gen(dev_ctx->device_name.c_str()); } // Use wave size of 64 for all Adreno GPUs. backend_ctx->adreno_wave_size = 64; - } else if (strstr(default_device->name, "Intel")) { + } else if (strstr(dev_ctx->device_name.c_str(), "Intel")) { backend_ctx->gpu_family = GPU_FAMILY::INTEL; } else { - GGML_LOG_ERROR("Unsupported GPU: %s\n", default_device->name); + GGML_LOG_ERROR("Unsupported GPU: %s\n", dev_ctx->device_name.c_str()); backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN; - return backend_ctx; + return nullptr; } #ifdef GGML_OPENCL_USE_ADRENO_KERNELS if (backend_ctx->gpu_family != GPU_FAMILY::ADRENO) { GGML_LOG_ERROR("ggml_opencl: Adreno-specific kernels should not be enabled for non-Adreno GPUs; " "run on an Adreno GPU or recompile with CMake option `-DGGML_OPENCL_USE_ADRENO_KERNELS=OFF`\n"); - return backend_ctx; + return nullptr; } #endif // Populate backend device name - dev_ctx->platform_name = default_device->platform->name; - dev_ctx->device_name = default_device->name; - backend_ctx->device_name = default_device->name; + backend_ctx->device_name = dev_ctx->device_name; // A local ref of cl_device_id for convenience cl_device_id device = backend_ctx->device; - ggml_cl_version platform_version = get_opencl_platform_version(default_device->platform->id); + ggml_cl_version platform_version = get_opencl_platform_version(dev_ctx->platform); // Check device OpenCL version, OpenCL 2.0 or above is required ggml_cl_version opencl_c_version = get_opencl_c_version(platform_version, device); if (opencl_c_version.major < 2) { GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n"); - return backend_ctx; + return nullptr; } // Check driver version @@ -1375,7 +1452,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { // fp16 is required if (!backend_ctx->fp16_support) { GGML_LOG_ERROR("ggml_opencl: device does not support FP16\n"); - return backend_ctx; + return nullptr; } // If OpenCL 3.0 is supported, then check for cl_khr_subgroups, which becomes @@ -1384,7 +1461,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { strstr(ext_buffer, "cl_intel_subgroups") == NULL) { GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) " "(note that subgroups is an optional feature in OpenCL 3.0)\n"); - return backend_ctx; + return nullptr; } cl_uint base_align_in_bits; @@ -1426,14 +1503,10 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n"); #endif // GGML_OPENCL_USE_ADRENO_KERNELS - cl_context_properties properties[] = { - (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)dev_ctx->platform, 0 - }; - - CL_CHECK((backend_ctx->context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err)); + cl_int err; // A local ref of cl_context for convenience - cl_context context = backend_ctx->context; + cl_context context = backend_ctx->context = dev_ctx->context; //CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err), // (err != CL_INVALID_QUEUE_PROPERTIES && err != CL_INVALID_VALUE ? err : @@ -1446,7 +1519,7 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { CL_CHECK((backend_ctx->queue = clCreateCommandQueue(context, device, command_queue_props, &err), err)); // Load kernels - load_cl_kernels(backend_ctx, opencl_c_version); + load_cl_kernels(backend_ctx.get(), opencl_c_version); #ifdef GGML_OPENCL_USE_ADRENO_KERNELS // Allocate intermediate buffers and images @@ -1476,10 +1549,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { CL_CHECK((backend_ctx->B_d_max = clCreateBuffer(context, 0, max_B_d_bytes, NULL, &err), err)); #endif // GGML_OPENCL_USE_ADRENO_KERNELS - // For now we support a single devices - ggml_backend_opencl_n_devices = 1; - - return backend_ctx; + dev_ctx->backend_ctx = backend_ctx.release(); + return dev_ctx->backend_ctx; } static void ggml_cl2_free(void) { @@ -1684,10 +1755,46 @@ static void ggml_backend_opencl_synchronize(ggml_backend_t backend) { GGML_UNUSED(backend); } +// Syncronizes the 'backend_ctx's device with others so that commands +// enqueued to it won't start until commands in the other devices have +// completed. +static void sync_with_other_backends(ggml_backend_opencl_context * backend_ctx) { + if (g_ggml_backend_opencl_devices.size() < 2) + return; // No other devices to synchronize with. + + std::vector events; + events.reserve(g_ggml_backend_opencl_devices.size()); + + for (ggml_backend_device & backend_dev : g_ggml_backend_opencl_devices) { + auto * other_backend_ctx = ggml_cl2_init(&backend_dev); + if (backend_ctx != other_backend_ctx) { + cl_event ev; + CL_CHECK(clEnqueueMarkerWithWaitList(other_backend_ctx->queue, 0, nullptr, &ev)); + CL_CHECK(clFlush(other_backend_ctx->queue)); + events.push_back(ev); + } + } + + CL_CHECK(clEnqueueBarrierWithWaitList(backend_ctx->queue, events.size(), events.data(), nullptr)); + for (auto ev : events) { + CL_CHECK(clReleaseEvent(ev)); + } +} + +static void sync_with_other_backends(ggml_backend_t backend) { + auto * backend_ctx = static_cast(backend->context); + sync_with_other_backends(backend_ctx); +} + static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; + // NOTE: this may oversynchronize by synchronizing with + // backends/devices which don't compute 'cgraph's + // dependencies. + sync_with_other_backends(backend); + if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { continue; } @@ -2292,8 +2399,8 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, cl_context context = backend_ctx->context; cl_command_queue queue = backend_ctx->queue; - // Make sure all previously submitted commands are finished. - CL_CHECK(clFinish(queue)); + // Make sure all previously submitted commands in other devices are finished. + sync_with_other_backends(backend_ctx); #ifdef GGML_OPENCL_SOA_Q // In end-to-end runs, get_tensor is usually used to get back the logits, @@ -2397,13 +2504,8 @@ static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_b } static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) { - // FIXME: not thread safe, device may not be initialized yet - static cl_uint alignment = -1; - if (alignment == (cl_uint)-1) { - ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device); - alignment = backend_ctx->alignment; - } - return alignment; + ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device); + return backend_ctx->alignment; } static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) { @@ -2430,16 +2532,6 @@ static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = { /* .is_host = */ NULL, }; -ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() { - static ggml_backend_buffer_type buffer_type = { - /* .iface = */ ggml_backend_opencl_buffer_type_interface, - /* .device = */ &g_ggml_backend_opencl_device, - /* .context = */ nullptr, - }; - - return &buffer_type; -} - // // backend device // @@ -2497,9 +2589,15 @@ static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, co } static ggml_backend_buffer_type_t ggml_backend_opencl_device_get_buffer_type(ggml_backend_dev_t dev) { - return ggml_backend_opencl_buffer_type(); + auto * dev_ctx = static_cast(dev->context); - GGML_UNUSED(dev); + dev_ctx->buffer_type = ggml_backend_buffer_type{ + /* .iface = */ ggml_backend_opencl_buffer_type_interface, + /* .device = */ dev, + /* .context = */ nullptr, + }; + + return &dev_ctx->buffer_type; } static ggml_backend_buffer_t ggml_backend_opencl_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { @@ -2515,12 +2613,21 @@ static bool ggml_backend_opencl_device_supports_op(ggml_backend_dev_t dev, const } static bool ggml_backend_opencl_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - return buft->iface.get_name == ggml_backend_opencl_buffer_type_get_name; + // Check 'dev' and 'buffer_type' are not objects belonging to this backend. + if (dev->iface.get_name != ggml_backend_opencl_device_get_name || + buft->iface.get_name != ggml_backend_opencl_buffer_type_get_name) { + return false; + } - GGML_UNUSED(dev); + // Check cl_context is the same. clEnqueue* commands may not use + // buffers from another cl_context. + ggml_backend_opencl_context * backend_ctx0 = ggml_cl2_init(dev); + ggml_backend_opencl_context * backend_ctx1 = ggml_cl2_init(buft->device); + return backend_ctx0->context == backend_ctx1->context; } -static struct ggml_backend_device_i ggml_backend_opencl_device_i = { +namespace /* anonymous */ { +struct ggml_backend_device_i ggml_backend_opencl_device_i = { /* .get_name = */ ggml_backend_opencl_device_get_name, /* .get_description = */ ggml_backend_opencl_device_get_description, /* .get_memory = */ ggml_backend_opencl_device_get_memory, @@ -2537,6 +2644,7 @@ static struct ggml_backend_device_i ggml_backend_opencl_device_i = { /* .event_free = */ NULL, /* .event_synchronize = */ NULL, }; +} // Backend registry @@ -2547,15 +2655,15 @@ static const char * ggml_backend_opencl_reg_get_name(ggml_backend_reg_t reg) { } static size_t ggml_backend_opencl_reg_device_count(ggml_backend_reg_t reg) { - return ggml_backend_opencl_n_devices; + return g_ggml_backend_opencl_devices.size(); GGML_UNUSED(reg); } static ggml_backend_dev_t ggml_backend_opencl_reg_device_get(ggml_backend_reg_t reg, size_t index) { - GGML_ASSERT(index == 0); + GGML_ASSERT(index < ggml_backend_opencl_reg_device_count(reg)); - return &g_ggml_backend_opencl_device; + return &g_ggml_backend_opencl_devices[index]; GGML_UNUSED(reg); GGML_UNUSED(index); @@ -2569,27 +2677,23 @@ static struct ggml_backend_reg_i ggml_backend_opencl_reg_i = { }; ggml_backend_reg_t ggml_backend_opencl_reg(void) { - // TODO: make this thread-safe somehow? + static std::mutex mutex; static ggml_backend_reg reg; static bool initialized = false; + std::lock_guard lock(mutex); - if (!initialized) { - reg = ggml_backend_reg { - /* .api_version = */ GGML_BACKEND_API_VERSION, - /* .iface = */ ggml_backend_opencl_reg_i, - /* .context = */ NULL, - }; - - g_ggml_backend_opencl_device = ggml_backend_device { - /* .iface = */ ggml_backend_opencl_device_i, - /* .reg = */ ®, - /* .context = */ &g_ggml_ctx_dev_main, - }; - - ggml_cl2_init(&g_ggml_backend_opencl_device); - - initialized = true; + if (initialized) { + return ® } + initialized = true; + + g_ggml_backend_opencl_devices = ggml_opencl_probe_devices(®); + + reg = ggml_backend_reg{ + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_opencl_reg_i, + /* .context = */ NULL, + }; return ® } From 6b56a64690a318fcabcd7739ac7e314d44785ea8 Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Thu, 22 May 2025 09:24:09 +0100 Subject: [PATCH 14/18] SYCL: Avoid using with SYCL-Graph for unsupported nodes (#13587) Currently on a CUDA backend to SYCL when running `GGML_SYCL_DISABLE_GRAPH=0 ./bin/test-backend-ops -b SYCL0` there are two operations that throw an exception from the blocking waits during queue recording. * `-o CONCAT` : Use of blocking waits on a queue that's being recorded https://github.com/ggml-org/llama.cpp/blob/master/ggml/src/ggml-sycl/concat.cpp#L185-L187 * `-o MUL_MAT_ID`: Blocking wait on a recording queue for a copy to host memory https://github.com/ggml-org/llama.cpp/blob/master/ggml/src/ggml-sycl/ggml-sycl.cpp#L3072-L3074 We've noticed that `ggml-cuda.cu` has the [check_node_graph_compatibility_and_refresh_copy_ops](https://github.com/ggml-org/llama.cpp/blob/39e73ae0d69f882d7e29cecc6dd8f5052fca6731/ggml/src/ggml-cuda/ggml-cuda.cu#L2458-L2458) method for checking if a graph can be used, even if enabled. I've taken a similar approach in this PR by adding a method to `ggml-sycl.cpp` for checking if a graph can be used for the operations even if a user has asked for it to be enabled. --- ggml/src/ggml-sycl/ggml-sycl.cpp | 34 +++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index d05919781..c2eb618e8 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -3809,11 +3809,43 @@ static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * syc } } +#ifdef GGML_SYCL_GRAPH +static bool check_graph_compatibility(ggml_cgraph * cgraph) { + if (ggml_sycl_info().device_count > 1) { + // A sycl_ex::command_graph object can only be created for a single device + GGML_LOG_INFO("%s: disabling SYCL graphs due to multiple devices\n", __func__); + return false; + } + + for (int i = 0; i < cgraph->n_nodes; i++) { + const ggml_op node_op = cgraph->nodes[i]->op; + switch (node_op) { + default: + break; + case GGML_OP_CONCAT: + // ggml_sycl_op_concat() does a blocking host wait after memcpy operations, + // but wait() can't be called on the events returned by a queue recording + // to a graph. + [[fallthrough]]; + case GGML_OP_MUL_MAT_ID: + // ggml_sycl_mul_mat_id() does a blocking host wait on the sycl queue after + // submitting a memcpy operation, but wait() can't be called on a queue that + // is recording to a graph. + GGML_LOG_INFO("%s: disabling SYCL graphs due to unsupported node type %s\n", __func__, + ggml_op_name(node_op)); + return false; + } + } + return true; +} +#endif + static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { auto * sycl_ctx = static_cast(backend->context); #ifdef GGML_SYCL_GRAPH - if (!g_ggml_sycl_disable_graph) { + bool use_sycl_graph = !g_ggml_sycl_disable_graph && check_graph_compatibility(cgraph); + if (use_sycl_graph) { const bool graph_support = dpct::get_device(sycl_ctx->device).has(sycl::aspect::ext_oneapi_limited_graph); if (!graph_support) { GGML_SYCL_DEBUG("[SYCL-GRAPH] can not use graphs on device:%d\n", sycl_ctx->device); From d394a9aedc50a13b7f6373416f7c1ccabfe79c32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Scipione?= Date: Thu, 22 May 2025 13:54:43 +0200 Subject: [PATCH 15/18] sycl : Remove waits from function calls (#13702) * removes the waits in async memcpy functions --- ggml/src/ggml-sycl/ggml-sycl.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index c2eb618e8..271f54e57 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -3740,7 +3740,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend, GGML_ASSERT(buf->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type"); const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0); SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy( - data, (const char *)tensor->data + offset, size).wait())); + data, (const char *)tensor->data + offset, size))); } catch (sycl::exception const &exc) { std::cerr << exc.what() << "Exception caught at file:" << __FILE__ @@ -3760,7 +3760,7 @@ static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend, */ const queue_ptr stream = sycl_ctx->stream(sycl_ctx->device, 0); SYCL_CHECK(CHECK_TRY_ERROR((stream)->memcpy( - dst->data, src->data, ggml_nbytes(dst)).wait())); + dst->data, src->data, ggml_nbytes(dst)))); return true; } From 5be24af73d77ea23d399726f1d2a01a70ee86331 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Thu, 22 May 2025 14:25:05 +0200 Subject: [PATCH 16/18] gguf-py : correct charsmap parameter typing (#13701) --- gguf-py/gguf/gguf_writer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index ff50d3de3..1dcce18f7 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -896,7 +896,7 @@ class GGUFWriter: def add_remove_extra_whitespaces(self, value: bool) -> None: self.add_bool(Keys.Tokenizer.REMOVE_EXTRA_WS, value) - def add_precompiled_charsmap(self, charsmap: Sequence[bytes]) -> None: + def add_precompiled_charsmap(self, charsmap: bytes) -> None: self.add_array(Keys.Tokenizer.PRECOMPILED_CHARSMAP, charsmap) def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None: From cc74d5be990e37f201591fd868a92e64abdbf902 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 22 May 2025 16:33:39 +0300 Subject: [PATCH 17/18] server : pad small embedding batches (#13692) ggml-ci --- tools/server/server.cpp | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 7424da523..1a08e30d2 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -3341,6 +3341,37 @@ struct server_context { common_set_adapter_lora(ctx, slot_batched->lora); } + const bool do_encode = (params_base.embedding || params_base.reranking); + + // pad the batch so that batch.n_tokens >= n_slots + // TODO: temporary workaround for https://github.com/ggml-org/llama.cpp/issues/13689 + if (do_encode) { + const int n_slots = slots.size(); + + if (batch.n_tokens < n_slots) { + std::set seq_ids; + for (int j = 0; j < batch.n_tokens; ++j) { + seq_ids.insert(batch.seq_id[j][0]); + } + + // find unused sequence id + llama_seq_id seq_id = -1; + for (int i = 0; i < n_slots; ++i) { + if (seq_ids.find(i) == seq_ids.end()) { + seq_id = i; + } + } + + const int n_add = n_slots - batch.n_tokens; + + SRV_WRN("adding %d dummy tokens to the batch, seq_id = %d\n", n_add, seq_id); + + for (int j = 0; j < n_add; ++j) { + common_batch_add(batch, 0, j, { seq_id }, false); + } + } + } + // process the created batch of tokens for (int32_t i = 0; i < batch.n_tokens; i += n_batch) { const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i); @@ -3357,7 +3388,7 @@ struct server_context { int ret = 0; - if (params_base.embedding || params_base.reranking) { + if (do_encode) { ret = llama_encode(ctx, batch_view); } else { ret = llama_decode(ctx, batch_view); From ab86335760ebb441574eb47f886fa1ee302e2131 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Fri, 23 May 2025 02:31:29 +0800 Subject: [PATCH 18/18] common: Include torch package for s390x (#13699) * common: update requirements.txt to include pytorch nightly for s390x Signed-off-by: Aaron Teo * common: fix torch installation via pip for s390x Signed-off-by: Aaron Teo --------- Signed-off-by: Aaron Teo --- requirements/requirements-convert_hf_to_gguf.txt | 6 +++++- requirements/requirements-convert_hf_to_gguf_update.txt | 6 +++++- requirements/requirements-convert_lora_to_gguf.txt | 2 ++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/requirements/requirements-convert_hf_to_gguf.txt b/requirements/requirements-convert_hf_to_gguf.txt index 8cb9c354f..431c596c1 100644 --- a/requirements/requirements-convert_hf_to_gguf.txt +++ b/requirements/requirements-convert_hf_to_gguf.txt @@ -1,3 +1,7 @@ -r ./requirements-convert_legacy_llama.txt --extra-index-url https://download.pytorch.org/whl/cpu -torch~=2.2.1 +torch~=2.2.1; platform_machine != "s390x" + +# torch s390x packages can only be found from nightly builds +--extra-index-url https://download.pytorch.org/whl/nightly +torch>=0.0.0.dev0; platform_machine == "s390x" diff --git a/requirements/requirements-convert_hf_to_gguf_update.txt b/requirements/requirements-convert_hf_to_gguf_update.txt index 8cb9c354f..431c596c1 100644 --- a/requirements/requirements-convert_hf_to_gguf_update.txt +++ b/requirements/requirements-convert_hf_to_gguf_update.txt @@ -1,3 +1,7 @@ -r ./requirements-convert_legacy_llama.txt --extra-index-url https://download.pytorch.org/whl/cpu -torch~=2.2.1 +torch~=2.2.1; platform_machine != "s390x" + +# torch s390x packages can only be found from nightly builds +--extra-index-url https://download.pytorch.org/whl/nightly +torch>=0.0.0.dev0; platform_machine == "s390x" diff --git a/requirements/requirements-convert_lora_to_gguf.txt b/requirements/requirements-convert_lora_to_gguf.txt index 5758076c4..d091d5648 100644 --- a/requirements/requirements-convert_lora_to_gguf.txt +++ b/requirements/requirements-convert_lora_to_gguf.txt @@ -1,2 +1,4 @@ -r ./requirements-convert_hf_to_gguf.txt --extra-index-url https://download.pytorch.org/whl/cpu +# torch s390x packages can only be found from nightly builds +--extra-index-url https://download.pytorch.org/whl/nightly