From 66287bdaaca3042bd4df5fc8f7bd4d58bfcd208c Mon Sep 17 00:00:00 2001 From: Maximilian Werk Date: Thu, 26 Feb 2026 12:14:09 +0100 Subject: [PATCH 01/15] model : add Jina Embeddings v5 Nano (partial EuroBERT) support (#19826) * WIP: Add EuroBERT support with autoformatting changes This commit includes: - EuroBERT model implementation for GGUF conversion - C++ backend support for EuroBERT architecture - Unintended autoformatting changes to Python files Saving before reverting formatting-only changes. * feat: add back eos assert when not last token pooling * feat: removed duplicated code and cleanup * feat: removed not working architectures and unnecessary check * fix: typo * fix: dynamic pooling config * feat: added an example model for eurobert * feat: proper llama-vocab implementation for jina-v5 * fix: removed unnecessary comments --- convert_hf_to_gguf.py | 29 +++++++++++ convert_hf_to_gguf_update.py | 1 + gguf-py/gguf/constants.py | 15 ++++++ src/CMakeLists.txt | 1 + src/llama-arch.cpp | 15 ++++++ src/llama-arch.h | 1 + src/llama-model.cpp | 39 +++++++++++++++ src/llama-vocab.cpp | 3 +- src/models/eurobert.cpp | 97 ++++++++++++++++++++++++++++++++++++ src/models/models.h | 4 ++ tests/test-tokenizer-0.sh | 9 +++- tools/imatrix/imatrix.cpp | 4 +- 12 files changed, 214 insertions(+), 4 deletions(-) create mode 100644 src/models/eurobert.cpp diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index e03810959..5709cb0e3 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1148,6 +1148,9 @@ class TextModel(ModelBase): if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6": # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de res = "jina-v2-de" + if chkhsh == "a023e9fdc5a11f034d3ef515b92350e56fb2af1f66c6b6811a4444ea9bf8763d": + # ref: https://huggingface.co/jinaai/jina-embeddings-v5-text-nano + res = "jina-v5-nano" if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d": # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct res = "smaug-bpe" @@ -6125,6 +6128,32 @@ class NeoBert(BertModel): yield from super().modify_tensors(data_torch, name, bid) +@ModelBase.register("EuroBertModel", "JinaEmbeddingsV5Model") +class EuroBertModel(TextModel): + model_arch = gguf.MODEL_ARCH.EUROBERT + + def set_vocab(self): + self.gguf_writer.add_add_bos_token(False) + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + # EuroBert is bidirectional (encoder) + self.gguf_writer.add_causal_attention(False) + + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + + self._try_set_pooling_type() + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Strip "model." prefix from tensor names + if name.startswith("model."): + name = name[6:] + + yield from super().modify_tensors(data_torch, name, bid) + + @ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification") class XLMRobertaModel(BertModel): model_arch = gguf.MODEL_ARCH.BERT diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 53a73759e..b31ddcca7 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -107,6 +107,7 @@ models = [ {"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM! {"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", }, {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", }, + {"name": "jina-v5-nano", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v5-text-nano", }, {"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", }, {"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", }, {"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", }, diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 689acdc65..e9ef97d55 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -379,6 +379,7 @@ class MODEL_ARCH(IntEnum): NEO_BERT = auto() JINA_BERT_V2 = auto() JINA_BERT_V3 = auto() + EUROBERT = auto() BLOOM = auto() STABLELM = auto() QWEN = auto() @@ -820,6 +821,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.NEO_BERT: "neo-bert", MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2", MODEL_ARCH.JINA_BERT_V3: "jina-bert-v3", + MODEL_ARCH.EUROBERT: "eurobert", MODEL_ARCH.BLOOM: "bloom", MODEL_ARCH.STABLELM: "stablelm", MODEL_ARCH.QWEN: "qwen", @@ -1587,6 +1589,19 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_UP, MODEL_TENSOR.LAYER_OUT_NORM, ], + MODEL_ARCH.EUROBERT: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_DOWN, + ], MODEL_ARCH.MPT: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 2a661a1fe..283823fa9 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -62,6 +62,7 @@ add_library(llama models/dream.cpp models/ernie4-5-moe.cpp models/ernie4-5.cpp + models/eurobert.cpp models/exaone-moe.cpp models/exaone.cpp models/exaone4.cpp diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 39ebb9db0..106ea133a 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -26,6 +26,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_NEO_BERT, "neo-bert" }, { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" }, { LLM_ARCH_JINA_BERT_V3, "jina-bert-v3" }, + { LLM_ARCH_EUROBERT, "eurobert" }, { LLM_ARCH_BLOOM, "bloom" }, { LLM_ARCH_STABLELM, "stablelm" }, { LLM_ARCH_QWEN, "qwen" }, @@ -819,6 +820,20 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_CLS, LLM_TENSOR_CLS_OUT, }; + case LLM_ARCH_EUROBERT: + return { + LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_OUTPUT_NORM, + LLM_TENSOR_ATTN_NORM, + LLM_TENSOR_ATTN_Q, + LLM_TENSOR_ATTN_K, + LLM_TENSOR_ATTN_V, + LLM_TENSOR_ATTN_OUT, + LLM_TENSOR_FFN_NORM, + LLM_TENSOR_FFN_GATE, + LLM_TENSOR_FFN_UP, + LLM_TENSOR_FFN_DOWN, + }; case LLM_ARCH_MODERN_BERT: return { LLM_TENSOR_TOKEN_EMBD, diff --git a/src/llama-arch.h b/src/llama-arch.h index 11daa1413..42a6ea38f 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -30,6 +30,7 @@ enum llm_arch { LLM_ARCH_NEO_BERT, LLM_ARCH_JINA_BERT_V2, LLM_ARCH_JINA_BERT_V3, + LLM_ARCH_EUROBERT, LLM_ARCH_BLOOM, LLM_ARCH_STABLELM, LLM_ARCH_QWEN, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 52408c525..c8ef1b5e7 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -979,6 +979,16 @@ void llama_model::load_hparams(llama_model_loader & ml) { type = LLM_TYPE_250M; } } break; + case LLM_ARCH_EUROBERT: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); + ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); + + if (hparams.n_layer == 12) { + type = LLM_TYPE_SMALL; // 0.2B + } + } break; case LLM_ARCH_BLOOM: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -3570,6 +3580,29 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); } } break; + case LLM_ARCH_EUROBERT: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); + } + } break; case LLM_ARCH_JINA_BERT_V2: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings @@ -8181,6 +8214,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_NOMIC_BERT_MOE: case LLM_ARCH_NEO_BERT: + case LLM_ARCH_EUROBERT: case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_MODERN_BERT: case LLM_ARCH_GEMMA_EMBEDDING: @@ -8378,6 +8412,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_EUROBERT: + { + llm = std::make_unique(*this, params); + } break; case LLM_ARCH_BLOOM: { llm = std::make_unique(*this, params); @@ -9004,6 +9042,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_MODERN_BERT: case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_NOMIC_BERT_MOE: + case LLM_ARCH_EUROBERT: case LLM_ARCH_STABLELM: case LLM_ARCH_BITNET: case LLM_ARCH_QWEN: diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 9c118eab7..194eed238 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1890,7 +1890,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "falcon-h1" || tokenizer_pre == "pixtral" || tokenizer_pre == "midm-2.0" || - tokenizer_pre == "lfm2") { + tokenizer_pre == "lfm2" || + tokenizer_pre == "jina-v5-nano") { pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3; ignore_merges = true; add_bos = true; diff --git a/src/models/eurobert.cpp b/src/models/eurobert.cpp new file mode 100644 index 000000000..86e3176ed --- /dev/null +++ b/src/models/eurobert.cpp @@ -0,0 +1,97 @@ +#include "models.h" + +llm_build_eurobert::llm_build_eurobert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v; + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + ggml_tensor * cur; + ggml_tensor * inpL; + ggml_tensor * inp_pos = build_inp_pos(); + + inpL = build_inp_embd(model.tok_embd); + cb(inpL, "inp_embd", -1); + + auto * inp_attn = build_attn_inp_no_cache(); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * cur = inpL; + + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + + { + ggml_tensor * Qcur; + ggml_tensor * Kcur; + ggml_tensor * Vcur; + + Qcur = build_lora_mm(model.layers[il].wq, cur); + Kcur = build_lora_mm(model.layers[il].wk, cur); + Vcur = build_lora_mm(model.layers[il].wv, cur); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(inp_attn, + model.layers[il].wo, nullptr, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + cb(cur, "kqv_out", il); + } + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + + cur = ggml_add(ctx0, cur, inpL); + + ggml_tensor * ffn_inp = cur; + cb(ffn_inp, "ffn_inp", il); + + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + + inpL = cur; + } + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + + cb(cur, "result_embd", -1); + res->t_embd = cur; + + ggml_build_forward_expand(gf, cur); +} diff --git a/src/models/models.h b/src/models/models.h index 10f8b5892..0712d03d8 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -424,6 +424,10 @@ struct llm_build_neo_bert : public llm_graph_context { llm_build_neo_bert(const llama_model & model, const llm_graph_params & params); }; +struct llm_build_eurobert : public llm_graph_context { + llm_build_eurobert(const llama_model & model, const llm_graph_params & params); +}; + template struct llm_build_olmo2 : public llm_graph_context { llm_build_olmo2(const llama_model & model, const llm_graph_params & params); diff --git a/tests/test-tokenizer-0.sh b/tests/test-tokenizer-0.sh index 7ef009dc9..7024b00af 100755 --- a/tests/test-tokenizer-0.sh +++ b/tests/test-tokenizer-0.sh @@ -13,7 +13,12 @@ fi name=$1 input=$2 -make -j tests/test-tokenizer-0 +# Build using CMake if binary doesn't exist +if [ ! -f ./build/bin/test-tokenizer-0 ]; then + printf "Building test-tokenizer-0 with CMake...\n" + cmake -B build -DLLAMA_BUILD_TESTS=ON + cmake --build build --target test-tokenizer-0 -j +fi printf "Testing %s on %s ...\n" $name $input @@ -23,7 +28,7 @@ printf "Tokenizing using (py) Python AutoTokenizer ...\n" python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1 printf "Tokenizing using (cpp) llama.cpp ...\n" -./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1 +./build/bin/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1 cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in" cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in" diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp index 669de55dd..e025c114b 100644 --- a/tools/imatrix/imatrix.cpp +++ b/tools/imatrix/imatrix.cpp @@ -912,7 +912,9 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params, c const bool add_bos = llama_vocab_get_add_bos(vocab); - GGML_ASSERT(!llama_vocab_get_add_eos(vocab)); + if (llama_pooling_type(ctx) != LLAMA_POOLING_TYPE_LAST) { + GGML_ASSERT(!llama_vocab_get_add_eos(vocab)); + } auto tim1 = std::chrono::high_resolution_clock::now(); LOG_INF("%s: tokenizing the input ..\n", __func__); From 9b62913b40b94c63db239b770e232bd48534d480 Mon Sep 17 00:00:00 2001 From: Eric Zhang <34133756+EZForever@users.noreply.github.com> Date: Thu, 26 Feb 2026 19:28:09 +0800 Subject: [PATCH 02/15] jinja : correct default size for string slices (#19913) --- common/jinja/runtime.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/common/jinja/runtime.cpp b/common/jinja/runtime.cpp index c93e182a7..5757c76b7 100644 --- a/common/jinja/runtime.cpp +++ b/common/jinja/runtime.cpp @@ -721,6 +721,8 @@ value member_expression::execute_impl(context & ctx) { int64_t arr_size = 0; if (is_val(object)) { arr_size = object->as_array().size(); + } else if (is_val(object)) { + arr_size = object->as_string().length(); } if (is_stmt(this->property)) { From efba35a860954e78e72177b0d6b0c88b239d8cb1 Mon Sep 17 00:00:00 2001 From: drrros <52050875+drrros@users.noreply.github.com> Date: Thu, 26 Feb 2026 14:32:31 +0300 Subject: [PATCH 03/15] server: fix load-on-startup not respected in ini file (#19897) Co-authored-by: Roman Marchenko --- tools/server/server-models.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index efb22da5c..e16254779 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -291,7 +291,9 @@ void server_models::load_models() { for (const auto & [name, inst] : mapping) { std::string val; if (inst.meta.preset.get_option(COMMON_ARG_PRESET_LOAD_ON_STARTUP, val)) { - models_to_load.push_back(name); + if (common_arg_utils::is_truthy(val)) { + models_to_load.push_back(name); + } } } if ((int)models_to_load.size() > base_params.models_max) { From ffaafde16ffebd2853467f0dd833625a726ce08e Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 26 Feb 2026 13:00:57 +0100 Subject: [PATCH 04/15] ggml-virtgpu: improve the reliability of the code (#19846) * ggml-virtgpu-backend: validate the consistency of the received objects This patch adds consistency checks in the ggml-virtgpu-backend (running on the host side) to ensure that the data received from the guest is consistent (valid pointers, valid sizes and offsets). * ggml-virtgpu-backend: add fallback/skips for optional ggml backend methods ``` 1. bck->iface.synchronize(bck) 2. buft->iface.get_alloc_size(buft, op) 3. buft->iface.get_max_size(buft) ``` these three methods are optional in the GGML interface. `get_max_size` was already properly defaulted, but `backend sychronize` and `butf get_max_size` would have segfaulted the backend if not implemented. * ggml-virtgpu-backend: fix log format missing argument * ggml-virtgpu-backend: improve the abort message * ggml-virtgpu-backend: more safety checks * ggml-virtgpu-backend: new error code * ggml-virtgpu-backend: initialize all the error codes * ggml-virtgpu: add a missing comment generated by the code generator * ggml-virtgpu: add the '[virtgpu]' prefix to the device/buffer names * ggml-virtgpu: apir_device_buffer_from_ptr: improve the error message * ggml-virtgpu: shared: make it match the latest api_remoting.h of Virglrenderer APIR (still unmerged) * ggml-virtgpu: update the code generator to have dispatch_command_name in a host/guest shared file * ggml-virtgpu: REMOTE_CALL: fail if the backend returns an error * docs/backend/VirtGPU.md: indicate that the RAM+VRAM size is limed to 64 GB with libkrun * ggml-virtgpu: turn off clang-format header ordering for some of the files Compilation breaks when ordered alphabetically. * ggml-virtgpu: clang-format * ggml-virtgpu/backend/shared/api_remoting: better comments for the APIR return codes --- docs/backend/VirtGPU.md | 4 +- .../backend/backend-dispatched-backend.cpp | 43 +++++++++- .../backend-dispatched-buffer-type.cpp | 14 ++- .../backend/backend-dispatched-buffer.cpp | 48 +++++++++++ .../backend/backend-dispatched.cpp | 13 ++- .../backend/backend-dispatched.gen.h | 58 ------------- .../ggml-virtgpu/backend/backend-dispatched.h | 2 + .../ggml-virtgpu/backend/backend-virgl-apir.h | 2 +- ggml/src/ggml-virtgpu/backend/backend.cpp | 38 ++++----- .../backend/shared/api_remoting.h | 21 +++-- .../backend/shared/apir_backend.gen.h | 58 +++++++++++++ .../backend/shared/apir_backend.h | 6 +- .../src/ggml-virtgpu/backend/shared/apir_cs.h | 20 ++--- .../backend/shared/apir_cs_ggml.h | 27 ++++-- .../ggml-virtgpu/backend/shared/apir_cs_rpc.h | 4 + .../ggml-virtgpu/ggml-backend-buffer-type.cpp | 6 +- ggml/src/ggml-virtgpu/ggml-backend-device.cpp | 7 +- ggml/src/ggml-virtgpu/ggml-backend-reg.cpp | 56 ++++++++---- ggml/src/ggml-virtgpu/ggml-backend.cpp | 2 +- ggml/src/ggml-virtgpu/ggml-remoting.h | 2 +- ggml/src/ggml-virtgpu/include/apir_hw.h | 6 +- ggml/src/ggml-virtgpu/regenerate_remoting.py | 47 +++++----- .../ggml-virtgpu/virtgpu-forward-backend.cpp | 6 +- .../virtgpu-forward-buffer-type.cpp | 8 +- .../ggml-virtgpu/virtgpu-forward-buffer.cpp | 12 +-- .../ggml-virtgpu/virtgpu-forward-device.cpp | 4 +- ggml/src/ggml-virtgpu/virtgpu-forward-impl.h | 47 +++++----- ggml/src/ggml-virtgpu/virtgpu-forward.gen.h | 1 + ggml/src/ggml-virtgpu/virtgpu.cpp | 85 ++++++++----------- ggml/src/ggml-virtgpu/virtgpu.h | 8 +- 30 files changed, 398 insertions(+), 257 deletions(-) diff --git a/docs/backend/VirtGPU.md b/docs/backend/VirtGPU.md index c81468da1..72f376dea 100644 --- a/docs/backend/VirtGPU.md +++ b/docs/backend/VirtGPU.md @@ -152,7 +152,9 @@ Commands and data are serialized using a custom binary protocol with: - **VM-specific**: Only works in virtual machines with virtio-gpu support - **Host dependency**: Requires properly configured host-side backend - **Latency**: Small overhead from VM escaping for each operation - +- **Shared-memory size**: with the `libkrun` hypervisor, the RAM + VRAM + addressable memory is limited to 64 GB. So the maximum GPU memory + will be `64GB - RAM`, regardless of the hardware VRAM size. * This work is pending upstream changes in the VirglRenderer project. diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp b/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp index cc879e51d..03a037f1c 100644 --- a/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp +++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched-backend.cpp @@ -7,9 +7,21 @@ #include +static uint32_t validate_graph_operation(size_t cgraph_size, uint32_t shmem_res_id, const char * operation) { + if (cgraph_size == 0) { + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Zero-size computation graph\n", operation); + return 1; + } + + // place-holder: validate that the size of shmem_res_id is <= cgraph_size + // need to add another method in the Virgl->APIR callback interface + GGML_UNUSED(shmem_res_id); + + return 0; // Valid +} + uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) { GGML_UNUSED(ctx); - GGML_UNUSED(enc); static bool async_backend_initialized = false; static bool async_backend; @@ -34,10 +46,26 @@ uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, v size_t cgraph_size; apir_decode_size_t(dec, &cgraph_size); + if (validate_graph_operation(cgraph_size, shmem_res_id, __func__) != 0) { + apir_decoder_set_fatal(dec); + return 1; + } + apir_decoder secondary_dec = apir_new_decoder((const char *) shmem_data, cgraph_size); ggml_cgraph * cgraph = apir_decode_ggml_cgraph(&secondary_dec, cgraph_size); + if (!cgraph || apir_decoder_get_fatal(&secondary_dec)) { + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Failed to deserialize computation graph\n", __func__); + return 1; + } + + if (cgraph->n_nodes < 0 || cgraph->n_leafs < 0) { + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid negative node/leaf count: nodes=%d leafs=%d\n", __func__, + cgraph->n_nodes, cgraph->n_leafs); + return 1; + } + ggml_status status; #if APIR_BACKEND_CHECK_SUPPORTS_OP == 1 for (int idx = 0; idx < cgraph->n_nodes; idx++) { @@ -45,7 +73,8 @@ uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, v if (dev->iface.supports_op(dev, op)) { continue; } - GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Graph node %d (%s) not supported by the backend\n", idx, ggml_op_desc(op)); + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Graph node %d (%s) not supported by the backend\n", __func__, idx, + ggml_op_desc(op)); status = GGML_STATUS_ABORTED; apir_encode_ggml_status(enc, &status); @@ -53,9 +82,17 @@ uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, v return 0; } #endif + + // Check if backend is properly initialized + if (!bck) { + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Backend not initialized (bck is null)\n", __func__); + + return 1; + } + status = bck->iface.graph_compute(bck, cgraph); - if (async_backend) { + if (async_backend && bck->iface.synchronize) { bck->iface.synchronize(bck); } diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp b/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp index d55eec276..c66dbaa9e 100644 --- a/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp +++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer-type.cpp @@ -85,7 +85,19 @@ uint32_t backend_buffer_type_get_alloc_size(apir_encoder * enc, apir_decoder * d const ggml_tensor * op = apir_decode_ggml_tensor_inplace(dec); - size_t value = buft->iface.get_alloc_size(buft, op); + // Check for decode error + if (op == nullptr) { + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Failed to decode tensor\n", __func__); + apir_decoder_set_fatal(dec); + return 1; + } + + size_t value; + if (buft->iface.get_alloc_size) { + value = buft->iface.get_alloc_size(buft, op); + } else { + value = ggml_nbytes(op); // Default fallback + } apir_encode_size_t(enc, &value); diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp b/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp index 8cc063ff0..3ade8d99b 100644 --- a/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp +++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched-buffer.cpp @@ -6,11 +6,26 @@ #include +static uint32_t validate_buffer_operation(size_t offset, size_t size, const char * operation) { + // Only check for critical integer overflow - no arbitrary size limits + if (offset > SIZE_MAX - size) { + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Integer overflow in offset+size: %zu + %zu\n", operation, offset, size); + return 1; + } + + return 0; // Valid +} + uint32_t backend_buffer_get_base(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx) { GGML_UNUSED(ctx); ggml_backend_buffer_t buffer; buffer = apir_decode_ggml_buffer(dec); + if (!buffer || apir_decoder_get_fatal(dec)) { + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__); + return 1; + } + uintptr_t base = (uintptr_t) buffer->iface.get_base(buffer); apir_encode_uintptr_t(enc, &base); @@ -24,6 +39,11 @@ uint32_t backend_buffer_set_tensor(apir_encoder * enc, apir_decoder * dec, virgl ggml_backend_buffer_t buffer; buffer = apir_decode_ggml_buffer(dec); + if (!buffer || apir_decoder_get_fatal(dec)) { + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__); + return 1; + } + ggml_tensor * tensor; // safe to remove the const qualifier here tensor = (ggml_tensor *) (uintptr_t) apir_decode_ggml_tensor(dec); @@ -37,6 +57,10 @@ uint32_t backend_buffer_set_tensor(apir_encoder * enc, apir_decoder * dec, virgl size_t size; apir_decode_size_t(dec, &size); + if (validate_buffer_operation(offset, size, __func__) != 0) { + return 1; + } + void * shmem_data = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id); if (!shmem_data) { @@ -56,6 +80,11 @@ uint32_t backend_buffer_get_tensor(apir_encoder * enc, apir_decoder * dec, virgl ggml_backend_buffer_t buffer; buffer = apir_decode_ggml_buffer(dec); + if (!buffer || apir_decoder_get_fatal(dec)) { + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__); + return 1; + } + const ggml_tensor * tensor; // safe to remove the const qualifier here tensor = apir_decode_ggml_tensor(dec); @@ -69,6 +98,10 @@ uint32_t backend_buffer_get_tensor(apir_encoder * enc, apir_decoder * dec, virgl size_t size; apir_decode_size_t(dec, &size); + if (validate_buffer_operation(offset, size, __func__) != 0) { + return 1; + } + void * shmem_data = ctx->iface->get_shmem_ptr(ctx->ctx_id, shmem_res_id); if (!shmem_data) { GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Couldn't get the shmem addr from virgl\n", __func__); @@ -86,6 +119,11 @@ uint32_t backend_buffer_cpy_tensor(apir_encoder * enc, apir_decoder * dec, virgl ggml_backend_buffer_t buffer; buffer = apir_decode_ggml_buffer(dec); + if (!buffer || apir_decoder_get_fatal(dec)) { + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__); + return 1; + } + const ggml_tensor * src; // safe to remove the const qualifier here src = apir_decode_ggml_tensor(dec); @@ -105,6 +143,11 @@ uint32_t backend_buffer_clear(apir_encoder * enc, apir_decoder * dec, virgl_apir ggml_backend_buffer_t buffer; buffer = apir_decode_ggml_buffer(dec); + if (!buffer || apir_decoder_get_fatal(dec)) { + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__); + return 1; + } + uint8_t value; apir_decode_uint8_t(dec, &value); @@ -120,6 +163,11 @@ uint32_t backend_buffer_free_buffer(apir_encoder * enc, apir_decoder * dec, virg ggml_backend_buffer_t buffer; buffer = apir_decode_ggml_buffer(dec); + if (!buffer || apir_decoder_get_fatal(dec)) { + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Invalid buffer handle from guest\n", __func__); + return 1; + } + if (!apir_untrack_backend_buffer(buffer)) { GGML_LOG_WARN(GGML_VIRTGPU_BCK "%s: unknown buffer %p\n", __func__, (void *) buffer); return 1; diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp b/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp index 64152eef0..c80e4aabe 100644 --- a/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp +++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched.cpp @@ -1,6 +1,6 @@ #include "backend-dispatched.h" -#include "backend-virgl-apir.h" +#include "backend-virgl-apir.h" #include "ggml-backend-impl.h" #include "ggml-backend.h" #include "ggml-impl.h" @@ -28,19 +28,24 @@ uint32_t backend_dispatch_initialize(void * ggml_backend_reg_fct_p) { return APIR_BACKEND_INITIALIZE_BACKEND_REG_FAILED; } - if (!reg->iface.get_device_count(reg)) { - GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: backend initialization failed: no device found\n", __func__); + size_t device_count = reg->iface.get_device_count(reg); + if (!device_count) { + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: no device found\n", __func__); return APIR_BACKEND_INITIALIZE_NO_DEVICE; } dev = reg->iface.get_device(reg, 0); if (!dev) { - GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: backend initialization failed: no device received\n", __func__); + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: failed to get device\n", __func__); return APIR_BACKEND_INITIALIZE_NO_DEVICE; } bck = dev->iface.init_backend(dev, NULL); + if (!bck) { + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: backend initialization failed\n", __func__); + return APIR_BACKEND_INITIALIZE_BACKEND_INIT_FAILED; + } return APIR_BACKEND_INITIALIZE_SUCCESS; } diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h b/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h index 481d7f315..3dc334e4c 100644 --- a/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h +++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched.gen.h @@ -32,64 +32,6 @@ uint32_t backend_buffer_free_buffer(apir_encoder * enc, apir_decoder * dec, virg /* backend */ uint32_t backend_backend_graph_compute(apir_encoder * enc, apir_decoder * dec, virgl_apir_context * ctx); -static inline const char * backend_dispatch_command_name(ApirBackendCommandType type) { - switch (type) { - /* device */ - case APIR_COMMAND_TYPE_DEVICE_GET_DEVICE_COUNT: - return "backend_device_get_device_count"; - case APIR_COMMAND_TYPE_DEVICE_GET_COUNT: - return "backend_device_get_count"; - case APIR_COMMAND_TYPE_DEVICE_GET_NAME: - return "backend_device_get_name"; - case APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION: - return "backend_device_get_description"; - case APIR_COMMAND_TYPE_DEVICE_GET_TYPE: - return "backend_device_get_type"; - case APIR_COMMAND_TYPE_DEVICE_GET_MEMORY: - return "backend_device_get_memory"; - case APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP: - return "backend_device_supports_op"; - case APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE: - return "backend_device_get_buffer_type"; - case APIR_COMMAND_TYPE_DEVICE_GET_PROPS: - return "backend_device_get_props"; - case APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR: - return "backend_device_buffer_from_ptr"; - /* buffer-type */ - case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME: - return "backend_buffer_type_get_name"; - case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT: - return "backend_buffer_type_get_alignment"; - case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE: - return "backend_buffer_type_get_max_size"; - case APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST: - return "backend_buffer_type_is_host (DEPRECATED)"; - case APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER: - return "backend_buffer_type_alloc_buffer"; - case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE: - return "backend_buffer_type_get_alloc_size"; - /* buffer */ - case APIR_COMMAND_TYPE_BUFFER_GET_BASE: - return "backend_buffer_get_base"; - case APIR_COMMAND_TYPE_BUFFER_SET_TENSOR: - return "backend_buffer_set_tensor"; - case APIR_COMMAND_TYPE_BUFFER_GET_TENSOR: - return "backend_buffer_get_tensor"; - case APIR_COMMAND_TYPE_BUFFER_CPY_TENSOR: - return "backend_buffer_cpy_tensor"; - case APIR_COMMAND_TYPE_BUFFER_CLEAR: - return "backend_buffer_clear"; - case APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER: - return "backend_buffer_free_buffer"; - /* backend */ - case APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE: - return "backend_backend_graph_compute"; - - default: - return "unknown"; - } -} - extern "C" { static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATCH_TABLE_COUNT] = { diff --git a/ggml/src/ggml-virtgpu/backend/backend-dispatched.h b/ggml/src/ggml-virtgpu/backend/backend-dispatched.h index 10311631d..740ee9e3f 100644 --- a/ggml/src/ggml-virtgpu/backend/backend-dispatched.h +++ b/ggml/src/ggml-virtgpu/backend/backend-dispatched.h @@ -1,5 +1,6 @@ #pragma once +// clang-format off #include #include @@ -10,6 +11,7 @@ #include "shared/apir_backend.h" #include "shared/apir_cs.h" #include "shared/apir_cs_ggml.h" +// clang-format on #define GGML_VIRTGPU_BCK "ggml-virtgpu-backend: " diff --git a/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h b/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h index 44b347f85..c65a01cdf 100644 --- a/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h +++ b/ggml/src/ggml-virtgpu/backend/backend-virgl-apir.h @@ -19,7 +19,7 @@ struct virgl_apir_callbacks { }; extern "C" { -ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct virgl_apir_callbacks *virgl_cbs); +ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct virgl_apir_callbacks * virgl_cbs); void apir_backend_deinit(uint32_t virgl_ctx_id); uint32_t apir_backend_dispatcher(uint32_t virgl_ctx_id, virgl_apir_callbacks * virgl_cbs, diff --git a/ggml/src/ggml-virtgpu/backend/backend.cpp b/ggml/src/ggml-virtgpu/backend/backend.cpp index d93414a07..535a05f3e 100644 --- a/ggml/src/ggml-virtgpu/backend/backend.cpp +++ b/ggml/src/ggml-virtgpu/backend/backend.cpp @@ -1,6 +1,5 @@ #include "backend-dispatched.h" #include "backend-virgl-apir.h" - #include "shared/api_remoting.h" #include "shared/apir_backend.h" #include "shared/apir_cs.h" @@ -17,10 +16,10 @@ #define GGML_DEFAULT_BACKEND_REG "ggml_backend_init" static void * backend_library_handle = NULL; -static FILE * apir_logfile = NULL; +static FILE * apir_logfile = NULL; static void log_to_file_callback(enum ggml_log_level level, const char * text, void * user_data) { - FILE * logfile = (FILE *)user_data; + FILE * logfile = (FILE *) user_data; fprintf(logfile, "[%d] %s", level, text); fflush(logfile); } @@ -48,9 +47,9 @@ void apir_backend_deinit(uint32_t virgl_ctx_id) { } #define APIR_GGML_LIBRARY_PATH_KEY "ggml.library.path" -#define APIR_GGML_LIBRARY_REG_KEY "ggml.library.reg" +#define APIR_GGML_LIBRARY_REG_KEY "ggml.library.reg" -ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct virgl_apir_callbacks *virgl_cbs) { +ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct virgl_apir_callbacks * virgl_cbs) { const char * dlsym_error; const char * apir_log_to_file = getenv(APIR_LLAMA_CPP_LOG_TO_FILE_ENV); @@ -63,15 +62,13 @@ ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct } } - const char * library_name = virgl_cbs->get_config(virgl_ctx_id, APIR_GGML_LIBRARY_PATH_KEY); + const char * library_name = virgl_cbs->get_config(virgl_ctx_id, APIR_GGML_LIBRARY_PATH_KEY); const char * virgl_library_reg = virgl_cbs->get_config(virgl_ctx_id, APIR_GGML_LIBRARY_REG_KEY); - const char * library_reg = virgl_library_reg ? virgl_library_reg : GGML_DEFAULT_BACKEND_REG; + const char * library_reg = virgl_library_reg ? virgl_library_reg : GGML_DEFAULT_BACKEND_REG; if (!library_name) { - GGML_LOG_ERROR(GGML_VIRTGPU_BCK - "%s: cannot open the GGML library: env var '%s' not defined\n", - __func__, APIR_LLAMA_CPP_GGML_LIBRARY_PATH_ENV); - + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: cannot open the GGML library: env var '%s' not defined\n", __func__, + APIR_LLAMA_CPP_GGML_LIBRARY_PATH_ENV); return APIR_LOAD_LIBRARY_ENV_VAR_MISSING; } @@ -79,16 +76,14 @@ ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct backend_library_handle = dlopen(library_name, RTLD_LAZY); if (!backend_library_handle) { - GGML_LOG_ERROR(GGML_VIRTGPU_BCK - "%s: cannot open the GGML library: %s\n", __func__, dlerror()); + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: cannot open the GGML library: %s\n", __func__, dlerror()); return APIR_LOAD_LIBRARY_CANNOT_OPEN; } if (!library_reg) { - GGML_LOG_ERROR(GGML_VIRTGPU_BCK - "%s: cannot register the GGML library: env var '%s' not defined\n", - __func__, APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV); + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: cannot register the GGML library: env var '%s' not defined\n", __func__, + APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV); return APIR_LOAD_LIBRARY_ENV_VAR_MISSING; } @@ -96,11 +91,9 @@ ApirLoadLibraryReturnCode apir_backend_initialize(uint32_t virgl_ctx_id, struct void * ggml_backend_reg_fct = dlsym(backend_library_handle, library_reg); dlsym_error = dlerror(); if (dlsym_error) { - GGML_LOG_ERROR(GGML_VIRTGPU_BCK - "%s: cannot find the GGML backend registration symbol '%s' (from %s): %s\n", + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: cannot find the GGML backend registration symbol '%s' (from %s): %s\n", __func__, library_reg, APIR_LLAMA_CPP_GGML_LIBRARY_REG_ENV, dlsym_error); - return APIR_LOAD_LIBRARY_SYMBOL_MISSING; } @@ -132,13 +125,12 @@ uint32_t apir_backend_dispatcher(uint32_t virgl_ctx_id, virgl_apir_context ctx = { .ctx_id = virgl_ctx_id, - .iface = virgl_cbs, + .iface = virgl_cbs, }; if (cmd_type >= APIR_BACKEND_DISPATCH_TABLE_COUNT) { - GGML_LOG_ERROR(GGML_VIRTGPU_BCK - "%s: Received an invalid dispatch index (%d >= %d)\n", - __func__, cmd_type, APIR_BACKEND_DISPATCH_TABLE_COUNT); + GGML_LOG_ERROR(GGML_VIRTGPU_BCK "%s: Received an invalid dispatch index (%d >= %d)\n", __func__, cmd_type, + APIR_BACKEND_DISPATCH_TABLE_COUNT); return APIR_BACKEND_FORWARD_INDEX_INVALID; } diff --git a/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h b/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h index f19a5d12d..6bf97e8a3 100644 --- a/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h +++ b/ggml/src/ggml-virtgpu/backend/shared/api_remoting.h @@ -16,28 +16,32 @@ enum ApirCommandType { APIR_COMMAND_TYPE_LOADLIBRARY = 1, APIR_COMMAND_TYPE_FORWARD = 2, - APIR_COMMAND_TYPE_LENGTH = 3, + APIR_COMMAND_TYPE_LENGTH = 3, }; typedef uint64_t ApirCommandFlags; enum ApirLoadLibraryReturnCode { APIR_LOAD_LIBRARY_SUCCESS = 0, + // these error codes are returned by the Virglrenderer APIR component APIR_LOAD_LIBRARY_HYPERCALL_INITIALIZATION_ERROR = 1, APIR_LOAD_LIBRARY_ALREADY_LOADED = 2, APIR_LOAD_LIBRARY_ENV_VAR_MISSING = 3, APIR_LOAD_LIBRARY_CANNOT_OPEN = 4, APIR_LOAD_LIBRARY_SYMBOL_MISSING = 5, - APIR_LOAD_LIBRARY_INIT_BASE_INDEX = 6, // anything above this is a APIR backend library initialization return code + // any value greater than this is an APIR *backend library* initialization return code + APIR_LOAD_LIBRARY_INIT_BASE_INDEX = 6, }; enum ApirForwardReturnCode { - APIR_FORWARD_SUCCESS = 0, - APIR_FORWARD_NO_DISPATCH_FCT = 1, - APIR_FORWARD_TIMEOUT = 2, - - APIR_FORWARD_BASE_INDEX = 3, // anything above this is a APIR backend library forward return code -} ; + APIR_FORWARD_SUCCESS = 0, + // these error codes are returned by the Virglrenderer APIR component + APIR_FORWARD_NO_DISPATCH_FCT = 1, + APIR_FORWARD_TIMEOUT = 2, + APIR_FORWARD_FAILED_TO_SYNC_STREAMS = 3, + // any value greater than this index an APIR *backend library* forward return code + APIR_FORWARD_BASE_INDEX = 4, +}; __attribute__((unused)) static inline const char * apir_command_name(ApirCommandType type) { switch (type) { @@ -82,6 +86,7 @@ __attribute__((unused)) static const char * apir_forward_error(ApirForwardReturn APIR_FORWARD_ERROR(APIR_FORWARD_SUCCESS); APIR_FORWARD_ERROR(APIR_FORWARD_NO_DISPATCH_FCT); APIR_FORWARD_ERROR(APIR_FORWARD_TIMEOUT); + APIR_FORWARD_ERROR(APIR_FORWARD_FAILED_TO_SYNC_STREAMS); APIR_FORWARD_ERROR(APIR_FORWARD_BASE_INDEX); return "Unknown APIR_COMMAND_TYPE_FORWARD error"; diff --git a/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h b/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h index d214b6f2a..520ac9c72 100644 --- a/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h +++ b/ggml/src/ggml-virtgpu/backend/shared/apir_backend.gen.h @@ -34,3 +34,61 @@ typedef enum ApirBackendCommandType { // last command_type index + 1 APIR_BACKEND_DISPATCH_TABLE_COUNT = 23, } ApirBackendCommandType; + +static inline const char * apir_dispatch_command_name(ApirBackendCommandType type) { + switch (type) { + /* device */ + case APIR_COMMAND_TYPE_DEVICE_GET_DEVICE_COUNT: + return "device_get_device_count"; + case APIR_COMMAND_TYPE_DEVICE_GET_COUNT: + return "device_get_count"; + case APIR_COMMAND_TYPE_DEVICE_GET_NAME: + return "device_get_name"; + case APIR_COMMAND_TYPE_DEVICE_GET_DESCRIPTION: + return "device_get_description"; + case APIR_COMMAND_TYPE_DEVICE_GET_TYPE: + return "device_get_type"; + case APIR_COMMAND_TYPE_DEVICE_GET_MEMORY: + return "device_get_memory"; + case APIR_COMMAND_TYPE_DEVICE_SUPPORTS_OP: + return "device_supports_op"; + case APIR_COMMAND_TYPE_DEVICE_GET_BUFFER_TYPE: + return "device_get_buffer_type"; + case APIR_COMMAND_TYPE_DEVICE_GET_PROPS: + return "device_get_props"; + case APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR: + return "device_buffer_from_ptr"; + /* buffer-type */ + case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_NAME: + return "buffer_type_get_name"; + case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALIGNMENT: + return "buffer_type_get_alignment"; + case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_MAX_SIZE: + return "buffer_type_get_max_size"; + case APIR_COMMAND_TYPE_BUFFER_TYPE_IS_HOST: + return "buffer_type_is_host"; + case APIR_COMMAND_TYPE_BUFFER_TYPE_ALLOC_BUFFER: + return "buffer_type_alloc_buffer"; + case APIR_COMMAND_TYPE_BUFFER_TYPE_GET_ALLOC_SIZE: + return "buffer_type_get_alloc_size"; + /* buffer */ + case APIR_COMMAND_TYPE_BUFFER_GET_BASE: + return "buffer_get_base"; + case APIR_COMMAND_TYPE_BUFFER_SET_TENSOR: + return "buffer_set_tensor"; + case APIR_COMMAND_TYPE_BUFFER_GET_TENSOR: + return "buffer_get_tensor"; + case APIR_COMMAND_TYPE_BUFFER_CPY_TENSOR: + return "buffer_cpy_tensor"; + case APIR_COMMAND_TYPE_BUFFER_CLEAR: + return "buffer_clear"; + case APIR_COMMAND_TYPE_BUFFER_FREE_BUFFER: + return "buffer_free_buffer"; + /* backend */ + case APIR_COMMAND_TYPE_BACKEND_GRAPH_COMPUTE: + return "backend_graph_compute"; + + default: + return "unknown"; + } +} diff --git a/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h b/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h index f3efa52c7..da1e21b5b 100644 --- a/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h +++ b/ggml/src/ggml-virtgpu/backend/shared/apir_backend.h @@ -14,7 +14,7 @@ #define APIR_BACKEND_INITIALIZE_BACKEND_REG_FAILED 6 #define APIR_BACKEND_INITIALIZE_ALREADY_INITED 7 #define APIR_BACKEND_INITIALIZE_NO_DEVICE 8 - +#define APIR_BACKEND_INITIALIZE_BACKEND_INIT_FAILED 9 // new entries here need to be added to the apir_backend_initialize_error function below @@ -39,6 +39,10 @@ static const char * apir_backend_initialize_error(int code) { APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_MISSING_BACKEND_SYMBOLS); APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_MISSING_GGML_SYMBOLS); APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_BACKEND_FAILED); + APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_BACKEND_REG_FAILED); + APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_ALREADY_INITED); + APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_NO_DEVICE); + APIR_BACKEND_INITIALIZE_ERROR(APIR_BACKEND_INITIALIZE_BACKEND_INIT_FAILED); return "Unknown APIR_BACKEND_INITIALIZE error:/"; diff --git a/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h b/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h index 1bc3a5f68..64bf2ec96 100644 --- a/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h +++ b/ggml/src/ggml-virtgpu/backend/shared/apir_cs.h @@ -13,7 +13,6 @@ struct apir_encoder { const char * start; const char * end; bool fatal; - }; struct apir_decoder { @@ -28,8 +27,8 @@ struct apir_decoder { static apir_decoder apir_new_decoder(const char * ptr, size_t size) { apir_decoder dec = { - .cur = ptr, - .end = ptr + size, + .cur = ptr, + .end = ptr + size, .fatal = false, }; @@ -79,10 +78,7 @@ static inline bool apir_decoder_get_fatal(const apir_decoder * dec) { * encode peek */ -static inline bool apir_decoder_peek_internal(apir_decoder * dec, - size_t size, - void * val, - size_t val_size) { +static inline bool apir_decoder_peek_internal(apir_decoder * dec, size_t size, void * val, size_t val_size) { assert(val_size <= size); if (unlikely(size > (size_t) (dec->end - dec->cur))) { @@ -332,8 +328,7 @@ static inline void apir_decode_char_array(apir_decoder * dec, char * val, size_t static inline void * apir_decoder_alloc_array(size_t size, size_t count) { size_t alloc_size; if (unlikely(__builtin_mul_overflow(size, count, &alloc_size))) { - GGML_LOG_ERROR("%s: overflow in array allocation of %zu * %zu bytes\n", - __func__, size, count); + GGML_LOG_ERROR("%s: overflow in array allocation of %zu * %zu bytes\n", __func__, size, count); return NULL; } @@ -352,20 +347,19 @@ static inline void apir_decode_bool_t(apir_decoder * dec, bool * val) { /* apir_buffer_type_host_handle_t */ -static inline void apir_encode_apir_buffer_type_host_handle_t(apir_encoder * enc, +static inline void apir_encode_apir_buffer_type_host_handle_t(apir_encoder * enc, const apir_buffer_type_host_handle_t * val) { apir_encode(enc, sizeof(apir_buffer_type_host_handle_t), val, sizeof(apir_buffer_type_host_handle_t)); } -static inline void apir_decode_apir_buffer_type_host_handle_t(apir_decoder * dec, +static inline void apir_decode_apir_buffer_type_host_handle_t(apir_decoder * dec, apir_buffer_type_host_handle_t * val) { apir_decode(dec, sizeof(apir_buffer_type_host_handle_t), val, sizeof(apir_buffer_type_host_handle_t)); } /* apir_buffer_host_handle_t */ -static inline void apir_encode_apir_buffer_host_handle_t(apir_encoder * enc, - const apir_buffer_host_handle_t * val) { +static inline void apir_encode_apir_buffer_host_handle_t(apir_encoder * enc, const apir_buffer_host_handle_t * val) { apir_encode(enc, sizeof(apir_buffer_host_handle_t), val, sizeof(apir_buffer_host_handle_t)); } diff --git a/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h b/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h index 289f4b77d..fabe3e401 100644 --- a/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h +++ b/ggml/src/ggml-virtgpu/backend/shared/apir_cs_ggml.h @@ -1,11 +1,10 @@ -#include "ggml-impl.h" #include "apir_cs.h" #include "apir_cs_rpc.h" +#include "ggml-impl.h" // ggml_buffer_to_apir_host_handle(ggml_backend_buffer_t buffer); -static inline void apir_encode_ggml_buffer_host_handle(apir_encoder * enc, - const apir_buffer_host_handle_t * handle); +static inline void apir_encode_ggml_buffer_host_handle(apir_encoder * enc, const apir_buffer_host_handle_t * handle); static inline ggml_backend_buffer_t apir_decode_ggml_buffer(apir_decoder * dec); @@ -22,8 +21,7 @@ static inline apir_rpc_tensor * apir_decode_apir_rpc_tensor_inplace(apir_decoder return (apir_rpc_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, apir_rpc_tensor_size); } -static inline apir_rpc_tensor * apir_decode_apir_rpc_tensor_array_inplace(apir_decoder * dec, - uint32_t n_tensors) { +static inline apir_rpc_tensor * apir_decode_apir_rpc_tensor_array_inplace(apir_decoder * dec, uint32_t n_tensors) { size_t apir_rpc_tensor_size = sizeof(apir_rpc_tensor) * n_tensors; return (apir_rpc_tensor *) (uintptr_t) apir_decoder_use_inplace(dec, apir_rpc_tensor_size); @@ -45,9 +43,9 @@ static inline const ggml_tensor * apir_decode_ggml_tensor(apir_decoder * dec) { } ggml_init_params params{ - /*.mem_size =*/ ggml_tensor_overhead(), - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, + /*.mem_size =*/ggml_tensor_overhead(), + /*.mem_buffer =*/NULL, + /*.no_alloc =*/true, }; ggml_context * ctx = ggml_init(params); @@ -105,6 +103,19 @@ static inline ggml_backend_buffer_t apir_decode_ggml_buffer(apir_decoder * dec) apir_decoder_read(dec, buffer_ptr_size, &buffer, buffer_ptr_size); + // SECURITY: Validate buffer handle against tracked buffers to prevent + // guest VM from providing arbitrary host memory addresses + if (buffer) { + extern std::unordered_set backend_buffers; + if (backend_buffers.find(buffer) == backend_buffers.end()) { + GGML_LOG_WARN("ggml-virtgpu-backend: %s: Invalid buffer handle from guest: %p\n", __func__, + (void *) buffer); + // Set fatal flag to prevent further processing with invalid handle + apir_decoder_set_fatal(dec); + return NULL; + } + } + return buffer; } diff --git a/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h b/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h index f68179895..4cb2f047d 100644 --- a/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h +++ b/ggml/src/ggml-virtgpu/backend/shared/apir_cs_rpc.h @@ -1,3 +1,6 @@ +#pragma once + +// clang-format off #include "ggml.h" #include "ggml-backend-impl.h" @@ -5,6 +8,7 @@ #include #include #include +// clang-format on // ggml_tensor is serialized into apir_rpc_tensor struct apir_rpc_tensor { diff --git a/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp b/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp index c493a8e2a..8fa20ff43 100644 --- a/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp +++ b/ggml/src/ggml-virtgpu/ggml-backend-buffer-type.cpp @@ -34,6 +34,7 @@ static ggml_backend_buffer_t ggml_backend_remoting_buffer_type_alloc_buffer(ggml static const char * ggml_backend_remoting_buffer_type_get_name(ggml_backend_buffer_type_t buft) { virtgpu * gpu = BUFT_TO_GPU(buft); + // Return the prefixed name that was built once during initialization return gpu->cached_buffer_type.name; } @@ -53,9 +54,8 @@ static size_t ggml_backend_remoting_buffer_type_get_alloc_size(ggml_backend_buff const ggml_tensor * tensor) { virtgpu * gpu = BUFT_TO_GPU(buft); - if (tensor->buffer == NULL - || !tensor->buffer->context - || !buft->device->iface.supports_buft(buft->device, tensor->buffer->buft)) { + if (tensor->buffer == NULL || !tensor->buffer->context || + !buft->device->iface.supports_buft(buft->device, tensor->buffer->buft)) { return ggml_nbytes(tensor); } diff --git a/ggml/src/ggml-virtgpu/ggml-backend-device.cpp b/ggml/src/ggml-virtgpu/ggml-backend-device.cpp index c7d288105..ec8156bb8 100644 --- a/ggml/src/ggml-virtgpu/ggml-backend-device.cpp +++ b/ggml/src/ggml-virtgpu/ggml-backend-device.cpp @@ -3,6 +3,7 @@ static const char * ggml_backend_remoting_device_get_name(ggml_backend_dev_t dev) { virtgpu * gpu = DEV_TO_GPU(dev); + // Return the prefixed name that was built once during initialization return gpu->cached_device_info.name; } @@ -22,7 +23,7 @@ static enum ggml_backend_dev_type ggml_backend_remoting_device_get_type(ggml_bac static void ggml_backend_remoting_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { virtgpu * gpu = DEV_TO_GPU(dev); - *free = gpu->cached_device_info.memory_free; + *free = gpu->cached_device_info.memory_free; *total = gpu->cached_device_info.memory_total; } @@ -72,7 +73,7 @@ static void ggml_backend_remoting_device_get_props(ggml_backend_dev_t dev, ggml_ ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_backend_dev_t dev) { virtgpu * gpu = DEV_TO_GPU(dev); - static std::atomic initialized = false; + static std::atomic initialized = false; static ggml_backend_buffer_type buft; if (!initialized) { @@ -95,7 +96,7 @@ ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_type(ggml_bac static ggml_backend_buffer_type_t ggml_backend_remoting_device_get_buffer_from_ptr_type(ggml_backend_dev_t dev) { virtgpu * gpu = DEV_TO_GPU(dev); - static std::atomic initialized = false; + static std::atomic initialized = false; static ggml_backend_buffer_type buft; if (!initialized) { diff --git a/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp b/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp index 2d02cfec1..a4df5956a 100644 --- a/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp +++ b/ggml/src/ggml-virtgpu/ggml-backend-reg.cpp @@ -7,8 +7,8 @@ void ggml_virtgpu_cleanup(virtgpu * gpu); static virtgpu * apir_initialize() { - static virtgpu * gpu = NULL; - static std::atomic initialized = false; + static virtgpu * gpu = NULL; + static std::atomic initialized = false; if (initialized) { // fast track @@ -31,29 +31,53 @@ static virtgpu * apir_initialize() { } // Pre-fetch and cache all device information, it will not change - gpu->cached_device_info.description = apir_device_get_description(gpu); + gpu->cached_device_info.description = apir_device_get_description(gpu); if (!gpu->cached_device_info.description) { GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the virtgpu device description", __func__); } - gpu->cached_device_info.name = apir_device_get_name(gpu); - if (!gpu->cached_device_info.name) { - GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the virtgpu device name", __func__); - } gpu->cached_device_info.device_count = apir_device_get_count(gpu); gpu->cached_device_info.type = apir_device_get_type(gpu); - apir_device_get_memory(gpu, - &gpu->cached_device_info.memory_free, - &gpu->cached_device_info.memory_total); + { + // Get the remote name and create prefixed version + char * rmt_device_name = apir_device_get_name(gpu); + if (!rmt_device_name) { + GGML_ABORT(GGML_VIRTGPU "%s: failed to get the virtgpu device name", __func__); + } + + size_t device_name_len = strlen(rmt_device_name) + 11; // "[virtgpu] " + null terminator + gpu->cached_device_info.name = (char *) malloc(device_name_len); + if (!gpu->cached_device_info.name) { + free(rmt_device_name); + GGML_ABORT(GGML_VIRTGPU "%s: failed to allocate memory for prefixed device name", __func__); + } + snprintf(gpu->cached_device_info.name, device_name_len, "[virtgpu] %s", rmt_device_name); + free(rmt_device_name); + } + + apir_device_get_memory(gpu, &gpu->cached_device_info.memory_free, &gpu->cached_device_info.memory_total); apir_buffer_type_host_handle_t buft_host_handle = apir_device_get_buffer_type(gpu); gpu->cached_buffer_type.host_handle = buft_host_handle; - gpu->cached_buffer_type.name = apir_buffer_type_get_name(gpu, buft_host_handle); - if (!gpu->cached_buffer_type.name) { - GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the virtgpu buffer type name", __func__); + { + // Get the remote name and create prefixed version + char * rmt_name = apir_buffer_type_get_name(gpu, buft_host_handle); + if (!rmt_name) { + GGML_ABORT(GGML_VIRTGPU "%s: failed to get the virtgpu buffer type name", __func__); + } + + size_t prefixed_len = strlen(rmt_name) + 11; // "[virtgpu] " + null terminator + gpu->cached_buffer_type.name = (char *) malloc(prefixed_len); + if (!gpu->cached_buffer_type.name) { + free(rmt_name); + GGML_ABORT(GGML_VIRTGPU "%s: failed to allocate memory for prefixed buffer type name", __func__); + } + snprintf(gpu->cached_buffer_type.name, prefixed_len, "[virtgpu] %s", rmt_name); + free(rmt_name); } - gpu->cached_buffer_type.alignment = apir_buffer_type_get_alignment(gpu, buft_host_handle); - gpu->cached_buffer_type.max_size = apir_buffer_type_get_max_size(gpu, buft_host_handle); + + gpu->cached_buffer_type.alignment = apir_buffer_type_get_alignment(gpu, buft_host_handle); + gpu->cached_buffer_type.max_size = apir_buffer_type_get_max_size(gpu, buft_host_handle); initialized = true; } @@ -98,7 +122,7 @@ static void ggml_backend_remoting_reg_init_devices(ggml_backend_reg_t reg) { static std::atomic initialized = false; if (initialized) { - return; // fast track + return; // fast track } { diff --git a/ggml/src/ggml-virtgpu/ggml-backend.cpp b/ggml/src/ggml-virtgpu/ggml-backend.cpp index 5cd6c0c06..a63ee2b9d 100644 --- a/ggml/src/ggml-virtgpu/ggml-backend.cpp +++ b/ggml/src/ggml-virtgpu/ggml-backend.cpp @@ -1,5 +1,5 @@ -#include "ggml-remoting.h" #include "../../include/ggml-virtgpu.h" +#include "ggml-remoting.h" static const char * ggml_backend_remoting_get_name(ggml_backend_t backend) { UNUSED(backend); diff --git a/ggml/src/ggml-virtgpu/ggml-remoting.h b/ggml/src/ggml-virtgpu/ggml-remoting.h index 087664086..4f70326be 100644 --- a/ggml/src/ggml-virtgpu/ggml-remoting.h +++ b/ggml/src/ggml-virtgpu/ggml-remoting.h @@ -9,7 +9,7 @@ #include #define GGML_VIRTGPU_NAME "ggml-virtgpu" -#define GGML_VIRTGPU "ggml-virtgpu: " +#define GGML_VIRTGPU "ggml-virtgpu: " // USE_ALWAYS_TRUE_SUPPORTS_OP: 1 is fast, 0 avoid micro-benchmark crashes diff --git a/ggml/src/ggml-virtgpu/include/apir_hw.h b/ggml/src/ggml-virtgpu/include/apir_hw.h index 33af045ca..7d6ea2265 100644 --- a/ggml/src/ggml-virtgpu/include/apir_hw.h +++ b/ggml/src/ggml-virtgpu/include/apir_hw.h @@ -3,7 +3,7 @@ #include struct virgl_renderer_capset_apir { - uint32_t apir_version; - uint32_t supports_blob_resources; - uint32_t reserved[4]; // For future expansion + uint32_t apir_version; + uint32_t supports_blob_resources; + uint32_t reserved[4]; // For future expansion }; diff --git a/ggml/src/ggml-virtgpu/regenerate_remoting.py b/ggml/src/ggml-virtgpu/regenerate_remoting.py index aeb48a408..dae75fd1c 100755 --- a/ggml/src/ggml-virtgpu/regenerate_remoting.py +++ b/ggml/src/ggml-virtgpu/regenerate_remoting.py @@ -145,8 +145,31 @@ class RemotingCodebaseGenerator: enum_lines.append(f" APIR_BACKEND_DISPATCH_TABLE_COUNT = {total_count},") enum_lines.append("} ApirBackendCommandType;") + # Generate function name mapping + func_lines = [] + func_lines.append("static inline const char * apir_dispatch_command_name(ApirBackendCommandType type) {") + func_lines.append(" switch (type) {") + + current_group = None + for func in functions: + # Add comment for new group + if func['group_name'] != current_group: + func_lines.append(f" /* {func['group_description']} */") + current_group = func['group_name'] + + # Generate clean function name without backend_ prefix + clean_name = f"{func['group_name']}_{func['function_name']}" + func_lines.append(f" case {func['enum_name']}:") + func_lines.append(f" return \"{clean_name}\";") + + func_lines.append("") + func_lines.append(" default:") + func_lines.append(" return \"unknown\";") + func_lines.append(" }") + func_lines.append("}") + # Full header template - header_content = NL.join(enum_lines) + "\n" + header_content = NL.join(enum_lines) + "\n\n" + NL.join(func_lines) + "\n" return header_content @@ -170,19 +193,6 @@ class RemotingCodebaseGenerator: decl_lines.append(f"{signature} {func['backend_function']}({params});") - # Switch cases - switch_lines = [] - current_group = None - - for func in functions: - if func['group_name'] != current_group: - switch_lines.append(f" /* {func['group_description']} */") - current_group = func['group_name'] - - deprecated = " (DEPRECATED)" if func['deprecated'] else "" - - switch_lines.append(f" case {func['enum_name']}: return \"{func['backend_function']}{deprecated}\";") - # Dispatch table table_lines = [] current_group = None @@ -201,15 +211,6 @@ class RemotingCodebaseGenerator: {NL.join(decl_lines)} -static inline const char *backend_dispatch_command_name(ApirBackendCommandType type) -{{ - switch (type) {{ -{NL.join(switch_lines)} - - default: return "unknown"; - }} -}} - extern "C" {{ static const backend_dispatch_t apir_backend_dispatch_table[APIR_BACKEND_DISPATCH_TABLE_COUNT] = {{ {NL.join(table_lines)} diff --git a/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp b/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp index 07d9a6684..4593690c6 100644 --- a/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp +++ b/ggml/src/ggml-virtgpu/virtgpu-forward-backend.cpp @@ -17,8 +17,8 @@ ggml_status apir_backend_graph_compute(virtgpu * gpu, ggml_cgraph * cgraph) { size_t cgraph_size = apir_serialize_ggml_cgraph(cgraph, cgraph_data); virtgpu_shmem temp_shmem; // Local storage for large buffers - virtgpu_shmem * shmem = &temp_shmem; - bool using_shared_shmem = false; + virtgpu_shmem * shmem = &temp_shmem; + bool using_shared_shmem = false; if (cgraph_size <= gpu->data_shmem.mmap_size) { // Lock mutex before using shared data_shmem buffer @@ -26,7 +26,7 @@ ggml_status apir_backend_graph_compute(virtgpu * gpu, ggml_cgraph * cgraph) { GGML_ABORT(GGML_VIRTGPU "%s: Failed to lock data_shmem mutex", __func__); } using_shared_shmem = true; - shmem = &gpu->data_shmem; + shmem = &gpu->data_shmem; } else if (virtgpu_shmem_create(gpu, cgraph_size, shmem)) { GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate the guest-host shared buffer", __func__); } diff --git a/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp b/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp index cab74fd17..38f8ec945 100644 --- a/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp +++ b/ggml/src/ggml-virtgpu/virtgpu-forward-buffer-type.cpp @@ -62,7 +62,9 @@ size_t apir_buffer_type_get_max_size(virtgpu * gpu, apir_buffer_type_host_handle return max_size; } -apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu, apir_buffer_type_host_handle_t host_handle, size_t size) { +apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu, + apir_buffer_type_host_handle_t host_handle, + size_t size) { apir_encoder * encoder; apir_decoder * decoder; ApirForwardReturnCode ret; @@ -84,7 +86,9 @@ apir_buffer_context_t apir_buffer_type_alloc_buffer(virtgpu * gpu, apir_buffer_t return buffer_context; } -size_t apir_buffer_type_get_alloc_size(virtgpu * gpu, apir_buffer_type_host_handle_t host_handle, const ggml_tensor * op) { +size_t apir_buffer_type_get_alloc_size(virtgpu * gpu, + apir_buffer_type_host_handle_t host_handle, + const ggml_tensor * op) { apir_encoder * encoder; apir_decoder * decoder; ApirForwardReturnCode ret; diff --git a/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp b/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp index 86eee358c..228284f4a 100644 --- a/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp +++ b/ggml/src/ggml-virtgpu/virtgpu-forward-buffer.cpp @@ -35,8 +35,8 @@ void apir_buffer_set_tensor(virtgpu * gpu, apir_encode_ggml_tensor(encoder, tensor); virtgpu_shmem temp_shmem; // Local storage for large buffers - virtgpu_shmem * shmem = &temp_shmem; - bool using_shared_shmem = false; + virtgpu_shmem * shmem = &temp_shmem; + bool using_shared_shmem = false; if (size <= gpu->data_shmem.mmap_size) { // Lock mutex before using shared data_shmem buffer @@ -44,7 +44,7 @@ void apir_buffer_set_tensor(virtgpu * gpu, GGML_ABORT(GGML_VIRTGPU "%s: Failed to lock data_shmem mutex", __func__); } using_shared_shmem = true; - shmem = &gpu->data_shmem; + shmem = &gpu->data_shmem; } else if (virtgpu_shmem_create(gpu, size, shmem)) { GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate the guest-host shared buffer", __func__); @@ -86,8 +86,8 @@ void apir_buffer_get_tensor(virtgpu * gpu, apir_encode_ggml_tensor(encoder, tensor); virtgpu_shmem temp_shmem; // Local storage for large buffers - virtgpu_shmem * shmem = &temp_shmem; - bool using_shared_shmem = false; + virtgpu_shmem * shmem = &temp_shmem; + bool using_shared_shmem = false; if (size <= gpu->data_shmem.mmap_size) { // Lock mutex before using shared data_shmem buffer @@ -95,7 +95,7 @@ void apir_buffer_get_tensor(virtgpu * gpu, GGML_ABORT(GGML_VIRTGPU "%s: Failed to lock data_shmem mutex", __func__); } using_shared_shmem = true; - shmem = &gpu->data_shmem; + shmem = &gpu->data_shmem; } else if (virtgpu_shmem_create(gpu, size, shmem)) { GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate the guest-host shared buffer", __func__); diff --git a/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp b/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp index 4b6b8f527..9f513c138 100644 --- a/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp +++ b/ggml/src/ggml-virtgpu/virtgpu-forward-device.cpp @@ -26,7 +26,7 @@ char * apir_device_get_name(virtgpu * gpu) { REMOTE_CALL(gpu, encoder, decoder, ret); const size_t string_size = apir_decode_array_size_unchecked(decoder); - char * string = (char *) apir_decoder_alloc_array(sizeof(char), string_size); + char * string = (char *) apir_decoder_alloc_array(sizeof(char), string_size); if (!string) { GGML_LOG_ERROR(GGML_VIRTGPU "%s: Could not allocate the device name buffer\n", __func__); return NULL; @@ -173,7 +173,7 @@ apir_buffer_context_t apir_device_buffer_from_ptr(virtgpu * gpu, size_t size, si REMOTE_CALL_PREPARE(gpu, encoder, APIR_COMMAND_TYPE_DEVICE_BUFFER_FROM_PTR); if (virtgpu_shmem_create(gpu, size, &buffer_context.shmem)) { - GGML_ABORT(GGML_VIRTGPU "Couldn't allocate the guest-host shared buffer"); + GGML_ABORT(GGML_VIRTGPU "%s: Couldn't allocate %ldb of guest-host shared buffer", __func__, size); } apir_encode_virtgpu_shmem_res_id(encoder, buffer_context.shmem.res_id); diff --git a/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h b/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h index f23c75bb9..4d0b6e05c 100644 --- a/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h +++ b/ggml/src/ggml-virtgpu/virtgpu-forward-impl.h @@ -1,29 +1,36 @@ -#include "virtgpu.h" +#pragma once +// clang-format off +#include "virtgpu.h" #include "ggml-remoting.h" #include "backend/shared/apir_backend.h" #include "backend/shared/apir_cs_ggml.h" - #include "ggml-backend-impl.h" +// clang-format on -#define REMOTE_CALL_PREPARE(gpu_dev_name, encoder_name, apir_command_type__) \ - do { \ - int32_t forward_flag = (int32_t) apir_command_type__; \ - encoder_name = remote_call_prepare(gpu_dev_name, APIR_COMMAND_TYPE_FORWARD, forward_flag); \ - if (!encoder_name) { \ - GGML_ABORT(GGML_VIRTGPU "%s: failed to prepare the remote call encoder", __func__); \ - } \ +#define REMOTE_CALL_PREPARE(gpu_dev_name, encoder_name, apir_command_type__) \ + int32_t REMOTE_CALL_PREPARE_forward_flag = (int32_t) apir_command_type__; \ + const char * REMOTE_CALL_PREPARE_command_name = apir_dispatch_command_name(apir_command_type__); \ + do { \ + encoder_name = remote_call_prepare(gpu_dev_name, APIR_COMMAND_TYPE_FORWARD, REMOTE_CALL_PREPARE_forward_flag); \ + if (!encoder_name) { \ + GGML_ABORT(GGML_VIRTGPU "%s: failed to prepare the remote call encoder", __func__); \ + } \ } while (0) -#define REMOTE_CALL(gpu_dev_name, encoder_name, decoder_name, ret_name) \ - do { \ - ret_name = (ApirForwardReturnCode) remote_call(gpu_dev_name, encoder_name, &decoder_name, 0, NULL); \ - if (!decoder_name) { \ - GGML_ABORT(GGML_VIRTGPU "%s: failed to kick the remote call", __func__); \ - } \ - if (ret_name < APIR_FORWARD_BASE_INDEX) { \ - GGML_ABORT(GGML_VIRTGPU "%s: failed to forward the API call: %s: code %d", __func__, \ - apir_forward_error(ret_name), ret_name); \ - } \ - ret_name = (ApirForwardReturnCode) (ret_name - APIR_FORWARD_BASE_INDEX); \ +#define REMOTE_CALL(gpu_dev_name, encoder_name, decoder_name, ret_name) \ + do { \ + ret_name = (ApirForwardReturnCode) remote_call(gpu_dev_name, encoder_name, &decoder_name, 0, NULL); \ + if (!decoder_name) { \ + GGML_ABORT(GGML_VIRTGPU "%s: failed to kick the remote call", __func__); \ + } \ + if (ret_name < APIR_FORWARD_BASE_INDEX) { \ + GGML_ABORT(GGML_VIRTGPU "%s: failed to forward the API call: %s: code %d", __func__, \ + apir_forward_error(ret_name), ret_name); \ + } \ + ret_name = (ApirForwardReturnCode) (ret_name - APIR_FORWARD_BASE_INDEX); \ + if (ret_name != 0) { \ + GGML_ABORT(GGML_VIRTGPU "backend function '%s' failed (return code: %d)", \ + REMOTE_CALL_PREPARE_command_name, ret_name); \ + } \ } while (0) diff --git a/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h b/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h index fe4cae202..44b0ad1ff 100644 --- a/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h +++ b/ggml/src/ggml-virtgpu/virtgpu-forward.gen.h @@ -20,6 +20,7 @@ apir_buffer_context_t apir_device_buffer_from_ptr(struct virtgpu * gpu, char * apir_buffer_type_get_name(struct virtgpu * gpu, apir_buffer_type_host_handle_t host_handle); size_t apir_buffer_type_get_alignment(struct virtgpu * gpu, apir_buffer_type_host_handle_t host_handle); size_t apir_buffer_type_get_max_size(struct virtgpu * gpu, apir_buffer_type_host_handle_t host_handle); +/* apir_buffer_type_is_host is deprecated. */ apir_buffer_context_t apir_buffer_type_alloc_buffer(struct virtgpu * gpu, apir_buffer_type_host_handle_t host_handle, size_t size); diff --git a/ggml/src/ggml-virtgpu/virtgpu.cpp b/ggml/src/ggml-virtgpu/virtgpu.cpp index 1e650dc65..a84a77399 100644 --- a/ggml/src/ggml-virtgpu/virtgpu.cpp +++ b/ggml/src/ggml-virtgpu/virtgpu.cpp @@ -53,9 +53,9 @@ static int virtgpu_handshake(virtgpu * gpu) { if (!decoder) { GGML_ABORT(GGML_VIRTGPU - "%s: failed to initiate the communication with the virglrenderer library. " - "Most likely, the wrong virglrenderer library was loaded in the hypervisor.", - __func__); + "%s: failed to initiate the communication with the virglrenderer library. " + "Most likely, the wrong virglrenderer library was loaded in the hypervisor.", + __func__); return 1; } @@ -65,8 +65,7 @@ static int virtgpu_handshake(virtgpu * gpu) { uint32_t host_minor; if (ret_magic != APIR_HANDSHAKE_MAGIC) { - GGML_ABORT(GGML_VIRTGPU - "%s: handshake with the virglrenderer failed (code=%d | %s)", __func__, ret_magic, + GGML_ABORT(GGML_VIRTGPU "%s: handshake with the virglrenderer failed (code=%d | %s)", __func__, ret_magic, apir_backend_initialize_error(ret_magic)); } else { apir_decode_uint32_t(decoder, &host_major); @@ -140,15 +139,13 @@ static ApirLoadLibraryReturnCode virtgpu_load_library(virtgpu * gpu) { "Make sure virglrenderer is correctly configured by the hypervisor. (%s) ", __func__, apir_load_library_error(ret)); } else { - GGML_ABORT(GGML_VIRTGPU - "%s: virglrenderer could not load the API Remoting backend library. (%s - code %d)", __func__, - apir_load_library_error(ret), ret); + GGML_ABORT(GGML_VIRTGPU "%s: virglrenderer could not load the API Remoting backend library. (%s - code %d)", + __func__, apir_load_library_error(ret), ret); } return ret; } - GGML_LOG_INFO(GGML_VIRTGPU - "%s: virglrenderer successfully loaded the API Remoting backend library.\n", __func__); + GGML_LOG_INFO(GGML_VIRTGPU "%s: virglrenderer successfully loaded the API Remoting backend library.\n", __func__); ApirLoadLibraryReturnCode apir_ret = (ApirLoadLibraryReturnCode) (ret - APIR_LOAD_LIBRARY_INIT_BASE_INDEX); @@ -158,10 +155,11 @@ static ApirLoadLibraryReturnCode virtgpu_load_library(virtgpu * gpu) { "Make sure virglrenderer is correctly configured by the hypervisor. (%s)", __func__, apir_load_library_error(apir_ret)); } else if (apir_ret == APIR_LOAD_LIBRARY_SYMBOL_MISSING) { - GGML_ABORT(GGML_VIRTGPU - "%s: the API Remoting backend library couldn't load the GGML backend library, some symbols are missing. " - "Make sure virglrenderer is correctly configured by the hypervisor. (%s)", - __func__, apir_load_library_error(apir_ret)); + GGML_ABORT( + GGML_VIRTGPU + "%s: the API Remoting backend library couldn't load the GGML backend library, some symbols are missing. " + "Make sure virglrenderer is correctly configured by the hypervisor. (%s)", + __func__, apir_load_library_error(apir_ret)); } else if (apir_ret < APIR_LOAD_LIBRARY_INIT_BASE_INDEX) { GGML_ABORT(GGML_VIRTGPU "%s: the API Remoting backend library couldn't load the GGML backend library: apir code=%d | %s)", @@ -169,8 +167,8 @@ static ApirLoadLibraryReturnCode virtgpu_load_library(virtgpu * gpu) { } else { uint32_t lib_ret = apir_ret - APIR_LOAD_LIBRARY_INIT_BASE_INDEX; GGML_ABORT(GGML_VIRTGPU - "%s: the API Remoting backend library initialize its backend library: apir code=%d)", __func__, - lib_ret); + "%s: the API Remoting backend library failed to initialize its backend library: apir code=%d)", + __func__, lib_ret); } return ret; } @@ -184,55 +182,49 @@ virtgpu * create_virtgpu() { // Initialize mutex to protect shared data_shmem buffer if (mtx_init(&gpu->data_shmem_mutex, mtx_plain) != thrd_success) { delete gpu; - GGML_ABORT(GGML_VIRTGPU - "%s: failed to initialize data_shmem mutex", __func__); + GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize data_shmem mutex", __func__); return NULL; } if (virtgpu_open(gpu) != APIR_SUCCESS) { - GGML_LOG_ERROR(GGML_VIRTGPU - "%s: failed to open the virtgpu device\n", __func__); + GGML_LOG_ERROR(GGML_VIRTGPU "%s: failed to open the virtgpu device\n", __func__); return NULL; } if (virtgpu_init_capset(gpu) != APIR_SUCCESS) { if (gpu->use_apir_capset) { GGML_ABORT(GGML_VIRTGPU - "%s: failed to initialize the virtgpu APIR capset. Make sure that the virglrenderer library supports it.", __func__); + "%s: failed to initialize the virtgpu APIR capset. Make sure that the virglrenderer library " + "supports it.", + __func__); } else { - GGML_ABORT(GGML_VIRTGPU - "%s: failed to initialize the virtgpu Venus capset", __func__); + GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the virtgpu Venus capset", __func__); } return NULL; } if (virtgpu_init_context(gpu) != APIR_SUCCESS) { - GGML_ABORT(GGML_VIRTGPU - "%s: failed to initialize the GPU context", __func__); + GGML_ABORT(GGML_VIRTGPU "%s: failed to initialize the GPU context", __func__); return NULL; } if (virtgpu_shmem_create(gpu, SHMEM_REPLY_SIZE, &gpu->reply_shmem)) { - GGML_ABORT(GGML_VIRTGPU - "%s: failed to create the shared reply memory pages", __func__); + GGML_ABORT(GGML_VIRTGPU "%s: failed to create the shared reply memory pages", __func__); return NULL; } if (virtgpu_shmem_create(gpu, SHMEM_DATA_SIZE, &gpu->data_shmem)) { - GGML_ABORT(GGML_VIRTGPU - "%s: failed to create the shared data memory pages", __func__); + GGML_ABORT(GGML_VIRTGPU "%s: failed to create the shared data memory pages", __func__); return NULL; } if (virtgpu_handshake(gpu)) { - GGML_ABORT(GGML_VIRTGPU - "%s: failed to handshake with the virglrenderer library", __func__); + GGML_ABORT(GGML_VIRTGPU "%s: failed to handshake with the virglrenderer library", __func__); return NULL; } if (virtgpu_load_library(gpu) != APIR_LOAD_LIBRARY_SUCCESS) { - GGML_ABORT(GGML_VIRTGPU - "%s: failed to load the backend library", __func__); + GGML_ABORT(GGML_VIRTGPU "%s: failed to load the backend library", __func__); return NULL; } @@ -243,8 +235,7 @@ static virt_gpu_result_t virtgpu_open(virtgpu * gpu) { drmDevicePtr devs[8]; int count = drmGetDevices2(0, devs, ARRAY_SIZE(devs)); if (count < 0) { - GGML_LOG_ERROR(GGML_VIRTGPU - "%s: failed to enumerate DRM devices\n", __func__); + GGML_LOG_ERROR(GGML_VIRTGPU "%s: failed to enumerate DRM devices\n", __func__); return APIR_ERROR_INITIALIZATION_FAILED; } @@ -266,19 +257,17 @@ static virt_gpu_result_t virtgpu_open_device(virtgpu * gpu, const drmDevicePtr d int fd = open(node_path, O_RDWR | O_CLOEXEC); if (fd < 0) { - GGML_ABORT(GGML_VIRTGPU - "%s: failed to open %s", __func__, node_path); + GGML_ABORT(GGML_VIRTGPU "%s: failed to open %s", __func__, node_path); return APIR_ERROR_INITIALIZATION_FAILED; } drmVersionPtr version = drmGetVersion(fd); if (!version || strcmp(version->name, "virtio_gpu") || version->version_major != 0) { if (version) { - GGML_LOG_ERROR(GGML_VIRTGPU - "%s: unknown DRM driver %s version %d\n", __func__, version->name, version->version_major); + GGML_LOG_ERROR(GGML_VIRTGPU "%s: unknown DRM driver %s version %d\n", __func__, version->name, + version->version_major); } else { - GGML_LOG_ERROR(GGML_VIRTGPU - "%s: failed to get DRM driver version\n", __func__); + GGML_LOG_ERROR(GGML_VIRTGPU "%s: failed to get DRM driver version\n", __func__); } if (version) { @@ -322,9 +311,8 @@ static virt_gpu_result_t virtgpu_init_capset(virtgpu * gpu) { virtgpu_ioctl_get_caps(gpu, gpu->capset.id, gpu->capset.version, &gpu->capset.data, sizeof(gpu->capset.data)); if (ret) { - GGML_LOG_ERROR(GGML_VIRTGPU - "%s: failed to get APIR v%d capset: %s\n", - __func__, gpu->capset.version, strerror(errno)); + GGML_LOG_ERROR(GGML_VIRTGPU "%s: failed to get APIR v%d capset: %s\n", __func__, gpu->capset.version, + strerror(errno)); return APIR_ERROR_INITIALIZATION_FAILED; } @@ -547,13 +535,10 @@ static void log_call_duration(long long call_duration_ns, const char * name) { double call_duration_s = (double) call_duration_ns / 1e9; // 1 second = 1e9 nanoseconds if (call_duration_s > 1) { - GGML_LOG_INFO(GGML_VIRTGPU - "waited %.2fs for the %s host reply...\n", call_duration_s, name); + GGML_LOG_INFO(GGML_VIRTGPU "waited %.2fs for the %s host reply...\n", call_duration_s, name); } else if (call_duration_ms > 1) { - GGML_LOG_INFO(GGML_VIRTGPU - "waited %.2fms for the %s host reply...\n", call_duration_ms, name); + GGML_LOG_INFO(GGML_VIRTGPU "waited %.2fms for the %s host reply...\n", call_duration_ms, name); } else { - GGML_LOG_INFO(GGML_VIRTGPU - "waited %lldns for the %s host reply...\n", call_duration_ns, name); + GGML_LOG_INFO(GGML_VIRTGPU "waited %lldns for the %s host reply...\n", call_duration_ns, name); } } diff --git a/ggml/src/ggml-virtgpu/virtgpu.h b/ggml/src/ggml-virtgpu/virtgpu.h index 68e0f3a37..f82d8fb50 100644 --- a/ggml/src/ggml-virtgpu/virtgpu.h +++ b/ggml/src/ggml-virtgpu/virtgpu.h @@ -1,5 +1,6 @@ #pragma once +// clang-format off #include "virtgpu-utils.h" #include "virtgpu-shm.h" #include "virtgpu-apir.h" @@ -23,20 +24,21 @@ #include "apir_hw.h" #include #include "venus_hw.h" +// clang-format on #ifndef VIRTGPU_DRM_CAPSET_APIR // Will be defined include/drm/virtgpu_drm.h when // https://gitlab.freedesktop.org/virgl/virglrenderer/-/merge_requests/1590/diffs // is merged -#define VIRTGPU_DRM_CAPSET_APIR 10 +# define VIRTGPU_DRM_CAPSET_APIR 10 #endif // Mesa/Virlgrenderer Venus internal. Only necessary during the // Venus->APIR transition in Virglrenderer #define VENUS_COMMAND_TYPE_LENGTH 331 -#ifndef VIRTGPU_DRM_CAPSET_VENUS // only available with Linux >= v6.16 -#define VIRTGPU_DRM_CAPSET_VENUS 4 +#ifndef VIRTGPU_DRM_CAPSET_VENUS // only available with Linux >= v6.16 +# define VIRTGPU_DRM_CAPSET_VENUS 4 #endif typedef uint32_t virgl_renderer_capset; From b68d75165ad37ba1256cc45a43ec4f51cf813c3e Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Thu, 26 Feb 2026 21:01:08 +0800 Subject: [PATCH 05/15] llama: Add option to merge gate and exp weights (#19139) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * llama: Add option to merge gate and exp weights * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret * update constants.py * add gate_up for the all MoE models * convert: simplify merge tensor condition * update constants.py * reduce number of models, add create_tensor_gate_up helper --------- Co-authored-by: Sigbjørn Skjæret --- convert_hf_to_gguf.py | 41 ++++++++++++++++++-- gguf-py/gguf/constants.py | 5 +++ gguf-py/gguf/tensor_mapping.py | 4 ++ src/llama-arch.cpp | 5 +++ src/llama-arch.h | 1 + src/llama-graph.cpp | 70 ++++++++++++++++++++++++---------- src/llama-graph.h | 7 +++- src/llama-model.cpp | 18 ++++++--- src/llama-model.h | 18 +++++---- src/models/deepseek2.cpp | 4 +- src/models/qwen35moe.cpp | 3 +- src/models/qwen3next.cpp | 3 +- 12 files changed, 135 insertions(+), 44 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 5709cb0e3..095441739 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -116,7 +116,8 @@ class ModelBase: split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None, disable_mistral_community_chat_template: bool = False, - sentence_transformers_dense_modules: bool = False): + sentence_transformers_dense_modules: bool = False, + fuse_gate_up_exps: bool = False): if type(self) is ModelBase or \ type(self) is TextModel or \ type(self) is MmprojModel: @@ -135,6 +136,9 @@ class ModelBase: self.dry_run = dry_run self.remote_hf_model_id = remote_hf_model_id self.sentence_transformers_dense_modules = sentence_transformers_dense_modules + self.fuse_gate_up_exps = fuse_gate_up_exps + self._gate_exp_buffer: dict[int, Tensor] = {} + self._up_exp_buffer: dict[int, Tensor] = {} self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams self.model_tensors = self.index_tensors(remote_hf_model_id=remote_hf_model_id) self.metadata_override = metadata_override @@ -512,8 +516,31 @@ class ModelBase: raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses") def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - return [(self.map_tensor_name(name), data_torch)] + new_name = self.map_tensor_name(name) + + # Handle gate/up expert tensor fusion if enabled + if self.fuse_gate_up_exps and bid is not None: + if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_GATE_EXP, bid): + self._gate_exp_buffer[bid] = data_torch + elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_UP_EXP, bid): + self._up_exp_buffer[bid] = data_torch + + # Check if both gate and up are buffered for this layer + if bid in self._gate_exp_buffer and bid in self._up_exp_buffer: + gate_data = self._gate_exp_buffer.pop(bid) + up_data = self._up_exp_buffer.pop(bid) + # gate/up shape: (n_expert, n_ff, n_embd), concatenate to (n_expert, n_ff*2, n_embd) + fused_data = torch.cat([gate_data, up_data], dim=1) + fused_name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_UP_EXP, bid) + logger.info(f"Fused gate_exps and up_exps for layer {bid}") + return [(fused_name, fused_data)] + + # If we buffered a gate/up tensor, wait for the other + if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_GATE_EXP, bid) or \ + self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.FFN_UP_EXP, bid): + return [] + + return [(new_name, data_torch)] def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool: del name, new_name, bid, n_dims # unused @@ -11942,6 +11969,11 @@ def parse_args() -> argparse.Namespace: "Default these modules are not included.") ) + parser.add_argument( + "--fuse-gate-up-exps", action="store_true", + help="Fuse gate_exps and up_exps tensors into a single gate_up_exps tensor for MoE models.", + ) + args = parser.parse_args() if not args.print_supported_models and args.model is None: parser.error("the following arguments are required: model") @@ -12079,7 +12111,8 @@ def main() -> None: split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run, small_first_shard=args.no_tensor_first_split, remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template, - sentence_transformers_dense_modules=args.sentence_transformers_dense_modules + sentence_transformers_dense_modules=args.sentence_transformers_dense_modules, + fuse_gate_up_exps=args.fuse_gate_up_exps ) if args.vocab_only: diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index e9ef97d55..839c6e787 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -532,6 +532,7 @@ class MODEL_TENSOR(IntEnum): FFN_GATE_EXP = auto() FFN_DOWN_EXP = auto() FFN_UP_EXP = auto() + FFN_GATE_UP_EXP = auto() FFN_GATE_SHEXP = auto() FFN_DOWN_SHEXP = auto() FFN_UP_SHEXP = auto() @@ -980,6 +981,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps", MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps", MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps", + MODEL_TENSOR.FFN_GATE_UP_EXP: "blk.{bid}.ffn_gate_up_exps", MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b", MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: "per_layer_token_embd", # gemma3n @@ -1820,6 +1822,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_GATE_UP_EXP, MODEL_TENSOR.SSM_A, MODEL_TENSOR.SSM_CONV1D, MODEL_TENSOR.SSM_DT, @@ -1909,6 +1912,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_GATE_UP_EXP, MODEL_TENSOR.SSM_A, MODEL_TENSOR.SSM_CONV1D, MODEL_TENSOR.SSM_DT, @@ -2610,6 +2614,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_GATE_EXP, MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_GATE_UP_EXP, MODEL_TENSOR.FFN_GATE_SHEXP, MODEL_TENSOR.FFN_DOWN_SHEXP, MODEL_TENSOR.FFN_UP_SHEXP, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index fc468d077..e57561090 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -567,6 +567,10 @@ class TensorNameMap: "model.layers.{bid}.mlp.chunk_experts.gate_proj", # grovemoe ), + MODEL_TENSOR.FFN_GATE_UP_EXP: ( + "model.layers.{bid}.mlp.experts.gate_up_proj", + ), + # Feed-forward down MODEL_TENSOR.FFN_DOWN: ( "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 106ea133a..47e8d5278 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -349,6 +349,7 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" }, { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_GATE_UP_EXPS, "blk.%d.ffn_gate_up_exps" }, { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, @@ -1004,6 +1005,7 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_FFN_GATE_EXPS, LLM_TENSOR_FFN_DOWN_EXPS, LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_GATE_UP_EXPS, LLM_TENSOR_FFN_GATE_INP_SHEXP, LLM_TENSOR_FFN_GATE_SHEXP, LLM_TENSOR_FFN_DOWN_SHEXP, @@ -1061,6 +1063,7 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_FFN_GATE_EXPS, LLM_TENSOR_FFN_DOWN_EXPS, LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_GATE_UP_EXPS, LLM_TENSOR_FFN_GATE_INP_SHEXP, LLM_TENSOR_FFN_GATE_SHEXP, LLM_TENSOR_FFN_DOWN_SHEXP, @@ -1601,6 +1604,7 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_FFN_GATE_EXPS, LLM_TENSOR_FFN_DOWN_EXPS, LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_GATE_UP_EXPS, LLM_TENSOR_FFN_GATE_INP_SHEXP, LLM_TENSOR_FFN_GATE_SHEXP, LLM_TENSOR_FFN_DOWN_SHEXP, @@ -2685,6 +2689,7 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, + {LLM_TENSOR_FFN_GATE_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_DOWN_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_GATE_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_UP_CHEXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index 42a6ea38f..6d1b1df31 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -373,6 +373,7 @@ enum llm_tensor { LLM_TENSOR_FFN_DOWN_EXPS, // merged experts LLM_TENSOR_FFN_GATE_EXPS, LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_GATE_UP_EXPS, LLM_TENSOR_FFN_DOWN_SHEXP, LLM_TENSOR_FFN_GATE_SHEXP, LLM_TENSOR_FFN_UP_SHEXP, diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index dc58c0826..23a86ea29 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1165,7 +1165,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn( float w_scale, llama_expert_gating_func_type gating_op, int il, - ggml_tensor * probs_in) const { + ggml_tensor * probs_in, + ggml_tensor * gate_up_exps) const { return build_moe_ffn( cur, gate_inp, /* gate_inp_b */ nullptr, @@ -1181,7 +1182,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn( w_scale, gating_op, il, - probs_in + probs_in, + gate_up_exps ); } @@ -1204,7 +1206,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn( float w_scale, llama_expert_gating_func_type gating_op, int il, - ggml_tensor * probs_in) const { + ggml_tensor * probs_in, + ggml_tensor * gate_up_exps, + ggml_tensor * gate_up_exps_b) const { const int64_t n_embd = cur->ne[0]; const int64_t n_tokens = cur->ne[1]; const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN @@ -1343,26 +1347,48 @@ ggml_tensor * llm_graph_context::build_moe_ffn( cb(cur, "ffn_moe_weighted", il); } - ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] - cb(up, "ffn_moe_up", il); - - if (up_exps_b) { - up = ggml_add_id(ctx0, up, up_exps_b, selected_experts); - cb(up, "ffn_moe_up_biased", il); - } - + ggml_tensor * up = nullptr; ggml_tensor * experts = nullptr; - if (gate_exps) { - cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + + if (gate_up_exps) { + // merged gate_up path: one mul_mat_id, then split into gate and up views + ggml_tensor * gate_up = build_lora_mm_id(gate_up_exps, cur, selected_experts); // [n_ff*2, n_expert_used, n_tokens] + cb(gate_up, "ffn_moe_gate_up", il); + + if (gate_up_exps_b) { + gate_up = ggml_add_id(ctx0, gate_up, gate_up_exps_b, selected_experts); + cb(gate_up, "ffn_moe_gate_up_biased", il); + } + + const int64_t n_ff = gate_up->ne[0] / 2; + cur = ggml_view_3d(ctx0, gate_up, n_ff, gate_up->ne[1], gate_up->ne[2], gate_up->nb[1], gate_up->nb[2], 0); cb(cur, "ffn_moe_gate", il); + up = ggml_view_3d(ctx0, gate_up, n_ff, gate_up->ne[1], gate_up->ne[2], gate_up->nb[1], gate_up->nb[2], n_ff * gate_up->nb[0]); + cb(up, "ffn_moe_up", il); } else { - cur = up; + // separate gate and up path + up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + cb(up, "ffn_moe_up", il); + + if (up_exps_b) { + up = ggml_add_id(ctx0, up, up_exps_b, selected_experts); + cb(up, "ffn_moe_up_biased", il); + } + + if (gate_exps) { + cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + cb(cur, "ffn_moe_gate", il); + } else { + cur = up; + } + + if (gate_exps_b) { + cur = ggml_add_id(ctx0, cur, gate_exps_b, selected_experts); + cb(cur, "ffn_moe_gate_biased", il); + } } - if (gate_exps_b) { - cur = ggml_add_id(ctx0, cur, gate_exps_b, selected_experts); - cb(cur, "ffn_moe_gate_biased", il); - } + const bool has_gate = gate_exps || gate_up_exps; switch (type_op) { case LLM_FFN_SILU: @@ -1385,7 +1411,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn( break; } } + } + if (has_gate) { cur = ggml_swiglu_split(ctx0, cur, up); cb(cur, "ffn_moe_swiglu", il); } else { @@ -1393,7 +1421,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn( cb(cur, "ffn_moe_silu", il); } break; case LLM_FFN_GELU: - if (gate_exps) { + if (has_gate) { cur = ggml_geglu_split(ctx0, cur, up); cb(cur, "ffn_moe_geglu", il); } else { @@ -1409,7 +1437,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn( cb(cur, "ffn_moe_swiglu_oai", il); } break; case LLM_FFN_RELU: - if (gate_exps) { + if (has_gate) { cur = ggml_reglu_split(ctx0, cur, up); cb(cur, "ffn_moe_reglu", il); } else { @@ -1417,7 +1445,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn( cb(cur, "ffn_moe_relu", il); } break; case LLM_FFN_RELU_SQR: - if (gate_exps) { + if (has_gate) { // TODO: add support for gated squared relu GGML_ABORT("fatal error: gated squared relu not implemented"); } else { diff --git a/src/llama-graph.h b/src/llama-graph.h index 22d11a838..e8f006977 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -814,7 +814,8 @@ struct llm_graph_context { float w_scale, llama_expert_gating_func_type gating_op, int il, - ggml_tensor * probs_in = nullptr) const; + ggml_tensor * probs_in = nullptr, + ggml_tensor * gate_up_exps = nullptr) const; ggml_tensor * build_moe_ffn( ggml_tensor * cur, @@ -835,7 +836,9 @@ struct llm_graph_context { float w_scale, llama_expert_gating_func_type gating_op, int il, - ggml_tensor * probs_in = nullptr) const; + ggml_tensor * probs_in = nullptr, + ggml_tensor * gate_up_exps = nullptr, + ggml_tensor * gate_up_exps_b = nullptr) const; // // inputs diff --git a/src/llama-model.cpp b/src/llama-model.cpp index c8ef1b5e7..dabf3b308 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2980,6 +2980,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // TODO: move to a separate function const auto tn = LLM_TN(arch); + + // helper: try merged gate_up_exps first, fall back to separate gate and up + auto create_tensor_gate_up_exps = [&](llama_layer & layer, int bid, int64_t n_embd_, int64_t n_ff_, int64_t n_expert_, int flags) { + layer.ffn_gate_up_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_UP_EXPS, "weight", bid), {n_embd_, n_ff_ * 2, n_expert_}, TENSOR_NOT_REQUIRED); + if (layer.ffn_gate_up_exps == nullptr) { + layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", bid), {n_embd_, n_ff_, n_expert_}, flags); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", bid), {n_embd_, n_ff_, n_expert_}, flags); + } + }; switch (arch) { case LLM_ARCH_LLAMA: case LLM_ARCH_REFACT: @@ -5221,9 +5230,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } // MoE branch - layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0); layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0); - layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0); + create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0); // Shared expert branch layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0); @@ -7425,9 +7433,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0); - layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0); layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0); - layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0); + create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0); // Shared experts layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), { n_embd }, 0); @@ -7491,9 +7498,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0); - layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0); layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, 0); - layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0); + create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, 0); // Shared experts const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff; diff --git a/src/llama-model.h b/src/llama-model.h index 96e407a0b..d7c3e7d1c 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -280,14 +280,16 @@ struct llama_layer { struct ggml_tensor * ffn_up_enc = nullptr; // ff MoE - struct ggml_tensor * ffn_gate_inp = nullptr; - struct ggml_tensor * ffn_gate_exps = nullptr; - struct ggml_tensor * ffn_down_exps = nullptr; - struct ggml_tensor * ffn_up_exps = nullptr; - struct ggml_tensor * ffn_gate_inp_b = nullptr; - struct ggml_tensor * ffn_gate_exps_b = nullptr; - struct ggml_tensor * ffn_down_exps_b = nullptr; - struct ggml_tensor * ffn_up_exps_b = nullptr; + struct ggml_tensor * ffn_gate_inp = nullptr; + struct ggml_tensor * ffn_gate_exps = nullptr; + struct ggml_tensor * ffn_down_exps = nullptr; + struct ggml_tensor * ffn_up_exps = nullptr; + struct ggml_tensor * ffn_gate_up_exps = nullptr; + struct ggml_tensor * ffn_gate_inp_b = nullptr; + struct ggml_tensor * ffn_gate_exps_b = nullptr; + struct ggml_tensor * ffn_down_exps_b = nullptr; + struct ggml_tensor * ffn_up_exps_b = nullptr; + struct ggml_tensor * ffn_gate_up_exps_b = nullptr; // ff shared expert (shexp) struct ggml_tensor * ffn_gate_inp_shexp = nullptr; diff --git a/src/models/deepseek2.cpp b/src/models/deepseek2.cpp index b2c1f1606..b608396e5 100644 --- a/src/models/deepseek2.cpp +++ b/src/models/deepseek2.cpp @@ -218,7 +218,9 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr LLM_FFN_SILU, hparams.expert_weights_norm, hparams.expert_weights_scale, hparams.expert_weights_scale, (llama_expert_gating_func_type) hparams.expert_gating_func, - il); + il, + nullptr, + model.layers[il].ffn_gate_up_exps); cb(moe_out, "ffn_moe_out", il); // FFN shared expert diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp index 77f18b5ae..22d708f20 100644 --- a/src/models/qwen35moe.cpp +++ b/src/models/qwen35moe.cpp @@ -380,7 +380,8 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_ffn(ggml_tensor * cur, const int model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps, nullptr, n_expert, n_expert_used, LLM_FFN_SILU, - true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); + true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il, + nullptr, model.layers[il].ffn_gate_up_exps); cb(moe_out, "ffn_moe_out", il); // Add shared experts if present - following Qwen3Next reference implementation diff --git a/src/models/qwen3next.cpp b/src/models/qwen3next.cpp index 9d3a68cfe..f2621200f 100644 --- a/src/models/qwen3next.cpp +++ b/src/models/qwen3next.cpp @@ -479,7 +479,8 @@ ggml_tensor * llm_build_qwen3next::build_layer_ffn(ggml_tensor * cur, const int model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps, nullptr, n_expert, n_expert_used, LLM_FFN_SILU, - true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); + true, false, 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il, + nullptr, model.layers[il].ffn_gate_up_exps); cb(moe_out, "ffn_moe_out", il); // Add shared experts if present - following Qwen3Next reference implementation From 99bd67c9b29851f758c7d22caa8fc57fd5af3e4f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 26 Feb 2026 18:08:54 +0200 Subject: [PATCH 06/15] kv-cache : fix can_shift() check to take into account M-RoPE (#19928) --- src/llama-kv-cache.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index cb702b2a5..6b668ee9a 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -978,6 +978,9 @@ bool llama_kv_cache::get_can_shift() const { if (model.arch == LLM_ARCH_STEP35) { return false; } + if (hparams.n_pos_per_embd() > 1) { + return false; + } return true; } From 01cd448b8c3ebf30724675cc3d1e6cee74e8bdc4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 26 Feb 2026 18:20:16 +0200 Subject: [PATCH 07/15] server : fix ctx checkpoint restore logic (#19924) --- tools/server/server-context.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 73af81243..eba463e4d 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2363,7 +2363,7 @@ private: //printf("[DEBUG] `do_reset` was set to `true` after failing to restore a checkpoint"); } else { pos_next = std::min(pos_next, std::max(it->pos_min + 1, it->pos_max)); - n_past = slot.prompt.tokens.size_up_to_pos(pos_next); + n_past = std::min(slot.prompt.tokens.size_up_to_pos(pos_next), (size_t) it->n_tokens); SLT_WRN(slot, "restored context checkpoint (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n", it->pos_min, it->pos_max, it->n_tokens, (float) checkpoint_size / 1024 / 1024); } } From 37964f44f9fab37571b27cccd9f45d4a066e0817 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 26 Feb 2026 18:39:49 +0200 Subject: [PATCH 08/15] mtmd : fix padding of n_tokens (#19930) --- tools/mtmd/mtmd-helper.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp index 902a4b456..c75f90730 100644 --- a/tools/mtmd/mtmd-helper.cpp +++ b/tools/mtmd/mtmd-helper.cpp @@ -248,7 +248,7 @@ int32_t mtmd_helper_decode_image_chunk( int32_t n_tokens = mtmd_input_chunk_get_n_tokens(chunk); int32_t i_batch = 0; - int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch; + int32_t n_img_batches = (n_tokens + n_batch - 1) / n_batch; decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd); if (mtmd_decode_use_mrope(ctx)) { From 723c71064da0908c19683f8c344715fbf6d986fd Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Thu, 26 Feb 2026 19:11:04 +0100 Subject: [PATCH 09/15] vulkan: fix fp16 Flash Attention on Windows AMD RDNA2 and below (#19921) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 22 +++++++++---------- .../vulkan-shaders/flash_attn.comp | 9 +++++++- .../vulkan-shaders/flash_attn_base.glsl | 7 +++--- 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index a1149e606..0fae68628 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -624,8 +624,6 @@ struct vk_device_struct { // floor(log2(maxComputeWorkGroupInvocations)) uint32_t max_workgroup_size_log2 {}; - bool flash_attention_fp16; - bool coopmat_support; bool coopmat_acc_f32_support {}; bool coopmat_acc_f16_support {}; @@ -2978,11 +2976,15 @@ static vk_fa_tuning_params get_fa_tuning_params(const vk_device& device, uint32_ } } -static vk_fa_pipeline_state get_fa_pipeline_state(const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool aligned, bool f32acc, +static vk_fa_pipeline_state get_fa_pipeline_state(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool aligned, bool f32acc, bool use_mask, bool use_mask_opt, bool use_logit_softcap) { + const bool old_amd_windows = device->vendor_id == VK_VENDOR_ID_AMD && device->driver_id == vk::DriverId::eAmdProprietary && + (device->architecture == AMD_GCN || device->architecture == AMD_RDNA1 || device->architecture == AMD_RDNA2); + uint32_t flags = (use_mask_opt ? 1 : 0) | (use_mask ? 2 : 0) | - (use_logit_softcap ? 4 : 0); + (use_logit_softcap ? 4 : 0) | + (old_amd_windows ? 8 : 0); const uint32_t subgroup_size = params.disable_subgroups ? 0 : params.subgroup_size; @@ -3384,7 +3386,7 @@ static void ggml_vk_load_shaders(vk_device& device) { } \ } - if (device->flash_attention_fp16) { + if (device->fp16) { CREATE_FA(GGML_TYPE_F32, f32, FA_SCALAR, ) CREATE_FA(GGML_TYPE_F16, f16, FA_SCALAR, ) CREATE_FA(GGML_TYPE_Q4_0, q4_0, FA_SCALAR, ) @@ -5423,10 +5425,6 @@ static vk_device ggml_vk_get_device(size_t idx) { device->mmvq_mode = 1; } - // Driver issues with older AMD GPUs on Windows, see https://github.com/ggml-org/llama.cpp/pull/19625#issuecomment-3940840613 - const bool is_amd_proprietary_gcn = device->vendor_id == VK_VENDOR_ID_AMD && device->architecture == AMD_GCN && device->driver_id == vk::DriverId::eAmdProprietary; - device->flash_attention_fp16 = device->fp16 && !is_amd_proprietary_gcn; - return device; } @@ -8567,7 +8565,7 @@ static bool ggml_vk_flash_attn_scalar_shmem_support(const vk_device& device, con const uint32_t Br = params.block_rows; const uint32_t Bc = params.block_cols; - const uint32_t float_type_size = device->flash_attention_fp16 ? sizeof(ggml_fp16_t) : sizeof(float); + const uint32_t float_type_size = device->fp16 ? sizeof(ggml_fp16_t) : sizeof(float); // tmpsh is overestimated slightly const uint32_t tmpsh = wg_size * sizeof(float); @@ -8690,7 +8688,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx uint32_t workgroups_y = (uint32_t)neq2; uint32_t workgroups_z = (uint32_t)neq3; - const bool f32acc = !ctx->device->flash_attention_fp16 || dst->op_params[3] == GGML_PREC_F32; + const bool f32acc = !ctx->device->fp16 || dst->op_params[3] == GGML_PREC_F32; // For scalar/coopmat1 FA, we can use the "large" size to accommodate qga. // For coopmat2 FA, we always use the small size (which is still pretty large for gqa). @@ -8745,7 +8743,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx // Only use mask opt when the mask is fairly large. This hasn't been tuned extensively. bool use_mask_opt = mask && nem1 >= 32 && nem0 * nem1 > 32768; - vk_fa_pipeline_state fa_pipeline_state = get_fa_pipeline_state(tuning_params, HSK, HSV, aligned, f32acc, + vk_fa_pipeline_state fa_pipeline_state = get_fa_pipeline_state(ctx->device, tuning_params, HSK, HSV, aligned, f32acc, mask != nullptr, use_mask_opt, logit_softcap != 0); vk_pipeline pipeline = nullptr; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp index 135ab1ad6..ec48f5b11 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp @@ -465,7 +465,14 @@ void main() { if (SubGroupSize > 0) { [[unroll]] for (uint s = D_split; s < SubGroupSize; s *= 2) { - Of[r][d] += subgroupShuffleXor(Of[r][d], s); + if (!OLD_AMD_WINDOWS) { + Of[r][d] += subgroupShuffleXor(Of[r][d], s); + } else { + // Something about f16vec4 subgroupShuffleXor is broken on AMD Windows RDNA2 and below. + // Shuffle full vec4 as workaround. + // See https://github.com/ggml-org/llama.cpp/issues/19881#issuecomment-3958643697 + Of[r][d] += FLOAT_TYPEV4(subgroupShuffleXor(vec4(Of[r][d]), s)); + } } if (row_split == 1) { barrier(); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl index d444542b5..172d38f03 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl @@ -14,9 +14,10 @@ layout (constant_id = 9) const uint32_t SHMEM_STAGING = 0; layout (constant_id = 10) const uint32_t Flags = 0; layout (constant_id = 11) const uint32_t LIMIT_OCCUPANCY_SHMEM = 0; -const bool USE_MASK_OPT = (Flags & 1) != 0; -const bool MASK_ENABLE = (Flags & 2) != 0; -const bool LOGIT_SOFTCAP = (Flags & 4) != 0; +const bool USE_MASK_OPT = (Flags & 1) != 0; +const bool MASK_ENABLE = (Flags & 2) != 0; +const bool LOGIT_SOFTCAP = (Flags & 4) != 0; +const bool OLD_AMD_WINDOWS = (Flags & 8) != 0; // Round up head sizes to a multiple of 16, for coopmat1/coopmat2 paths const uint32_t HSK_pad = (HSK + 15) & ~15; From 4e76d24f282e0fa591f1eb87ae6fd9174c6ed998 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Thu, 26 Feb 2026 21:39:11 +0100 Subject: [PATCH 10/15] ggml : fix AMX and add batched support (#19925) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit llama-perplexity -hf ggml-org/Qwen3-0.6B-GGUF:Q4_0 -f wikitext-2-raw/wiki.test.raw -c 2048 -b 2048 --chunks 2 before this commit: ``` perplexity: calculating perplexity over 2 chunks, n_ctx=2048, batch_size=2048, n_seq=1 perplexity: 2.31 seconds per pass - ETA 0.07 minutes [1]17.3868,[2]22.2199, Final estimate: PPL = 22.2199 +/- 1.59692 llama_perf_context_print: load time = 878.56 ms llama_perf_context_print: prompt eval time = 2037.82 ms / 4096 tokens ( 0.50 ms per token, 2009.99 tokens per second) llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) llama_perf_context_print: total time = 6403.17 ms / 4097 tokens llama_perf_context_print: graphs reused = 0 llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted | llama_memory_breakdown_print: | - Host | 845 = 318 + 224 + 302 | llama_memory_breakdown_print: | - CPU_REPACK | 288 = 288 + 0 + 0 | llama_memory_breakdown_print: | - AMX | 31 = 31 + 0 + 0 | ``` after this commit: ``` perplexity: calculating perplexity over 2 chunks, n_ctx=2048, batch_size=2048, n_seq=1 perplexity: 1.98 seconds per pass - ETA 0.05 minutes [1]17.2005,[2]21.8220, Final estimate: PPL = 21.8220 +/- 1.56485 llama_perf_context_print: load time = 719.23 ms llama_perf_context_print: prompt eval time = 1676.23 ms / 4096 tokens ( 0.41 ms per token, 2443.58 tokens per second) llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) llama_perf_context_print: total time = 4258.74 ms / 4097 tokens llama_perf_context_print: graphs reused = 0 llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted | llama_memory_breakdown_print: | - Host | 845 = 318 + 224 + 302 | llama_memory_breakdown_print: | - AMX | 319 = 319 + 0 + 0 | ``` (no more CPU_REPACK) after this commit, disabling amx: ``` perplexity: calculating perplexity over 2 chunks, n_ctx=2048, batch_size=2048, n_seq=1 perplexity: 2.34 seconds per pass - ETA 0.07 minutes [1]17.2005,[2]21.8220, Final estimate: PPL = 21.8220 +/- 1.56485 llama_perf_context_print: load time = 841.91 ms llama_perf_context_print: prompt eval time = 2057.28 ms / 4096 tokens ( 0.50 ms per token, 1990.98 tokens per second) llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) llama_perf_context_print: total time = 6454.51 ms / 4097 tokens llama_perf_context_print: graphs reused = 0 llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted | llama_memory_breakdown_print: | - Host | 845 = 318 + 224 + 302 | llama_memory_breakdown_print: | - CPU_REPACK | 319 = 319 + 0 + 0 | ``` => same perplexity. Signed-off-by: Adrien Gallouët --- ggml/src/ggml-cpu/amx/amx.cpp | 63 ++++++++----- ggml/src/ggml-cpu/amx/mmq.cpp | 162 +++++++++++++++++----------------- 2 files changed, 124 insertions(+), 101 deletions(-) diff --git a/ggml/src/ggml-cpu/amx/amx.cpp b/ggml/src/ggml-cpu/amx/amx.cpp index 895a57137..9baf3e025 100644 --- a/ggml/src/ggml-cpu/amx/amx.cpp +++ b/ggml/src/ggml-cpu/amx/amx.cpp @@ -141,27 +141,50 @@ static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_typ namespace ggml::cpu::amx { class extra_buffer_type : ggml::cpu::extra_buffer_type { bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override { - // handle only 2d gemm for now - auto is_contiguous_2d = [](const struct ggml_tensor * t) { - return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1; - }; - - if (op->op == GGML_OP_MUL_MAT && is_contiguous_2d(op->src[0]) && // src0 must be contiguous - is_contiguous_2d(op->src[1]) && // src1 must be contiguous - op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_amx_buffer_type() && - op->src[0]->ne[0] % (TILE_K * 2 * 32) == 0 && // TODO: not sure if correct (https://github.com/ggml-org/llama.cpp/pull/16315) - op->ne[0] % (TILE_N * 2) == 0 && // out_features is 32x - (qtype_has_amx_kernels(op->src[0]->type) || (op->src[0]->type == GGML_TYPE_F16))) { - // src1 must be host buffer - if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { - return false; - } - // src1 must be float32 - if (op->src[1]->type == GGML_TYPE_F32) { - return true; - } + if (op->op != GGML_OP_MUL_MAT) { + return false; } - return false; + auto * src0 = op->src[0]; + auto * src1 = op->src[1]; + + if (!ggml_is_contiguous(src0) || !ggml_is_contiguous(src1)) { + return false; + } + if (!src0->buffer || src0->buffer->buft != ggml_backend_amx_buffer_type()) { + return false; + } + if (src1->buffer && !ggml_backend_buft_is_host(src1->buffer->buft)) { + return false; + } + if (op->ne[0] % (TILE_N * 2)) { + return false; + } + int alignment; + switch (src0->type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q8_0: + alignment = TILE_K; + break; + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + case GGML_TYPE_IQ4_XS: + alignment = 256; // QK_K + break; + case GGML_TYPE_F16: + alignment = 16; + break; + default: + return false; + } + if (src0->ne[0] % alignment) { + return false; + } + if (src1->type != GGML_TYPE_F32) { + return false; + } + return true; } ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override { diff --git a/ggml/src/ggml-cpu/amx/mmq.cpp b/ggml/src/ggml-cpu/amx/mmq.cpp index 47c61b881..b5aca7663 100644 --- a/ggml/src/ggml-cpu/amx/mmq.cpp +++ b/ggml/src/ggml-cpu/amx/mmq.cpp @@ -1,4 +1,3 @@ - #if defined(__GNUC__) #pragma GCC diagnostic ignored "-Wpedantic" #pragma GCC diagnostic ignored "-Wunused-local-typedefs" @@ -202,35 +201,27 @@ struct tile_config_t{ // advanced-matrix-extensions-intrinsics-functions.html // -#define TC_CONFIG_TILE(i, r, cb) tc.rows[i] = r; tc.colsb[i] = cb -void ggml_tile_config_init(void) { - static thread_local bool is_first_time = true; +inline void ggml_tile_config_init(void) { + static thread_local bool done = false; - if (!is_first_time) { + if (done) { return; } - static thread_local tile_config_t tc; - tile_config_t current_tc; - _tile_storeconfig(¤t_tc); + alignas(64) tile_config_t tc = {}; + tc.palette_id = 1; + tc.start_row = 0; + tc.rows[0] = 8; tc.colsb[0] = 64; + tc.rows[1] = 8; tc.colsb[1] = 64; + tc.rows[2] = 16; tc.colsb[2] = 32; + tc.rows[3] = 16; tc.colsb[3] = 32; + tc.rows[4] = 16; tc.colsb[4] = 64; + tc.rows[5] = 16; tc.colsb[5] = 64; + tc.rows[6] = 16; tc.colsb[6] = 64; + tc.rows[7] = 16; tc.colsb[7] = 64; - // load only when config changes - if (tc.palette_id == 0 || (memcmp(¤t_tc.colsb, &tc.colsb, sizeof(uint16_t) * 8) != 0 && - memcmp(¤t_tc.rows, &tc.rows, sizeof(uint8_t) * 8) != 0)) { - tc.palette_id = 1; - tc.start_row = 0; - TC_CONFIG_TILE(TMM0, 8, 64); - TC_CONFIG_TILE(TMM1, 8, 64); - TC_CONFIG_TILE(TMM2, 16, 32); - TC_CONFIG_TILE(TMM3, 16, 32); - TC_CONFIG_TILE(TMM4, 16, 64); - TC_CONFIG_TILE(TMM5, 16, 64); - TC_CONFIG_TILE(TMM6, 16, 64); - TC_CONFIG_TILE(TMM7, 16, 64); - _tile_loadconfig(&tc); - } - - is_first_time = false; + _tile_loadconfig(&tc); + done = true; } // we need an extra 16 * 4B (TILE_N * int32_t) for each NB/KB block for compensation. @@ -268,33 +259,6 @@ int get_row_size(int K) { return row_size; } -// vectorized dtype conversion -inline float FP16_TO_FP32(ggml_half val) { - __m256i v = _mm256_setr_epi16( - val, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); - __m512 o = _mm512_cvtph_ps(v); - return _mm512_cvtss_f32(o); -} - -inline __m512 FP16_TO_FP32_VEC(ggml_half val) { - __m256i v = _mm256_set1_epi16(val); - return _mm512_cvtph_ps(v); -} - -// horizontal reduce -inline float _mm512_reduce_max_ps(const __m512 x) { - __m512 v = x; - __m512 v1 = _mm512_shuffle_f32x4(v, v, 0x4E); - v = _mm512_max_ps(v, v1); - v1 = _mm512_shuffle_f32x4(v, v, 0xB1); - v = _mm512_max_ps(v, v1); - v1 = _mm512_shuffle_ps(v, v, 0x4E); - v = _mm512_max_ps(v, v1); - v1 = _mm512_shuffle_ps(v, v, 0xB1); - v = _mm512_max_ps(v, v1); - return _mm512_cvtss_f32(v); -} - // transpose utils #define SHUFFLE_EPI32(a, b, mask) \ _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), mask)) @@ -1370,9 +1334,9 @@ struct tinygemm_kernel_avx #define LAUNCH_TINYGEMM_KERNEL_AVX(MB_SIZE, NB_SIZE) \ tinygemm_kernel_avx::apply( \ - K, (const float *)src1->data + mb_start * K, \ - (const type *)src0->data + nb_start * K, \ - (float *)dst->data + mb_start * ldc + nb_start, ldc); + K, (const float *)src1->data + src1_offset + mb_start * K, \ + (const type *)src0->data + src0_offset + nb_start * K, \ + (float *)dst->data + dst_offset + mb_start * ldc + nb_start, ldc) // re-organize in the format {NB, KB, TILE_SIZE}: @@ -2019,11 +1983,11 @@ struct tinygemm_kernel_vnni::apply( \ - KB, (const char *)wdata + 0 * row_size_A, \ - (const char *)src0->data + PACKED_INDEX(nb * kTilesN, 0, KB, TILE_SIZE), \ - (float *) dst->data + 0 * N + nb_start, ldc) +#define LAUNCH_TINYGEMM_KERNEL_VNNI(NB_SIZE) \ + tinygemm_kernel_vnni::apply( \ + KB, wdata_batch, \ + (const char *)src0->data + src0_offset + PACKED_INDEX(nb * kTilesN, 0, KB, TILE_SIZE), \ + (float *) dst->data + dst_offset + nb_start, ldc) template ::value, int>::type = 0> @@ -2079,7 +2043,7 @@ void tinygemm_kernel_amx(int M, int N, int KB, const void * RESTRICT _A, const v _tile_stored(TMM5, Tile5(C_pre), TILE_N * sizeof(int32_t)); if (need_unpack) { - unpack_B(Tile1, B_blk0); + unpack_B(Tile1, B_blk1); _tile_loadd(TMM1, Tile1, TILE_N * VNNI_BLK); } else { _tile_loadd(TMM1, B_blk1, TILE_N * VNNI_BLK); @@ -2336,6 +2300,13 @@ void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * d }); } +// ne2 is passed explicitly to help compiler optimize repeated calls +inline int64_t ggml_batch_offset(const ggml_tensor * t, int64_t batch_idx, int64_t ne2) { + const int64_t i2 = batch_idx % ne2; + const int64_t i3 = batch_idx / ne2; + return i3 * t->nb[3] + i2 * t->nb[2]; +} + size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst) { struct ggml_tensor * src0 = dst->src[0]; @@ -2348,12 +2319,13 @@ size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst) { const int M = dst->ne[1]; const int K = src0->ne[0]; + const int64_t n_batch = dst->ne[2] * dst->ne[3]; size_t desired_wsize = 0; GGML_DISPATCH_QTYPES(TYPE, [&] { const size_t row_size_A = K / blck_size * sizeof(vec_dot_type); - desired_wsize = M * row_size_A; + desired_wsize = n_batch * M * row_size_A; }); return desired_wsize; @@ -2365,7 +2337,7 @@ size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst) { // src1: input in shape of {M, K}, float32 // dst: output in shape of {M, N}, float32 // -// the function performs: dst = src1 @ src0.T +// the function performs: dst = src1 @ src0.T for each batch // void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_tensor * dst) { struct ggml_tensor * src0 = dst->src[0]; @@ -2382,17 +2354,26 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te const int K = src0->ne[0]; const int ldc = dst->nb[1] / dst->nb[0]; + const int64_t ne2 = dst->ne[2]; + const int64_t n_batch = ne2 * dst->ne[3]; + if (is_floating_type) { constexpr int BLOCK_M = 4; constexpr int BLOCK_N = 6; const int MB = div_up(M, BLOCK_M); const int NB = div_up(N, BLOCK_N); - parallel_for_ggml(params, MB * NB, [&](int begin, int end) { + parallel_for_ggml(params, n_batch * MB * NB, [&](int begin, int end) { GGML_DISPATCH_FLOATING_TYPES(TYPE, [&] { for (int i = begin; i < end; ++i) { - int mb = i / NB; - int nb = i % NB; + int batch_idx = i / (MB * NB); + int remaining = i % (MB * NB); + int mb = remaining / NB; + int nb = remaining % NB; + + int64_t src0_offset = ggml_batch_offset(src0, batch_idx, ne2); + int64_t src1_offset = ggml_batch_offset(src1, batch_idx, ne2); + int64_t dst_offset = ggml_batch_offset(dst, batch_idx, ne2); int mb_start = mb * BLOCK_M; int mb_size = std::min(BLOCK_M, M - mb_start); @@ -2424,10 +2405,10 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te void * wdata = params->wdata; //TODO: performance improvement: merge quant A - if (params->ith == 0) { + // if (params->ith == 0) { GGML_DISPATCH_QTYPES(TYPE, [&] { const size_t row_size_A = K / blck_size * sizeof(vec_dot_type); - const size_t desired_wsize = M * row_size_A; + const size_t desired_wsize = n_batch * M * row_size_A; if (params->wsize < desired_wsize) { GGML_ABORT("insufficient work space size"); } @@ -2436,12 +2417,19 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te // Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size); - const float * A_data = static_cast(src1->data); - for (int m = 0; m < M; ++m) { - from_float(A_data + m * K, (char *)wdata + m * row_size_A, K); - } + parallel_for_ggml(params, n_batch, [&](int begin, int end) { + for (int batch_idx = begin; batch_idx < end; ++batch_idx) { + int64_t src1_offset = ggml_batch_offset(src1, batch_idx, ne2); + const float * A_data = (const float *)((const char *)src1->data + src1_offset); + char * wdata_batch = (char *)wdata + batch_idx * M * row_size_A; + + for (int m = 0; m < M; ++m) { + from_float(A_data + m * K, wdata_batch + m * row_size_A, K); + } + } + }); }); - } + // } ggml_barrier(params->threadpool); @@ -2451,13 +2439,19 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te constexpr int BLOCK_N = TILE_N * kTilesN; const int NB = div_up(N, BLOCK_N); - parallel_for_ggml(params, NB, [&](int begin, int end) { + parallel_for_ggml(params, n_batch * NB, [&](int begin, int end) { GGML_DISPATCH_QTYPES(TYPE, [&] { const int KB = K / blck_size; const int TILE_SIZE = get_tile_size(); const int row_size_A = KB * sizeof(vec_dot_type); for (int i = begin; i < end; ++i) { - int nb = i; + int batch_idx = i / NB; + int nb = i % NB; + + int64_t src0_offset = ggml_batch_offset(src0, batch_idx, ne2); + int64_t dst_offset = ggml_batch_offset(dst, batch_idx, ne2); + const char * wdata_batch = (const char *)wdata + batch_idx * row_size_A; + int nb_start = nb * BLOCK_N; int nb_size = std::min(BLOCK_N, N - nb_start); // 32, 64, 96 @@ -2481,7 +2475,7 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te const int MB = div_up(M, BLOCK_M); const int NB = div_up(N, BLOCK_N); - parallel_for_ggml(params, MB * NB, [&](int begin, int end) { + parallel_for_ggml(params, n_batch * MB * NB, [&](int begin, int end) { // init tile config for each thread ggml_tile_config_init(); @@ -2491,8 +2485,14 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te const int row_size_A = KB * sizeof(vec_dot_type); for (int i = begin; i < end; ++i) { - int mb = i / NB; - int nb = i % NB; + int batch_idx = i / (MB * NB); + int remaining = i % (MB * NB); + int mb = remaining / NB; + int nb = remaining % NB; + + int64_t src0_offset = ggml_batch_offset(src0, batch_idx, ne2); + int64_t dst_offset = ggml_batch_offset(dst, batch_idx, ne2); + const char * wdata_batch = (const char *)wdata + batch_idx * M * row_size_A; int mb_start = mb * BLOCK_M; int mb_size = std::min(BLOCK_M, M - mb_start); @@ -2501,9 +2501,9 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te tinygemm_kernel_amx( mb_size, nb_size, KB, - (const char *)wdata + mb_start * row_size_A, - (const char *)src0->data + PACKED_INDEX(nb * 2, 0, KB, TILE_SIZE), - (float *) dst->data + mb_start * N + nb_start, ldc); + wdata_batch + mb_start * row_size_A, + (const char *)src0->data + src0_offset + PACKED_INDEX(nb * 2, 0, KB, TILE_SIZE), + (float *) dst->data + dst_offset + mb_start * N + nb_start, ldc); } }); }); From 88cf781f51ee0c35fcbe973d3eaad0cb932d65c3 Mon Sep 17 00:00:00 2001 From: Vishal Singh Date: Fri, 27 Feb 2026 06:13:41 +0530 Subject: [PATCH 11/15] ggml-zendnn: update code for latest ZenDNN API (#19923) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - adapt ggml-zendnn.cpp to the new lowoha::matmul interface - update the ZenDNN git tag in CMake to the latest release (ZenDNN‑2026‑WW08) - add static lib support in CMake --- docs/backend/ZenDNN.md | 75 +++++++--------------------- ggml/src/ggml-zendnn/CMakeLists.txt | 63 ++++++++++++----------- ggml/src/ggml-zendnn/ggml-zendnn.cpp | 6 +-- 3 files changed, 52 insertions(+), 92 deletions(-) diff --git a/docs/backend/ZenDNN.md b/docs/backend/ZenDNN.md index b57fd97b6..3b1f8242f 100644 --- a/docs/backend/ZenDNN.md +++ b/docs/backend/ZenDNN.md @@ -22,7 +22,7 @@ **Llama.cpp + ZenDNN** -The llama.cpp ZenDNN backend leverages AMD's optimized matrix multiplication primitives to accelerate inference on AMD CPUs. It utilizes ZenDNN's **LowOHA (Low Overhead Hardware Accelerated)** MatMul operator for efficient GEMM operations with minimal execution overhead, built-in weight caching, and direct access to backend libraries (AOCL BLIS, LibXSMM, OneDNN). +The llama.cpp ZenDNN backend leverages AMD's optimized matrix multiplication primitives to accelerate inference on AMD CPUs. It utilizes ZenDNN's **LowOHA (Low Overhead Hardware Accelerated)** MatMul operator for efficient GEMM operations with minimal execution overhead, built-in weight caching, and direct access to backend libraries (AOCL DLP, LibXSMM, OneDNN). For more information about ZenDNN, visit: https://www.amd.com/en/developer/zendnn.html @@ -32,7 +32,7 @@ For more information about ZenDNN, visit: https://www.amd.com/en/developer/zendn |:-------:|:-------:|:----------------------------------------------:| | Linux | Support | Ubuntu 20.04, 22.04, 24.04 | -For the latest list of supported operating systems, see the [ZenDNN Supported OS](https://github.com/amd/ZenDNN/blob/zendnnl/README.md#15-supported-os). +For the latest list of supported operating systems, see the [ZenDNN Supported OS](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/README.md#15-supported-os). ## Hardware @@ -44,9 +44,9 @@ ZenDNN is optimized for AMD EPYC™ processors and AMD Ryzen™ processors based | CPU Family | Status | Notes | |:-----------------------------:|:-------:|:----------------------------------:| -| AMD EPYC™ 9005 Series (Turin)| Support | 5th Gen - Zen 5 architecture | -| AMD EPYC™ 9004 Series (Genoa)| Support | 4th Gen - Zen 4 architecture | -| AMD EPYC™ 7003 Series (Milan)| Support | 3rd Gen - Zen 3 architecture | +| AMD EPYC™ 9005 Series (Turin) | Support | 5th Gen - Zen 5 architecture | +| AMD EPYC™ 9004 Series (Genoa) | Support | 4th Gen - Zen 4 architecture | +| AMD EPYC™ 7003 Series (Milan) | Support | 3rd Gen - Zen 3 architecture | | AMD Ryzen™ AI MAX (Strix Halo)| Support | High-performance mobile processors | *Notes:* @@ -61,7 +61,7 @@ The ZenDNN backend currently accelerates **matrix multiplication (MUL_MAT)** ope | Operation | Status | Notes | |:-------------|:-------:|:----------------------------------------------:| -| MUL_MAT | ✓ | Accelerated via ZenDNN LowOHA MatMul | +| MUL_MAT | Support | Accelerated via ZenDNN LowOHA MatMul | *Note:* Since only MUL_MAT is accelerated, models will benefit most from ZenDNN when matrix multiplications dominate the computational workload (which is typical for transformer-based LLMs). @@ -104,7 +104,6 @@ If you want to build ZenDNN yourself or use a specific version: # Clone ZenDNN repository git clone https://github.com/amd/ZenDNN.git cd ZenDNN -git checkout zendnnl # Build and install (requires CMake >= 3.25) mkdir build && cd build @@ -114,7 +113,7 @@ cmake --build . --target all Default installation path: `ZenDNN/build/install` -**For detailed build instructions**, refer to the [ZenDNN README](https://github.com/amd/ZenDNN/blob/zendnnl/README.md). +**For detailed build instructions**, refer to the [ZenDNN README](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/README.md). **Step 2: Build llama.cpp with custom ZenDNN path** @@ -146,8 +145,7 @@ Run llama.cpp server with ZenDNN acceleration: ```sh # Set optimal configuration -export OMP_NUM_THREADS=64 # Adjust to your CPU core count -export ZENDNNL_MATMUL_ALGO=2 # Blocked AOCL BLIS for best performance +export ZENDNNL_MATMUL_ALGO=1 # Blocked AOCL DLP algo for best performance # Start server ./build/bin/llama-server \ @@ -160,62 +158,26 @@ export ZENDNNL_MATMUL_ALGO=2 # Blocked AOCL BLIS for best performance Access the server at `http://localhost:8080`. **Performance tips**: -- Set `OMP_NUM_THREADS` to match your physical core count -- Use `ZENDNNL_MATMUL_ALGO=2` for optimal performance +- Use `ZENDNNL_MATMUL_ALGO=1` for optimal performance - For NUMA systems: `numactl --cpunodebind=0 --membind=0 ./build/bin/llama-server ...` ## Environment Variable -### Build Time +For environment variables related to ZenDNN, refer to the [ZenDNN Environment Variables Documentation](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/docs/runtime_env.md). -| Name | Value | Function | -|--------------------|---------------------------------------|---------------------------------------------| -| GGML_ZENDNN | ON/OFF | Enable ZenDNN backend support | -| ZENDNN_ROOT | Path to ZenDNN installation | Set ZenDNN installation directory | -| GGML_OPENMP | ON/OFF (recommended: ON) | Enable OpenMP for multi-threading | +### Performance Optimization -### Runtime - -| Name | Value | Function | -|-------------------------|--------------------------|-------------------------------------------------------------------| -| OMP_NUM_THREADS | Number (e.g., 64) | Set number of OpenMP threads (recommended: physical core count) | -| ZENDNNL_MATMUL_ALGO | 0-5 | Select MatMul backend algorithm (see Performance Optimization) | -| ZENDNNL_PROFILE_LOG_LEVEL | 0-4 | Profiling log level (0=disabled, 4=verbose) | -| ZENDNNL_ENABLE_PROFILER | 0 or 1 | Enable detailed profiling (1=enabled) | -| ZENDNNL_API_LOG_LEVEL | 0-4 | API log level (0=disabled, 4=verbose) | - -**Example**: +ZenDNN's LowOHA MatMul supports multiple backend algorithms. For **best performance**, use the **Blocked AOCL DLP** algorithm: ```sh -export OMP_NUM_THREADS=64 -export ZENDNNL_MATMUL_ALGO=2 # Use Blocked AOCL BLIS for best performance -./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "Test" -n 100 +export ZENDNNL_MATMUL_ALGO=1 # Blocked AOCL DLP algo (recommended) ``` -## Performance Optimization - -### MatMul Algorithm Selection - -ZenDNN's LowOHA MatMul supports multiple backend algorithms. For **best performance**, use the **Blocked AOCL BLIS** algorithm: - -```sh -export ZENDNNL_MATMUL_ALGO=2 # Blocked AOCL BLIS (recommended) -``` - -**Available algorithms**: - -| Value | Algorithm | Description | -|:-----:|:-----------------------|:----------------------------------------------| -| 0 | Dynamic Dispatch | Automatic backend selection (default) | -| 1 | AOCL BLIS | AOCL BLIS backend | -| 2 | AOCL BLIS Blocked | **Blocked AOCL BLIS (recommended)** | -| 3 | OneDNN | OneDNN backend | -| 4 | OneDNN Blocked | Blocked OneDNN | -| 5 | LibXSMM | LibXSMM backend | +For more details on available algorithms, see the [ZenDNN MatMul Algorithm Documentation](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/docs/runtime_env.md#algorithm-details). ### Profiling and Debugging -For detailed profiling and logging options, refer to the [ZenDNN Logging Documentation](https://github.com/amd/ZenDNN/blob/zendnnl/docs/logging.md). +For detailed profiling and logging options, refer to the [ZenDNN Logging Documentation](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/docs/logging.md). ## Known Issues @@ -245,10 +207,9 @@ A: Currently, ZenDNN primarily supports FP32 and BF16 data types. Quantized mode A: Ensure: 1. You're using an AMD EPYC or Ryzen processor (Zen 2 or newer) -2. `OMP_NUM_THREADS` is set appropriately (physical core count) -3. `ZENDNNL_MATMUL_ALGO=2` is set for best performance (Blocked AOCL BLIS) -4. You're using a sufficiently large model (small models may not benefit as much) -5. Enable profiling to verify ZenDNN MatMul is being called +2. `ZENDNNL_MATMUL_ALGO=1` is set for best performance (Blocked AOCL DLP) +3. You're using a sufficiently large model (small models may not benefit as much) +4. Enable profiling to verify ZenDNN MatMul is being called ### **GitHub Contribution**: Please add the **[ZenDNN]** prefix/tag in issues/PRs titles to help the ZenDNN-team check/address them without delay. diff --git a/ggml/src/ggml-zendnn/CMakeLists.txt b/ggml/src/ggml-zendnn/CMakeLists.txt index f5cf6eedd..9bdb4e836 100644 --- a/ggml/src/ggml-zendnn/CMakeLists.txt +++ b/ggml/src/ggml-zendnn/CMakeLists.txt @@ -1,12 +1,19 @@ ggml_add_backend_library(ggml-zendnn ggml-zendnn.cpp) -# Get ZenDNN path if (NOT DEFINED ZENDNN_ROOT OR ZENDNN_ROOT STREQUAL "") set(ZENDNN_ROOT "$ENV{ZENDNN_ROOT}") endif() -# Check if path is still empty or OFF +if (BUILD_SHARED_LIBS) + set(ZENDNN_SHARED_LIB ON) + set(ZENDNN_ARCHIVE_LIB OFF) +else() + set(ZENDNN_SHARED_LIB OFF) + set(ZENDNN_ARCHIVE_LIB ON) +endif() + +# Download and build ZenDNN if not provided if (NOT ZENDNN_ROOT OR ZENDNN_ROOT STREQUAL "" OR ZENDNN_ROOT STREQUAL "OFF") message(STATUS "ZENDNN_ROOT not set. Automatically downloading and building ZenDNN...") message(STATUS "This will take several minutes on first build...") @@ -21,7 +28,7 @@ if (NOT ZENDNN_ROOT OR ZENDNN_ROOT STREQUAL "" OR ZENDNN_ROOT STREQUAL "OFF") ExternalProject_Add( zendnn GIT_REPOSITORY https://github.com/amd/ZenDNN.git - GIT_TAG 21ce8f7879c86bf3637f707fae6f29e0951db5fe + GIT_TAG a18adf8c605fb5f5e52cefd7eda08a7b18febbaf # ZenDNN-2026-WW08 PREFIX ${ZENDNN_PREFIX} SOURCE_DIR ${ZENDNN_SOURCE_DIR} BINARY_DIR ${ZENDNN_BUILD_DIR} @@ -32,7 +39,9 @@ if (NOT ZENDNN_ROOT OR ZENDNN_ROOT STREQUAL "" OR ZENDNN_ROOT STREQUAL "OFF") -DZENDNNL_BUILD_DOXYGEN=OFF -DZENDNNL_BUILD_GTEST=OFF -DZENDNNL_BUILD_BENCHDNN=OFF - # Enable ALL matmul algorithm backends + -DZENDNNL_DEPENDS_FBGEMM=OFF + -DZENDNNL_LIB_BUILD_ARCHIVE=${ZENDNN_ARCHIVE_LIB} + -DZENDNNL_LIB_BUILD_SHARED=${ZENDNN_SHARED_LIB} -DZENDNNL_DEPENDS_AOCLDLP=ON -DZENDNNL_DEPENDS_ONEDNN=ON -DZENDNNL_DEPENDS_LIBXSMM=ON @@ -45,47 +54,37 @@ if (NOT ZENDNN_ROOT OR ZENDNN_ROOT STREQUAL "" OR ZENDNN_ROOT STREQUAL "OFF") LOG_INSTALL ON ) - # Add dependency so ZenDNN builds before our library add_dependencies(ggml-zendnn zendnn) - - # Set ZENDNN_ROOT to the installation directory set(ZENDNN_ROOT ${ZENDNN_INSTALL_DIR}) - message(STATUS "ZenDNN will be built to: ${ZENDNN_ROOT}") else() message(STATUS "Using custom ZenDNN installation at: ${ZENDNN_ROOT}") endif() -# ZenDNN headers + libs target_include_directories(ggml-zendnn PRIVATE ${ZENDNN_ROOT}/zendnnl/include - ${ZENDNN_ROOT}/deps/aocldlp/include - ${ZENDNN_ROOT}/deps/aoclutils/include ${ZENDNN_ROOT}/deps/json/include - ${ZENDNN_ROOT}/deps/libxsmm/include + ${ZENDNN_ROOT}/deps/aoclutils/include + ${ZENDNN_ROOT}/deps/aocldlp/include ${ZENDNN_ROOT}/deps/onednn/include -) + ${ZENDNN_ROOT}/deps/libxsmm/include) -target_link_directories(ggml-zendnn PRIVATE - ${ZENDNN_ROOT}/zendnnl/lib - ${ZENDNN_ROOT}/deps/aocldlp/lib - ${ZENDNN_ROOT}/deps/aoclutils/lib - ${ZENDNN_ROOT}/deps/libxsmm/lib - ${ZENDNN_ROOT}/deps/onednn/lib -) +if (ZENDNN_SHARED_LIB) + target_link_directories(ggml-zendnn PRIVATE ${ZENDNN_ROOT}/zendnnl/lib) + target_link_libraries(ggml-zendnn PRIVATE zendnnl) +elseif (ZENDNN_ARCHIVE_LIB) + target_link_libraries(ggml-zendnn PRIVATE + ${ZENDNN_ROOT}/zendnnl/lib/libzendnnl_archive.a + ${ZENDNN_ROOT}/deps/aoclutils/${CMAKE_INSTALL_LIBDIR}/libaoclutils.a + ${ZENDNN_ROOT}/deps/aoclutils/${CMAKE_INSTALL_LIBDIR}/libau_cpuid.a + ${ZENDNN_ROOT}/deps/aocldlp/lib/libaocl-dlp.a + ${ZENDNN_ROOT}/deps/onednn/${CMAKE_INSTALL_LIBDIR}/libdnnl.a + ${ZENDNN_ROOT}/deps/libxsmm/lib/libxsmm.a + ${ZENDNN_ROOT}/deps/libxsmm/lib/libxsmmext.a + ${ZENDNN_ROOT}/deps/libxsmm/lib/libxsmmnoblas.a) +endif() -target_link_libraries(ggml-zendnn PRIVATE - zendnnl_archive # ZenDNN main - aocl-dlp # AOCL libraries - aoclutils - au_cpuid - dnnl # OneDNN - xsmm # libxsmm small matrix math - xsmmext - xsmmnoblas - m - pthread -) +target_link_libraries(ggml-zendnn PRIVATE m pthread) if (GGML_OPENMP) target_link_libraries(ggml-zendnn PRIVATE OpenMP::OpenMP_CXX) diff --git a/ggml/src/ggml-zendnn/ggml-zendnn.cpp b/ggml/src/ggml-zendnn/ggml-zendnn.cpp index 551c15bb4..c87603040 100644 --- a/ggml/src/ggml-zendnn/ggml-zendnn.cpp +++ b/ggml/src/ggml-zendnn/ggml-zendnn.cpp @@ -41,13 +41,13 @@ static bool ggml_zendnn_matmul(ggml_backend_zendnn_context * ctx, int64_t m, int const TA * A, int64_t lda, const TB * B, int64_t ldb, TC * C, int64_t ldc) { - zendnnl::lowoha::lowoha_params params; + zendnnl::lowoha::matmul::matmul_params params; params.dtypes.src = ggml_to_zendnn_type(); params.dtypes.wei = ggml_to_zendnn_type(); params.dtypes.dst = ggml_to_zendnn_type(); params.num_threads = ctx->n_threads; - zendnnl::lowoha::status_t status = zendnnl::lowoha::matmul_direct( + zendnnl::error_handling::status_t status = zendnnl::lowoha::matmul::matmul_direct( 'r', false, true, // row-major, don't transpose B, transpose A (because it's column-major) n, // M: rows of B and C m, // N: cols of A^T and C @@ -63,7 +63,7 @@ static bool ggml_zendnn_matmul(ggml_backend_zendnn_context * ctx, int64_t m, int params // params ); - if (status != zendnnl::lowoha::status_t::success) { + if (status != zendnnl::error_handling::status_t::success) { GGML_LOG_ERROR("%s, ZenDNN matmul failed: status=%d\n", __func__, static_cast(status)); return false; } From c17dce4f5c725054e10cfafc883cbd24e8b6303b Mon Sep 17 00:00:00 2001 From: Neo Zhang Date: Fri, 27 Feb 2026 09:26:07 +0800 Subject: [PATCH 12/15] replace the magic nunber 768 by max work group size to support iGPU (#19920) Co-authored-by: Neo Zhang Jianyu --- ggml/src/ggml-sycl/add-id.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-sycl/add-id.cpp b/ggml/src/ggml-sycl/add-id.cpp index 00c073cf9..8929017a9 100644 --- a/ggml/src/ggml-sycl/add-id.cpp +++ b/ggml/src/ggml-sycl/add-id.cpp @@ -55,7 +55,11 @@ void ggml_sycl_add_id(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { const int32_t* src2_d = (const int32_t*)src2->data; float* dst_d = (float*)dst->data; - int threads = std::min((int)ne00, 768); // cols + const unsigned int max_work_group_size = ggml_sycl_info().max_work_group_sizes[ctx.device]; + assert(work_group_size % (WARP_SIZE * WARP_SIZE) == 0); + + int threads = std::min((unsigned int)ne00, max_work_group_size); // cols + ctx.stream()->parallel_for( sycl::nd_range<3>( sycl::range<3>(1, ne02, ne01) * sycl::range<3>(1, 1, threads), From a8b192b6ec4fd517fc966ef6fa02c01f8157bbee Mon Sep 17 00:00:00 2001 From: Jan Patrick Lehr Date: Fri, 27 Feb 2026 05:37:54 +0100 Subject: [PATCH 13/15] tests : enable test-chat out of tree build (#19558) The binary relies on model files that it tries to find. However, when configuring the build directory to be parallel to the source tree those heuristics fail. This sets the working directory for the test executable to be the source-tree which resolves this issue. --- tests/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 350bffc31..d98a090c3 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -152,7 +152,7 @@ if (NOT WIN32 OR NOT BUILD_SHARED_LIBS) llama_build_and_test(test-grammar-parser.cpp) llama_build_and_test(test-grammar-integration.cpp) llama_build_and_test(test-llama-grammar.cpp) - llama_build_and_test(test-chat.cpp) + llama_build_and_test(test-chat.cpp WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) # TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8 if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") llama_build_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) From 2e7e6385230ede73135191e83c81cc9f138c2d65 Mon Sep 17 00:00:00 2001 From: Pascal Date: Fri, 27 Feb 2026 07:05:23 +0100 Subject: [PATCH 14/15] server : support multiple model aliases via comma-separated --alias (#19926) * server : support multiple model aliases via comma-separated --alias * server : update --alias description and regenerate docs * server : multiple model aliases and tags - address review feedback from ngxson - --alias accepts comma-separated values (std::set, no duplicates) - --tags for informational metadata (not used for routing) - aliases resolve transparently in router via get_meta/has_model - /v1/models exposes aliases and tags fields * regenerate docs * nits * server : use first alias as model_name for backward compat address review feedback from ngxson * server : add single-model test for aliases and tags --- common/arg.cpp | 21 +++++- common/common.h | 3 +- tools/cli/README.md | 10 +-- tools/completion/README.md | 10 +-- tools/server/README.md | 21 ++++-- tools/server/server-context.cpp | 14 +++- tools/server/server-context.h | 3 + tools/server/server-models.cpp | 99 ++++++++++++++++++++++++--- tools/server/server-models.h | 2 + tools/server/server.cpp | 2 +- tools/server/tests/unit/test_basic.py | 17 +++++ tools/server/tests/utils.py | 3 + 12 files changed, 173 insertions(+), 32 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 1e8885c9c..05f4a5244 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2520,11 +2520,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex )); add_opt(common_arg( {"-a", "--alias"}, "STRING", - "set alias for model name (to be used by REST API)", + "set model name aliases, comma-separated (to be used by API)", [](common_params & params, const std::string & value) { - params.model_alias = value; + for (auto & alias : string_split(value, ',')) { + alias = string_strip(alias); + if (!alias.empty()) { + params.model_alias.insert(alias); + } + } } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS")); + add_opt(common_arg( + {"--tags"}, "STRING", + "set model tags, comma-separated (informational, not used for routing)", + [](common_params & params, const std::string & value) { + for (auto & tag : string_split(value, ',')) { + tag = string_strip(tag); + if (!tag.empty()) { + params.model_tags.insert(tag); + } + } + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TAGS")); add_opt(common_arg( {"-m", "--model"}, "FNAME", ex == LLAMA_EXAMPLE_EXPORT_LORA diff --git a/common/common.h b/common/common.h index 1fa172865..c5a803757 100644 --- a/common/common.h +++ b/common/common.h @@ -410,7 +410,8 @@ struct common_params { struct common_params_model model; - std::string model_alias = ""; // model alias // NOLINT + std::set model_alias; // model aliases // NOLINT + std::set model_tags; // model tags (informational, not used for routing) // NOLINT std::string hf_token = ""; // HF token // NOLINT std::string prompt = ""; // NOLINT std::string system_prompt = ""; // NOLINT diff --git a/tools/cli/README.md b/tools/cli/README.md index 4a15cbad9..22d3fc87e 100644 --- a/tools/cli/README.md +++ b/tools/cli/README.md @@ -57,8 +57,8 @@ | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | -| `--mmap, --no-mmap` | whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | -| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. Takes precedence over --mmap (default: enabled)
(env: LLAMA_ARG_DIO) | +| `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | +| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)
(env: LLAMA_ARG_DIO) | | `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggml-org/llama.cpp/issues/1437
(env: LLAMA_ARG_NUMA) | | `-dev, --device ` | comma-separated list of devices to use for offloading (none = don't offload)
use --list-devices to see a list of available devices
(env: LLAMA_ARG_DEVICE) | | `--list-devices` | print list of available devices and exit | @@ -109,14 +109,14 @@ | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) | | `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) | | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | -| `--temp N` | temperature (default: 0.80) | +| `--temp, --temperature N` | temperature (default: 0.80) | | `--top-k N` | top-k sampling (default: 40, 0 = disabled)
(env: LLAMA_ARG_TOP_K) | | `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) | | `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) | -| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) | +| `--top-nsigma, --top-n-sigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) | | `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) | | `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) | -| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) | +| `--typical, --typical-p N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) | | `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) | | `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) | | `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) | diff --git a/tools/completion/README.md b/tools/completion/README.md index 3ca3e6845..bcc088765 100644 --- a/tools/completion/README.md +++ b/tools/completion/README.md @@ -140,8 +140,8 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | -| `--mmap, --no-mmap` | whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | -| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. Takes precedence over --mmap (default: enabled)
(env: LLAMA_ARG_DIO) | +| `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | +| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)
(env: LLAMA_ARG_DIO) | | `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggml-org/llama.cpp/issues/1437
(env: LLAMA_ARG_NUMA) | | `-dev, --device ` | comma-separated list of devices to use for offloading (none = don't offload)
use --list-devices to see a list of available devices
(env: LLAMA_ARG_DEVICE) | | `--list-devices` | print list of available devices and exit | @@ -192,14 +192,14 @@ llama-completion.exe -m models\gemma-1.1-7b-it.Q4_K_M.gguf --ignore-eos -n -1 | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) | | `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) | | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | -| `--temp N` | temperature (default: 0.80) | +| `--temp, --temperature N` | temperature (default: 0.80) | | `--top-k N` | top-k sampling (default: 40, 0 = disabled)
(env: LLAMA_ARG_TOP_K) | | `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) | | `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) | -| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) | +| `--top-nsigma, --top-n-sigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) | | `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) | | `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) | -| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) | +| `--typical, --typical-p N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) | | `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) | | `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) | | `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) | diff --git a/tools/server/README.md b/tools/server/README.md index 34b722a27..da16ddc75 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -74,8 +74,8 @@ For the full list of features, please refer to [server's changelog](https://gith | `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | -| `--mmap, --no-mmap` | whether to memory-map model. Explicitly enabling mmap disables direct-io. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | -| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. Takes precedence over --mmap (default: enabled)
(env: LLAMA_ARG_DIO) | +| `--mmap, --no-mmap` | whether to memory-map model. (if mmap disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | +| `-dio, --direct-io, -ndio, --no-direct-io` | use DirectIO if available. (default: disabled)
(env: LLAMA_ARG_DIO) | | `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggml-org/llama.cpp/issues/1437
(env: LLAMA_ARG_NUMA) | | `-dev, --device ` | comma-separated list of devices to use for offloading (none = don't offload)
use --list-devices to see a list of available devices
(env: LLAMA_ARG_DEVICE) | | `--list-devices` | print list of available devices and exit | @@ -126,14 +126,14 @@ For the full list of features, please refer to [server's changelog](https://gith | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) | | `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) | | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | -| `--temp N` | temperature (default: 0.80) | +| `--temp, --temperature N` | temperature (default: 0.80) | | `--top-k N` | top-k sampling (default: 40, 0 = disabled)
(env: LLAMA_ARG_TOP_K) | | `--top-p N` | top-p sampling (default: 0.95, 1.0 = disabled) | | `--min-p N` | min-p sampling (default: 0.05, 0.0 = disabled) | -| `--top-nsigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) | +| `--top-nsigma, --top-n-sigma N` | top-n-sigma sampling (default: -1.00, -1.0 = disabled) | | `--xtc-probability N` | xtc probability (default: 0.00, 0.0 = disabled) | | `--xtc-threshold N` | xtc threshold (default: 0.10, 1.0 = disabled) | -| `--typical N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) | +| `--typical, --typical-p N` | locally typical sampling, parameter p (default: 1.00, 1.0 = disabled) | | `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) | | `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.00, 1.0 = disabled) | | `--presence-penalty N` | repeat alpha presence penalty (default: 0.00, 0.0 = disabled) | @@ -162,9 +162,11 @@ For the full list of features, please refer to [server's changelog](https://gith | Argument | Explanation | | -------- | ----------- | +| `-lcs, --lookup-cache-static FNAME` | path to static lookup cache to use for lookup decoding (not updated by generation) | +| `-lcd, --lookup-cache-dynamic FNAME` | path to dynamic lookup cache to use for lookup decoding (updated by generation) | | `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_CTX_CHECKPOINTS) | | `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) | -| `-kvu, --kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)
(env: LLAMA_ARG_KV_UNIFIED) | +| `-kvu, --kv-unified, -no-kvu, --no-kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)
(env: LLAMA_ARG_KV_UNIFIED) | | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)
(env: LLAMA_ARG_CONTEXT_SHIFT) | | `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode | | `-sp, --special` | special tokens output enabled (default: false) | @@ -182,7 +184,8 @@ For the full list of features, please refer to [server's changelog](https://gith | `-otd, --override-tensor-draft =,...` | override tensor buffer type for draft model | | `-cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model
(env: LLAMA_ARG_CPU_MOE_DRAFT) | | `-ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model
(env: LLAMA_ARG_N_CPU_MOE_DRAFT) | -| `-a, --alias STRING` | set alias for model name (to be used by REST API)
(env: LLAMA_ARG_ALIAS) | +| `-a, --alias STRING` | set model name aliases, comma-separated (to be used by API)
(env: LLAMA_ARG_ALIAS) | +| `--tags STRING` | set model tags, comma-separated (informational, not used for routing)
(env: LLAMA_ARG_TAGS) | | `--host HOST` | ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: 127.0.0.1)
(env: LLAMA_ARG_HOST) | | `--port PORT` | port to listen (default: 8080)
(env: LLAMA_ARG_PORT) | | `--path PATH` | path to serve static files from (default: )
(env: LLAMA_ARG_STATIC_PATH) | @@ -229,6 +232,10 @@ For the full list of features, please refer to [server's changelog](https://gith | `-ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) | | `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_MODEL_DRAFT) | | `--spec-replace TARGET DRAFT` | translate the string in TARGET into DRAFT if the draft model and main model are not compatible | +| `--spec-type [none\|ngram-cache\|ngram-simple\|ngram-map-k\|ngram-map-k4v\|ngram-mod]` | type of speculative decoding to use when no draft model is provided (default: none) | +| `--spec-ngram-size-n N` | ngram size N for ngram-simple/ngram-map speculative decoding, length of lookup n-gram (default: 12) | +| `--spec-ngram-size-m N` | ngram size M for ngram-simple/ngram-map speculative decoding, length of draft m-gram (default: 48) | +| `--spec-ngram-min-hits N` | minimum hits for ngram-map speculative decoding (default: 1) | | `-mv, --model-vocoder FNAME` | vocoder model for audio generation (default: unused) | | `--tts-use-guide-tokens` | Use guide tokens to improve TTS word recall | | `--embd-gemma-default` | use default EmbeddingGemma model (note: can download weights from the internet) | diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index eba463e4d..aafed4950 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -580,6 +580,8 @@ private: float slot_prompt_similarity = 0.0f; std::string model_name; // name of the loaded model, to be used by API + std::set model_aliases; // additional names for the model + std::set model_tags; // informational tags bool sleeping = false; @@ -813,10 +815,9 @@ private: SRV_WRN("%s", "for more info see https://github.com/ggml-org/llama.cpp/pull/16391\n"); if (!params_base.model_alias.empty()) { - // user explicitly specified model name - model_name = params_base.model_alias; + // backward compat: use first alias as model name + model_name = *params_base.model_alias.begin(); } else if (!params_base.model.name.empty()) { - // use model name in registry format (for models in cache) model_name = params_base.model.name; } else { // fallback: derive model name from file name @@ -824,6 +825,9 @@ private: model_name = model_path.filename().string(); } + model_aliases = params_base.model_alias; + model_tags = params_base.model_tags; + if (!is_resume) { return init(); } @@ -2892,6 +2896,8 @@ server_context_meta server_context::get_meta() const { return server_context_meta { /* build_info */ build_info, /* model_name */ impl->model_name, + /* model_aliases */ impl->model_aliases, + /* model_tags */ impl->model_tags, /* model_path */ impl->params_base.model.path, /* has_mtmd */ impl->mctx != nullptr, /* has_inp_image */ impl->chat_params.allow_image, @@ -3688,6 +3694,8 @@ void server_routes::init_routes() { {"data", { { {"id", meta->model_name}, + {"aliases", meta->model_aliases}, + {"tags", meta->model_tags}, {"object", "model"}, {"created", std::time(0)}, {"owned_by", "llamacpp"}, diff --git a/tools/server/server-context.h b/tools/server/server-context.h index 03c29f513..631d573fc 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -6,12 +6,15 @@ #include #include +#include struct server_context_impl; // private implementation struct server_context_meta { std::string build_info; std::string model_name; + std::set model_aliases; + std::set model_tags; std::string model_path; bool has_mtmd; bool has_inp_image; diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index e16254779..bc601237b 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -184,6 +184,51 @@ void server_models::add_model(server_model_meta && meta) { if (mapping.find(meta.name) != mapping.end()) { throw std::runtime_error(string_format("model '%s' appears multiple times", meta.name.c_str())); } + + // check model name does not conflict with existing aliases + for (const auto & [key, inst] : mapping) { + if (inst.meta.aliases.count(meta.name)) { + throw std::runtime_error(string_format("model name '%s' conflicts with alias of model '%s'", + meta.name.c_str(), key.c_str())); + } + } + + // parse aliases from preset's --alias option (comma-separated) + std::string alias_str; + if (meta.preset.get_option("LLAMA_ARG_ALIAS", alias_str) && !alias_str.empty()) { + for (auto & alias : string_split(alias_str, ',')) { + alias = string_strip(alias); + if (!alias.empty()) { + meta.aliases.insert(alias); + } + } + } + + // parse tags from preset's --tags option (comma-separated) + std::string tags_str; + if (meta.preset.get_option("LLAMA_ARG_TAGS", tags_str) && !tags_str.empty()) { + for (auto & tag : string_split(tags_str, ',')) { + tag = string_strip(tag); + if (!tag.empty()) { + meta.tags.insert(tag); + } + } + } + + // validate aliases do not conflict with existing names or aliases + for (const auto & alias : meta.aliases) { + if (mapping.find(alias) != mapping.end()) { + throw std::runtime_error(string_format("alias '%s' for model '%s' conflicts with existing model name", + alias.c_str(), meta.name.c_str())); + } + for (const auto & [key, inst] : mapping) { + if (inst.meta.aliases.count(alias)) { + throw std::runtime_error(string_format("alias '%s' for model '%s' conflicts with alias of model '%s'", + alias.c_str(), meta.name.c_str(), key.c_str())); + } + } + } + meta.update_args(ctx_preset, bin_path); // render args std::string name = meta.name; mapping[name] = instance_t{ @@ -249,6 +294,8 @@ void server_models::load_models() { server_model_meta meta{ /* preset */ preset.second, /* name */ preset.first, + /* aliases */ {}, + /* tags */ {}, /* port */ 0, /* status */ SERVER_MODEL_STATUS_UNLOADED, /* last_used */ 0, @@ -265,10 +312,28 @@ void server_models::load_models() { for (const auto & [name, preset] : custom_presets) { custom_names.insert(name); } + auto join_set = [](const std::set & s) { + std::string result; + for (const auto & v : s) { + if (!result.empty()) { + result += ", "; + } + result += v; + } + return result; + }; + SRV_INF("Available models (%zu) (*: custom preset)\n", mapping.size()); for (const auto & [name, inst] : mapping) { bool has_custom = custom_names.find(name) != custom_names.end(); - SRV_INF(" %c %s\n", has_custom ? '*' : ' ', name.c_str()); + std::string info; + if (!inst.meta.aliases.empty()) { + info += " (aliases: " + join_set(inst.meta.aliases) + ")"; + } + if (!inst.meta.tags.empty()) { + info += " [tags: " + join_set(inst.meta.tags) + "]"; + } + SRV_INF(" %c %s%s\n", has_custom ? '*' : ' ', name.c_str(), info.c_str()); } } @@ -320,7 +385,15 @@ void server_models::update_meta(const std::string & name, const server_model_met bool server_models::has_model(const std::string & name) { std::lock_guard lk(mutex); - return mapping.find(name) != mapping.end(); + if (mapping.find(name) != mapping.end()) { + return true; + } + for (const auto & [key, inst] : mapping) { + if (inst.meta.aliases.count(name)) { + return true; + } + } + return false; } std::optional server_models::get_meta(const std::string & name) { @@ -329,6 +402,11 @@ std::optional server_models::get_meta(const std::string & nam if (it != mapping.end()) { return it->second.meta; } + for (const auto & [key, inst] : mapping) { + if (inst.meta.aliases.count(name)) { + return inst.meta; + } + } return std::nullopt; } @@ -766,7 +844,7 @@ static void res_err(std::unique_ptr & res, const json & error_d res->data = safe_json_to_str({{ "error", error_data }}); } -static bool router_validate_model(const std::string & name, server_models & models, bool models_autoload, std::unique_ptr & res) { +static bool router_validate_model(std::string & name, server_models & models, bool models_autoload, std::unique_ptr & res) { if (name.empty()) { res_err(res, format_error_response("model name is missing from the request", ERROR_TYPE_INVALID_REQUEST)); return false; @@ -776,6 +854,8 @@ static bool router_validate_model(const std::string & name, server_models & mode res_err(res, format_error_response(string_format("model '%s' not found", name.c_str()), ERROR_TYPE_INVALID_REQUEST)); return false; } + // resolve alias to canonical model name + name = meta->name; if (models_autoload) { models.ensure_model_loaded(name); } else { @@ -847,16 +927,16 @@ void server_models_routes::init_routes() { auto res = std::make_unique(); json body = json::parse(req.body); std::string name = json_value(body, "model", std::string()); - auto model = models.get_meta(name); - if (!model.has_value()) { + auto meta = models.get_meta(name); + if (!meta.has_value()) { res_err(res, format_error_response("model is not found", ERROR_TYPE_NOT_FOUND)); return res; } - if (model->status == SERVER_MODEL_STATUS_LOADED) { + if (meta->status == SERVER_MODEL_STATUS_LOADED) { res_err(res, format_error_response("model is already loaded", ERROR_TYPE_INVALID_REQUEST)); return res; } - models.load(name); + models.load(meta->name); res_ok(res, {{"success", true}}); return res; }; @@ -877,6 +957,7 @@ void server_models_routes::init_routes() { preset_copy.unset_option("LLAMA_ARG_HOST"); preset_copy.unset_option("LLAMA_ARG_PORT"); preset_copy.unset_option("LLAMA_ARG_ALIAS"); + preset_copy.unset_option("LLAMA_ARG_TAGS"); status["preset"] = preset_copy.to_ini(); } if (meta.is_failed()) { @@ -885,6 +966,8 @@ void server_models_routes::init_routes() { } models_json.push_back(json { {"id", meta.name}, + {"aliases", meta.aliases}, + {"tags", meta.tags}, {"object", "model"}, // for OAI-compat {"owned_by", "llamacpp"}, // for OAI-compat {"created", t}, // for OAI-compat @@ -912,7 +995,7 @@ void server_models_routes::init_routes() { res_err(res, format_error_response("model is not loaded", ERROR_TYPE_INVALID_REQUEST)); return res; } - models.unload(name); + models.unload(model->name); res_ok(res, {{"success", true}}); return res; }; diff --git a/tools/server/server-models.h b/tools/server/server-models.h index a397abda4..78abc8d72 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -52,6 +52,8 @@ static std::string server_model_status_to_string(server_model_status status) { struct server_model_meta { common_preset preset; std::string name; + std::set aliases; // additional names that resolve to this model + std::set tags; // informational tags, not used for routing int port = 0; server_model_status status = SERVER_MODEL_STATUS_UNLOADED; int64_t last_used = 0; // for LRU unloading diff --git a/tools/server/server.cpp b/tools/server/server.cpp index d3d431602..542b98453 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -92,7 +92,7 @@ int main(int argc, char ** argv) { // for consistency between server router mode and single-model mode, we set the same model name as alias if (params.model_alias.empty() && !params.model.name.empty()) { - params.model_alias = params.model.name; + params.model_alias.insert(params.model.name); } common_init(); diff --git a/tools/server/tests/unit/test_basic.py b/tools/server/tests/unit/test_basic.py index 3405be3e2..d1b89cf1a 100644 --- a/tools/server/tests/unit/test_basic.py +++ b/tools/server/tests/unit/test_basic.py @@ -94,3 +94,20 @@ def test_no_webui(): server.start() res = requests.get(url) assert res.status_code == 404 + + +def test_server_model_aliases_and_tags(): + global server + server.model_alias = "tinyllama-2,fim,code" + server.model_tags = "chat,fim,small" + server.start() + res = server.make_request("GET", "/models") + assert res.status_code == 200 + assert len(res.body["data"]) == 1 + model = res.body["data"][0] + # aliases field must contain all aliases + assert set(model["aliases"]) == {"tinyllama-2", "fim", "code"} + # tags field must contain all tags + assert set(model["tags"]) == {"chat", "fim", "small"} + # id is derived from first alias (alphabetical order from std::set) + assert model["id"] == "code" diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py index f76bb1a91..5002999d9 100644 --- a/tools/server/tests/utils.py +++ b/tools/server/tests/utils.py @@ -56,6 +56,7 @@ class ServerProcess: # custom options model_alias: str | None = None + model_tags: str | None = None model_url: str | None = None model_file: str | None = None model_draft: str | None = None @@ -180,6 +181,8 @@ class ServerProcess: server_args.extend(["--pooling", self.pooling]) if self.model_alias: server_args.extend(["--alias", self.model_alias]) + if self.model_tags: + server_args.extend(["--tags", self.model_tags]) if self.n_ctx: server_args.extend(["--ctx-size", self.n_ctx]) if self.n_slots: From 8387ffb28d3467b81bc73727588c3fde772f8ebe Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Fri, 27 Feb 2026 11:02:53 +0100 Subject: [PATCH 15/15] gguf-py : dump version to 0.18.0 (#19950) This commit updates the gguf-py package version to 0.18.0 in preperation of a new release to PyPI. Refs: https://github.com/ggml-org/llama.cpp/discussions/19948 --- gguf-py/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml index 48693ae3e..5fb2755f1 100644 --- a/gguf-py/pyproject.toml +++ b/gguf-py/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "gguf" -version = "0.17.1" +version = "0.18.0" description = "Read and write ML models in GGUF for GGML" authors = ["GGML "] packages = [