From eadc4184caee5b5f68f31f19a2f65c6961748e46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Mon, 5 Jan 2026 09:14:04 +0100 Subject: [PATCH 1/4] llama : refactor rope_freq_base/scale_swa conversion and init (#18553) * refactor rope_freq_base/scale_swa conversion and init * safe defaults for unknowns * update relevant models * grammar * add get_rope_freq_scale to modern-bert * const * const * log swa info --- convert_hf_to_gguf.py | 14 +++++++--- src/llama-hparams.h | 4 +-- src/llama-model.cpp | 49 ++++++++++++++++++++++++++-------- src/models/afmoe.cpp | 14 ++++++---- src/models/cohere2-iswa.cpp | 3 +++ src/models/gemma2-iswa.cpp | 7 +++-- src/models/llama-iswa.cpp | 8 ++++-- src/models/modern-bert.cpp | 7 ++--- src/models/openai-moe-iswa.cpp | 7 +++-- src/models/smallthinker.cpp | 18 ++++++++----- 10 files changed, 94 insertions(+), 37 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 3340a0a7d..68446aa44 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -771,9 +771,14 @@ class TextModel(ModelBase): self.rope_parameters = self.hparams.get("rope_parameters", self.hparams.get("rope_scaling")) or {} + rope_theta = self.find_hparam(["rope_theta", "global_rope_theta", "rotary_emb_base"], optional=True) + local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "swa_rope_theta", "rope_local_base_freq"], optional=True) + # Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters: - if "rope_theta" not in self.rope_parameters and (rope_theta := self.find_hparam(["rope_theta", "global_rope_theta", "rotary_emb_base"], optional=True)) is not None: + if local_rope_theta is not None: + self.rope_parameters["sliding_attention"] = {"rope_theta": local_rope_theta} + if "rope_theta" not in self.rope_parameters and rope_theta is not None: self.rope_parameters["rope_theta"] = rope_theta if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None: self.rope_parameters["rope_type"] = rope_type @@ -839,6 +844,7 @@ class TextModel(ModelBase): self.gguf_writer.add_head_count_kv(n_head_kv) logger.info(f"gguf: key-value head count = {n_head_kv}") + # TODO: Handle "sliding_attention" similarly when models start implementing it rope_params = self.rope_parameters.get("full_attention", self.rope_parameters) if (rope_type := rope_params.get("rope_type")) is not None: rope_factor = rope_params.get("factor") @@ -885,6 +891,9 @@ class TextModel(ModelBase): if (rope_theta := rope_params.get("rope_theta")) is not None: self.gguf_writer.add_rope_freq_base(rope_theta) logger.info(f"gguf: rope theta = {rope_theta}") + if (local_rope_theta := self.rope_parameters.get("sliding_attention", {}).get("rope_theta")) is not None: + self.gguf_writer.add_rope_freq_base_swa(local_rope_theta) + logger.info(f"gguf: rope theta swa = {local_rope_theta}") if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"], optional=True)) is not None: self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") @@ -5004,7 +5013,6 @@ class Plamo3Model(TextModel): if (sliding_window := self.find_hparam(["window_size", "sliding_window"], optional=True)) is not None: self.gguf_writer.add_sliding_window(sliding_window) self.gguf_writer.add_sliding_window_pattern(self.hparams["sliding_window_pattern"]) - self.gguf_writer.add_rope_freq_base_swa(self.rope_parameters.get("sliding_attention", {"rope_theta": self.hparams.get("rope_local_theta")})["rope_theta"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: @@ -7480,7 +7488,6 @@ class MimoV2Model(TextModel): self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) self.gguf_writer.add_sliding_window_pattern(self.hparams["hybrid_layer_pattern"]) - self.gguf_writer.add_rope_freq_base_swa(self.hparams["swa_rope_theta"]) self.gguf_writer.add_value_length(self.hparams["v_head_dim"]) self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"]) self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"]) @@ -10218,7 +10225,6 @@ class ModernBertModel(BertModel): self.gguf_writer.add_sliding_window(self.hparams["local_attention"]) if (sliding_window_pattern := self.hparams.get("global_attn_every_n_layers")) is not None: self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern) - self.gguf_writer.add_rope_freq_base_swa(self.rope_parameters.get("sliding_attention", {"rope_theta": self.hparams.get("local_rope_theta")})["rope_theta"]) self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 42def73f0..fc5708fc4 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -105,9 +105,9 @@ struct llama_hparams { float rope_attn_factor = 1.0f; float rope_freq_base_train; - float rope_freq_base_train_swa; + float rope_freq_base_train_swa = 10000.0f; float rope_freq_scale_train; - float rope_freq_scale_train_swa; + float rope_freq_scale_train_swa = 1.0f; uint32_t n_ctx_orig_yarn; float rope_yarn_log_mul = 0.0f; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index c739b0b48..28dcc2840 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -578,6 +578,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling); GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED); + // TODO: Handle SWA metadata similarly when models start implementing it // rope_freq_scale (inverse of the kv) is optional float ropescale = 0.0f; if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) { @@ -586,10 +587,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { } hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale; - // by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers - hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; - hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; - ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false); // non-transformer models do not have attention heads @@ -677,6 +674,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.f_attn_temp_scale = 0.1f; hparams.f_attn_temp_offset = 1.0f; hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full + + hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; + hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); } switch (hparams.n_expert) { @@ -722,6 +723,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { if (hparams.n_swa > 0) { hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; hparams.set_swa_pattern(4); + + hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; + hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); } else { hparams.swa_type = LLAMA_SWA_TYPE_NONE; } @@ -1243,7 +1248,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { if (found_swa && hparams.n_swa > 0) { uint32_t swa_period = 8; hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; - hparams.rope_freq_scale_train_swa = 1.0f; ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa); ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false); hparams.set_swa_pattern(swa_period); @@ -1309,7 +1313,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.n_swa = 4096; // default value of gemma 2 hparams.set_swa_pattern(2); hparams.attn_soft_cap = true; + hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; + hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false); @@ -1334,8 +1341,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; hparams.set_swa_pattern(6); - hparams.rope_freq_base_train_swa = 10000.0f; - hparams.rope_freq_scale_train_swa = 1.0f; + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); } else { hparams.swa_type = LLAMA_SWA_TYPE_NONE; } @@ -1365,10 +1371,9 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.set_swa_pattern(5); hparams.n_layer_kv_from_start = 20; - hparams.rope_freq_base_train_swa = 10000.0f; - hparams.rope_freq_scale_train_swa = 1.0f; hparams.f_attention_scale = 1.0f; + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -1384,9 +1389,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.set_swa_pattern(6); hparams.causal_attn = false; // embeddings do not use causal attention - hparams.rope_freq_base_train_swa = 10000.0f; - hparams.rope_freq_scale_train_swa = 1.0f; + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type); @@ -1525,7 +1529,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { { hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; hparams.set_swa_pattern(4); + hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; + hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -1564,6 +1571,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { if (found_swa && hparams.n_swa > 0) { hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; hparams.set_swa_pattern(4); + + hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; + hparams.rope_freq_scale_train_swa = 1.0; // See olmo2.cpp + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); } else { hparams.swa_type = LLAMA_SWA_TYPE_NONE; } @@ -1906,6 +1917,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; hparams.n_swa = 4096; hparams.set_swa_pattern(4); + + hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; + hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); } ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); @@ -2208,6 +2223,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; hparams.set_swa_pattern(2); + hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; + hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); + switch (hparams.n_layer) { case 24: type = LLM_TYPE_20B; break; case 36: type = LLM_TYPE_120B; break; @@ -2252,6 +2271,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; hparams.n_swa = 4096; hparams.set_swa_pattern(4, true); + + hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; + hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); } else { hparams.swa_type = LLAMA_SWA_TYPE_NONE; hparams.n_no_rope_layer_step = hparams.n_layer; @@ -7098,6 +7121,10 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str()); LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train); LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train); + if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { + LLAMA_LOG_INFO("%s: freq_base_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa); + LLAMA_LOG_INFO("%s: freq_scale_swa = %g\n", __func__, hparams.rope_freq_scale_train_swa); + } LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn); LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul); LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown"); diff --git a/src/models/afmoe.cpp b/src/models/afmoe.cpp index 0192e344c..6a752a403 100644 --- a/src/models/afmoe.cpp +++ b/src/models/afmoe.cpp @@ -22,8 +22,15 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para const float kq_scale = 1.0f/sqrtf(float(n_embd_head)); for (int il = 0; il < n_layer; ++il) { + const float freq_base_l = model.get_rope_freq_base (cparams, il); + const float freq_scale_l = model.get_rope_freq_scale(cparams, il); + ggml_tensor * inpSA = inpL; + // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous + const bool use_rope = hparams.n_no_rope_layer_step > 0 && + (il + 1) % hparams.n_no_rope_layer_step != 0; + // dual attention normalization (pre) cur = build_norm(inpL, model.layers[il].attn_norm, NULL, @@ -56,19 +63,16 @@ llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_para cb(Qcur, "Qcur_normed", il); cb(Kcur, "Kcur_normed", il); - // RoPE only for sliding_attention layers - const bool use_rope = hparams.n_no_rope_layer_step > 0 && - ((il + 1) % hparams.n_no_rope_layer_step) != 0; if (use_rope) { Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow); cb(Qcur, "Qcur_rope", il); Kcur = ggml_rope_ext( ctx0, Kcur, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur_rope", il); } diff --git a/src/models/cohere2-iswa.cpp b/src/models/cohere2-iswa.cpp index b18aa8c4e..9334b5e42 100644 --- a/src/models/cohere2-iswa.cpp +++ b/src/models/cohere2-iswa.cpp @@ -21,6 +21,9 @@ llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const for (int il = 0; il < n_layer; ++il) { const bool is_swa = hparams.is_swa(il); + // UNUSED: + // const float freq_base_l = model.get_rope_freq_base (cparams, il); + // const float freq_scale_l = model.get_rope_freq_scale(cparams, il); // norm cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM, il); diff --git a/src/models/gemma2-iswa.cpp b/src/models/gemma2-iswa.cpp index 9cc59a53e..7a9198193 100644 --- a/src/models/gemma2-iswa.cpp +++ b/src/models/gemma2-iswa.cpp @@ -19,6 +19,9 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll ggml_tensor * inp_out_ids = build_inp_out_ids(); for (int il = 0; il < n_layer; ++il) { + const float freq_base_l = model.get_rope_freq_base (cparams, il); + const float freq_scale_l = model.get_rope_freq_scale(cparams, il); + // norm cur = build_norm(inpL, model.layers[il].attn_norm, NULL, @@ -43,12 +46,12 @@ llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const ll Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow); Kcur = ggml_rope_ext( ctx0, Kcur, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow); cb(Qcur, "Qcur", il); diff --git a/src/models/llama-iswa.cpp b/src/models/llama-iswa.cpp index 03f806168..61dd2c179 100644 --- a/src/models/llama-iswa.cpp +++ b/src/models/llama-iswa.cpp @@ -25,8 +25,12 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_ ggml_tensor * inp_out_ids = build_inp_out_ids(); for (int il = 0; il < n_layer; ++il) { + const float freq_base_l = model.get_rope_freq_base (cparams, il); + const float freq_scale_l = model.get_rope_freq_scale(cparams, il); + ggml_tensor * inpSA = inpL; + // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous const bool use_rope = hparams.n_no_rope_layer_step > 0 && (il + 1) % hparams.n_no_rope_layer_step != 0; @@ -67,13 +71,13 @@ llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_ if (use_rope) { Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow ); Kcur = ggml_rope_ext( ctx0, Kcur, inp_pos, rope_factors, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow ); } else if (inp_attn_scale) { diff --git a/src/models/modern-bert.cpp b/src/models/modern-bert.cpp index 6df418ecd..bb12ed819 100644 --- a/src/models/modern-bert.cpp +++ b/src/models/modern-bert.cpp @@ -23,7 +23,8 @@ llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const ll auto * inp_attn = build_attn_inp_no_cache(); for (int il = 0; il < n_layer; ++il) { - float freq_base_l = model.get_rope_freq_base(cparams, il); + const float freq_base_l = model.get_rope_freq_base(cparams, il); + const float freq_scale_l = model.get_rope_freq_scale(cparams, il); cur = inpL; @@ -48,13 +49,13 @@ llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const ll // RoPE Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow ); Kcur = ggml_rope_ext( ctx0, Kcur, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow ); diff --git a/src/models/openai-moe-iswa.cpp b/src/models/openai-moe-iswa.cpp index 96596709e..dbe3ca185 100644 --- a/src/models/openai-moe-iswa.cpp +++ b/src/models/openai-moe-iswa.cpp @@ -14,6 +14,9 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model, ggml_tensor * inp_out_ids = build_inp_out_ids(); for (int il = 0; il < n_layer; ++il) { + const float freq_base_l = model.get_rope_freq_base (cparams, il); + const float freq_scale_l = model.get_rope_freq_scale(cparams, il); + ggml_tensor * inpSA = inpL; // norm @@ -49,13 +52,13 @@ llm_build_openai_moe_iswa::llm_build_openai_moe_iswa(const llama_model & model, Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow ); Kcur = ggml_rope_ext( ctx0, Kcur, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow ); diff --git a/src/models/smallthinker.cpp b/src/models/smallthinker.cpp index 277eec295..4c497ca76 100644 --- a/src/models/smallthinker.cpp +++ b/src/models/smallthinker.cpp @@ -26,10 +26,16 @@ llm_build_smallthinker::llm_build_smallthinker(const llama_model & model, ggml_tensor * inp_out_ids = build_inp_out_ids(); for (int il = 0; il < n_layer; ++il) { - ggml_tensor * inpSA = inpL; - ggml_tensor * probs = nullptr; + const float freq_base_l = model.get_rope_freq_base (cparams, il); + const float freq_scale_l = model.get_rope_freq_scale(cparams, il); - probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL); // [n_expert, n_tokens] + ggml_tensor * inpSA = inpL; + + // This overlaps with SWA layers in current models, so get_rope_freq_base/scale may be superfluous + const bool use_rope = hparams.n_no_rope_layer_step == n_layer || + il % hparams.n_no_rope_layer_step != 0; + + ggml_tensor * probs = build_lora_mm(model.layers[il].ffn_gate_inp, inpL); // [n_expert, n_tokens] cb(probs, "ffn_moe_logits", il); // norm @@ -52,11 +58,11 @@ llm_build_smallthinker::llm_build_smallthinker(const llama_model & model, Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - if (hparams.n_no_rope_layer_step == n_layer || il % hparams.n_no_rope_layer_step != 0) { - Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + if (use_rope) { + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow); - Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow); } cb(Qcur, "Qcur", il); From b37124d2d2f0e4f6468746f2cbda286944627a75 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Mon, 5 Jan 2026 04:30:14 -0600 Subject: [PATCH 2/4] vulkan: handle quantize_q8_1 overflowing the max workgroup count (#18515) * vulkan: handle quantize_q8_1 overflowing the max workgroup count * vulkan: Fix small tile size matmul on lavapipe * fix mul_mat_id failures --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 57 +++++++++++-------- .../vulkan-shaders/quantize_q8_1.comp | 16 +++--- tests/test-backend-ops.cpp | 4 ++ 3 files changed, 44 insertions(+), 33 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 16254457b..502a4deeb 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -2898,39 +2898,41 @@ static void ggml_vk_load_shaders(vk_device& device) { const uint32_t tk_m = device->coopmat_support ? device->coopmat_k : 1; const uint32_t tk_s = device->coopmat_support ? device->coopmat_k : 1; - l_warptile = { 128, 128, 128, 16, subgroup_size_8 * 2, 64, 2, tm_l, tn_l, tk_l, subgroup_size_8 }; - m_warptile = { 128, 64, 64, 16, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; - s_warptile = { subgroup_size_16, 32, 32, 16, 32, 32, 2, tm_s, tn_s, tk_s, subgroup_size_8 }; + const uint32_t s_warptile_wm = device->subgroup_size == 8 ? 8 : 32; - l_warptile_mmq = { 128, 128, 128, 32, subgroup_size_8 * 2, 64, 2, tm_l, tn_l, tk_l, subgroup_size_8 }; - m_warptile_mmq = { 128, 64, 64, 32, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; - s_warptile_mmq = { subgroup_size_32, 32, 32, 32, 32, 32, 2, tm_s, tn_s, tk_s, subgroup_size_8 }; + l_warptile = { 128, 128, 128, 16, subgroup_size_8 * 2, 64, 2, tm_l, tn_l, tk_l, subgroup_size_8 }; + m_warptile = { 128, 64, 64, 16, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; + s_warptile = { subgroup_size_32, 32, 32, 16, s_warptile_wm, 32, 2, tm_s, tn_s, tk_s, subgroup_size_8 }; + + l_warptile_mmq = { 128, 128, 128, 32, subgroup_size_8 * 2, 64, 2, tm_l, tn_l, tk_l, subgroup_size_8 }; + m_warptile_mmq = { 128, 64, 64, 32, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; + s_warptile_mmq = { subgroup_size_32, 32, 32, 32, s_warptile_wm, 32, 2, tm_s, tn_s, tk_s, subgroup_size_8 }; // Integer MMQ has a smaller shared memory profile, but heavier register use - l_warptile_mmq_int = { 128, 128, 128, 32, subgroup_size_8 * 2, 64, 2, 4, 4, 1, subgroup_size_8 }; - m_warptile_mmq_int = { 128, 64, 64, 32, subgroup_size_8, 32, 2, 2, 2, 1, subgroup_size_8 }; - s_warptile_mmq_int = { subgroup_size_32, 32, 32, 32, 32, 32, 2, 2, 1, 1, subgroup_size_8 }; + l_warptile_mmq_int = { 128, 128, 128, 32, subgroup_size_8 * 2, 64, 2, 4, 4, 1, subgroup_size_8 }; + m_warptile_mmq_int = { 128, 64, 64, 32, subgroup_size_8, 32, 2, 2, 2, 1, subgroup_size_8 }; + s_warptile_mmq_int = { subgroup_size_32, 32, 32, 32, s_warptile_wm, 32, 2, 2, 1, 1, subgroup_size_8 }; // K-quants use even more registers, mitigate by setting WMITER to 1 - l_warptile_mmq_int_k = { 128, 128, 128, 32, subgroup_size_8 * 2, 64, 1, 4, 4, 1, subgroup_size_8 }; - m_warptile_mmq_int_k = { 128, 64, 64, 32, subgroup_size_8, 32, 1, 2, 2, 1, subgroup_size_8 }; - s_warptile_mmq_int_k = { subgroup_size_32, 32, 32, 32, 32, 32, 1, 2, 1, 1, subgroup_size_8 }; + l_warptile_mmq_int_k = { 128, 128, 128, 32, subgroup_size_8 * 2, 64, 1, 4, 4, 1, subgroup_size_8 }; + m_warptile_mmq_int_k = { 128, 64, 64, 32, subgroup_size_8, 32, 1, 2, 2, 1, subgroup_size_8 }; + s_warptile_mmq_int_k = { subgroup_size_32, 32, 32, 32, s_warptile_wm, 32, 1, 2, 1, 1, subgroup_size_8 }; - l_warptile_id = { 128, 128, 128, 16, mul_mat_subgroup_size_16 * 2, 64, 2, tm_l, tn_l, tk_l, mul_mat_subgroup_size_16 }; - m_warptile_id = { 128, 64, 64, 16, mul_mat_subgroup_size_16, 32, 2, tm_m, tn_m, tk_m, mul_mat_subgroup_size_16 }; - s_warptile_id = { mul_mat_subgroup_size_16, 32, 32, 16, 32, 32, 2, tm_s, tn_s, tk_s, mul_mat_subgroup_size_16 }; + l_warptile_id = { 128, 128, 128, 16, mul_mat_subgroup_size_16 * 2, 64, 2, tm_l, tn_l, tk_l, mul_mat_subgroup_size_16 }; + m_warptile_id = { 128, 64, 64, 16, mul_mat_subgroup_size_16, 32, 2, tm_m, tn_m, tk_m, mul_mat_subgroup_size_16 }; + s_warptile_id = { mul_mat_subgroup_size_16, 32, 32, 16, s_warptile_wm, 32, 2, tm_s, tn_s, tk_s, mul_mat_subgroup_size_16 }; - l_warptile_mmqid = { 128, 128, 128, 32, mul_mat_subgroup_size_8 * 2, 64, 2, tm_l, tn_l, tk_l, mul_mat_subgroup_size_8 }; - m_warptile_mmqid = { 128, 64, 64, 32, mul_mat_subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, mul_mat_subgroup_size_8 }; - s_warptile_mmqid = { mul_mat_subgroup_size_32, 32, 32, 32, 32, 32, 2, tm_s, tn_s, tk_s, mul_mat_subgroup_size_8 }; + l_warptile_mmqid = { 128, 128, 128, 32, mul_mat_subgroup_size_8 * 2, 64, 2, tm_l, tn_l, tk_l, mul_mat_subgroup_size_8 }; + m_warptile_mmqid = { 128, 64, 64, 32, mul_mat_subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, mul_mat_subgroup_size_8 }; + s_warptile_mmqid = { mul_mat_subgroup_size_32, 32, 32, 32, s_warptile_wm, 32, 2, tm_s, tn_s, tk_s, mul_mat_subgroup_size_8 }; - l_warptile_mmqid_int = { 128, 128, 128, 32, mul_mat_subgroup_size_8 * 2, 64, 2, 4, 4, 1, mul_mat_subgroup_size_8 }; - m_warptile_mmqid_int = { 128, 64, 64, 32, mul_mat_subgroup_size_8, 32, 2, 2, 2, 1, mul_mat_subgroup_size_8 }; - s_warptile_mmqid_int = { mul_mat_subgroup_size_32, 32, 32, 32, 32, 32, 2, 2, 1, 1, mul_mat_subgroup_size_8 }; + l_warptile_mmqid_int = { 128, 128, 128, 32, mul_mat_subgroup_size_8 * 2, 64, 2, 4, 4, 1, mul_mat_subgroup_size_8 }; + m_warptile_mmqid_int = { 128, 64, 64, 32, mul_mat_subgroup_size_8, 32, 2, 2, 2, 1, mul_mat_subgroup_size_8 }; + s_warptile_mmqid_int = { mul_mat_subgroup_size_32, 32, 32, 32, s_warptile_wm, 32, 2, 2, 1, 1, mul_mat_subgroup_size_8 }; - l_warptile_mmqid_int_k = { 128, 128, 128, 32, mul_mat_subgroup_size_16 * 2, 64, 1, 4, 4, 1, mul_mat_subgroup_size_16 }; - m_warptile_mmqid_int_k = { 128, 64, 64, 32, mul_mat_subgroup_size_16, 32, 1, 2, 2, 1, mul_mat_subgroup_size_16 }; - s_warptile_mmqid_int_k = { mul_mat_subgroup_size_32, 32, 32, 32, 32, 32, 1, 2, 1, 1, mul_mat_subgroup_size_16 }; + l_warptile_mmqid_int_k = { 128, 128, 128, 32, mul_mat_subgroup_size_16 * 2, 64, 1, 4, 4, 1, mul_mat_subgroup_size_16 }; + m_warptile_mmqid_int_k = { 128, 64, 64, 32, mul_mat_subgroup_size_16, 32, 1, 2, 2, 1, mul_mat_subgroup_size_16 }; + s_warptile_mmqid_int_k = { mul_mat_subgroup_size_32, 32, 32, 32, s_warptile_wm, 32, 1, 2, 1, 1, mul_mat_subgroup_size_16 }; // chip specific tuning if ((device->architecture == AMD_GCN) && (device->driver_id != vk::DriverId::eAmdProprietary)) { @@ -6773,7 +6775,12 @@ static void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& sub vk_pipeline pipeline = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, std::array{ne}, { ne, 1, 1 }); + const uint32_t num_blocks = CEIL_DIV(ne, pipeline->wg_denoms[0]); + // clamp the number of elements to the max workgroup count. The shader will iterate over the total number of blocks. + const uint64_t max_elements = std::min(uint64_t{ctx->device->properties.limits.maxComputeWorkGroupCount[0]} * pipeline->wg_denoms[0], std::numeric_limits::max()); + const uint32_t elements = std::min(ne, static_cast(max_elements)); + + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, std::array{ ne, num_blocks }, { elements, 1, 1 }); ggml_vk_sync_buffers(ctx, subctx); } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp b/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp index 20e45d025..7ea29a07e 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp @@ -15,6 +15,7 @@ layout (push_constant) uniform parameter { uint ne; + uint num_blocks; } p; #include "types.glsl" @@ -33,8 +34,7 @@ layout (binding = 1) writeonly buffer D {block_q8_1_x4 data_b[];}; shared float shmem[GROUP_SIZE]; #endif -void quantize() { - const uint wgid = gl_WorkGroupID.x; +void quantize(const uint wgid) { const uint tid = INVOCATION_ID; // Each thread handles a vec4, so 8 threads handle a block @@ -45,11 +45,7 @@ void quantize() { const uint ib = wgid * blocks_per_group + block_in_wg; const uint iqs = tid % 8; -#ifndef QBLOCK_X4 - if (ib >= gl_NumWorkGroups.x * blocks_per_group) { - return; - } -#else +#ifdef QBLOCK_X4 const uint ibx4_outer = ib / 4; const uint ibx4_inner = ib % 4; @@ -123,5 +119,9 @@ void quantize() { } void main() { - quantize(); + uint wgid = gl_WorkGroupID.x; + while (wgid < p.num_blocks) { + quantize(wgid); + wgid += gl_NumWorkGroups.x; + } } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index fa6e80e3f..8df994e91 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -7625,6 +7625,10 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 64, 77, 77, {12,1}, {1,1})); test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 576, 512, 576, {1,1}, {1,1})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 1, 2048, 8192, {1, 1}, {1, 1})); + for (ggml_type type_a : all_types) { + test_cases.emplace_back(new test_mul_mat(type_a, GGML_TYPE_F32, 1, 64, 256, {1, 1}, {1, 1})); + } #if 0 // test the mat-mat path for Metal From 2da64a2f8a47894d70bc87a160dcb57a3df63b6e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 5 Jan 2026 12:34:23 +0200 Subject: [PATCH 3/4] models : fix backend assignment for Granite/Nemotron graphs (#18599) * models : fix backend assignment for Granite/Nemotron graphs * cont : add ref * cont : move call to build_inp_embd() --- src/llama-graph.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 6cde7338c..86c547263 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1326,6 +1326,10 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const { res->add_input(std::move(inp)); + // make sure the produced embeddings are immediately materialized in the ggml graph + // ref: https://github.com/ggml-org/llama.cpp/pull/18599 + ggml_build_forward_expand(gf, cur); + return cur; } From f1768d8f03fe514794349790de0785eafded6c0b Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Mon, 5 Jan 2026 04:51:39 -0600 Subject: [PATCH 4/4] vulkan: fix topk_moe_sigmoid_norm_bias failures in GLM-4.6 (#18582) --- .../ggml-vulkan/vulkan-shaders/topk_moe.comp | 19 ++++++++++++------- tests/test-backend-ops.cpp | 1 + 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp b/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp index 4bf6d2bcb..ef2f202ec 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp @@ -101,6 +101,10 @@ void main() { const uint lane = gl_SubgroupInvocationID; float probs[experts_per_thread]; + [[unroll]] + for (int i = 0; i < experts_per_thread; i++) { + probs[i] = -INFINITY; + } [[unroll]] for (uint i = 0; i < n_experts; i += WARP_SIZE) { @@ -112,8 +116,9 @@ void main() { softmax_warp_inplace(probs, n_experts, lane, nexperts_use_push); } else if (gating_func == GATING_FUNC_SIGMOID) { [[unroll]] - for (int i = 0; i < experts_per_thread; i++) { - probs[i] = 1.f / (1.f + exp(-probs[i])); + for (uint i = 0; i < n_experts; i += WARP_SIZE) { + const uint expert = i + lane; + probs[i / WARP_SIZE] = (n_experts % WARP_SIZE == 0 || expert < n_experts) ? 1.f / (1.f + exp(-probs[i / WARP_SIZE])) : -INFINITY; } } @@ -150,11 +155,11 @@ void main() { uint max_expert = lane; [[unroll]] - for (int i = 1; i < experts_per_thread; i++) { - const uint expert = lane + i * WARP_SIZE; - if ((n_experts % WARP_SIZE == 0 || expert < n_experts) && selection_probs[i] > max_val_s) { - max_val = probs[i]; - max_val_s = selection_probs[i]; + for (uint i = WARP_SIZE; i < n_experts; i += WARP_SIZE) { + const uint expert = i + lane; + if ((n_experts % WARP_SIZE == 0 || expert < n_experts) && selection_probs[i / WARP_SIZE] > max_val_s) { + max_val = probs[i / WARP_SIZE]; + max_val_s = selection_probs[i / WARP_SIZE]; max_expert = expert; } } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 8df994e91..15567abed 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -8184,6 +8184,7 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_topk_moe({71, 22, 1, 1}, 8, with_norm, bias_probs, gate, scale_w)); test_cases.emplace_back(new test_topk_moe({128, 1, 1, 1}, 128, with_norm, bias_probs, gate, scale_w)); test_cases.emplace_back(new test_topk_moe({129, 1, 1, 1}, 128, with_norm, bias_probs, gate, scale_w)); + test_cases.emplace_back(new test_topk_moe({160, 4, 1, 1}, 160, with_norm, bias_probs, gate, scale_w)); } } }