From 42928bc14d33bde7b8fe8ea436ca44b40357737e Mon Sep 17 00:00:00 2001 From: ynankani Date: Sat, 16 May 2026 09:09:27 +0000 Subject: [PATCH] model : NvFP4 quantized LM head support (#23046) * NvFP4 quantized LM head support Signed-off-by: ynankani * Address review commnets Signed-off-by: ynankani * Add assert for NvFp4 lm head and tied embeddings Signed-off-by: ynankani * Address review commnets Signed-off-by: ynankani * Create output_s tensor only when LM head NvFp4 Signed-off-by: ynankani --------- Signed-off-by: ynankani --- src/llama-model-saver.cpp | 2 ++ src/llama-model.cpp | 15 ++++++++++++++- src/llama-model.h | 5 +++++ src/models/afmoe.cpp | 2 +- src/models/apertus.cpp | 2 +- src/models/arcee.cpp | 2 +- src/models/arctic.cpp | 2 +- src/models/arwkv7.cpp | 2 +- src/models/baichuan.cpp | 2 +- src/models/bailingmoe.cpp | 2 +- src/models/bailingmoe2.cpp | 2 +- src/models/bloom.cpp | 2 +- src/models/chameleon.cpp | 2 +- src/models/chatglm.cpp | 2 +- src/models/codeshell.cpp | 2 +- src/models/cogvlm.cpp | 2 +- src/models/cohere2.cpp | 2 +- src/models/command-r.cpp | 2 +- src/models/dbrx.cpp | 2 +- src/models/deci.cpp | 2 +- src/models/deepseek.cpp | 2 +- src/models/dots1.cpp | 2 +- src/models/dream.cpp | 2 +- src/models/ernie4-5-moe.cpp | 2 +- src/models/ernie4-5.cpp | 2 +- src/models/exaone-moe.cpp | 2 +- src/models/exaone.cpp | 2 +- src/models/exaone4.cpp | 2 +- src/models/falcon-h1.cpp | 2 +- src/models/falcon.cpp | 2 +- src/models/gemma.cpp | 2 +- src/models/gemma2.cpp | 2 +- src/models/gemma3.cpp | 2 +- src/models/gemma3n.cpp | 2 +- src/models/gemma4.cpp | 2 +- src/models/glm4-moe.cpp | 2 +- src/models/glm4.cpp | 2 +- src/models/gpt2.cpp | 2 +- src/models/gptneox.cpp | 2 +- src/models/granite-hybrid.cpp | 2 +- src/models/granite.cpp | 2 +- src/models/grok.cpp | 2 +- src/models/grovemoe.cpp | 2 +- src/models/hunyuan-moe.cpp | 2 +- src/models/hunyuan-vl.cpp | 2 +- src/models/internlm2.cpp | 2 +- src/models/jais.cpp | 2 +- src/models/jais2.cpp | 2 +- src/models/jamba.cpp | 2 +- src/models/lfm2.cpp | 2 +- src/models/llada-moe.cpp | 2 +- src/models/llada.cpp | 2 +- src/models/llama.cpp | 2 +- src/models/llama4.cpp | 2 +- src/models/maincoder.cpp | 2 +- src/models/mamba.cpp | 2 +- src/models/mimo2.cpp | 2 +- src/models/minicpm3.cpp | 2 +- src/models/minimax-m2.cpp | 2 +- src/models/mistral3.cpp | 2 +- src/models/mpt.cpp | 2 +- src/models/nemotron-h.cpp | 2 +- src/models/nemotron.cpp | 2 +- src/models/olmo.cpp | 2 +- src/models/olmo2.cpp | 2 +- src/models/olmoe.cpp | 2 +- src/models/openai-moe.cpp | 2 +- src/models/openelm.cpp | 2 +- src/models/orion.cpp | 2 +- src/models/paddleocr.cpp | 2 +- src/models/pangu-embed.cpp | 2 +- src/models/phi2.cpp | 2 +- src/models/phi3.cpp | 2 +- src/models/plamo.cpp | 2 +- src/models/plamo2.cpp | 2 +- src/models/plamo3.cpp | 2 +- src/models/plm.cpp | 2 +- src/models/qwen.cpp | 2 +- src/models/qwen2.cpp | 2 +- src/models/qwen2moe.cpp | 2 +- src/models/qwen2vl.cpp | 2 +- src/models/qwen3.cpp | 2 +- src/models/qwen35.cpp | 2 +- src/models/qwen35moe.cpp | 2 +- src/models/qwen3moe.cpp | 2 +- src/models/qwen3next.cpp | 2 +- src/models/qwen3vl.cpp | 2 +- src/models/qwen3vlmoe.cpp | 2 +- src/models/refact.cpp | 2 +- src/models/rnd1.cpp | 2 +- src/models/rwkv6.cpp | 2 +- src/models/rwkv6qwen2.cpp | 2 +- src/models/rwkv7.cpp | 2 +- src/models/seed-oss.cpp | 2 +- src/models/smallthinker.cpp | 2 +- src/models/smollm3.cpp | 2 +- src/models/stablelm.cpp | 2 +- src/models/starcoder.cpp | 2 +- src/models/starcoder2.cpp | 2 +- src/models/step35.cpp | 2 +- src/models/t5.cpp | 2 +- src/models/wavtokenizer-dec.cpp | 2 +- src/models/xverse.cpp | 2 +- 103 files changed, 121 insertions(+), 101 deletions(-) diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp index e83056557..528e4c9c0 100644 --- a/src/llama-model-saver.cpp +++ b/src/llama-model-saver.cpp @@ -393,6 +393,8 @@ void llama_model_saver::add_tensors_from_model() { add_tensor(model->output); add_tensor(model->output_b); add_tensor(model->output_norm_enc); + add_tensor(model->output_s); + add_tensor(model->output_in_s); add_tensor(model->cls); add_tensor(model->cls_b); add_tensor(model->cls_out); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index ff30a2ae7..46ae010f8 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1394,10 +1394,23 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { layer.ssm_beta_in_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); } } + // output scales + if (output && output->type == GGML_TYPE_NVFP4) { + // weight scale + if (!output_s) { + output_s = create_tensor(tn(LLM_TENSOR_OUTPUT, "scale"), {1}, TENSOR_NOT_REQUIRED); + } + // input scale + if (!output_in_s) { + output_in_s = create_tensor(tn(LLM_TENSOR_OUTPUT, "input_scale"), {1}, TENSOR_NOT_REQUIRED); + } + } } - ml.done_getting_tensors(); + GGML_ASSERT(!(output && tok_embd && + strcmp(output->name, tok_embd->name) == 0 && + output->type == GGML_TYPE_NVFP4)); // populate tensors_by_name for (auto & [_, ctx_ptr] : ml.ctx_map) { for (auto * cur = ggml_get_first_tensor(ctx_ptr.get()); cur != NULL; cur = ggml_get_next_tensor(ctx_ptr.get(), cur)) { diff --git a/src/llama-model.h b/src/llama-model.h index d63c68918..01c87a752 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -533,6 +533,11 @@ struct llama_model { struct ggml_tensor * output_b = nullptr; struct ggml_tensor * output_norm_enc = nullptr; + + // NVFP4 per-tensor scale2, input_scale for LM head + struct ggml_tensor * output_s = nullptr; + struct ggml_tensor * output_in_s = nullptr; + // classifier struct ggml_tensor * cls = nullptr; struct ggml_tensor * cls_b = nullptr; diff --git a/src/models/afmoe.cpp b/src/models/afmoe.cpp index 602e3176a..a7c77ee5d 100644 --- a/src/models/afmoe.cpp +++ b/src/models/afmoe.cpp @@ -277,7 +277,7 @@ llama_model_afmoe::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/apertus.cpp b/src/models/apertus.cpp index 136ff7029..bec713652 100644 --- a/src/models/apertus.cpp +++ b/src/models/apertus.cpp @@ -160,7 +160,7 @@ llama_model_apertus::graph::graph(const llama_model & model, const llm_graph_par res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/arcee.cpp b/src/models/arcee.cpp index 70e86d411..d086c4717 100644 --- a/src/models/arcee.cpp +++ b/src/models/arcee.cpp @@ -148,7 +148,7 @@ llama_model_arcee::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/arctic.cpp b/src/models/arctic.cpp index d8653a446..27deadffe 100644 --- a/src/models/arctic.cpp +++ b/src/models/arctic.cpp @@ -171,7 +171,7 @@ llama_model_arctic::graph::graph(const llama_model & model, const llm_graph_para res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/arwkv7.cpp b/src/models/arwkv7.cpp index 79aa8c908..9bd04127b 100644 --- a/src/models/arwkv7.cpp +++ b/src/models/arwkv7.cpp @@ -193,7 +193,7 @@ llama_model_arwkv7::graph::graph(const llama_model & model, const llm_graph_para cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/baichuan.cpp b/src/models/baichuan.cpp index 4e55290e4..4d26081cd 100644 --- a/src/models/baichuan.cpp +++ b/src/models/baichuan.cpp @@ -146,7 +146,7 @@ llama_model_baichuan::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/bailingmoe.cpp b/src/models/bailingmoe.cpp index 030dd4f42..fe1ae1086 100644 --- a/src/models/bailingmoe.cpp +++ b/src/models/bailingmoe.cpp @@ -171,7 +171,7 @@ llama_model_bailingmoe::graph::graph(const llama_model & model, const llm_graph_ res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/bailingmoe2.cpp b/src/models/bailingmoe2.cpp index e7fe3d5b4..2f0d44a62 100644 --- a/src/models/bailingmoe2.cpp +++ b/src/models/bailingmoe2.cpp @@ -210,7 +210,7 @@ llama_model_bailingmoe2::graph::graph(const llama_model & model, const llm_graph res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/bloom.cpp b/src/models/bloom.cpp index b600fb0c9..30b0f3d07 100644 --- a/src/models/bloom.cpp +++ b/src/models/bloom.cpp @@ -142,7 +142,7 @@ llama_model_bloom::graph::graph(const llama_model & model, const llm_graph_param cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/chameleon.cpp b/src/models/chameleon.cpp index 8510b9e29..4bceaefd6 100644 --- a/src/models/chameleon.cpp +++ b/src/models/chameleon.cpp @@ -181,7 +181,7 @@ llama_model_chameleon::graph::graph(const llama_model & model, const llm_graph_p res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output_with_img_logits", -1); // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs. diff --git a/src/models/chatglm.cpp b/src/models/chatglm.cpp index e898eff79..6766fa71c 100644 --- a/src/models/chatglm.cpp +++ b/src/models/chatglm.cpp @@ -151,7 +151,7 @@ llama_model_chatglm::graph::graph(const llama_model & model, const llm_graph_par cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/codeshell.cpp b/src/models/codeshell.cpp index e9e85d967..274dd3342 100644 --- a/src/models/codeshell.cpp +++ b/src/models/codeshell.cpp @@ -143,7 +143,7 @@ llama_model_codeshell::graph::graph(const llama_model & model, const llm_graph_p cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/cogvlm.cpp b/src/models/cogvlm.cpp index 79236121b..2e231bb3f 100644 --- a/src/models/cogvlm.cpp +++ b/src/models/cogvlm.cpp @@ -150,7 +150,7 @@ llama_model_cogvlm::graph::graph(const llama_model & model, const llm_graph_para cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; ggml_build_forward_expand(gf, cur); diff --git a/src/models/cohere2.cpp b/src/models/cohere2.cpp index 12edbae10..a514cf88f 100644 --- a/src/models/cohere2.cpp +++ b/src/models/cohere2.cpp @@ -146,7 +146,7 @@ llama_model_cohere2::graph::graph(const llama_model & model, const llm_graph_par res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); if (f_logit_scale) { cur = ggml_scale(ctx0, cur, f_logit_scale); diff --git a/src/models/command-r.cpp b/src/models/command-r.cpp index decb89f54..adf7fcaa2 100644 --- a/src/models/command-r.cpp +++ b/src/models/command-r.cpp @@ -131,7 +131,7 @@ llama_model_command_r::graph::graph(const llama_model & model, const llm_graph_p res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); if (f_logit_scale) { cur = ggml_scale(ctx0, cur, f_logit_scale); diff --git a/src/models/dbrx.cpp b/src/models/dbrx.cpp index bce6b04bc..af71c7753 100644 --- a/src/models/dbrx.cpp +++ b/src/models/dbrx.cpp @@ -145,7 +145,7 @@ llama_model_dbrx::graph::graph(const llama_model & model, const llm_graph_params res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/deci.cpp b/src/models/deci.cpp index 9f1a959c3..567e35352 100644 --- a/src/models/deci.cpp +++ b/src/models/deci.cpp @@ -181,7 +181,7 @@ llama_model_deci::graph::graph(const llama_model & model, const llm_graph_params res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/deepseek.cpp b/src/models/deepseek.cpp index c79460596..f52ec9518 100644 --- a/src/models/deepseek.cpp +++ b/src/models/deepseek.cpp @@ -185,7 +185,7 @@ llama_model_deepseek::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/dots1.cpp b/src/models/dots1.cpp index 93cbcf9d9..435d27281 100644 --- a/src/models/dots1.cpp +++ b/src/models/dots1.cpp @@ -183,7 +183,7 @@ llama_model_dots1::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/dream.cpp b/src/models/dream.cpp index 60a3f0ec2..12ac6f1ce 100644 --- a/src/models/dream.cpp +++ b/src/models/dream.cpp @@ -128,7 +128,7 @@ llama_model_dream::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/ernie4-5-moe.cpp b/src/models/ernie4-5-moe.cpp index 2bd01a2c5..8d9ff1386 100644 --- a/src/models/ernie4-5-moe.cpp +++ b/src/models/ernie4-5-moe.cpp @@ -124,7 +124,7 @@ llama_model_ernie4_5_moe::graph::graph(const llama_model & model, const llm_grap res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/ernie4-5.cpp b/src/models/ernie4-5.cpp index fa989fe92..9b39c605e 100644 --- a/src/models/ernie4-5.cpp +++ b/src/models/ernie4-5.cpp @@ -155,7 +155,7 @@ llama_model_ernie4_5::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/exaone-moe.cpp b/src/models/exaone-moe.cpp index 54bb3ca86..76d91982f 100644 --- a/src/models/exaone-moe.cpp +++ b/src/models/exaone-moe.cpp @@ -237,7 +237,7 @@ llama_model_exaone_moe::graph::graph(const llama_model & model, const llm_graph_ res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/exaone.cpp b/src/models/exaone.cpp index 75d5f6063..c7e9960d7 100644 --- a/src/models/exaone.cpp +++ b/src/models/exaone.cpp @@ -127,7 +127,7 @@ llama_model_exaone::graph::graph(const llama_model & model, const llm_graph_para res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/exaone4.cpp b/src/models/exaone4.cpp index 5506e7642..499e22dde 100644 --- a/src/models/exaone4.cpp +++ b/src/models/exaone4.cpp @@ -163,7 +163,7 @@ llama_model_exaone4::graph::graph(const llama_model & model, const llm_gra res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/falcon-h1.cpp b/src/models/falcon-h1.cpp index d353befdb..94b65a3c7 100644 --- a/src/models/falcon-h1.cpp +++ b/src/models/falcon-h1.cpp @@ -200,7 +200,7 @@ llama_model_falcon_h1::graph::graph(const llama_model & model, const llm_graph_p res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/falcon.cpp b/src/models/falcon.cpp index 75f2cfef5..ad546ef2d 100644 --- a/src/models/falcon.cpp +++ b/src/models/falcon.cpp @@ -152,7 +152,7 @@ llama_model_falcon::graph::graph(const llama_model & model, const llm_graph_para cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/gemma.cpp b/src/models/gemma.cpp index 067316700..1519682fd 100644 --- a/src/models/gemma.cpp +++ b/src/models/gemma.cpp @@ -130,7 +130,7 @@ llama_model_gemma::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/gemma2.cpp b/src/models/gemma2.cpp index 6255bf740..ae3f9ffb5 100644 --- a/src/models/gemma2.cpp +++ b/src/models/gemma2.cpp @@ -163,7 +163,7 @@ llama_model_gemma2::graph::graph(const llama_model & model, const llm_graph_para res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); // final logit soft-capping cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping); diff --git a/src/models/gemma3.cpp b/src/models/gemma3.cpp index ee510fe38..63a2b380e 100644 --- a/src/models/gemma3.cpp +++ b/src/models/gemma3.cpp @@ -207,7 +207,7 @@ llama_model_gemma3::graph::graph(const llama_model & model, const llm_grap res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); if (hparams.f_final_logit_softcapping) { cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping); diff --git a/src/models/gemma3n.cpp b/src/models/gemma3n.cpp index 881499b0c..6ec3a0060 100644 --- a/src/models/gemma3n.cpp +++ b/src/models/gemma3n.cpp @@ -296,7 +296,7 @@ llama_model_gemma3n::graph::graph(const llama_model & model, const llm_graph_par cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); { // final logit soft-capping diff --git a/src/models/gemma4.cpp b/src/models/gemma4.cpp index f45ae4cad..4f9d8b18b 100644 --- a/src/models/gemma4.cpp +++ b/src/models/gemma4.cpp @@ -380,7 +380,7 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); if (hparams.f_final_logit_softcapping) { cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping); diff --git a/src/models/glm4-moe.cpp b/src/models/glm4-moe.cpp index 45886b51a..27654b8cb 100644 --- a/src/models/glm4-moe.cpp +++ b/src/models/glm4-moe.cpp @@ -275,7 +275,7 @@ llama_model_glm4_moe::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/glm4.cpp b/src/models/glm4.cpp index d6ef76e26..7c242fed2 100644 --- a/src/models/glm4.cpp +++ b/src/models/glm4.cpp @@ -185,7 +185,7 @@ llama_model_glm4::graph::graph(const llama_model & model, const llm_graph_params res->t_embd = cur; // Output projection - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/gpt2.cpp b/src/models/gpt2.cpp index ba49c31b5..e2dcc8b15 100644 --- a/src/models/gpt2.cpp +++ b/src/models/gpt2.cpp @@ -138,7 +138,7 @@ llama_model_gpt2::graph::graph(const llama_model & model, const llm_graph_params cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/gptneox.cpp b/src/models/gptneox.cpp index 33ebe2d88..443e35add 100644 --- a/src/models/gptneox.cpp +++ b/src/models/gptneox.cpp @@ -209,7 +209,7 @@ llama_model_gptneox::graph::graph(const llama_model & model, const llm_graph_par cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/granite-hybrid.cpp b/src/models/granite-hybrid.cpp index 12e4790ae..27f6706ea 100644 --- a/src/models/granite-hybrid.cpp +++ b/src/models/granite-hybrid.cpp @@ -186,7 +186,7 @@ llama_model_granite_hybrid::graph::graph(const llama_model & model, const llm_gr res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); // For Granite architectures - scale logits if (hparams.f_logit_scale) { diff --git a/src/models/granite.cpp b/src/models/granite.cpp index 5e7c7b681..cda4aa231 100644 --- a/src/models/granite.cpp +++ b/src/models/granite.cpp @@ -145,7 +145,7 @@ llama_model_granite::graph::graph( res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); // For Granite architectures - scale logits cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale); diff --git a/src/models/grok.cpp b/src/models/grok.cpp index 0bc49d002..7c46ec1c0 100644 --- a/src/models/grok.cpp +++ b/src/models/grok.cpp @@ -206,7 +206,7 @@ llama_model_grok::graph::graph(const llama_model & model, const llm_graph_params res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cur = ggml_scale(ctx0, cur, hparams.f_logit_scale); diff --git a/src/models/grovemoe.cpp b/src/models/grovemoe.cpp index feef81516..1cab75adc 100644 --- a/src/models/grovemoe.cpp +++ b/src/models/grovemoe.cpp @@ -184,7 +184,7 @@ llama_model_grovemoe::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/hunyuan-moe.cpp b/src/models/hunyuan-moe.cpp index 44af42412..deb3c9671 100644 --- a/src/models/hunyuan-moe.cpp +++ b/src/models/hunyuan-moe.cpp @@ -179,7 +179,7 @@ llama_model_hunyuan_moe::graph::graph(const llama_model & model, const llm_graph res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/hunyuan-vl.cpp b/src/models/hunyuan-vl.cpp index 5fb9154be..da9bb74de 100644 --- a/src/models/hunyuan-vl.cpp +++ b/src/models/hunyuan-vl.cpp @@ -181,7 +181,7 @@ llama_model_hunyuan_vl::graph::graph(const llama_model & model, const llm_graph_ cb(cur, "result_norm", -1); res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/internlm2.cpp b/src/models/internlm2.cpp index f0c5580a6..f9ee37a24 100644 --- a/src/models/internlm2.cpp +++ b/src/models/internlm2.cpp @@ -129,7 +129,7 @@ llama_model_internlm2::graph::graph(const llama_model & model, const llm_graph_p res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/jais.cpp b/src/models/jais.cpp index a6451dca0..2ba162605 100644 --- a/src/models/jais.cpp +++ b/src/models/jais.cpp @@ -123,7 +123,7 @@ llama_model_jais::graph::graph(const llama_model & model, const llm_graph_params cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/jais2.cpp b/src/models/jais2.cpp index ad59b953e..896613144 100644 --- a/src/models/jais2.cpp +++ b/src/models/jais2.cpp @@ -152,7 +152,7 @@ llama_model_jais2::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // Output projection - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/jamba.cpp b/src/models/jamba.cpp index e1b8d137e..84ea63c31 100644 --- a/src/models/jamba.cpp +++ b/src/models/jamba.cpp @@ -189,7 +189,7 @@ llama_model_jamba::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/lfm2.cpp b/src/models/lfm2.cpp index df6a80287..29081344b 100644 --- a/src/models/lfm2.cpp +++ b/src/models/lfm2.cpp @@ -262,7 +262,7 @@ llama_model_lfm2::graph::graph(const llama_model & model, const llm_graph_ cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/llada-moe.cpp b/src/models/llada-moe.cpp index b60f67f6c..9722dde9f 100644 --- a/src/models/llada-moe.cpp +++ b/src/models/llada-moe.cpp @@ -153,7 +153,7 @@ llama_model_llada_moe::graph::graph(const llama_model & model, const llm_graph_p res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/llada.cpp b/src/models/llada.cpp index fa21c5fe3..58b2c466e 100644 --- a/src/models/llada.cpp +++ b/src/models/llada.cpp @@ -147,7 +147,7 @@ llama_model_llada::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/llama.cpp b/src/models/llama.cpp index 8ddb59368..cef66d054 100644 --- a/src/models/llama.cpp +++ b/src/models/llama.cpp @@ -235,7 +235,7 @@ llama_model_llama::graph::graph(const llama_model & model, const llm_grap if constexpr (!embed) { // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/llama4.cpp b/src/models/llama4.cpp index 899611d53..0ff5376d5 100644 --- a/src/models/llama4.cpp +++ b/src/models/llama4.cpp @@ -260,7 +260,7 @@ llama_model_llama4::graph::graph(const llama_model & model, const llm_grap res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/maincoder.cpp b/src/models/maincoder.cpp index 3dbd82fd3..84cfe3990 100644 --- a/src/models/maincoder.cpp +++ b/src/models/maincoder.cpp @@ -141,7 +141,7 @@ llama_model_maincoder::graph::graph(const llama_model & model, const llm_graph_p res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/mamba.cpp b/src/models/mamba.cpp index b7708d7fd..887a1fa50 100644 --- a/src/models/mamba.cpp +++ b/src/models/mamba.cpp @@ -128,7 +128,7 @@ llama_model_mamba::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/mimo2.cpp b/src/models/mimo2.cpp index 719966166..d0295ec11 100644 --- a/src/models/mimo2.cpp +++ b/src/models/mimo2.cpp @@ -231,7 +231,7 @@ llama_model_mimo2::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/minicpm3.cpp b/src/models/minicpm3.cpp index ff5eb6ffa..1ffc54fa7 100644 --- a/src/models/minicpm3.cpp +++ b/src/models/minicpm3.cpp @@ -251,7 +251,7 @@ llama_model_minicpm3::graph::graph(const llama_model & model, const llm_graph_pa cb(cur, "lmhead_scaling", -1); // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/minimax-m2.cpp b/src/models/minimax-m2.cpp index 0dee89346..22e291d73 100644 --- a/src/models/minimax-m2.cpp +++ b/src/models/minimax-m2.cpp @@ -158,7 +158,7 @@ llama_model_minimax_m2::graph::graph(const llama_model & model, const llm_graph_ res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/mistral3.cpp b/src/models/mistral3.cpp index 708da49af..4e6ebef82 100644 --- a/src/models/mistral3.cpp +++ b/src/models/mistral3.cpp @@ -222,7 +222,7 @@ llama_model_mistral3::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/mpt.cpp b/src/models/mpt.cpp index cfc60e8de..0229d20ed 100644 --- a/src/models/mpt.cpp +++ b/src/models/mpt.cpp @@ -161,7 +161,7 @@ llama_model_mpt::graph::graph(const llama_model & model, const llm_graph_params cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/nemotron-h.cpp b/src/models/nemotron-h.cpp index 865461f61..a82f9c170 100644 --- a/src/models/nemotron-h.cpp +++ b/src/models/nemotron-h.cpp @@ -174,7 +174,7 @@ llama_model_nemotron_h::graph::graph(const llama_model & model, const llm_graph_ res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/nemotron.cpp b/src/models/nemotron.cpp index 0c72ed297..5d4a3b5c6 100644 --- a/src/models/nemotron.cpp +++ b/src/models/nemotron.cpp @@ -140,7 +140,7 @@ llama_model_nemotron::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/olmo.cpp b/src/models/olmo.cpp index 161035e72..cfcf17bcb 100644 --- a/src/models/olmo.cpp +++ b/src/models/olmo.cpp @@ -133,7 +133,7 @@ llama_model_olmo::graph::graph(const llama_model & model, const llm_graph_params res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/olmo2.cpp b/src/models/olmo2.cpp index 9633f2699..7cc262f55 100644 --- a/src/models/olmo2.cpp +++ b/src/models/olmo2.cpp @@ -198,7 +198,7 @@ llama_model_olmo2::graph::graph(const llama_model & model, const llm_graph res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/olmoe.cpp b/src/models/olmoe.cpp index 4bb901305..7976ae44a 100644 --- a/src/models/olmoe.cpp +++ b/src/models/olmoe.cpp @@ -164,7 +164,7 @@ llama_model_olmoe::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/openai-moe.cpp b/src/models/openai-moe.cpp index 13a590ce6..15b6c8c12 100644 --- a/src/models/openai-moe.cpp +++ b/src/models/openai-moe.cpp @@ -160,7 +160,7 @@ llama_model_openai_moe::graph::graph(const llama_model & model, const llm_graph_ res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/openelm.cpp b/src/models/openelm.cpp index b4128e116..9f76350fd 100644 --- a/src/models/openelm.cpp +++ b/src/models/openelm.cpp @@ -162,7 +162,7 @@ llama_model_openelm::graph::graph(const llama_model & model, const llm_graph_par cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/orion.cpp b/src/models/orion.cpp index 7ace0a513..bcb4bbba4 100644 --- a/src/models/orion.cpp +++ b/src/models/orion.cpp @@ -132,7 +132,7 @@ llama_model_orion::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/paddleocr.cpp b/src/models/paddleocr.cpp index 1c0eadefa..d39220bd7 100644 --- a/src/models/paddleocr.cpp +++ b/src/models/paddleocr.cpp @@ -98,7 +98,7 @@ llama_model_paddleocr::graph::graph(const llama_model & model, const llm_graph_p res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/pangu-embed.cpp b/src/models/pangu-embed.cpp index 41b7e2ac2..7593f879b 100644 --- a/src/models/pangu-embed.cpp +++ b/src/models/pangu-embed.cpp @@ -148,7 +148,7 @@ llama_model_pangu_embed::graph::graph(const llama_model & model, const llm_graph res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); if (model.output_b != nullptr) { cur = ggml_add(ctx0, cur, model.output_b); diff --git a/src/models/phi2.cpp b/src/models/phi2.cpp index a333602c7..8f3ed5f7b 100644 --- a/src/models/phi2.cpp +++ b/src/models/phi2.cpp @@ -130,7 +130,7 @@ llama_model_phi2::graph::graph(const llama_model & model, const llm_graph_params cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output_no_bias", -1); cur = ggml_add(ctx0, cur, model.output_b); diff --git a/src/models/phi3.cpp b/src/models/phi3.cpp index 0a65e91fe..f8a4a4d5a 100644 --- a/src/models/phi3.cpp +++ b/src/models/phi3.cpp @@ -179,7 +179,7 @@ llama_model_phi3::graph::graph(const llama_model & model, const llm_graph_ cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); if (model.output_b != nullptr) { cb(cur, "result_output_no_bias", -1); diff --git a/src/models/plamo.cpp b/src/models/plamo.cpp index 4c16c20a0..c7ed1211c 100644 --- a/src/models/plamo.cpp +++ b/src/models/plamo.cpp @@ -127,7 +127,7 @@ llama_model_plamo::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/plamo2.cpp b/src/models/plamo2.cpp index 29c870260..b713889fe 100644 --- a/src/models/plamo2.cpp +++ b/src/models/plamo2.cpp @@ -185,7 +185,7 @@ llama_model_plamo2::graph::graph(const llama_model & model, const llm_graph_para res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); // Explicitly mark as output tensor to ensure proper backend assignment diff --git a/src/models/plamo3.cpp b/src/models/plamo3.cpp index 849f1579e..29f3e803d 100644 --- a/src/models/plamo3.cpp +++ b/src/models/plamo3.cpp @@ -186,7 +186,7 @@ llama_model_plamo3::graph::graph(const llama_model & model, const llm_grap cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); res->t_logits = cur; ggml_build_forward_expand(gf, cur); diff --git a/src/models/plm.cpp b/src/models/plm.cpp index 57f599510..ce050919e 100644 --- a/src/models/plm.cpp +++ b/src/models/plm.cpp @@ -204,7 +204,7 @@ llama_model_plm::graph::graph(const llama_model & model, const llm_graph_params cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/qwen.cpp b/src/models/qwen.cpp index cdc076cdf..00467dbad 100644 --- a/src/models/qwen.cpp +++ b/src/models/qwen.cpp @@ -131,7 +131,7 @@ llama_model_qwen::graph::graph(const llama_model & model, const llm_graph_params res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/qwen2.cpp b/src/models/qwen2.cpp index 6320458a1..a5147460b 100644 --- a/src/models/qwen2.cpp +++ b/src/models/qwen2.cpp @@ -141,7 +141,7 @@ llama_model_qwen2::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); if (model.output_b != nullptr) { cur = ggml_add(ctx0, cur, model.output_b); diff --git a/src/models/qwen2moe.cpp b/src/models/qwen2moe.cpp index 7587c802c..7cb03859d 100644 --- a/src/models/qwen2moe.cpp +++ b/src/models/qwen2moe.cpp @@ -184,7 +184,7 @@ llama_model_qwen2moe::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/qwen2vl.cpp b/src/models/qwen2vl.cpp index 1a40fa89b..d79db682c 100644 --- a/src/models/qwen2vl.cpp +++ b/src/models/qwen2vl.cpp @@ -134,7 +134,7 @@ llama_model_qwen2vl::graph::graph(const llama_model & model, const llm_graph_par res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/qwen3.cpp b/src/models/qwen3.cpp index fa656c84e..41b97fed9 100644 --- a/src/models/qwen3.cpp +++ b/src/models/qwen3.cpp @@ -147,7 +147,7 @@ llama_model_qwen3::graph::graph(const llama_model & model, const llm_graph_param res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp index f276be61b..b188810f9 100644 --- a/src/models/qwen35.cpp +++ b/src/models/qwen35.cpp @@ -167,7 +167,7 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para res->t_embd = cur; // LM head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp index cf05dc9d6..8ec9b8c6f 100644 --- a/src/models/qwen35moe.cpp +++ b/src/models/qwen35moe.cpp @@ -180,7 +180,7 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p res->t_embd = cur; // LM head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/qwen3moe.cpp b/src/models/qwen3moe.cpp index 4440b83aa..a4f8e1379 100644 --- a/src/models/qwen3moe.cpp +++ b/src/models/qwen3moe.cpp @@ -168,7 +168,7 @@ llama_model_qwen3moe::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/qwen3next.cpp b/src/models/qwen3next.cpp index cb1b4814c..bdc3026c1 100644 --- a/src/models/qwen3next.cpp +++ b/src/models/qwen3next.cpp @@ -176,7 +176,7 @@ llama_model_qwen3next::graph::graph(const llama_model & model, const llm_graph_p res->t_embd = cur; // LM head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/qwen3vl.cpp b/src/models/qwen3vl.cpp index 7871f8f79..5defd8939 100644 --- a/src/models/qwen3vl.cpp +++ b/src/models/qwen3vl.cpp @@ -163,7 +163,7 @@ llama_model_qwen3vl::graph::graph(const llama_model & model, const llm_graph_par res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/qwen3vlmoe.cpp b/src/models/qwen3vlmoe.cpp index b99143c89..5b77df571 100644 --- a/src/models/qwen3vlmoe.cpp +++ b/src/models/qwen3vlmoe.cpp @@ -180,7 +180,7 @@ llama_model_qwen3vlmoe::graph::graph(const llama_model & model, const llm_graph_ res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/refact.cpp b/src/models/refact.cpp index f14f10917..bf3949a90 100644 --- a/src/models/refact.cpp +++ b/src/models/refact.cpp @@ -150,7 +150,7 @@ llama_model_refact::graph::graph(const llama_model & model, const llm_graph_para res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/rnd1.cpp b/src/models/rnd1.cpp index 325ee73ba..ca8e00961 100644 --- a/src/models/rnd1.cpp +++ b/src/models/rnd1.cpp @@ -167,7 +167,7 @@ llama_model_rnd1::graph::graph(const llama_model & model, const llm_graph_params res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/rwkv6.cpp b/src/models/rwkv6.cpp index 2944711ac..ba2a9dfa0 100644 --- a/src/models/rwkv6.cpp +++ b/src/models/rwkv6.cpp @@ -176,7 +176,7 @@ llama_model_rwkv6::graph::graph(const llama_model & model, const llm_graph_param cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/rwkv6qwen2.cpp b/src/models/rwkv6qwen2.cpp index 6f7d1f572..566b8cdcb 100644 --- a/src/models/rwkv6qwen2.cpp +++ b/src/models/rwkv6qwen2.cpp @@ -158,7 +158,7 @@ llama_model_rwkv6qwen2::graph::graph(const llama_model & model, const llm_graph_ cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/rwkv7.cpp b/src/models/rwkv7.cpp index b205e3935..7574b2526 100644 --- a/src/models/rwkv7.cpp +++ b/src/models/rwkv7.cpp @@ -202,7 +202,7 @@ llama_model_rwkv7::graph::graph(const llama_model & model, const llm_graph_param cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/seed-oss.cpp b/src/models/seed-oss.cpp index 83e114740..806cba574 100644 --- a/src/models/seed-oss.cpp +++ b/src/models/seed-oss.cpp @@ -141,7 +141,7 @@ llama_model_seed_oss::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/smallthinker.cpp b/src/models/smallthinker.cpp index 3214e7cba..4231cccc6 100644 --- a/src/models/smallthinker.cpp +++ b/src/models/smallthinker.cpp @@ -178,7 +178,7 @@ llama_model_smallthinker::graph::graph(const llama_model & model, const ll res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/smollm3.cpp b/src/models/smollm3.cpp index 7adaf34c5..90e7d473e 100644 --- a/src/models/smollm3.cpp +++ b/src/models/smollm3.cpp @@ -143,7 +143,7 @@ llama_model_smollm3::graph::graph(const llama_model & model, const llm_graph_par res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/stablelm.cpp b/src/models/stablelm.cpp index 8f613e559..4da7f7aef 100644 --- a/src/models/stablelm.cpp +++ b/src/models/stablelm.cpp @@ -163,7 +163,7 @@ llama_model_stablelm::graph::graph(const llama_model & model, const llm_graph_pa res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/starcoder.cpp b/src/models/starcoder.cpp index 58cf0ac0e..e131af058 100644 --- a/src/models/starcoder.cpp +++ b/src/models/starcoder.cpp @@ -135,7 +135,7 @@ llama_model_starcoder::graph::graph(const llama_model & model, const llm_graph_p cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/starcoder2.cpp b/src/models/starcoder2.cpp index 45dae0602..9c207c028 100644 --- a/src/models/starcoder2.cpp +++ b/src/models/starcoder2.cpp @@ -148,7 +148,7 @@ llama_model_starcoder2::graph::graph(const llama_model & model, const llm_graph_ res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/step35.cpp b/src/models/step35.cpp index c4789752d..3b68e6870 100644 --- a/src/models/step35.cpp +++ b/src/models/step35.cpp @@ -261,7 +261,7 @@ llama_model_step35::graph::graph(const llama_model & model, const llm_graph_para cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/t5.cpp b/src/models/t5.cpp index 27a0711ba..73e327414 100644 --- a/src/models/t5.cpp +++ b/src/models/t5.cpp @@ -265,7 +265,7 @@ llama_model_t5::graph::graph(const llama_model & model, const llm_graph_p res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/wavtokenizer-dec.cpp b/src/models/wavtokenizer-dec.cpp index a873e5d2e..214fed99b 100644 --- a/src/models/wavtokenizer-dec.cpp +++ b/src/models/wavtokenizer-dec.cpp @@ -253,7 +253,7 @@ llama_model_wavtokenizer_dec::graph::graph(const llama_model & model, const llm_ LLM_NORM, -1); // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cur = ggml_add(ctx0, cur, model.output_b); diff --git a/src/models/xverse.cpp b/src/models/xverse.cpp index e4d111e62..d6d1c7a2e 100644 --- a/src/models/xverse.cpp +++ b/src/models/xverse.cpp @@ -126,7 +126,7 @@ llama_model_xverse::graph::graph(const llama_model & model, const llm_graph_para res->t_embd = cur; // lm_head - cur = build_lora_mm(model.output, cur); + cur = build_lora_mm(model.output, cur, model.output_s); cb(cur, "result_output", -1); res->t_logits = cur;