mtp: use inp_out_ids for skipping logit computation (#23433)

when doing a follow-up decode for the draft model, we were always doing the logit computation even though it is not required.
2026-05-22 11:16:08 +00:00 · 2026-05-21 15:23:14 +08:00 · 2026-05-21 15:23:14 +08:00 · 12e5d99078
commit 12e5d99078
parent 7ea23ddf7b
2 changed files with 11 additions and 4 deletions
--- a/src/models/qwen35.cpp
+++ b/src/models/qwen35.cpp
@ -525,8 +525,9 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr

    res->add_input(std::move(inp));

-    ggml_tensor * inp_pos = build_inp_pos();
-    auto * inp_attn       = build_attn_inp_kv();
+    ggml_tensor * inp_pos     = build_inp_pos();
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+    auto * inp_attn           = build_attn_inp_kv();

    ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il);
    cb(h_norm, "mtp_hnorm", il);
@ -615,6 +616,8 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr
    cb(cur, "h_pre_norm", -1);
    res->t_h_pre_norm = cur;

+    cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
+
    ggml_tensor * head_norm_w = layer.nextn.shared_head_norm
            ? layer.nextn.shared_head_norm
            : model.output_norm;
--- a/src/models/qwen35moe.cpp
+++ b/src/models/qwen35moe.cpp
@ -588,8 +588,10 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm

    res->add_input(std::move(inp));

-    ggml_tensor * inp_pos = build_inp_pos();
-    auto * inp_attn       = build_attn_inp_kv();
+    ggml_tensor * inp_pos     = build_inp_pos();
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+    auto * inp_attn           = build_attn_inp_kv();
+

    ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il);
    cb(h_norm, "mtp_hnorm", il);
@ -710,6 +712,8 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm
    cb(cur, "h_pre_norm", -1);
    res->t_h_pre_norm = cur;

+    cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
+
    ggml_tensor * head_norm_w = layer.nextn.shared_head_norm
            ? layer.nextn.shared_head_norm
            : model.output_norm;