From 328874d054e0eb44591202a23c209cf02c18e3cb Mon Sep 17 00:00:00 2001 From: Pascal Date: Mon, 25 May 2026 16:05:04 +0200 Subject: [PATCH] model: tag ffn_latent as MUL_MAT to fix buft probe (#23664) ffn_latent_down/up are declared GGML_OP_MUL in LLM_TENSOR_INFOS but nemotron-h feeds them through ggml_mul_mat. The loader buft probe asks the backend about the declared op, so it tested an elementwise MUL on a q8_0 weight. That used to return true unconditionally and the weight stayed on GPU by luck. Once supports_op told the truth, the probe got a no and the loader pushed the weight and its matmul to CPU, splitting the graph. Tagging it MUL_MAT asks the real question, the math is unchanged. Verified on Nemotron 3 Super 120B Q5_K_M: from 64.9 back to 103.22 t/s. --- src/llama-arch.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index c9eead18a..e7d027c98 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -767,8 +767,9 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, // Nemotron 3 Super - {LLM_TENSOR_FFN_LATENT_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, - {LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + // latent projections feed ggml_mul_mat, the buft probe must use MUL_MAT to keep them on GPU + {LLM_TENSOR_FFN_LATENT_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, }; LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}