llama: end-to-end tests (#19802)

* tests: add end-to-end tests per model architecture

* fixup for rebase

* fix use-after-free in llama-model-loader.cpp

* fix CI

* fix WebGPU

* fix CI

* disable CI for macOS-latest-cmake-arm64

* use expert_weights_scale only if != 0.0f

* comments
This commit is contained in:
Johannes Gäßler 2026-03-08 12:30:21 +01:00 committed by GitHub
parent a95047979a
commit a976ff081b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
33 changed files with 1607 additions and 633 deletions

View file

@ -118,12 +118,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
ggml_build_forward_expand(gf, cur);
// Check layer type by checking which tensors exist
// KDA layers have ssm_a_log tensor, MLA layers have wkv_a_mqa tensor
bool is_kda = (layer.ssm_a != nullptr);
bool is_mla = (layer.wkv_a_mqa != nullptr);
if (is_kda) {
if (hparams.is_recurrent(il)) {
// === KDA Layer (Kimi Delta Attention) with Recurrent State ===
// Reference: vLLM kda.py
const auto * mctx_cur = inp_rs->mctx;
@ -211,7 +206,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
cur = ggml_mul_mat(ctx0, layer.wo, gated);
cb(cur, "kda_out", il);
} else if (is_mla) {
} else {
// === MLA Layer (Multi-head Latent Attention) without KV Cache ===
// Reference: vLLM mla.py
// Step 1: Q projection and reshape
@ -310,9 +305,6 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
cur = build_attn(inp_attn_kv, layer.wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale_mla, il);
cb(cur, "mla_out", il);
}
} else {
// Unknown layer type - this should not happen
GGML_ABORT("Kimi layer is neither KDA nor MLA - missing required tensors");
}
// On last layer, select only the output tokens
@ -349,7 +341,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
hparams.n_expert,
hparams.n_expert_used,
LLM_FFN_SILU, true,
true, hparams.expert_weights_scale,
hparams.expert_weights_scale, hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(moe_out, "ffn_moe_out", il);