graph : add optional scale parameter to build_lora_mm [no ci] (#20427)

This commit is contained in:
Richard Davison 2026-03-12 00:22:49 +01:00 committed by GitHub
parent 4a748b8f15
commit 1eea6a2968
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 23 additions and 59 deletions

View file

@ -900,7 +900,8 @@ ggml_tensor * llm_graph_context::build_cvec(
ggml_tensor * llm_graph_context::build_lora_mm( ggml_tensor * llm_graph_context::build_lora_mm(
ggml_tensor * w, ggml_tensor * w,
ggml_tensor * cur) const { ggml_tensor * cur,
ggml_tensor * w_s) const {
ggml_tensor * res = ggml_mul_mat(ctx0, w, cur); ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
for (const auto & lora : *loras) { for (const auto & lora : *loras) {
@ -921,6 +922,10 @@ ggml_tensor * llm_graph_context::build_lora_mm(
res = ggml_add(ctx0, res, ab_cur); res = ggml_add(ctx0, res, ab_cur);
} }
if (w_s) {
res = ggml_mul(ctx0, res, w_s);
}
return res; return res;
} }

View file

@ -764,10 +764,11 @@ struct llm_graph_context {
ggml_tensor * cur, ggml_tensor * cur,
int il) const; int il) const;
// do mat_mul, while optionally apply lora // do mat_mul, while optionally apply lora and per-tensor scale
ggml_tensor * build_lora_mm( ggml_tensor * build_lora_mm(
ggml_tensor * w, ggml_tensor * w,
ggml_tensor * cur) const; ggml_tensor * cur,
ggml_tensor * w_s = nullptr) const;
// do mat_mul_id, while optionally apply lora // do mat_mul_id, while optionally apply lora
ggml_tensor * build_lora_mm_id( ggml_tensor * build_lora_mm_id(

View file

@ -29,10 +29,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
// self-attention // self-attention
{ {
// compute Q and K and RoPE them // compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s);
if (model.layers[il].wq_s) {
Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_s);
}
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
if (model.layers[il].bq) { if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
@ -40,10 +37,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
} }
// B1.K // B1.K
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
if (model.layers[il].wk_s) {
Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_s);
}
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
if (model.layers[il].bk) { if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
@ -51,10 +45,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
} }
// B1.V // B1.V
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
if (model.layers[il].wv_s) {
Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_s);
}
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
if (model.layers[il].bv) { if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
@ -90,10 +81,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
LLM_NORM_RMS, il); LLM_NORM_RMS, il);
cb(cur, "attn_sub_norm", il); cb(cur, "attn_sub_norm", il);
cur = build_lora_mm(model.layers[il].wo, cur); cur = build_lora_mm(model.layers[il].wo, cur, model.layers[il].wo_s);
if (model.layers[il].wo_s) {
cur = ggml_mul(ctx0, cur, model.layers[il].wo_s);
}
if (model.layers[il].bo) { if (model.layers[il].bo) {
cur = ggml_add(ctx0, cur, model.layers[il].bo); cur = ggml_add(ctx0, cur, model.layers[il].bo);
} }
@ -127,10 +115,7 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
LLM_NORM_RMS, il); LLM_NORM_RMS, il);
cb(cur, "ffn_sub_norm", il); cb(cur, "ffn_sub_norm", il);
cur = build_lora_mm(model.layers[il].ffn_down, cur); cur = build_lora_mm(model.layers[il].ffn_down, cur, model.layers[il].ffn_down_s);
if (model.layers[il].ffn_down_s) {
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_s);
}
cb(cur, "ffn_down", il); cb(cur, "ffn_down", il);
cur = ggml_add(ctx0, cur, ffn_inp); cur = ggml_add(ctx0, cur, ffn_inp);

View file

@ -43,28 +43,19 @@ llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_gra
ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
// compute Q and K and RoPE them // compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s);
if (model.layers[il].wq_s) {
Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_s);
}
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
if (model.layers[il].bq) { if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
} }
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
if (model.layers[il].wk_s) {
Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_s);
}
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
if (model.layers[il].bk) { if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
} }
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
if (model.layers[il].wv_s) {
Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_s);
}
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
if (model.layers[il].bv) { if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);

View file

@ -30,22 +30,13 @@ llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_para
// self-attention // self-attention
{ {
// compute Q and K and RoPE them // compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s);
if (model.layers[il].wq_s) {
Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_s);
}
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
if (model.layers[il].wk_s) {
Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_s);
}
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
if (model.layers[il].wv_s) {
Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_s);
}
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);

View file

@ -30,22 +30,13 @@ llm_build_qwen3moe::llm_build_qwen3moe(const llama_model & model, const llm_grap
// self_attention // self_attention
{ {
// compute Q and K and RoPE them // compute Q and K and RoPE them
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur, model.layers[il].wq_s);
if (model.layers[il].wq_s) {
Qcur = ggml_mul(ctx0, Qcur, model.layers[il].wq_s);
}
cb(Qcur, "Qcur", il); cb(Qcur, "Qcur", il);
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur, model.layers[il].wk_s);
if (model.layers[il].wk_s) {
Kcur = ggml_mul(ctx0, Kcur, model.layers[il].wk_s);
}
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur, model.layers[il].wv_s);
if (model.layers[il].wv_s) {
Vcur = ggml_mul(ctx0, Vcur, model.layers[il].wv_s);
}
cb(Vcur, "Vcur", il); cb(Vcur, "Vcur", il);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);