Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.devops/vulkan.Dockerfile
#	.github/workflows/build-cache.yml
#	.github/workflows/build-cmake-pkg.yml
#	.github/workflows/build-linux-cross.yml
#	.github/workflows/build.yml
#	.github/workflows/release.yml
#	.github/workflows/server-self-hosted.yml
#	.github/workflows/server-webui.yml
#	ggml/src/ggml-hexagon/ggml-hexagon.cpp
#	ggml/src/ggml-hexagon/htp/matmul-ops.c
#	tests/test-backend-ops.cpp
This commit is contained in:
Concedo 2026-03-15 15:20:38 +08:00
commit 893b8abc21
6 changed files with 82 additions and 25 deletions

View file

@ -7620,6 +7620,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
if (!layer.wo_s && layer.wo) {
layer.wo_s = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
}
if (!layer.wqkv_s && layer.wqkv) {
layer.wqkv_s = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "scale", i), {1}, TENSOR_NOT_REQUIRED);
}
if (!layer.wqkv_gate_s && layer.wqkv_gate) {
layer.wqkv_gate_s = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
}
// dense FFN weight scales (per-tensor, shape {1})
if (!layer.ffn_gate_s && layer.ffn_gate) {
@ -7631,6 +7637,15 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
if (!layer.ffn_up_s && layer.ffn_up) {
layer.ffn_up_s = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
}
if (!layer.ffn_gate_shexp_s && layer.ffn_gate_shexp) {
layer.ffn_gate_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
}
if (!layer.ffn_down_shexp_s && layer.ffn_down_shexp) {
layer.ffn_down_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
}
if (!layer.ffn_up_shexp_s && layer.ffn_up_shexp) {
layer.ffn_up_shexp_s = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
}
// MoE expert weight scales (per-expert, shape {n_expert})
if (!layer.ffn_gate_exps_s && layer.ffn_gate_exps) {
@ -7642,6 +7657,17 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
if (!layer.ffn_up_exps_s && layer.ffn_up_exps) {
layer.ffn_up_exps_s = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "scale", i), {n_expert}, TENSOR_NOT_REQUIRED);
}
// recurrent / linear-attention weight scales (per-tensor, shape {1})
if (!layer.ssm_out_s && layer.ssm_out) {
layer.ssm_out_s = create_tensor(tn(LLM_TENSOR_SSM_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
}
if (!layer.ssm_alpha_s && layer.ssm_alpha) {
layer.ssm_alpha_s = create_tensor(tn(LLM_TENSOR_SSM_ALPHA, "scale", i), {1}, TENSOR_NOT_REQUIRED);
}
if (!layer.ssm_beta_s && layer.ssm_beta) {
layer.ssm_beta_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "scale", i), {1}, TENSOR_NOT_REQUIRED);
}
}
}