Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.github/workflows/build-android.yml
#	.github/workflows/build.yml
#	.github/workflows/release.yml
#	CMakeLists.txt
#	CODEOWNERS
#	common/CMakeLists.txt
#	common/common.h
#	docs/ops.md
#	docs/ops/Metal.csv
#	examples/batched/CMakeLists.txt
#	examples/convert-llama2c-to-ggml/CMakeLists.txt
#	examples/debug/CMakeLists.txt
#	examples/diffusion/CMakeLists.txt
#	examples/embedding/CMakeLists.txt
#	examples/eval-callback/CMakeLists.txt
#	examples/gen-docs/CMakeLists.txt
#	examples/idle/CMakeLists.txt
#	examples/lookahead/CMakeLists.txt
#	examples/lookup/CMakeLists.txt
#	examples/parallel/CMakeLists.txt
#	examples/passkey/CMakeLists.txt
#	examples/retrieval/CMakeLists.txt
#	examples/save-load-state/CMakeLists.txt
#	examples/speculative-simple/CMakeLists.txt
#	examples/speculative/CMakeLists.txt
#	examples/sycl/CMakeLists.txt
#	examples/training/CMakeLists.txt
#	ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
#	ggml/src/ggml-hexagon/htp/htp-ops.h
#	ggml/src/ggml-hexagon/htp/main.c
#	ggml/src/ggml-opencl/CMakeLists.txt
#	ggml/src/ggml-opencl/ggml-opencl.cpp
#	ggml/src/ggml-opencl/kernels/cvt.cl
#	pocs/vdot/CMakeLists.txt
#	src/CMakeLists.txt
#	tests/CMakeLists.txt
#	tests/test-quantize-stats.cpp
#	tools/batched-bench/CMakeLists.txt
#	tools/cli/CMakeLists.txt
#	tools/cli/cli.cpp
#	tools/completion/CMakeLists.txt
#	tools/cvector-generator/CMakeLists.txt
#	tools/cvector-generator/cvector-generator.cpp
#	tools/export-lora/CMakeLists.txt
#	tools/gguf-split/CMakeLists.txt
#	tools/gguf-split/gguf-split.cpp
#	tools/imatrix/CMakeLists.txt
#	tools/llama-bench/CMakeLists.txt
#	tools/llama-bench/llama-bench.cpp
#	tools/mtmd/CMakeLists.txt
#	tools/perplexity/CMakeLists.txt
#	tools/quantize/CMakeLists.txt
#	tools/quantize/quantize.cpp
#	tools/results/CMakeLists.txt
#	tools/server/CMakeLists.txt
#	tools/tokenize/CMakeLists.txt
#	tools/tts/CMakeLists.txt
This commit is contained in:
Concedo 2026-04-17 22:37:37 +08:00
commit 79882d669a
146 changed files with 1507 additions and 2103 deletions

View file

@ -1,6 +1,7 @@
#include "llama-graph.h"
#include "llama-impl.h"
#include "llama-model.h"
#include "llama-batch.h"
#include "llama-cparams.h"
@ -1059,6 +1060,84 @@ ggml_tensor * llm_graph_context::build_norm(
return cur;
}
llm_graph_qkv llm_graph_context::build_qkv(
const llama_layer & layer,
ggml_tensor * cur,
int64_t n_embd_head,
int64_t n_head,
int64_t n_head_kv,
int il) const {
const int64_t n_embd_q = n_embd_head * n_head;
const int64_t n_embd_kv = n_embd_head * n_head_kv;
ggml_tensor * Qcur, * Kcur, * Vcur;
if (layer.wqkv) {
// fused QKV path
ggml_tensor * qkv = build_lora_mm(layer.wqkv, cur, layer.wqkv_s);
cb(qkv, "wqkv", il);
if (layer.bqkv) {
qkv = ggml_add(ctx0, qkv, layer.bqkv);
cb(qkv, "bqkv", il);
}
if (hparams.f_clamp_kqv > 0.0f) {
qkv = ggml_clamp(ctx0, qkv, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
cb(qkv, "wqkv_clamped", il);
}
Qcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head, n_tokens,
ggml_row_size(qkv->type, n_embd_head), qkv->nb[1], 0);
Kcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head_kv, n_tokens,
ggml_row_size(qkv->type, n_embd_head), qkv->nb[1],
ggml_row_size(qkv->type, n_embd_q));
Vcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head_kv, n_tokens,
ggml_row_size(qkv->type, n_embd_head), qkv->nb[1],
ggml_row_size(qkv->type, n_embd_q + n_embd_kv));
} else {
// separate Q/K/V path
Qcur = build_lora_mm(layer.wq, cur, layer.wq_s);
cb(Qcur, "Qcur", il);
if (layer.bq) {
Qcur = ggml_add(ctx0, Qcur, layer.bq);
cb(Qcur, "Qcur", il);
}
if (hparams.f_clamp_kqv > 0.0f) {
Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
cb(Qcur, "Qcur_clamped", il);
}
Kcur = build_lora_mm(layer.wk, cur, layer.wk_s);
cb(Kcur, "Kcur", il);
if (layer.bk) {
Kcur = ggml_add(ctx0, Kcur, layer.bk);
cb(Kcur, "Kcur", il);
}
if (hparams.f_clamp_kqv > 0.0f) {
Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
cb(Kcur, "Kcur_clamped", il);
}
Vcur = build_lora_mm(layer.wv, cur, layer.wv_s);
cb(Vcur, "Vcur", il);
if (layer.bv) {
Vcur = ggml_add(ctx0, Vcur, layer.bv);
cb(Vcur, "Vcur", il);
}
if (hparams.f_clamp_kqv > 0.0f) {
Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
cb(Vcur, "Vcur_clamped", il);
}
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
}
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
return { Qcur, Kcur, Vcur };
}
ggml_tensor * llm_graph_context::build_ffn(
ggml_tensor * cur,
ggml_tensor * up,
@ -2011,6 +2090,7 @@ ggml_tensor * llm_graph_context::build_attn(
llm_graph_input_attn_no_cache * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * wo_s,
ggml_tensor * q_cur,
ggml_tensor * k_cur,
ggml_tensor * v_cur,
@ -2044,7 +2124,7 @@ ggml_tensor * llm_graph_context::build_attn(
cb(cur, "kqv_out", il);
if (wo) {
cur = build_lora_mm(wo, cur);
cur = build_lora_mm(wo, cur, wo_s);
}
if (wo_b) {
@ -2095,6 +2175,7 @@ ggml_tensor * llm_graph_context::build_attn(
llm_graph_input_attn_kv * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * wo_s,
ggml_tensor * q_cur,
ggml_tensor * k_cur,
ggml_tensor * v_cur,
@ -2146,10 +2227,15 @@ ggml_tensor * llm_graph_context::build_attn(
}
if (wo) {
cur = build_lora_mm(wo, cur);
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_JAIS2) {
// GLM4, GLM4_MOE, and JAIS2 seem to have numerical issues with half-precision accumulators
cur = build_lora_mm(wo, cur);
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
if (wo_s) {
cur = ggml_mul(ctx0, cur, wo_s);
}
} else {
cur = build_lora_mm(wo, cur, wo_s);
}
}
@ -2193,6 +2279,7 @@ ggml_tensor * llm_graph_context::build_attn(
llm_graph_input_attn_k * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * wo_s,
ggml_tensor * q_cur,
ggml_tensor * k_cur,
ggml_tensor * v_cur,
@ -2227,10 +2314,15 @@ ggml_tensor * llm_graph_context::build_attn(
cb(cur, "kqv_out", il);
if (wo) {
cur = build_lora_mm(wo, cur);
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
// GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
cur = build_lora_mm(wo, cur);
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
if (wo_s) {
cur = ggml_mul(ctx0, cur, wo_s);
}
} else {
cur = build_lora_mm(wo, cur, wo_s);
}
}
@ -2245,6 +2337,7 @@ ggml_tensor * llm_graph_context::build_attn(
llm_graph_input_attn_kv_iswa * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * wo_s,
ggml_tensor * q_cur,
ggml_tensor * k_cur,
ggml_tensor * v_cur,
@ -2313,7 +2406,7 @@ ggml_tensor * llm_graph_context::build_attn(
}
if (wo) {
cur = build_lora_mm(wo, cur);
cur = build_lora_mm(wo, cur, wo_s);
}
if (wo_b) {
@ -2344,6 +2437,7 @@ ggml_tensor * llm_graph_context::build_attn(
llm_graph_input_attn_cross * inp,
ggml_tensor * wo,
ggml_tensor * wo_b,
ggml_tensor * wo_s,
ggml_tensor * q_cur,
ggml_tensor * k_cur,
ggml_tensor * v_cur,
@ -2368,7 +2462,7 @@ ggml_tensor * llm_graph_context::build_attn(
cb(cur, "kqv_out", il);
if (wo) {
cur = build_lora_mm(wo, cur);
cur = build_lora_mm(wo, cur, wo_s);
}
if (wo_b) {