Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.github/workflows/build.yml
#	.github/workflows/release.yml
#	CONTRIBUTING.md
#	docs/backend/CANN.md
#	examples/eval-callback/eval-callback.cpp
#	examples/model-conversion/requirements.txt
#	examples/model-conversion/scripts/causal/run-org-model.py
#	ggml/src/ggml-cann/aclnn_ops.cpp
#	ggml/src/ggml-cann/common.h
#	ggml/src/ggml-cann/ggml-cann.cpp
#	ggml/src/ggml-cpu/CMakeLists.txt
#	ggml/src/ggml-cpu/kleidiai/kleidiai.cpp
#	ggml/src/ggml-cuda/CMakeLists.txt
#	ggml/src/ggml-opencl/ggml-opencl.cpp
#	ggml/src/ggml-rpc/ggml-rpc.cpp
#	ggml/src/ggml-sycl/ggml-sycl.cpp
#	ggml/src/ggml-webgpu/ggml-webgpu.cpp
#	ggml/src/ggml-zdnn/ggml-zdnn.cpp
#	models/templates/README.md
#	requirements/requirements-convert_hf_to_gguf.txt
#	requirements/requirements-convert_legacy_llama.txt
#	requirements/requirements-tool_bench.txt
#	tests/.gitignore
#	tests/test-backend-ops.cpp
#	tests/test-chat-parser.cpp
#	tests/test-chat.cpp
#	tests/test-json-schema-to-grammar.cpp
#	tests/test-tokenizer-random.py
This commit is contained in:
Concedo 2025-09-11 22:34:45 +08:00
commit 6463f5c26b
62 changed files with 3507 additions and 2153 deletions

View file

@ -137,6 +137,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
{ LLM_KV_DECODER_BLOCK_COUNT, "%s.decoder_block_count" },
{ LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
{ LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
{ LLM_KV_SWIN_NORM, "%s.swin_norm" },

View file

@ -141,6 +141,7 @@ enum llm_kv {
LLM_KV_POOLING_TYPE,
LLM_KV_LOGIT_SCALE,
LLM_KV_DECODER_START_TOKEN_ID,
LLM_KV_DECODER_BLOCK_COUNT,
LLM_KV_ATTN_LOGIT_SOFTCAPPING,
LLM_KV_FINAL_LOGIT_SOFTCAPPING,
LLM_KV_SWIN_NORM,

View file

@ -1447,7 +1447,9 @@ ggml_status llama_context::graph_compute(
if (backend_cpu != nullptr) {
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu));
auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
set_threadpool_fn(backend_cpu, tp);
if (set_threadpool_fn) {
set_threadpool_fn(backend_cpu, tp);
}
}
// set the number of threads for all the backends

View file

@ -1273,7 +1273,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
// split the batch into streams if needed
const auto n_stream = k->ne[3];
q = ggml_reshape_4d(ctx0, q, q->ne[0], q->ne[1], q->ne[2]/n_stream, n_stream);
q = ggml_view_4d(ctx0, q, q->ne[0], q->ne[1], q->ne[2]/n_stream, n_stream, q->nb[1], q->nb[2], q->nb[3]/n_stream, 0);
q = ggml_permute(ctx0, q, 0, 2, 1, 3);
k = ggml_permute(ctx0, k, 0, 2, 1, 3);

View file

@ -159,6 +159,7 @@ struct llama_hparams {
// needed by encoder-decoder models (e.g. T5, FLAN-T5)
// ref: https://github.com/ggerganov/llama.cpp/pull/8141
llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
uint32_t dec_n_layer = 0;
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;

View file

@ -1547,6 +1547,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
hparams.dec_start_token_id = dec_start_token_id;
}
hparams.dec_n_layer = hparams.n_layer;
ml.get_key(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer, false);
switch (hparams.n_layer) {
case 6: type = LLM_TYPE_60M; break; // t5-small
case 8: type = LLM_TYPE_80M; break; // flan-t5-small
@ -4510,6 +4513,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
}
// n_layer: number of encoder_layers
// dec_n_layer: number of decoder_layers
const int dec_n_layer = hparams.dec_n_layer;
if (dec_n_layer > n_layer) {
layers.resize(dec_n_layer);
}
// load encoder layers
for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i];
@ -4525,6 +4536,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
}
// load decoder layers
for (int i = 0; i < dec_n_layer; ++i) {
auto & layer = layers[i];
layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0);
layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
@ -13609,7 +13625,9 @@ struct llm_build_t5_dec : public llm_graph_context {
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
const int64_t dec_n_layer = hparams.dec_n_layer;
for (int il = 0; il < dec_n_layer; ++il) {
ggml_tensor * inpSA = inpL;
// norm
@ -13700,7 +13718,7 @@ struct llm_build_t5_dec : public llm_graph_context {
//cb(cur, "kqv_out", il);
}
if (il == n_layer - 1 && inp_out_ids) {
if (il == dec_n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
}
@ -13721,8 +13739,8 @@ struct llm_build_t5_dec : public llm_graph_context {
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_RELU,
model.layers[il].ffn_gate ? LLM_FFN_PAR : LLM_FFN_SEQ,
il);
cb(cur, "ffn_out", il);
}

View file

@ -114,7 +114,9 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
GGML_ASSERT(dev && "CPU backend is not loaded");
auto * reg = ggml_backend_dev_backend_reg(dev);
auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
numa_init_fn(numa);
if (numa_init_fn) {
numa_init_fn(numa);
}
}
}