mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-10 20:31:01 +00:00
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # .github/workflows/release.yml # CONTRIBUTING.md # docs/backend/CANN.md # examples/eval-callback/eval-callback.cpp # examples/model-conversion/requirements.txt # examples/model-conversion/scripts/causal/run-org-model.py # ggml/src/ggml-cann/aclnn_ops.cpp # ggml/src/ggml-cann/common.h # ggml/src/ggml-cann/ggml-cann.cpp # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-cpu/kleidiai/kleidiai.cpp # ggml/src/ggml-cuda/CMakeLists.txt # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-rpc/ggml-rpc.cpp # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-webgpu/ggml-webgpu.cpp # ggml/src/ggml-zdnn/ggml-zdnn.cpp # models/templates/README.md # requirements/requirements-convert_hf_to_gguf.txt # requirements/requirements-convert_legacy_llama.txt # requirements/requirements-tool_bench.txt # tests/.gitignore # tests/test-backend-ops.cpp # tests/test-chat-parser.cpp # tests/test-chat.cpp # tests/test-json-schema-to-grammar.cpp # tests/test-tokenizer-random.py
This commit is contained in:
commit
6463f5c26b
62 changed files with 3507 additions and 2153 deletions
|
|
@ -137,6 +137,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|||
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
||||
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
||||
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
||||
{ LLM_KV_DECODER_BLOCK_COUNT, "%s.decoder_block_count" },
|
||||
{ LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" },
|
||||
{ LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" },
|
||||
{ LLM_KV_SWIN_NORM, "%s.swin_norm" },
|
||||
|
|
|
|||
|
|
@ -141,6 +141,7 @@ enum llm_kv {
|
|||
LLM_KV_POOLING_TYPE,
|
||||
LLM_KV_LOGIT_SCALE,
|
||||
LLM_KV_DECODER_START_TOKEN_ID,
|
||||
LLM_KV_DECODER_BLOCK_COUNT,
|
||||
LLM_KV_ATTN_LOGIT_SOFTCAPPING,
|
||||
LLM_KV_FINAL_LOGIT_SOFTCAPPING,
|
||||
LLM_KV_SWIN_NORM,
|
||||
|
|
|
|||
|
|
@ -1447,7 +1447,9 @@ ggml_status llama_context::graph_compute(
|
|||
if (backend_cpu != nullptr) {
|
||||
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu));
|
||||
auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
|
||||
set_threadpool_fn(backend_cpu, tp);
|
||||
if (set_threadpool_fn) {
|
||||
set_threadpool_fn(backend_cpu, tp);
|
||||
}
|
||||
}
|
||||
|
||||
// set the number of threads for all the backends
|
||||
|
|
|
|||
|
|
@ -1273,7 +1273,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
|||
// split the batch into streams if needed
|
||||
const auto n_stream = k->ne[3];
|
||||
|
||||
q = ggml_reshape_4d(ctx0, q, q->ne[0], q->ne[1], q->ne[2]/n_stream, n_stream);
|
||||
q = ggml_view_4d(ctx0, q, q->ne[0], q->ne[1], q->ne[2]/n_stream, n_stream, q->nb[1], q->nb[2], q->nb[3]/n_stream, 0);
|
||||
|
||||
q = ggml_permute(ctx0, q, 0, 2, 1, 3);
|
||||
k = ggml_permute(ctx0, k, 0, 2, 1, 3);
|
||||
|
|
|
|||
|
|
@ -159,6 +159,7 @@ struct llama_hparams {
|
|||
// needed by encoder-decoder models (e.g. T5, FLAN-T5)
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/8141
|
||||
llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
|
||||
uint32_t dec_n_layer = 0;
|
||||
|
||||
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
||||
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
||||
|
|
|
|||
|
|
@ -1547,6 +1547,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
hparams.dec_start_token_id = dec_start_token_id;
|
||||
}
|
||||
|
||||
hparams.dec_n_layer = hparams.n_layer;
|
||||
ml.get_key(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 6: type = LLM_TYPE_60M; break; // t5-small
|
||||
case 8: type = LLM_TYPE_80M; break; // flan-t5-small
|
||||
|
|
@ -4510,6 +4513,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||
}
|
||||
|
||||
// n_layer: number of encoder_layers
|
||||
// dec_n_layer: number of decoder_layers
|
||||
const int dec_n_layer = hparams.dec_n_layer;
|
||||
if (dec_n_layer > n_layer) {
|
||||
layers.resize(dec_n_layer);
|
||||
}
|
||||
|
||||
// load encoder layers
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
|
||||
|
|
@ -4525,6 +4536,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||
layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
|
||||
layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
}
|
||||
|
||||
// load decoder layers
|
||||
for (int i = 0; i < dec_n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
|
||||
|
|
@ -13609,7 +13625,9 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|||
|
||||
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
const int64_t dec_n_layer = hparams.dec_n_layer;
|
||||
|
||||
for (int il = 0; il < dec_n_layer; ++il) {
|
||||
ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
|
|
@ -13700,7 +13718,7 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|||
//cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
if (il == dec_n_layer - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
|
||||
}
|
||||
|
|
@ -13721,8 +13739,8 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|||
model.layers[il].ffn_gate, NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL, NULL,
|
||||
NULL,
|
||||
model.layers[il].ffn_gate_enc ? LLM_FFN_GELU : LLM_FFN_RELU,
|
||||
model.layers[il].ffn_gate_enc ? LLM_FFN_PAR : LLM_FFN_SEQ,
|
||||
model.layers[il].ffn_gate ? LLM_FFN_GELU : LLM_FFN_RELU,
|
||||
model.layers[il].ffn_gate ? LLM_FFN_PAR : LLM_FFN_SEQ,
|
||||
il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -114,7 +114,9 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
|
|||
GGML_ASSERT(dev && "CPU backend is not loaded");
|
||||
auto * reg = ggml_backend_dev_backend_reg(dev);
|
||||
auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
|
||||
numa_init_fn(numa);
|
||||
if (numa_init_fn) {
|
||||
numa_init_fn(numa);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue