Merge branch 'master' into concedo_experimental

# Conflicts: # CMakeLists.txt # Makefile # README.md # docs/BLIS.md
2025-09-10 09:04:36 +00:00 · 2023-10-15 15:50:20 +08:00 · 2023-10-15 15:50:20 +08:00 · 5cfabaee25
commit 5cfabaee25
parent 643902fbbb 11bff29045
21 changed files with 10952 additions and 278 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -2845,8 +2845,8 @@ static void llm_load_tensors(
                        auto & layer = model.layers[i];

                        layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
-                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3*n_embd}, backend_split);
-                        layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},     backend_split);
+                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
+                        layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},                backend_split);

                        layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);

@ -5374,7 +5374,7 @@ static struct ggml_cgraph * llm_build_mpt(
    const int64_t n_layer     = hparams.n_layer;
    const int64_t n_ctx       = cparams.n_ctx;
    const int64_t n_head      = hparams.n_head;
-    const int64_t n_head_kv   = hparams.n_head_kv; // == n_head for MPT, as there's no MQA/GQA
+    const int64_t n_head_kv   = hparams.n_head_kv;
    const int64_t n_embd_head = hparams.n_embd_head();
    const int64_t n_embd_gqa  = hparams.n_embd_gqa();

@ -5727,7 +5727,6 @@ static struct ggml_cgraph * llama_build_graph(
 //
 //   - lctx:      llama context
 //   - batch:     batch to evaluate
-//   - n_threads: number of threads to use
 //
 // return 0 on success
 // return positive int on warning