Merge branch 'upstream' into concedo_experimental

# Conflicts: # ci/README.md # ci/run.sh # docs/backend/CUDA-FEDORA.md # docs/build.md # docs/install.md # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-cuda/common.cuh # tests/test-backend-ops.cpp
2025-09-11 09:34:37 +00:00 · 2025-03-26 00:18:01 +08:00 · 2025-03-26 00:18:01 +08:00 · ea358369cc
commit ea358369cc
parent 2bdf1dacff 053b3f9aae
26 changed files with 393 additions and 246 deletions
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -2666,7 +2666,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {

                    // output
                    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+
+                    // if output is NULL, init from the input tok embed
+                    if (output == NULL) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,   "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }

                    for (int i = 0; i < n_layer; ++i) {
                        auto & layer = layers[i];