Merge commit '2cd20b72ed' into concedo_experimental

# Conflicts: # CONTRIBUTING.md # docs/backend/CANN.md # docs/backend/SYCL.md # docs/backend/snapdragon/README.md # docs/backend/snapdragon/windows.md # docs/build.md # docs/multimodal/MobileVLM.md # docs/ops.md # docs/ops/WebGPU.csv # examples/debug/README.md # examples/llama.vim # examples/model-conversion/README.md # examples/sycl/README.md # ggml/src/ggml-cpu/amx/mmq.cpp # ggml/src/ggml-cpu/arch/x86/repack.cpp # ggml/src/ggml-hexagon/ggml-hexagon.cpp # ggml/src/ggml-hexagon/htp-drv.cpp # ggml/src/ggml-hexagon/htp/flash-attn-ops.c # ggml/src/ggml-hexagon/htp/hvx-base.h # ggml/src/ggml-hexagon/htp/hvx-copy.h # ggml/src/ggml-hexagon/htp/hvx-inverse.h # ggml/src/ggml-hexagon/htp/hvx-reduce.h # ggml/src/ggml-hexagon/htp/matmul-ops.c # ggml/src/ggml-hexagon/htp/rope-ops.c # ggml/src/ggml-hexagon/htp/worker-pool.c # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-opencl/kernels/cpy.cl # ggml/src/ggml-sycl/common.hpp # ggml/src/ggml-sycl/quants.hpp # ggml/src/ggml-sycl/softmax.cpp # ggml/src/ggml-vulkan/CMakeLists.txt # ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp # ggml/src/ggml-webgpu/ggml-webgpu.cpp # scripts/pr2wt.sh # scripts/server-bench.py # scripts/snapdragon/windows/run-cli.ps1 # tests/test-alloc.cpp # tests/test-backend-ops.cpp # tests/test-chat.cpp # tools/cli/cli.cpp # tools/completion/README.md # tools/cvector-generator/cvector-generator.cpp # tools/imatrix/README.md # tools/perplexity/README.md # tools/server/public_simplechat/readme.md # tools/server/tests/README.md
2026-05-10 04:00:53 +00:00 · 2026-03-10 22:11:08 +08:00 · 2026-03-10 22:11:08 +08:00 · 746664fde6
commit 746664fde6
parent c8800ed16c 2cd20b72ed
107 changed files with 1037 additions and 314 deletions
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@ -161,7 +161,7 @@ llama_context::llama_context(
    cparams.op_offload = params.op_offload;
    cparams.kv_unified = params.kv_unified;

-    // intialized later
+    // initialized later
    cparams.pipeline_parallel = false;

    {
@ -1991,7 +1991,7 @@ ggml_cgraph * llama_context::graph_reserve(

    ggml_backend_sched_reset(sched.get());

-    // when the scheduler is reset, we cannnot reuse the old graph, so we reset the previous graph result to prevent that
+    // when the scheduler is reset, we cannot reuse the old graph, so we reset the previous graph result to prevent that
    gf_res_prev->reset();

    // store the n_outputs as it is, and restore it afterwards
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@ -1616,7 +1616,7 @@ ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
 ggml_tensor * llm_graph_context::build_inp_out_ids() const {
    // note: when all tokens are output, we could skip this optimization to spare the ggml_get_rows() calls,
    //       but this would make the graph topology depend on the number of output tokens, which can interere with
-    //       features that require constant topology such as pipline parallelism
+    //       features that require constant topology such as pipeline parallelism
    //       ref: https://github.com/ggml-org/llama.cpp/pull/14275#issuecomment-2987424471
    //if (n_outputs < n_tokens) {
    //    return nullptr;
@ -1779,7 +1779,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
        if (v_mla) {
 #if 0
            // v_mla can be applied as a matrix-vector multiplication with broadcasting across dimension 3 == n_tokens.
-            // However, the code is optimized for dimensions 0 and 1 being large, so this is ineffient.
+            // However, the code is optimized for dimensions 0 and 1 being large, so this is inefficient.
            cur = ggml_reshape_4d(ctx0, cur, v_mla->ne[0], 1, n_head, n_tokens);
            cur = ggml_mul_mat(ctx0, v_mla, cur);
 #else
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@ -583,7 +583,7 @@ llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vector<llama_
            break;
        }

-        // remeber the position that we found
+        // remember the position that we found
        res.push_back(sinfo_new);

        // store the old state of the cells in the recovery stack
@ -1293,7 +1293,7 @@ static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float *
    }

    for (uint32_t s = 0; s < n_stream; ++s) {
-        // bookeeping of the KQ mask cells that could change for other tokens of the same sequence
+        // bookkeeping of the KQ mask cells that could change for other tokens of the same sequence
        std::unordered_map<llama_seq_id, uint32_t>              seq_srct;
        std::unordered_map<llama_seq_id, std::vector<uint32_t>> seq_idxs;

--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -175,6 +175,7 @@ const char * llm_type_name(llm_type type) {
        case LLM_TYPE_0_3B:          return "0.3B";
        case LLM_TYPE_0_5B:          return "0.5B";
        case LLM_TYPE_0_6B:          return "0.6B";
+        case LLM_TYPE_0_8B:          return "0.8B";
        case LLM_TYPE_1B:            return "1B";
        case LLM_TYPE_1_2B:          return "1.2B";
        case LLM_TYPE_1_3B:          return "1.3B";
@ -246,12 +247,14 @@ const char * llm_type_name(llm_type type) {
        case LLM_TYPE_100B_A6B:      return "100B.A6B";
        case LLM_TYPE_102B_A12B:     return "102B.A12B";
        case LLM_TYPE_106B_A12B:     return "106B.A12B";
+        case LLM_TYPE_122B_A10B:     return "122B.A10B";
        case LLM_TYPE_196B_A11B:     return "196B.A11B";
        case LLM_TYPE_230B_A10B:     return "230B.A10B";
        case LLM_TYPE_235B_A22B:     return "235B.A22B";
        case LLM_TYPE_300B_A47B:     return "300B.A47B";
        case LLM_TYPE_310B_A15B:     return "310B.A15B";
        case LLM_TYPE_355B_A32B:     return "355B.A32B";
+        case LLM_TYPE_397B_A17B:     return "397B.A17B";
        case LLM_TYPE_744B_A40B:     return "744B.A40B";
        case LLM_TYPE_E2B:           return "E2B";
        case LLM_TYPE_E4B:           return "E4B";
@ -1638,7 +1641,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                }

                switch (hparams.n_layer) {
-                    // TODO: Jamba layers are a bit heterogenous, so naming this is hard.
+                    // TODO: Jamba layers are a bit heterogeneous, so naming this is hard.
                    case 12: // 900M  8x???M
                    case 32: // 51B  16x?B
                    default: type = LLM_TYPE_UNKNOWN;
@ -2642,7 +2645,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                }

                switch (hparams.n_layer) {
-                    case 24: type = LLM_TYPE_2B; break;
+                    case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_8B : LLM_TYPE_2B; break;
+                    case 32: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_9B; break;
+                    case 64: type = LLM_TYPE_27B; break;
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
@ -2671,8 +2676,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                }

                switch (hparams.n_layer) {
-                    case 28: type = LLM_TYPE_35B_A3B; break;
-                    case 48: type = LLM_TYPE_80B_A3B; break;
+                    case 40: type = LLM_TYPE_35B_A3B; break;
+                    case 48: type = LLM_TYPE_122B_A10B; break;
+                    case 60: type = LLM_TYPE_397B_A17B; break;
                    default: type = LLM_TYPE_UNKNOWN;
                }
            } break;
--- a/src/llama-model.h
+++ b/src/llama-model.h
@ -54,6 +54,7 @@ enum llm_type {
    LLM_TYPE_0_3B,
    LLM_TYPE_0_5B,
    LLM_TYPE_0_6B,
+    LLM_TYPE_0_8B,
    LLM_TYPE_1B,
    LLM_TYPE_1_2B,
    LLM_TYPE_1_3B,
@ -125,12 +126,14 @@ enum llm_type {
    LLM_TYPE_100B_A6B,
    LLM_TYPE_102B_A12B, // Solar-Open
    LLM_TYPE_106B_A12B, // GLM-4.5-Air
+    LLM_TYPE_122B_A10B, // Qwen3.5
    LLM_TYPE_196B_A11B, // Step3.5-Flash
    LLM_TYPE_230B_A10B, // Minimax M2
    LLM_TYPE_235B_A22B,
    LLM_TYPE_300B_A47B, // Ernie MoE big
    LLM_TYPE_310B_A15B, // /MiMo-V2-Flash
    LLM_TYPE_355B_A32B, // GLM-4.5
+    LLM_TYPE_397B_A17B, // Qwen3.5
    LLM_TYPE_744B_A40B, // GLM-5
    LLM_TYPE_E2B,
    LLM_TYPE_E4B,
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@ -2069,7 +2069,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
                precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
 #if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-                // correct endiannes of data in precompiled_charsmap binary blob
+                // correct endianness of data in precompiled_charsmap binary blob
                uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
                *xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
                assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
--- a/src/models/deepseek2.cpp
+++ b/src/models/deepseek2.cpp
@ -146,7 +146,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
                    cb(Qcur, "Qcur_attn_temp_scaled", il);
                }

-                // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
+                // note: MLA with the absorption optimization converts into MQA (ie: GQA with 1 group)
                cur = build_attn(inp_attn_k,
                        model.layers[il].wo, NULL,
                        Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
--- a/src/models/models.h
+++ b/src/models/models.h
@ -3,7 +3,7 @@
 #include "llama-model.h"
 #include "llama-graph.h"

-// note: almost all graphs require atleast sqrtf, so include cmath globally
+// note: almost all graphs require at least sqrtf, so include cmath globally
 #include <cmath>

 //
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@ -773,7 +773,7 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
        // tiny_aya digit grouping pattern from tokenizer.json:
        //   {"type": "Split", "pattern": {"Regex": "\\d{1,3}(?=(?:\\d{3})*\\b)"}, "behavior": "Isolated"}
        // Splits digits into groups of 3 from the right (e.g., 1234567 -> 1, 234, 567)
-        // TODO: Revisit this regex, incase there are any subtle tokenization differences with the original regex.
+        // TODO: Revisit this regex, in case there are any subtle tokenization differences with the original regex.
        bpe_offsets = unicode_regex_split_custom_afmoe(text, offsets);
    }