Merge branch 'upstream' into concedo_experimental

# Conflicts: # .github/workflows/build-linux-cross.yml # .github/workflows/build.yml # CODEOWNERS # ggml/CMakeLists.txt # ggml/src/ggml-cuda/fattn.cu # ggml/src/ggml-webgpu/CMakeLists.txt # ggml/src/ggml-webgpu/ggml-webgpu.cpp # ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl # tests/test-backend-ops.cpp # tests/test-chat-template.cpp # tools/llama-bench/llama-bench.cpp # tools/rpc/README.md # tools/server/README.md
2026-05-11 04:51:25 +00:00 · 2025-10-09 01:33:27 +08:00 · 2025-10-09 01:33:27 +08:00 · b6f6338bba
commit b6f6338bba
parent 224800b33b d2ee056e1d
32 changed files with 1556 additions and 636 deletions
--- a/src/llama-memory-recurrent.cpp
+++ b/src/llama-memory-recurrent.cpp
@ -382,7 +382,9 @@ llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr &
                // if all tokens are output, split by sequence
                ubatch = balloc.split_seq(n_ubatch);
            } else {
-                ubatch = balloc.split_equal(n_ubatch, false);
+                // TODO: non-sequential equal split can be done if using unified KV cache
+                //       for simplicity, we always use sequential equal split for now
+                ubatch = balloc.split_equal(n_ubatch, true);
            }

            if (ubatch.n_tokens == 0) {
@ -859,9 +861,12 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
 bool llama_memory_recurrent::state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id) {
    if (dest_seq_id != -1) {
        // single sequence
-
        seq_rm(dest_seq_id, -1, -1);

+        if (cell_count == 0) {
+            return true;
+        }
+
        llama_batch_allocr balloc(hparams.n_pos_per_embd());

        llama_ubatch ubatch = balloc.ubatch_reserve(cell_count, 1);