Merge branch 'upstream' into concedo_experimental

# Conflicts: # .github/workflows/build.yml # README.md # ci/run.sh # docs/build.md # examples/CMakeLists.txt # examples/parallel/parallel.cpp # ggml/CMakeLists.txt # ggml/src/CMakeLists.txt # scripts/server-bench.py # src/llama-kv-cache-unified.cpp # tests/test-backend-ops.cpp # tools/batched-bench/batched-bench.cpp # tools/server/README.md
2025-09-11 01:24:36 +00:00 · 2025-07-17 00:28:37 +08:00 · 2025-07-17 00:28:37 +08:00 · bdff33e0de
commit bdff33e0de
parent f0564f9caf 21c021745d
47 changed files with 3128 additions and 509 deletions
--- a/include/llama.h
+++ b/include/llama.h
@ -338,6 +338,9 @@ extern "C" {
        bool swa_full;    // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
                          // NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
                          //       ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
+        bool kv_unified;  // use a unified buffer across the input sequences when computing the attention
+                          // try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
+                          // ref: https://github.com/ggml-org/llama.cpp/pull/14363
    };

    // model quantization parameters
@ -728,7 +731,7 @@ extern "C" {
    //   - lazily on next llama_decode()
    // p0 < 0 : [0,  p1]
    // p1 < 0 : [p0, inf)
-    DEPRECATED(void llama_kv_self_seq_div(
+    DEPRECATED(LLAMA_API void llama_kv_self_seq_div(
            struct llama_context * ctx,
                    llama_seq_id   seq_id,
                       llama_pos   p0,
@ -1008,6 +1011,7 @@ extern "C" {
    LLAMA_API llama_token llama_vocab_sep(const struct llama_vocab * vocab); // sentence separator
    LLAMA_API llama_token llama_vocab_nl (const struct llama_vocab * vocab); // next-line
    LLAMA_API llama_token llama_vocab_pad(const struct llama_vocab * vocab); // padding
+    LLAMA_API llama_token llama_vocab_mask(const struct llama_vocab * vocab); // mask

    LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
    LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);