Merge branch 'upstream' into concedo_experimental

# Conflicts: # .github/workflows/build.yml # .github/workflows/release.yml # CMakeLists.txt # examples/simple-chat/simple-chat.cpp # src/llama-quant.cpp # tools/run/run.cpp # tools/server/README.md
2025-09-11 01:24:36 +00:00 · 2025-06-24 23:06:16 +08:00 · 2025-06-24 23:06:16 +08:00 · ace537d44e
commit ace537d44e
parent 8ce56bd547 c148cf1946
17 changed files with 554 additions and 212 deletions
--- a/include/llama.h
+++ b/include/llama.h
@ -393,6 +393,7 @@ extern "C" {
        void * imatrix;                       // pointer to importance matrix data
        void * kv_overrides;                  // pointer to vector containing overrides
        void * tensor_types;                  // pointer to vector containing tensor types
+        void * prune_layers;                  // pointer to vector containing layer indices to prune
    } llama_model_quantize_params;

    typedef struct llama_logit_bias {
@ -946,12 +947,14 @@ extern "C" {
    // Requires the context to have a memory.
    // For encode-decoder contexts, processes the batch using the decoder.
    // Positive return values does not mean a fatal error, but rather a warning.
-    // Upon non-zero return values, the memory state is restored to the state before this call
+    // Upon fatal-error or abort, the ubatches that managed to be been processed will remain in the memory state of the context
+    //   To handle this correctly, query the memory state using llama_memory_seq_pos_min() and llama_memory_seq_pos_max()
+    // Upon other return values, the memory state is restored to the state before this call
    //    0 - success
    //    1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
-    //    2 - aborted
+    //    2 - aborted     (processed ubatches will remain in the context's memory)
    //   -1 - invalid input batch
-    // < -1 - error
+    // < -1 - fatal error (processed ubatches will remain in the context's memory)
    LLAMA_API int32_t llama_decode(
            struct llama_context * ctx,
              struct llama_batch   batch);