Merge branch 'upstream' into concedo_experimental

# Conflicts: # .devops/cuda.Dockerfile # CODEOWNERS # README.md # ggml/src/ggml-cann/aclnn_ops.cpp # ggml/src/ggml-cann/common.h # ggml/src/ggml-opencl/ggml-opencl.cpp # scripts/sync-ggml-am.sh # scripts/sync-ggml.last # scripts/sync-ggml.sh # tests/test-chat.cpp # tools/batched-bench/batched-bench.cpp # tools/mtmd/clip.h
2025-09-11 01:24:36 +00:00 · 2025-08-20 20:34:45 +08:00 · 2025-08-20 20:34:45 +08:00 · 1c41c38a6a
commit 1c41c38a6a
parent eb33467c8c a094f38143
40 changed files with 426 additions and 274 deletions
--- a/common/common.h
+++ b/common/common.h
@ -235,12 +235,15 @@ struct common_params_diffusion {
    bool    add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0
 };

+// reasoning API response format (not to be confused as chat template's reasoning format)
 enum common_reasoning_format {
    COMMON_REASONING_FORMAT_NONE,
-    COMMON_REASONING_FORMAT_AUTO,
+    COMMON_REASONING_FORMAT_AUTO,            // Same as deepseek, using `message.reasoning_content`
    COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
    COMMON_REASONING_FORMAT_DEEPSEEK,        // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
-    COMMON_REASONING_FORMAT_GRANITE,         // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
+    // do not extend this enum unless you absolutely have to
+    // in most cases, use COMMON_REASONING_FORMAT_AUTO
+    // see: https://github.com/ggml-org/llama.cpp/pull/15408
 };


@ -368,7 +371,7 @@ struct common_params {
    bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
    bool flash_attn        = false; // flash attention
    bool no_perf           = false; // disable performance metrics
-    bool ctx_shift         = true;  // context shift on inifinite text generation
+    bool ctx_shift         = false;  // context shift on inifinite text generation
    bool swa_full          = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
    bool kv_unified        = false; // enable unified KV cache