From 287a33017b32600bfc0e81feeb0ad6e81e0dd484 Mon Sep 17 00:00:00 2001
From: Julius Tischbein <jtischbein@nvidia.com>
Date: Sun, 18 Jan 2026 17:35:57 +0100
Subject: [PATCH 01/17] llama : Extend fallback, fix fileno for dio file,
 exclude case that mmap uses dio file (#18887)

---
 src/llama-mmap.cpp         |  6 +++++-
 src/llama-model-loader.cpp | 18 ++++++++++++------
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp
index fe0847fe1..0261e4c72 100644
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@@ -265,7 +265,8 @@ struct llama_file::impl {
                         continue;  // Interrupted by signal, retry
                     }
                     // Fallback to std::fread in case the DMA controller cannot access the buffer
-                    if (errno == EFAULT) {
+                    if (errno == EFAULT || errno == EINVAL) {
+                        LLAMA_LOG_WARN("%s: Falling back to buffered IO due to %s\n", __func__, strerror(errno));
                         auto curr_off = tell();
                         close(fd);
                         fd = -1;
@@ -384,6 +385,9 @@ int llama_file::file_id() const {
 #ifdef _WIN32
     return _fileno(pimpl->fp);
 #else
+    if (pimpl->fd != -1) {
+        return pimpl->fd;
+    }
 #if defined(fileno)
     return fileno(pimpl->fp);
 #else
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 300a322c5..383b8dc76 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -539,12 +539,18 @@ llama_model_loader::llama_model_loader(
     files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io));
     contexts.emplace_back(ctx);
 
-    use_direct_io = use_direct_io && files.back()->has_direct_io();
-
-    // Disable mmap in case Direct I/O is enabled and available
-    if (use_direct_io && use_mmap) {
-        use_mmap = false;
-        LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
+    if (use_mmap && use_direct_io) {
+        if (files.back()->has_direct_io()) {
+            // Disable mmap, as DirectIO is available
+            use_mmap = false;
+            LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__);
+        } else {
+            // Disable DirectIO and reopen file using std::fopen for mmap
+            use_direct_io = false;
+            files.pop_back();
+            files.emplace_back(new llama_file(fname.c_str(), "rb", false));
+            LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__);
+        }
     }
 
     // Save tensors data offset of the main file.

From 3d55846a5c626e2e608db8c24fa9ee6defaacca9 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Mon, 19 Jan 2026 13:12:38 +0100
Subject: [PATCH 02/17] model-conversion : add BUILD_DIR variable to
 run-converted-model scripts (#18927)

This commit adds a BUILD_DIR variable to the scripts used for running
converted models.

The motivation for this is that currently the `build` directory is
hardcoded and it can be useful to specify a different build directory,
with builds for different configurations.
---
 .../causal/run-converted-model-embeddings-logits.sh   |  9 +++++++--
 .../scripts/causal/run-converted-model.sh             | 11 ++++++++---
 .../scripts/embedding/run-converted-model.sh          |  5 +++--
 3 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/examples/model-conversion/scripts/causal/run-converted-model-embeddings-logits.sh b/examples/model-conversion/scripts/causal/run-converted-model-embeddings-logits.sh
index 3cce3fc94..1b5ff8611 100755
--- a/examples/model-conversion/scripts/causal/run-converted-model-embeddings-logits.sh
+++ b/examples/model-conversion/scripts/causal/run-converted-model-embeddings-logits.sh
@@ -4,6 +4,7 @@ set -e
 
 # First try command line argument, then environment variable, then file
 CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
+BUILD_DIR="${2:-"$BUILD_DIR"}"
 
 # Final check if we have a model path
 if [ -z "$CONVERTED_MODEL" ]; then
@@ -13,6 +14,10 @@ if [ -z "$CONVERTED_MODEL" ]; then
     exit 1
 fi
 
-cmake --build ../../build --target llama-debug -j8
+if [ -z "$BUILD_DIR" ]; then
+    BUILD_DIR="../../build"
+fi
 
-../../build/bin/llama-debug -m $CONVERTED_MODEL --embedding -p "Hello world today" --save-logits
+cmake --build ${BUILD_DIR} --target llama-debug -j8
+
+${BUILD_DIR}/bin/llama-debug -m $CONVERTED_MODEL --embedding -p "Hello world today" --save-logits
diff --git a/examples/model-conversion/scripts/causal/run-converted-model.sh b/examples/model-conversion/scripts/causal/run-converted-model.sh
index b6c3d3866..b684804e0 100755
--- a/examples/model-conversion/scripts/causal/run-converted-model.sh
+++ b/examples/model-conversion/scripts/causal/run-converted-model.sh
@@ -5,11 +5,16 @@ set -e
 # First try command line argument, then environment variable, then file
 CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"
 MODEL_TESTING_PROMPT="${2:-"$MODEL_TESTING_PROMPT"}"
+BUILD_DIR="${3:-"$BUILD_DIR"}"
 
-if [ -z "$MODEL_TESTING_PROMPT"]; then
+if [ -z "$MODEL_TESTING_PROMPT" ]; then
     MODEL_TESTING_PROMPT="Hello, my name is"
 fi
 
+if [ -z "$BUILD_DIR" ]; then
+    BUILD_DIR="../../build"
+fi
+
 # Final check if we have a model path
 if [ -z "$CONVERTED_MODEL" ]; then
     echo "Error: Model path must be provided either as:" >&2
@@ -21,6 +26,6 @@ fi
 echo $CONVERTED_MODEL
 echo $MODEL_TESTING_PROMPT
 
-cmake --build ../../build --target llama-debug -j8
+cmake --build ${BUILD_DIR} --target llama-debug -j8
 
-../../build/bin/llama-debug -m "$CONVERTED_MODEL" -p "$MODEL_TESTING_PROMPT" --save-logits
+${BUILD_DIR}/bin/llama-debug -m "$CONVERTED_MODEL" -p "$MODEL_TESTING_PROMPT" --save-logits
diff --git a/examples/model-conversion/scripts/embedding/run-converted-model.sh b/examples/model-conversion/scripts/embedding/run-converted-model.sh
index 84625cec3..ba8a3afae 100755
--- a/examples/model-conversion/scripts/embedding/run-converted-model.sh
+++ b/examples/model-conversion/scripts/embedding/run-converted-model.sh
@@ -28,6 +28,7 @@ done
 
 # First try command line argument, then environment variable
 CONVERTED_MODEL="${CONVERTED_MODEL:-"$CONVERTED_EMBEDDING_MODEL"}"
+BUILD_DIR="${BUILD_DIR:-"../../build"}"
 
 # Final check if we have a model path
 if [ -z "$CONVERTED_MODEL" ]; then
@@ -50,5 +51,5 @@ fi
 
 echo $CONVERTED_MODEL
 
-cmake --build ../../build --target llama-debug -j8
-../../build/bin/llama-debug -m "$CONVERTED_MODEL" --embedding -p "$PROMPT" --save-logits --embd-normalize $EMBD_NORMALIZE
+cmake --build ${BUILD_DIR} --target llama-debug -j8
+${BUILD_DIR}/bin/llama-debug -m "$CONVERTED_MODEL" --embedding -p "$PROMPT" --save-logits --embd-normalize $EMBD_NORMALIZE

From 365a3e8c319ddb5442afdf17d0d23dfa0ff26c78 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 19 Jan 2026 20:03:19 +0200
Subject: [PATCH 03/17] ggml : add ggml_build_forward_select (#18550)

* ggml : add ggml_build_forward_select

* cuda : adapt CUDA graph compat to new feature

* vulkan : update logic to handle command buffer closing

* ggml : check compute for fusion

* ggml : add comment
---
 ggml/include/ggml.h                    | 46 ++++++++++++++---
 ggml/src/ggml-backend.cpp              |  5 +-
 ggml/src/ggml-blas/ggml-blas.cpp       |  4 ++
 ggml/src/ggml-cann/ggml-cann.cpp       |  4 ++
 ggml/src/ggml-cpu/ggml-cpu.c           |  4 ++
 ggml/src/ggml-cuda/common.cuh          |  1 +
 ggml/src/ggml-cuda/ggml-cuda.cu        |  8 +++
 ggml/src/ggml-hexagon/ggml-hexagon.cpp |  4 ++
 ggml/src/ggml-impl.h                   |  3 ++
 ggml/src/ggml-metal/ggml-metal-ops.cpp |  4 ++
 ggml/src/ggml-opencl/ggml-opencl.cpp   |  4 ++
 ggml/src/ggml-sycl/ggml-sycl.cpp       |  3 ++
 ggml/src/ggml-vulkan/ggml-vulkan.cpp   |  5 +-
 ggml/src/ggml-webgpu/ggml-webgpu.cpp   |  3 ++
 ggml/src/ggml-zdnn/ggml-zdnn.cpp       |  4 ++
 ggml/src/ggml-zendnn/ggml-zendnn.cpp   |  4 ++
 ggml/src/ggml.c                        | 70 +++++++++++++++++++-------
 17 files changed, 148 insertions(+), 28 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index b69583dd3..1988d16dc 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -630,10 +630,11 @@ extern "C" {
 
     // this tensor...
     enum ggml_tensor_flag {
-        GGML_TENSOR_FLAG_INPUT  =  1, // ...is an input for the GGML compute graph
-        GGML_TENSOR_FLAG_OUTPUT =  2, // ...is an output for the GGML compute graph
-        GGML_TENSOR_FLAG_PARAM  =  4, // ...contains trainable parameters
-        GGML_TENSOR_FLAG_LOSS   =  8, // ...defines loss for numerical optimization (multiple loss tensors add up)
+        GGML_TENSOR_FLAG_INPUT   =  1, // ...is an input for the GGML compute graph
+        GGML_TENSOR_FLAG_OUTPUT  =  2, // ...is an output for the GGML compute graph
+        GGML_TENSOR_FLAG_PARAM   =  4, // ...contains trainable parameters
+        GGML_TENSOR_FLAG_LOSS    =  8, // ...defines loss for numerical optimization (multiple loss tensors add up)
+        GGML_TENSOR_FLAG_COMPUTE = 16, // ...must be computed
     };
 
     enum ggml_tri_type {
@@ -2577,11 +2578,42 @@ extern "C" {
         struct ggml_tensor *  grad,
         struct ggml_tensor *  sgd_params); // alpha, weight decay
 
+    // build forward mutiple tensors and select one of them for computing
+    // this is useful for creating graphs that have constant topology but compute different things based on the input
+    // ref: https://github.com/ggml-org/llama.cpp/pull/18550
     //
-    // automatic differentiation
+    // nodes:
+    //   | - build forward into the graph but do not compute
+    //   c - build forward into the graph and compute
     //
+    //    |  |  ...  c  ...  |
+    //    |  |  ...  c  ...  |
+    //    |  |  ...  c  ...  |
+    //   [0  1  ... idx ...  n-1]        <-- ggml_build_forward_select(..., n, idx)
+    //               c
+    //               c
+    //
+    // example:
+    //   struct ggml_tensor * curs[3];
+    //
+    //   curs[0]  = compute0(...);
+    //   curs[1]  = compute1(...);
+    //   curs[2]  = compute2(...);
+    //
+    //   int idx = select_branch(some_input);
+    //
+    //   struct ggml_tensor * out = ggml_build_forward_select(cgraph, curs, 3, idx);
+    //
+    GGML_API struct ggml_tensor * ggml_build_forward_select(
+            struct ggml_cgraph  * cgraph,
+            struct ggml_tensor ** tensors,
+            int                   n_tensors,
+            int                   idx);
+
+    GGML_API void ggml_build_forward_expand(
+            struct ggml_cgraph * cgraph,
+            struct ggml_tensor * tensor);
 
-    GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
     GGML_API void ggml_build_backward_expand(
         struct ggml_context *  ctx,        // context for gradient computation
         struct ggml_cgraph  *  cgraph,
@@ -2613,7 +2645,7 @@ extern "C" {
     GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
 
     // dump the graph into a file using the dot format
-    GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
+    GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * cgraph, const char * filename);
 
     // TODO these functions were sandwiched in the old optimization interface, is there a better place for them?
     typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 1b59924b8..354876574 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -874,9 +874,9 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
         }
         if (sched->debug > 1) {
             ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
-            GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d:", i, ggml_op_name(node->op), node->name,
+            GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d,c=%d:", i, ggml_op_name(node->op), node->name,
                 fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node),
-                graph->use_counts[ggml_hash_find(&graph->visited_hash_set, node)]);
+                graph->use_counts[ggml_hash_find(&graph->visited_hash_set, node)], node->flags & GGML_TENSOR_FLAG_COMPUTE ? 1 : 0);
             for (int j = 0; j < GGML_MAX_SRC; j++) {
                 struct ggml_tensor * src = node->src[j];
                 if (src == NULL) {
@@ -1922,6 +1922,7 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set,
         dst->view_offs = src->view_offs;
     }
     dst->op = src->op;
+    dst->flags = src->flags;
     memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
     ggml_set_name(dst, src->name);
 
diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp
index 84956cbb9..2e9ddf224 100644
--- a/ggml/src/ggml-blas/ggml-blas.cpp
+++ b/ggml/src/ggml-blas/ggml-blas.cpp
@@ -226,6 +226,10 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
     for (int i = 0; i < cgraph->n_nodes; i++) {
         struct ggml_tensor * node = cgraph->nodes[i];
 
+        if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
+            continue;
+        }
+
         switch (node->op) {
             case GGML_OP_MUL_MAT:
                 ggml_backend_blas_mul_mat(ctx, node);
diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index eba83327f..42c6c67a4 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -2146,6 +2146,10 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx
                 continue;
             }
 
+            if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
+                continue;
+            }
+
             bool ok = ggml_cann_compute_forward(*cann_ctx, node);
             if (!ok) {
                 GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index f7ba1fe31..4c7a75e76 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2943,6 +2943,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
             continue;
         }
 
+        if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
+            continue;
+        }
+
         ggml_compute_forward(&params, node);
 
         if (state->ith == 0 && cplan->abort_callback &&
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index eaaf87612..179522d83 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -1123,6 +1123,7 @@ struct ggml_tensor_extra_gpu {
 struct ggml_cuda_graph_node_properties {
     void * node_address;
     ggml_op node_op;
+    int32_t flags;
     int64_t ne[GGML_MAX_DIMS];
     size_t nb[GGML_MAX_DIMS];
     void * src_address[GGML_MAX_SRC];
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index ed1021469..cda422def 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2918,6 +2918,7 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
 static void ggml_cuda_graph_node_set_properties(ggml_cuda_graph_node_properties * props, ggml_tensor * node) {
     props->node_address = node->data;
     props->node_op = node->op;
+    props->flags = node->flags;
     for (int i = 0; i < GGML_MAX_DIMS; i++) {
         props->ne[i] = node->ne[i];
         props->nb[i] = node->nb[i];
@@ -2961,6 +2962,10 @@ static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_
         return false;
     }
 
+    if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) != (props->flags & GGML_TENSOR_FLAG_COMPUTE)) {
+        return false;
+    }
+
     return true;
 }
 
@@ -3378,6 +3383,9 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
                     continue;
                 }
 
+                if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
+                    continue;
+                }
 
                 // start of fusion operations
                 static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index cf1eb994c..5b835c11c 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -2497,6 +2497,10 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
             continue;
         }
 
+        if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
+            continue;
+        }
+
         uint32_t flags = 0;
 
         // skip quantizer if src1 is reused
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index 80e0fd2ff..baadfe9a7 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -611,6 +611,9 @@ static inline bool ggml_can_fuse_ext(const struct ggml_cgraph * cgraph, const in
         if (node->op != ops[i]) {
             return false;
         }
+        if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
+            return false;
+        }
         if (i < num_ops - 1 && !ggml_node_has_n_uses(cgraph, node_idxs[i], 1)) {
             return false;
         }
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
index 680ad794d..3d97d3dfd 100644
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -203,6 +203,10 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
         GGML_ABORT("unsupported op");
     }
 
+    if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
+        return 1;
+    }
+
     int n_fuse = 1;
 
     // check if the current node can run concurrently with other nodes before it
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index d89d5e724..8059240b1 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -3058,6 +3058,10 @@ static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggm
             continue;
         }
 
+        if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
+            continue;
+        }
+
         if (!backend_ctx->disable_fusion && ggml_opencl_can_fuse(cgraph, i, { GGML_OP_NORM, GGML_OP_MUL, GGML_OP_ADD })) {
             ggml_opencl_op_norm_fused(backend, node, cgraph->nodes[i+1], cgraph->nodes[i+2]);
             i += 2;
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 8f8176b67..bb8acc922 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -4109,6 +4109,9 @@ static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * syc
         if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
             continue;
         }
+        if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
+            continue;
+        }
 #ifndef NDEBUG
         assert(node->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
         for (int j = 0; j < GGML_MAX_SRC; j++) {
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 0fabbcec3..08fd044ca 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -12191,6 +12191,9 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
     if (ggml_is_empty(node) || ggml_op_is_empty(node->op) || !node->buffer) {
         return false;
     }
+    if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
+        return false;
+    }
 
     VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")");
     ctx->semaphore_idx = 0;
@@ -13645,7 +13648,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
     int last_node = cgraph->n_nodes - 1;
 
     // If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
-    while (last_node > 0 && ggml_vk_is_empty(cgraph->nodes[last_node])) {
+    while (last_node > 0 && (ggml_vk_is_empty(cgraph->nodes[last_node]) || ((cgraph->nodes[last_node]->flags & GGML_TENSOR_FLAG_COMPUTE) == 0))) {
         last_node -= 1;
     }
 
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
index 1470378af..584cea769 100644
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -1982,6 +1982,9 @@ static std::optional<webgpu_command> ggml_webgpu_encode_node(webgpu_context ctx,
     if (ggml_is_empty(node)) {
         return std::nullopt;
     }
+    if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
+        return std::nullopt;
+    }
     WEBGPU_LOG_DEBUG("ggml_webgpu_encode_node(" << node << ", " << ggml_op_name(node->op) << ")");
 
     ggml_tensor * src0 = node->src[0];
diff --git a/ggml/src/ggml-zdnn/ggml-zdnn.cpp b/ggml/src/ggml-zdnn/ggml-zdnn.cpp
index edbeb8eef..906d25417 100644
--- a/ggml/src/ggml-zdnn/ggml-zdnn.cpp
+++ b/ggml/src/ggml-zdnn/ggml-zdnn.cpp
@@ -58,6 +58,10 @@ static enum ggml_status ggml_zdnn_graph_compute(ggml_backend_t backend, ggml_cgr
             continue;
         }
 
+        if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
+            continue;
+        }
+
         bool ok = ggml_zdnn_compute_forward(ctx, node);
         if (!ok) {
             GGML_LOG_ERROR("%s: unsupported op %s (%s)\n",
diff --git a/ggml/src/ggml-zendnn/ggml-zendnn.cpp b/ggml/src/ggml-zendnn/ggml-zendnn.cpp
index fd07f983d..afbecde7a 100644
--- a/ggml/src/ggml-zendnn/ggml-zendnn.cpp
+++ b/ggml/src/ggml-zendnn/ggml-zendnn.cpp
@@ -211,6 +211,10 @@ static ggml_status ggml_backend_zendnn_graph_compute(ggml_backend_t backend, ggm
     for (int i = 0; i < cgraph->n_nodes; i++) {
         struct ggml_tensor * node = cgraph->nodes[i];
 
+        if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
+            continue;
+        }
+
         switch (node->op) {
             case GGML_OP_MUL_MAT:
                 ggml_zendnn_compute_forward_mul_mat(ctx, node);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index c75fe7d27..1725ad165 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -3441,7 +3441,8 @@ struct ggml_tensor * ggml_cast(
 
     result->op     = GGML_OP_CPY;
     result->src[0] = a;
-    result->src[1] = result;
+    result->src[1] = result; // note: this self-reference might seem redundant, but it's actually needed by some
+                             //       backends for consistency with ggml_cpy_impl() above
 
     return result;
 }
@@ -6725,20 +6726,35 @@ static void ggml_compute_backward(
     GGML_ASSERT(!src2_needs_grads || ggml_are_same_shape(src2, cgraph->grads[isrc2]));
 }
 
-static size_t ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
-    // check if already visited
-    size_t node_hash_pos = ggml_hash_find(&cgraph->visited_hash_set, node);
+static size_t ggml_visit_parents_graph(struct ggml_cgraph * cgraph, struct ggml_tensor * node, bool compute) {
+    if (node->op != GGML_OP_NONE && compute) {
+        node->flags |= GGML_TENSOR_FLAG_COMPUTE;
+    }
+
+    const size_t node_hash_pos = ggml_hash_find(&cgraph->visited_hash_set, node);
     GGML_ASSERT(node_hash_pos != GGML_HASHSET_FULL);
-    if (!ggml_bitset_get(cgraph->visited_hash_set.used, node_hash_pos)) {
-        // This is the first time we see this node in the current graph.
-        cgraph->visited_hash_set.keys[node_hash_pos] = node;
-        ggml_bitset_set(cgraph->visited_hash_set.used, node_hash_pos);
-        cgraph->use_counts[node_hash_pos] = 0;
-    } else {
+
+    if (ggml_bitset_get(cgraph->visited_hash_set.used, node_hash_pos)) {
         // already visited
+
+        if (compute) {
+            // update the compute flag regardless
+            for (int i = 0; i < GGML_MAX_SRC; ++i) {
+                struct ggml_tensor * src = node->src[i];
+                if (src && ((src->flags & GGML_TENSOR_FLAG_COMPUTE) == 0)) {
+                    ggml_visit_parents_graph(cgraph, src, true);
+                }
+            }
+        }
+
         return node_hash_pos;
     }
 
+    // This is the first time we see this node in the current graph.
+    cgraph->visited_hash_set.keys[node_hash_pos] = node;
+    ggml_bitset_set(cgraph->visited_hash_set.used, node_hash_pos);
+    cgraph->use_counts[node_hash_pos] = 0;
+
     for (int i = 0; i < GGML_MAX_SRC; ++i) {
         const int k =
             (cgraph->order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? i :
@@ -6747,7 +6763,7 @@ static size_t ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor
 
         struct ggml_tensor * src = node->src[k];
         if (src) {
-            size_t src_hash_pos = ggml_visit_parents(cgraph, src);
+            const size_t src_hash_pos = ggml_visit_parents_graph(cgraph, src, compute);
 
             // Update the use count for this operand.
             cgraph->use_counts[src_hash_pos]++;
@@ -6778,17 +6794,17 @@ static size_t ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor
     return node_hash_pos;
 }
 
-static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) {
+static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand, bool compute) {
     if (!expand) {
         // TODO: this branch isn't accessible anymore, maybe move this to ggml_build_forward_expand
         ggml_graph_clear(cgraph);
     }
 
-    const int n0 = cgraph->n_nodes;
+    const int n_old = cgraph->n_nodes;
 
-    ggml_visit_parents(cgraph, tensor);
+    ggml_visit_parents_graph(cgraph, tensor, compute);
 
-    const int n_new = cgraph->n_nodes - n0;
+    const int n_new = cgraph->n_nodes - n_old;
     GGML_PRINT_DEBUG("%s: visited %d new nodes\n", __func__, n_new);
 
     if (n_new > 0) {
@@ -6797,8 +6813,22 @@ static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_ten
     }
 }
 
+struct ggml_tensor * ggml_build_forward_select(
+        struct ggml_cgraph  * cgraph,
+        struct ggml_tensor ** tensors,
+        int                   n_tensors,
+        int                   idx) {
+    GGML_ASSERT(idx >= 0 && idx < n_tensors);
+
+    for (int i = 0; i < n_tensors; i++) {
+        ggml_build_forward_impl(cgraph, tensors[i], true, i == idx ? true : false);
+    }
+
+    return tensors[idx];
+}
+
 void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
-    ggml_build_forward_impl(cgraph, tensor, true);
+    ggml_build_forward_impl(cgraph, tensor, true, true);
 }
 
 void ggml_build_backward_expand(
@@ -7229,6 +7259,10 @@ bool ggml_can_fuse_subgraph_ext(const struct ggml_cgraph * cgraph,
             return false;
         }
 
+        if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
+            return false;
+        }
+
         if (ggml_node_list_find_tensor(cgraph, outputs, num_outputs, node) != -1) {
             continue;
         }
@@ -7310,7 +7344,7 @@ static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node,
             label);
 }
 
-void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
+void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * cgraph, const char * filename) {
     char color[16];
 
     FILE * fp = ggml_fopen(filename, "w");
@@ -7331,7 +7365,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
         if (node->flags & GGML_TENSOR_FLAG_PARAM) {
             snprintf(color, sizeof(color), "yellow");
         } else if (grad) {
-            if (ggml_graph_find(gf, node)) {
+            if (ggml_graph_find(cgraph, node)) {
                 snprintf(color, sizeof(color), "green");
             } else {
                 snprintf(color, sizeof(color), "lightblue");

From 18361c579cc7ed0a06ed9085eaf900326b537fa7 Mon Sep 17 00:00:00 2001
From: Lennart Austenfeld <53152202+l-austenfeld@users.noreply.github.com>
Date: Mon, 19 Jan 2026 19:13:31 +0100
Subject: [PATCH 04/17] server: fix memory reservations in populate_token_probs
 (#18787)

---
 tools/server/server-context.cpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 82294d940..c790ac79e 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -1326,11 +1326,12 @@ private:
     }
 
     void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) const {
-        const size_t n_probs = slot.task->params.sampling.n_probs;
+        const size_t n_probs_request = slot.task->params.sampling.n_probs;
 
         if (post_sampling) {
             const auto * cur_p = common_sampler_get_candidates(slot.smpl.get(), true);
             const size_t max_probs = cur_p->size;
+            const size_t n_probs = std::min(max_probs, n_probs_request);
 
             // set probability for sampled token
             for (size_t i = 0; i < max_probs; i++) {
@@ -1341,8 +1342,8 @@ private:
             }
 
             // set probability for top n_probs tokens
-            result.probs.reserve(max_probs);
-            for (size_t i = 0; i < std::min(max_probs, n_probs); i++) {
+            result.probs.reserve(n_probs);
+            for (size_t i = 0; i < n_probs; i++) {
                 result.probs.push_back({
                     cur_p->data[i].id,
                     common_token_to_piece(ctx, cur_p->data[i].id, special),
@@ -1352,9 +1353,11 @@ private:
         } else {
             // TODO: optimize this with min-p optimization
             std::vector<llama_token_data> cur = get_token_probabilities(ctx, idx);
+            const size_t max_probs = cur.size();
+            const size_t n_probs = std::min(max_probs, n_probs_request);
 
             // set probability for sampled token
-            for (size_t i = 0; i < cur.size(); i++) {
+            for (size_t i = 0; i < max_probs; i++) {
                 // set probability for sampled token
                 if (cur[i].id == result.tok) {
                     result.prob = cur[i].p;
@@ -1364,7 +1367,7 @@ private:
 
             // set probability for top n_probs tokens
             result.probs.reserve(n_probs);
-            for (size_t i = 0; i < std::min(cur.size(), n_probs); i++) {
+            for (size_t i = 0; i < n_probs; i++) {
                 result.probs.push_back({
                     cur[i].id,
                     common_token_to_piece(ctx, cur[i].id, special),

From 4037093c66bfe53b9b27d72765416815fdb0398f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Mon, 19 Jan 2026 20:29:15 +0100
Subject: [PATCH 05/17] ci : run test-jinja -py on high perf [no ci] (#18916)

---
 ci/run.sh            | 2 +-
 tests/CMakeLists.txt | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/ci/run.sh b/ci/run.sh
index 6ca6ea566..dfcf95966 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -254,7 +254,7 @@ function gg_run_ctest_release {
     (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
 
     if [ -z ${GG_BUILD_LOW_PERF} ]; then
-        (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+        (time ctest --output-on-failure -L 'main|python' ) 2>&1 | tee -a $OUT/${ci}-ctest.log
     else
         (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
     fi
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 3eae18eef..c9436c599 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -187,6 +187,7 @@ llama_build_and_test(test-chat-parser.cpp)
 llama_build_and_test(test-chat-peg-parser.cpp peg-parser/simple-tokenize.cpp)
 llama_build_and_test(test-chat-template.cpp)
 llama_build_and_test(test-jinja.cpp)
+llama_test(test-jinja NAME test-jinja-py ARGS -py LABEL python)
 llama_build_and_test(test-json-partial.cpp)
 llama_build_and_test(test-log.cpp)
 llama_build_and_test(

From 959ecf7f234dc0bc0cd6829b25cb0ee1481aa78a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Mon, 19 Jan 2026 20:29:43 +0100
Subject: [PATCH 06/17] jinja : fix undefined keys and attributes and int/float
 as bool (#18924)

* fix undefined keys and attributes

* add falsy tests

* as_bool for integers and floats

* more falsy/truthy tests

* --typo
---
 common/jinja/runtime.cpp |  4 +--
 common/jinja/value.h     |  6 ++++
 tests/test-jinja.cpp     | 78 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 86 insertions(+), 2 deletions(-)

diff --git a/common/jinja/runtime.cpp b/common/jinja/runtime.cpp
index d8ef27908..e3e4ebf1e 100644
--- a/common/jinja/runtime.cpp
+++ b/common/jinja/runtime.cpp
@@ -805,7 +805,7 @@ value member_expression::execute_impl(context & ctx) {
         } else if (is_val<value_string>(property)) {
             auto key = property->as_string().str();
             JJ_DEBUG("Accessing %s built-in '%s'", is_val<value_array>(object) ? "array" : "string", key.c_str());
-            val = try_builtin_func(ctx, key, object);
+            val = try_builtin_func(ctx, key, object, true);
         } else {
             throw std::runtime_error("Cannot access property with non-string/non-number: got " + property->type());
         }
@@ -814,7 +814,7 @@ value member_expression::execute_impl(context & ctx) {
             throw std::runtime_error("Cannot access property with non-string: got " + property->type());
         }
         auto key = property->as_string().str();
-        val = try_builtin_func(ctx, key, object);
+        val = try_builtin_func(ctx, key, object, true);
     }
 
     if (ctx.is_get_stats && val && object && property) {
diff --git a/common/jinja/value.h b/common/jinja/value.h
index 4e916919b..7bd0202ce 100644
--- a/common/jinja/value.h
+++ b/common/jinja/value.h
@@ -203,6 +203,9 @@ struct value_int_t : public value_t {
     virtual int64_t as_int() const override { return val_int; }
     virtual double as_float() const override { return static_cast<double>(val_int); }
     virtual string as_string() const override { return std::to_string(val_int); }
+    virtual bool as_bool() const override {
+        return val_int != 0;
+    }
     virtual const func_builtins & get_builtins() const override;
 };
 using value_int = std::shared_ptr<value_int_t>;
@@ -219,6 +222,9 @@ struct value_float_t : public value_t {
         if (out.back() == '.') out.push_back('0'); // leave one zero if no decimals
         return out;
     }
+    virtual bool as_bool() const override {
+        return val_flt != 0.0;
+    }
     virtual const func_builtins & get_builtins() const override;
 };
 using value_float = std::shared_ptr<value_float_t>;
diff --git a/tests/test-jinja.cpp b/tests/test-jinja.cpp
index 13818381e..99630ecb3 100644
--- a/tests/test-jinja.cpp
+++ b/tests/test-jinja.cpp
@@ -191,6 +191,84 @@ static void test_conditionals(testing & t) {
         json::object(),
         "yes"
     );
+
+    test_template(t, "is undefined falsy",
+        "{{ 'yes' if not y else 'no' }}",
+        json::object(),
+        "yes"
+    );
+
+    test_template(t, "is undefined attribute falsy",
+        "{{ 'yes' if not y.x else 'no' }}",
+        {{"y", true}},
+        "yes"
+    );
+
+    test_template(t, "is undefined key falsy",
+        "{{ 'yes' if not y['x'] else 'no' }}",
+        {{"y", {{}}}},
+        "yes"
+    );
+
+    test_template(t, "is empty array falsy",
+        "{{ 'yes' if not y else 'no' }}",
+        {{"y", json::array()}},
+        "yes"
+    );
+
+    test_template(t, "is empty object falsy",
+        "{{ 'yes' if not y else 'no' }}",
+        {{"y", json::object()}},
+        "yes"
+    );
+
+    test_template(t, "is empty string falsy",
+        "{{ 'yes' if not y else 'no' }}",
+        {{"y", ""}},
+        "yes"
+    );
+
+    test_template(t, "is 0 falsy",
+        "{{ 'yes' if not y else 'no' }}",
+        {{"y", 0}},
+        "yes"
+    );
+
+    test_template(t, "is 0.0 falsy",
+        "{{ 'yes' if not y else 'no' }}",
+        {{"y", 0.0}},
+        "yes"
+    );
+
+    test_template(t, "is non-empty array truthy",
+        "{{ 'yes' if y else 'no' }}",
+        {{"y", json::array({""})}},
+        "yes"
+    );
+
+    test_template(t, "is non-empty object truthy",
+        "{{ 'yes' if y else 'no' }}",
+        {{"y", {"x", false}}},
+        "yes"
+    );
+
+    test_template(t, "is non-empty string truthy",
+        "{{ 'yes' if y else 'no' }}",
+        {{"y", "0"}},
+        "yes"
+    );
+
+    test_template(t, "is 1 truthy",
+        "{{ 'yes' if y else 'no' }}",
+        {{"y", 1}},
+        "yes"
+    );
+
+    test_template(t, "is 1.0 truthy",
+        "{{ 'yes' if y else 'no' }}",
+        {{"y", 1.0}},
+        "yes"
+    );
 }
 
 static void test_loops(testing & t) {

From 1706a6d7c68fc54374b0807324079b2c6ebf674a Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Mon, 19 Jan 2026 16:09:20 -0600
Subject: [PATCH 07/17] convert : support Glm4MoeLite (#18936)

* initial commit for branch

* add glm-4.7-flash, move tokenizer hash

* use `glm4` pretok

* silence flake8 E302 (CI)

* apply review feedback

* add <|user|> as eog

* also add EOG `<|observation|>`

* revert llama-vocab

* inherit vocab from glm4

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
---
 convert_hf_to_gguf.py        | 31 ++++++++++++++++++++++++++++++-
 convert_hf_to_gguf_update.py |  1 +
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 464ecbaab..becbad046 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -1078,6 +1078,9 @@ class TextModel(ModelBase):
         if chkhsh == "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df":
             # ref: https://huggingface.co/aari1995/German_Semantic_V3
             res = "jina-v2-de"
+        if chkhsh == "cdf5f35325780597efd76153d4d1c16778f766173908894c04afc20108536267":
+            # ref: https://huggingface.co/zai-org/GLM-4.7-Flash
+            res = "glm4"
         if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
             # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
             res = "llama-bpe"
@@ -7458,7 +7461,7 @@ class DeepseekModel(TextModel):
     "DeepseekV3ForCausalLM",
     "KimiVLForConditionalGeneration",
     "YoutuForCausalLM",
-    "YoutuVLForConditionalGeneration"
+    "YoutuVLForConditionalGeneration",
 )
 class DeepseekV2Model(TextModel):
     model_arch = gguf.MODEL_ARCH.DEEPSEEK2
@@ -8446,6 +8449,32 @@ class Glm4MoeModel(TextModel):
                 raise ValueError(f"Unprocessed experts: {experts}")
 
 
+@ModelBase.register("Glm4MoeLiteForCausalLM")
+class Glm4MoeLiteModel(DeepseekV2Model):
+    model_arch = gguf.MODEL_ARCH.DEEPSEEK2
+
+    # copied from Glm4MoeModel
+    def set_vocab(self):
+        from transformers import AutoTokenizer
+
+        tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
+        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
+        tokens, toktypes, tokpre = self.get_vocab_base()
+        self.gguf_writer.add_tokenizer_model("gpt2")
+        self.gguf_writer.add_tokenizer_pre(tokpre)
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_types(toktypes)
+
+        # Special tokens
+        # Note: Using <|endoftext|> (151329) for eot causes endless generation
+        special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"])  # 151331
+        special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])  # 151336
+        special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # 151329
+        special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"])  # 151338
+
+        special_vocab.add_to_gguf(self.gguf_writer)
+
+
 @ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
 class ChatGLMModel(TextModel):
     model_arch = gguf.MODEL_ARCH.CHATGLM
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
index aa9843ea1..2811f7f88 100755
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -170,6 +170,7 @@ pre_computed_hashes = [
     {"name": "grok-2",    "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"},
     # jina-v2-de variants
     {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/aari1995/German_Semantic_V3", "chkhsh": "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df"},
+    {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/zai-org/GLM-4.7-Flash", "chkhsh": "cdf5f35325780597efd76153d4d1c16778f766173908894c04afc20108536267"},
 ]
 
 

From 6df686bee68ff109f62123c7a8eac003f3dd9e20 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Mon, 19 Jan 2026 23:28:01 +0100
Subject: [PATCH 08/17] server : refactor oai_parser_opt, move it to
 server_chat_params (#18937)

* server_chat_params

* move chat format into CLI

* use meta whenever possible

* clean up, no more chatml fallback
---
 common/chat.cpp                 |  14 ++--
 common/chat.h                   |   2 +-
 tools/cli/cli.cpp               |  32 ++++++--
 tools/server/server-common.cpp  |   4 +-
 tools/server/server-common.h    |  14 ++--
 tools/server/server-context.cpp | 138 ++++++++++++++------------------
 tools/server/server-context.h   |   5 +-
 tools/server/server-task.h      |   6 +-
 8 files changed, 112 insertions(+), 103 deletions(-)

diff --git a/common/chat.cpp b/common/chat.cpp
index 28721ac7d..b29544dac 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -601,18 +601,18 @@ bool common_chat_templates_was_explicit(const struct common_chat_templates * tmp
     return tmpls->has_explicit_template;
 }
 
-const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant) {
-    if (variant != nullptr) {
-        if (strcmp(variant, "tool_use") == 0) {
+std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant) {
+    if (!variant.empty()) {
+        if (variant == "tool_use") {
             if (tmpls->template_tool_use) {
-                return tmpls->template_tool_use->source().c_str();
+                return tmpls->template_tool_use->source();
             }
-            return nullptr;
+            return "";
         } else {
-            LOG_DBG("%s: unknown template variant: %s\n", __func__, variant);
+            LOG_DBG("%s: unknown template variant: %s\n", __func__, variant.c_str());
         }
     }
-    return tmpls->template_default->source().c_str();
+    return tmpls->template_default->source();
 }
 
 common_chat_templates_ptr common_chat_templates_init(
diff --git a/common/chat.h b/common/chat.h
index 454085e90..148801738 100644
--- a/common/chat.h
+++ b/common/chat.h
@@ -191,7 +191,7 @@ common_chat_templates_ptr common_chat_templates_init(
                                            const std::string & eos_token_override = "");
 
 bool         common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
-const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant = nullptr);
+std::string  common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant = "");
 
 
 struct common_chat_params      common_chat_templates_apply(
diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp
index 2f0ffea1c..caad29bac 100644
--- a/tools/cli/cli.cpp
+++ b/tools/cli/cli.cpp
@@ -71,14 +71,16 @@ struct cli_context {
 
     std::string generate_completion(result_timings & out_timings) {
         server_response_reader rd = ctx_server.get_response_reader();
+        auto formatted = format_chat();
         {
             // TODO: reduce some copies here in the future
             server_task task = server_task(SERVER_TASK_TYPE_COMPLETION);
-            task.id        = rd.get_new_id();
-            task.index     = 0;
-            task.params    = defaults;    // copy
-            task.cli_input = messages;    // copy
-            task.cli_files = input_files; // copy
+            task.id         = rd.get_new_id();
+            task.index      = 0;
+            task.params     = defaults;         // copy
+            task.cli_prompt = formatted.prompt; // copy
+            task.cli_files  = input_files;      // copy
+            task.cli        = true;
             rd.post_task({std::move(task)});
         }
 
@@ -156,6 +158,26 @@ struct cli_context {
             return content;
         }
     }
+
+    common_chat_params format_chat() {
+        auto meta = ctx_server.get_meta();
+        auto & chat_params = meta.chat_params;
+
+        common_chat_templates_inputs inputs;
+        inputs.messages              = common_chat_msgs_parse_oaicompat(messages);
+        inputs.tools                 = {}; // TODO
+        inputs.tool_choice           = COMMON_CHAT_TOOL_CHOICE_NONE;
+        inputs.json_schema           = ""; // TODO
+        inputs.grammar               = ""; // TODO
+        inputs.use_jinja             = chat_params.use_jinja;
+        inputs.parallel_tool_calls   = false;
+        inputs.add_generation_prompt = true;
+        inputs.reasoning_format      = chat_params.reasoning_format;
+        inputs.enable_thinking       = chat_params.enable_thinking;
+
+        // Apply chat template to the list of messages
+        return common_chat_templates_apply(chat_params.tmpls.get(), inputs);
+    }
 };
 
 int main(int argc, char ** argv) {
diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp
index 16b0db298..1bbe85322 100644
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@@ -831,7 +831,7 @@ static void handle_media(
 // used by /chat/completions endpoint
 json oaicompat_chat_params_parse(
     json & body, /* openai api json semantics */
-    const oaicompat_parser_options & opt,
+    const server_chat_params & opt,
     std::vector<raw_buffer> & out_files)
 {
     json llama_params;
@@ -1012,7 +1012,7 @@ json oaicompat_chat_params_parse(
     }
 
     // Apply chat template to the list of messages
-    auto chat_params = common_chat_templates_apply(opt.tmpls, inputs);
+    auto chat_params = common_chat_templates_apply(opt.tmpls.get(), inputs);
 
     /* Append assistant prefilled message */
     if (prefill_assistant_message) {
diff --git a/tools/server/server-common.h b/tools/server/server-common.h
index 152a2a3c4..7f4c07387 100644
--- a/tools/server/server-common.h
+++ b/tools/server/server-common.h
@@ -274,25 +274,25 @@ std::vector<server_tokens> tokenize_input_prompts(
 // OAI utils
 //
 
-// used by /completions endpoint
-json oaicompat_completion_params_parse(const json & body);
-
-struct oaicompat_parser_options {
+struct server_chat_params {
     bool use_jinja;
     bool prefill_assistant;
     common_reasoning_format reasoning_format;
-    std::map<std::string,std::string> chat_template_kwargs;
-    common_chat_templates * tmpls;
+    std::map<std::string, std::string> chat_template_kwargs; // mapping key --> json value
+    common_chat_templates_ptr tmpls;
     bool allow_image;
     bool allow_audio;
     bool enable_thinking = true;
     std::string media_path;
 };
 
+// used by /completions endpoint
+json oaicompat_completion_params_parse(const json & body);
+
 // used by /chat/completions endpoint
 json oaicompat_chat_params_parse(
     json & body, /* openai api json semantics */
-    const oaicompat_parser_options & opt,
+    const server_chat_params & opt,
     std::vector<raw_buffer> & out_files);
 
 // convert Anthropic Messages API format to OpenAI Chat Completions API format
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index c790ac79e..f1f677add 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -534,8 +534,8 @@ public:
     server_queue    queue_tasks;
     server_response queue_results;
 
-    common_chat_templates_ptr chat_templates;
-    oaicompat_parser_options  oai_parser_opt;
+    // note: chat_params must not be refreshed upon existing sleeping state
+    server_chat_params chat_params;
 
     ~server_context_impl() {
         if (!sleeping) {
@@ -688,15 +688,6 @@ private:
             llama_init_dft->free_context();
         }
 
-        chat_templates = common_chat_templates_init(model, params_base.chat_template);
-        try {
-            common_chat_format_example(chat_templates.get(), params.use_jinja, params.default_template_kwargs);
-        } catch (const std::exception & e) {
-            SRV_WRN("%s: Chat template parsing error: %s\n", __func__, e.what());
-            SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
-            chat_templates = common_chat_templates_init(model, "chatml");
-        }
-
         std::string & mmproj_path = params_base.mmproj.path;
         if (!mmproj_path.empty()) {
             if (!is_resume) {
@@ -845,30 +836,6 @@ private:
             model_name = model_path.filename().string();
         }
 
-        // thinking is enabled if:
-        // 1. It's not explicitly disabled (reasoning_budget == 0)
-        // 2. The chat template supports it
-        const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
-        SRV_INF("thinking = %d\n", enable_thinking);
-
-        oai_parser_opt = {
-            /* use_jinja             */ params_base.use_jinja,
-            /* prefill_assistant     */ params_base.prefill_assistant,
-            /* reasoning_format      */ params_base.reasoning_format,
-            /* chat_template_kwargs  */ params_base.default_template_kwargs,
-            /* common_chat_templates */ chat_templates.get(),
-            /* allow_image           */ mctx ? mtmd_support_vision(mctx) : false,
-            /* allow_audio           */ mctx ? mtmd_support_audio (mctx) : false,
-            /* enable_thinking       */ enable_thinking,
-            /* media_path            */ params_base.media_path,
-        };
-
-        // print sample chat example to make it clear which template is used
-        // @ngxson modern templates are too long, spam the logs; printing the example is enough
-        LOG_INF("%s: chat template, example_format: '%s'\n", __func__,
-        //      common_chat_templates_source(chat_templates.get()),
-                common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str());
-
         if (!is_resume) {
             return init();
         }
@@ -907,6 +874,42 @@ private:
             }
         }
 
+        // populate chat template params
+        {
+            common_chat_templates_ptr chat_templates;
+
+            try {
+                chat_templates = common_chat_templates_init(model, params_base.chat_template);
+
+                LOG_INF("%s: chat template, example_format: '%s'\n", __func__,
+                    common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str());
+
+            } catch (const std::exception & e) {
+                SRV_ERR("%s: chat template parsing error: %s\n", __func__, e.what());
+                SRV_ERR("%s: please consider disabling jinja via --no-jinja, or use a custom chat template via --chat-template\n", __func__);
+                SRV_ERR("%s: for example: --no-jinja --chat-template chatml\n", __func__);
+                return false;
+            }
+
+            // thinking is enabled if:
+            // 1. It's not explicitly disabled (reasoning_budget == 0)
+            // 2. The chat template supports it
+            const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
+            SRV_INF("%s: chat template, thinking = %d\n", __func__, enable_thinking);
+
+            chat_params = {
+                /* use_jinja             */ params_base.use_jinja,
+                /* prefill_assistant     */ params_base.prefill_assistant,
+                /* reasoning_format      */ params_base.reasoning_format,
+                /* chat_template_kwargs  */ params_base.default_template_kwargs,
+                /* tmpls                 */ std::move(chat_templates),
+                /* allow_image           */ mctx ? mtmd_support_vision(mctx) : false,
+                /* allow_audio           */ mctx ? mtmd_support_audio (mctx) : false,
+                /* enable_thinking       */ enable_thinking,
+                /* media_path            */ params_base.media_path,
+            };
+        }
+
         return true;
     }
 
@@ -1588,32 +1591,14 @@ private:
 
     // tokenize the input if it's set by CLI, return false on error
     bool tokenize_cli_input(server_task & task) {
-        GGML_ASSERT(task.cli_input != nullptr);
         try {
-            auto & opt = oai_parser_opt;
-            common_chat_templates_inputs inputs;
-            inputs.messages              = common_chat_msgs_parse_oaicompat(task.cli_input);
-            inputs.tools                 = {}; // TODO
-            inputs.tool_choice           = COMMON_CHAT_TOOL_CHOICE_NONE;
-            inputs.json_schema           = ""; // TODO
-            inputs.grammar               = ""; // TODO
-            inputs.use_jinja             = opt.use_jinja;
-            inputs.parallel_tool_calls   = false;
-            inputs.add_generation_prompt = true;
-            inputs.reasoning_format      = opt.reasoning_format;
-            inputs.enable_thinking       = opt.enable_thinking;
-
-            // Apply chat template to the list of messages
-            auto chat_params = common_chat_templates_apply(opt.tmpls, inputs);
-
-            // tokenize the resulting prompt
-            auto & prompt = chat_params.prompt;
+            auto & prompt = task.cli_prompt;
             if (mctx != nullptr) {
                 task.tokens = process_mtmd_prompt(mctx, prompt, task.cli_files);
             } else {
                 task.tokens = std::move(tokenize_input_prompts(vocab, mctx, prompt, true, true)[0]);
             }
-            task.cli_input.clear();
+            task.cli_prompt.clear();
             task.cli_files.clear();
         } catch (const std::exception & e) {
             send_error(task, std::string("Failed to format input: ") + e.what(), ERROR_TYPE_INVALID_REQUEST);
@@ -1689,7 +1674,7 @@ private:
                 {
                     // special case: if input is provided via CLI, tokenize it first
                     // otherwise, no need to tokenize as it's already done inside the HTTP thread
-                    if (task.cli_input != nullptr) {
+                    if (task.cli) {
                         if (!tokenize_cli_input(task)) {
                             break;
                         }
@@ -2901,8 +2886,6 @@ server_response_reader server_context::get_response_reader() {
 }
 
 server_context_meta server_context::get_meta() const {
-    auto tool_use_src = common_chat_templates_source(impl->chat_templates.get(), "tool_use");
-
     auto bos_id = llama_vocab_bos(impl->vocab);
     auto eos_id = llama_vocab_eos(impl->vocab);
     auto bos_token_str = bos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx, bos_id, true) : "";
@@ -2913,14 +2896,13 @@ server_context_meta server_context::get_meta() const {
         /* model_name             */ impl->model_name,
         /* model_path             */ impl->params_base.model.path,
         /* has_mtmd               */ impl->mctx != nullptr,
-        /* has_inp_image          */ impl->oai_parser_opt.allow_image,
-        /* has_inp_audio          */ impl->oai_parser_opt.allow_audio,
+        /* has_inp_image          */ impl->chat_params.allow_image,
+        /* has_inp_audio          */ impl->chat_params.allow_audio,
         /* json_webui_settings    */ impl->json_webui_settings,
         /* slot_n_ctx             */ impl->get_slot_n_ctx(),
         /* pooling_type           */ llama_pooling_type(impl->ctx),
 
-        /* chat_template          */ common_chat_templates_source(impl->chat_templates.get()),
-        /* chat_template_tool_use */ tool_use_src ? tool_use_src : "",
+        /* chat_params            */ impl->chat_params,
 
         /* bos_token_str          */ bos_token_str,
         /* eos_token_str          */ eos_token_str,
@@ -3202,8 +3184,8 @@ void server_routes::init_routes() {
 
         // this endpoint can be accessed during sleeping
         // the next LOC is to avoid someone accidentally use ctx_server
-        bool server_ctx; // do NOT delete this line
-        GGML_UNUSED(server_ctx);
+        bool ctx_server; // do NOT delete this line
+        GGML_UNUSED(ctx_server);
 
         res->ok({{"status", "ok"}});
         return res;
@@ -3393,8 +3375,8 @@ void server_routes::init_routes() {
 
         // this endpoint can be accessed during sleeping
         // the next LOC is to avoid someone accidentally use ctx_server
-        bool server_ctx; // do NOT delete this line
-        GGML_UNUSED(server_ctx);
+        bool ctx_server; // do NOT delete this line
+        GGML_UNUSED(ctx_server);
 
         task_params tparams;
         tparams.sampling = params.sampling;
@@ -3403,6 +3385,9 @@ void server_routes::init_routes() {
             { "n_ctx",  meta->slot_n_ctx },
         };
 
+        std::string tmpl_default = common_chat_templates_source(meta->chat_params.tmpls.get(), "");
+        std::string tmpl_tools   = common_chat_templates_source(meta->chat_params.tmpls.get(), "tool_use");
+
         json props = {
             { "default_generation_settings", default_generation_settings_for_props },
             { "total_slots",                 params.n_parallel },
@@ -3417,15 +3402,15 @@ void server_routes::init_routes() {
             { "endpoint_metrics",            params.endpoint_metrics },
             { "webui",                       params.webui },
             { "webui_settings",              meta->json_webui_settings },
-            { "chat_template",               meta->chat_template },
+            { "chat_template",               tmpl_default },
             { "bos_token",                   meta->bos_token_str },
             { "eos_token",                   meta->eos_token_str },
             { "build_info",                  meta->build_info },
             { "is_sleeping",                 queue_tasks.is_sleeping() },
         };
         if (params.use_jinja) {
-            if (!meta->chat_template_tool_use.empty()) {
-                props["chat_template_tool_use"] = meta->chat_template_tool_use;
+            if (!tmpl_tools.empty()) {
+                props["chat_template_tool_use"] = tmpl_tools;
             }
         }
         res->ok(props);
@@ -3446,6 +3431,7 @@ void server_routes::init_routes() {
 
     this->get_api_show = [this](const server_http_req &) {
         auto res = create_response();
+        std::string tmpl_default = common_chat_templates_source(meta->chat_params.tmpls.get(), "");
         json data = {
             {
                 "model_info", {
@@ -3454,7 +3440,7 @@ void server_routes::init_routes() {
             },
             {"modelfile", ""},
             {"parameters", ""},
-            {"template", meta->chat_template},
+            {"template", tmpl_default},
             {"details", {
                 {"parent_model", ""},
                 {"format", "gguf"},
@@ -3579,7 +3565,7 @@ void server_routes::init_routes() {
         json body = json::parse(req.body);
         json body_parsed = oaicompat_chat_params_parse(
             body,
-            ctx_server.oai_parser_opt,
+            meta->chat_params,
             files);
         return handle_completions_impl(
             req,
@@ -3595,7 +3581,7 @@ void server_routes::init_routes() {
         json body = convert_anthropic_to_oai(json::parse(req.body));
         json body_parsed = oaicompat_chat_params_parse(
             body,
-            ctx_server.oai_parser_opt,
+            meta->chat_params,
             files);
         return handle_completions_impl(
             req,
@@ -3611,7 +3597,7 @@ void server_routes::init_routes() {
         json body = convert_anthropic_to_oai(json::parse(req.body));
         json body_parsed = oaicompat_chat_params_parse(
             body,
-            ctx_server.oai_parser_opt,
+            meta->chat_params,
             files);
 
         json prompt = body_parsed.at("prompt");
@@ -3627,7 +3613,7 @@ void server_routes::init_routes() {
         json body = json::parse(req.body);
         json data = oaicompat_chat_params_parse(
             body,
-            ctx_server.oai_parser_opt,
+            meta->chat_params,
             files);
         res->ok({{ "prompt", std::move(data.at("prompt")) }});
         return res;
@@ -3638,8 +3624,8 @@ void server_routes::init_routes() {
 
         // this endpoint can be accessed during sleeping
         // the next LOC is to avoid someone accidentally use ctx_server
-        bool server_ctx; // do NOT delete this line
-        GGML_UNUSED(server_ctx);
+        bool ctx_server; // do NOT delete this line
+        GGML_UNUSED(ctx_server);
 
         json models = {
             {"models", {
diff --git a/tools/server/server-context.h b/tools/server/server-context.h
index 09bec15ae..ec1df9695 100644
--- a/tools/server/server-context.h
+++ b/tools/server/server-context.h
@@ -20,9 +20,8 @@ struct server_context_meta {
     int slot_n_ctx;
     enum llama_pooling_type pooling_type;
 
-    // chat template
-    std::string chat_template;
-    std::string chat_template_tool_use;
+    // chat params
+    server_chat_params & chat_params;
 
     // tokens
     std::string bos_token_str;
diff --git a/tools/server/server-task.h b/tools/server/server-task.h
index 11943ee4f..daffe0c90 100644
--- a/tools/server/server-task.h
+++ b/tools/server/server-task.h
@@ -130,8 +130,10 @@ struct server_task {
     task_params   params;
     server_tokens tokens;
 
-    // only used by CLI, this delegates the tokenization to the server
-    json                    cli_input = nullptr;
+    // only used by CLI, this allow tokenizing CLI inputs on server side
+    // we need this because mtmd_context and vocab are not accessible outside of server_context
+    bool                    cli = false;
+    std::string             cli_prompt;
     std::vector<raw_buffer> cli_files;
 
     server_task_type type;

From 7dee9ff59ad507304bf43a2682dbe0a89bbc3dce Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Tue, 20 Jan 2026 06:55:24 +0100
Subject: [PATCH 09/17] convert : use n_groups instead of hardcoded values in
 reshape (#18929)

* convert : use n_groups instead of hardcoded values in reshape

This commit modifies the conversion script for NemotronHModel to use
the 'n_groups' hyperparameter, and allow Python to calculate the the
last dimension, using -1, when reshaping the 'mixer.norm.weight' tensor.

* use self.n_group instead of self.hparams["n_groups"]
---
 convert_hf_to_gguf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index becbad046..ab015dd2c 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -9212,7 +9212,7 @@ class NemotronHModel(GraniteHybridModel):
                 return [(mapped_name, reshaped_data)]
 
             if name.endswith("mixer.norm.weight"):
-                reshaped_data = data_torch.reshape(8, 512)
+                reshaped_data = data_torch.reshape(self.n_group, -1)
                 mapped_name = self.map_tensor_name(name)
                 return [(mapped_name, reshaped_data)]
 

From 271191906c3ff0a02916622f703166b6891fce0e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 20 Jan 2026 12:21:28 +0200
Subject: [PATCH 10/17] metal : enable FA for MLA heads (#18950)

---
 ggml/src/ggml-metal/ggml-metal-device.m |  8 ++------
 ggml/src/ggml-metal/ggml-metal-ops.cpp  |  2 +-
 ggml/src/ggml-metal/ggml-metal.metal    | 13 ++++++++-----
 3 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m
index c418afe9c..eb4e2c209 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -1078,12 +1078,8 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
                 op->src[0]->ne[0] != 112 &&
                 op->src[0]->ne[0] != 128 &&
                 op->src[0]->ne[0] != 192 &&
-                op->src[0]->ne[0] != 256) {
-                return false;
-            }
-            if (op->src[0]->ne[0] == 576) {
-                // DeepSeek sizes
-                // TODO: disabled for now, until optmized
+                op->src[0]->ne[0] != 256 &&
+                op->src[0]->ne[0] != 576) {
                 return false;
             }
             if (op->src[1]->type != op->src[2]->type) {
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
index 3d97d3dfd..7f4cfbba2 100644
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -2520,7 +2520,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
 
         // simdgroups per threadgroup (a.k.a. warps)
         //nsg = ne01 <= nqptg ? MAX(4, MIN(nsgmax, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32))) : 4;
-        int32_t nsg = 4;
+        int32_t nsg = ne00 >= 512 ? 8 : 4;
 
         const size_t smem = FATTN_SMEM(nsg);
 
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index a4e1cafe5..17e358d1a 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -5552,9 +5552,7 @@ void kernel_flash_attn_ext_impl(
 
                 constexpr short NC = (C/8)/NSG;
 
-                // note: do not unroll for large heads
-                #pragma unroll (DK <= 64 ? NC : 1)
-                for (short cc = 0; cc < NC; ++cc) {
+                FOR_UNROLL (short cc = 0; cc < NC; ++cc) {
                     qk8x8_t mqk = make_filled_simdgroup_matrix<qk_t, 8>((qk_t) 0.0f);
 
                     if (DK % 16 != 0) {
@@ -5575,7 +5573,9 @@ void kernel_flash_attn_ext_impl(
                         k8x8_t mk[2];
                         q8x8_t mq[2];
 
-                        FOR_UNROLL (short i = 0; i < DK8/2; ++i) {
+                        // note: too much unroll can tank the performance for large heads
+                        #pragma unroll (MIN(DK8/2, 4*NSG))
+                        for (short i = 0; i < DK8/2; ++i) {
                             simdgroup_barrier(mem_flags::mem_none);
 
                             simdgroup_load(mq[0], pq + 0*8 + 16*i, DK);
@@ -5749,7 +5749,9 @@ void kernel_flash_attn_ext_impl(
                                 pv  += 8*NS20;
                             }
                         } else {
-                            FOR_UNROLL (short cc = 0; cc < (C/8)/2; ++cc) {
+                            constexpr short NC = (C/8)/2;
+
+                            FOR_UNROLL (short cc = 0; cc < NC; ++cc) {
                                 s8x8_t vs[2];
 
                                 simdgroup_load(vs[0], ss + 16*cc + 0, SH, 0, false);
@@ -5952,6 +5954,7 @@ kernel void kernel_flash_attn_ext(
       //case 1: kernel_flash_attn_ext_impl<FWD_TMPL, 1>(FWD_ARGS); break;
       //case 2: kernel_flash_attn_ext_impl<FWD_TMPL, 2>(FWD_ARGS); break;
         case 4: kernel_flash_attn_ext_impl<FWD_TMPL, 4>(FWD_ARGS); break;
+        case 8: kernel_flash_attn_ext_impl<FWD_TMPL, 8>(FWD_ARGS); break;
     }
 #undef FWD_TMPL
 #undef FWD_ARGS

From 08f3f4a8a30633491b031bf833441de2a1ab5029 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= <angt@huggingface.co>
Date: Tue, 20 Jan 2026 11:42:49 +0100
Subject: [PATCH 11/17] ggml : cleanup path_str() (#18928)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Remove pragmas as `std::codecvt_utf8` is not used.
- Avoid implicit `strlen()`.

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
---
 ggml/src/ggml-backend-reg.cpp | 24 ++++--------------------
 1 file changed, 4 insertions(+), 20 deletions(-)

diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 4181a714a..6bee1bc4b 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -77,39 +77,23 @@
 #include "ggml-zendnn.h"
 #endif
 
-// disable C++17 deprecation warning for std::codecvt_utf8
-#if defined(__clang__)
-#    pragma clang diagnostic push
-#    pragma clang diagnostic ignored "-Wdeprecated-declarations"
-#elif defined(__GNUC__)
-#    pragma GCC diagnostic push
-#    pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
-
 namespace fs = std::filesystem;
 
 static std::string path_str(const fs::path & path) {
-    std::string u8path;
     try {
 #if defined(__cpp_lib_char8_t)
         // C++20 and later: u8string() returns std::u8string
-        std::u8string u8str = path.u8string();
-        u8path = std::string(reinterpret_cast<const char*>(u8str.c_str()));
+        const std::u8string u8str = path.u8string();
+        return std::string(reinterpret_cast<const char *>(u8str.data()), u8str.size());
 #else
         // C++17: u8string() returns std::string
-        u8path = path.u8string();
+        return path.u8string();
 #endif
     } catch (...) {
+        return std::string();
     }
-    return u8path;
 }
 
-#if defined(__clang__)
-#    pragma clang diagnostic pop
-#elif defined(__GNUC__)
-#    pragma GCC diagnostic pop
-#endif
-
 #ifdef _WIN32
 
 using dl_handle = std::remove_pointer_t<HMODULE>;

From d1e3556481c8b351f9b7b69ba3febf6cb77fffa6 Mon Sep 17 00:00:00 2001
From: Oliver Simons <osimons@nvidia.com>
Date: Tue, 20 Jan 2026 13:11:01 +0100
Subject: [PATCH 12/17] CUDA: Replace init_offsets kernel with iterators in
 cub-based argsort (#18930)

* CUDA: Replace `init_offsets` with iterators in argsort

This is a QOL improvement, saving us the cost of materializing the
iterator

* Remove unnecessary include from top-k.cu
---
 ggml/src/ggml-cuda/argsort.cu | 22 +++++++---------------
 ggml/src/ggml-cuda/top-k.cu   |  1 -
 2 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu
index 57c8a99a2..cf7a44f7a 100644
--- a/ggml/src/ggml-cuda/argsort.cu
+++ b/ggml/src/ggml-cuda/argsort.cu
@@ -14,12 +14,6 @@ static __global__ void init_indices(int * indices, const int ncols, const int nr
     }
 }
 
-static __global__ void init_offsets(int * offsets, const int ncols, const int nrows) {
-    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx <= nrows) {
-        offsets[idx] = idx * ncols;
-    }
-}
 
 #ifdef GGML_CUDA_USE_CUB
 void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
@@ -31,18 +25,15 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
                               cudaStream_t     stream) {
     ggml_cuda_pool_alloc<int>   temp_indices_alloc(pool, ncols * nrows);
     ggml_cuda_pool_alloc<float> temp_keys_alloc(pool, ncols * nrows);
-    ggml_cuda_pool_alloc<int>   offsets_alloc(pool, nrows + 1);
 
     int *   temp_indices = temp_indices_alloc.get();
     float * temp_keys    = temp_keys_alloc.get();
-    int *   d_offsets    = offsets_alloc.get();
 
     static const int block_size = 256;
     const dim3 grid_size((ncols + block_size - 1) / block_size, nrows);
     init_indices<<<grid_size, block_size, 0, stream>>>(temp_indices, ncols, nrows);
 
-    const dim3 offset_grid((nrows + block_size - 1) / block_size);
-    init_offsets<<<offset_grid, block_size, 0, stream>>>(d_offsets, ncols, nrows);
+    auto offset_iterator = cuda::make_strided_iterator(cuda::make_counting_iterator(0), ncols);
 
     CUDA_CHECK(cudaMemcpyAsync(temp_keys, x, ncols * nrows * sizeof(float), cudaMemcpyDeviceToDevice, stream));
 
@@ -57,7 +48,7 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
             DeviceSegmentedSort::SortPairs(nullptr, temp_storage_bytes, temp_keys, temp_keys,  // keys (in-place)
                                            temp_indices, dst,                                  // values (indices)
                                            ncols * nrows, nrows,  // num items, num segments
-                                           d_offsets, d_offsets + 1, stream);
+                                           offset_iterator, offset_iterator + 1, stream);
         }
     } else {
         if (nrows == 1) {
@@ -66,7 +57,8 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
                                                  ncols, 0, sizeof(float) * 8, stream);
         } else {
             DeviceSegmentedSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys, temp_keys, temp_indices,
-                                                     dst, ncols * nrows, nrows, d_offsets, d_offsets + 1, stream);
+                                                     dst, ncols * nrows, nrows, offset_iterator, offset_iterator + 1,
+                                                     stream);
         }
     }
 
@@ -80,7 +72,7 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
                                        ncols, 0, sizeof(float) * 8, stream);
         } else {
             DeviceSegmentedSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, temp_indices, dst,
-                                           ncols * nrows, nrows, d_offsets, d_offsets + 1, stream);
+                                           ncols * nrows, nrows, offset_iterator, offset_iterator + 1, stream);
         }
     } else {
         if (nrows == 1) {
@@ -89,8 +81,8 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
                                                  ncols, 0, sizeof(float) * 8, stream);
         } else {
             DeviceSegmentedSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys,
-                                                     temp_indices, dst, ncols * nrows, nrows, d_offsets, d_offsets + 1,
-                                                     stream);
+                                                     temp_indices, dst, ncols * nrows, nrows, offset_iterator,
+                                                     offset_iterator + 1, stream);
         }
     }
 }
diff --git a/ggml/src/ggml-cuda/top-k.cu b/ggml/src/ggml-cuda/top-k.cu
index 318ac3869..785a18389 100644
--- a/ggml/src/ggml-cuda/top-k.cu
+++ b/ggml/src/ggml-cuda/top-k.cu
@@ -4,7 +4,6 @@
 #ifdef GGML_CUDA_USE_CUB
 #    include <cub/cub.cuh>
 #    if (CCCL_MAJOR_VERSION >= 3 && CCCL_MINOR_VERSION >= 2)
-#        include <cuda/iterator>
 #        define CUB_TOP_K_AVAILABLE
 using namespace cub;
 #    endif  // CCCL_MAJOR_VERSION >= 3 && CCCL_MINOR_VERSION >= 2

From 2c1f1996535cd49132e6e7d1b28908bcd4f56819 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Tue, 20 Jan 2026 18:23:25 +0100
Subject: [PATCH 13/17] cli : fix reasoning responses in CLI (#18961)

* cli : fix reasoning responses in CLI

* fix build

* fix build (2)
---
 common/chat-parser.cpp         |   6 +-
 common/chat-parser.h           |   8 +-
 common/chat.h                  |  23 +-
 common/common.h                |   1 +
 common/json-partial.h          |   1 +
 tests/test-chat-parser.cpp     | 240 +++++++++--------
 tests/test-chat-peg-parser.cpp |  12 +-
 tests/test-chat.cpp            | 454 +++++++++++++++++----------------
 tools/cli/cli.cpp              |  15 +-
 tools/server/server-common.h   |   1 +
 tools/server/server-task.cpp   |  34 +--
 tools/server/server-task.h     |  12 +-
 12 files changed, 417 insertions(+), 390 deletions(-)

diff --git a/common/chat-parser.cpp b/common/chat-parser.cpp
index 2f073512e..c2d1e30f3 100644
--- a/common/chat-parser.cpp
+++ b/common/chat-parser.cpp
@@ -129,7 +129,7 @@ static void parse_json_tool_calls(
     }
 }
 
-common_chat_msg_parser::common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax)
+common_chat_msg_parser::common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_parser_params & syntax)
     : input_(input), is_partial_(is_partial), syntax_(syntax)
 {
     result_.role = "assistant";
@@ -1611,7 +1611,7 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
     builder.finish();
 }
 
-common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax) {
+common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_parser_params & syntax) {
     if (syntax.format == COMMON_CHAT_FORMAT_PEG_SIMPLE ||
         syntax.format == COMMON_CHAT_FORMAT_PEG_NATIVE ||
         syntax.format == COMMON_CHAT_FORMAT_PEG_CONSTRUCTED) {
@@ -1635,7 +1635,7 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
     return msg;
 }
 
-common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_syntax & syntax) {
+common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_parser_params & syntax) {
     if (parser.empty()) {
         throw std::runtime_error("Failed to parse due to missing parser definition.");
     }
diff --git a/common/chat-parser.h b/common/chat-parser.h
index 78c4b74c2..3ed9c30a2 100644
--- a/common/chat-parser.h
+++ b/common/chat-parser.h
@@ -5,7 +5,7 @@
 #include "json-partial.h"
 #include "regex-partial.h"
 
-#include <nlohmann/json.hpp>
+#include <nlohmann/json_fwd.hpp>
 
 #include <optional>
 #include <string>
@@ -19,20 +19,20 @@ class common_chat_msg_partial_exception : public std::runtime_error {
 class common_chat_msg_parser {
     std::string input_;
     bool is_partial_;
-    common_chat_syntax syntax_;
+    common_chat_parser_params syntax_; // TODO: rename to params
     std::string healing_marker_;
 
     size_t pos_ = 0;
     common_chat_msg result_;
 
   public:
-    common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
+    common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_parser_params & syntax);
     const std::string & input() const { return input_; }
     size_t pos() const { return pos_; }
     const std::string & healing_marker() const { return healing_marker_; }
     const bool & is_partial() const { return is_partial_; }
     const common_chat_msg & result() const { return result_; }
-    const common_chat_syntax & syntax() const { return syntax_; }
+    const common_chat_parser_params & syntax() const { return syntax_; }
 
     void move_to(size_t pos) {
         if (pos > input_.size()) {
diff --git a/common/chat.h b/common/chat.h
index 148801738..ac19348ec 100644
--- a/common/chat.h
+++ b/common/chat.h
@@ -145,7 +145,7 @@ struct common_chat_templates_inputs {
     std::vector<common_chat_tool> tools;
     common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
     bool parallel_tool_calls = false;
-    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
+    common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; // TODO: refactor this to "bool enable_thinking"
     bool enable_thinking = true;
     std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
     std::map<std::string, std::string> chat_template_kwargs;
@@ -165,14 +165,21 @@ struct common_chat_params {
     std::string                         parser;
 };
 
-struct common_chat_syntax {
+// per-message parsing syntax
+// should be derived from common_chat_params
+struct common_chat_parser_params {
     common_chat_format       format                = COMMON_CHAT_FORMAT_CONTENT_ONLY;
-    common_reasoning_format  reasoning_format      = COMMON_REASONING_FORMAT_NONE;
+    common_reasoning_format  reasoning_format      = COMMON_REASONING_FORMAT_NONE; // TODO: refactor this to "bool parse_reasoning"
     // Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
     bool                     reasoning_in_content  = false;
     bool                     thinking_forced_open  = false;
     bool                     parse_tool_calls      = true;
     common_peg_arena         parser                = {};
+    common_chat_parser_params() = default;
+    common_chat_parser_params(const common_chat_params & chat_params) {
+        format               = chat_params.format;
+        thinking_forced_open = chat_params.thinking_forced_open;
+    }
 };
 
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
@@ -213,10 +220,12 @@ std::string common_chat_format_example(
     const std::map<std::string, std::string> & chat_template_kwargs);
 
 const char*               common_chat_format_name(common_chat_format format);
-const char*               common_reasoning_format_name(common_reasoning_format format);
-common_reasoning_format   common_reasoning_format_from_name(const std::string & format);
-common_chat_msg           common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
-common_chat_msg           common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_syntax & syntax);
+common_chat_msg           common_chat_parse(const std::string & input, bool is_partial, const common_chat_parser_params & syntax);
+common_chat_msg           common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_parser_params & syntax);
+
+// used by arg and server
+const char *             common_reasoning_format_name(common_reasoning_format format);
+common_reasoning_format  common_reasoning_format_from_name(const std::string & format);
 
 common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
 
diff --git a/common/common.h b/common/common.h
index b9566df62..8247949de 100644
--- a/common/common.h
+++ b/common/common.h
@@ -284,6 +284,7 @@ struct common_params_diffusion {
 };
 
 // reasoning API response format (not to be confused as chat template's reasoning format)
+// only used by server
 enum common_reasoning_format {
     COMMON_REASONING_FORMAT_NONE,
     COMMON_REASONING_FORMAT_AUTO,            // Same as deepseek, using `message.reasoning_content`
diff --git a/common/json-partial.h b/common/json-partial.h
index f63356dc4..be51aabfb 100644
--- a/common/json-partial.h
+++ b/common/json-partial.h
@@ -1,5 +1,6 @@
 #pragma once
 
+// TODO: use json_fwd.hpp when possible
 #include <nlohmann/json.hpp>
 
 // Healing marker (empty if the JSON was fully parsed / wasn't healed).
diff --git a/tests/test-chat-parser.cpp b/tests/test-chat-parser.cpp
index 4766518fe..6f44a2b42 100644
--- a/tests/test-chat-parser.cpp
+++ b/tests/test-chat-parser.cpp
@@ -54,113 +54,109 @@ static void assert_throws(const std::function<void()> & fn, const std::string &
 static void test_reasoning() {
   //common_log_set_verbosity_thold(LOG_DEFAULT_DEBUG);
   {
-    common_chat_msg_parser builder("<tnk>Cogito</tnk>Ergo sum", /* is_partial= */ false, {
-        /* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
-        /* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE,
-        /* .reasoning_in_content = */ false,
-        /* .thinking_forced_open = */ false,
-    });
+    common_chat_parser_params params;
+    params.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    params.reasoning_format = COMMON_REASONING_FORMAT_NONE;
+    params.reasoning_in_content = false;
+    params.thinking_forced_open = false;
+    common_chat_msg_parser builder("<tnk>Cogito</tnk>Ergo sum", /* is_partial= */ false, params);
     assert_equals(false, builder.try_parse_reasoning("<tnk>", "</tnk>"));
     assert_equals("<tnk>Cogito</tnk>Ergo sum", builder.consume_rest());
   }
   {
-    common_chat_msg_parser builder("<tnk>Cogito</tnk>Ergo sum", /* is_partial= */ false, {
-        /* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
-        /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
-        /* .reasoning_in_content = */ false,
-        /* .thinking_forced_open = */ false,
-    });
+    common_chat_parser_params params;
+    params.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+    params.reasoning_in_content = false;
+    params.thinking_forced_open = false;
+    common_chat_msg_parser builder("<tnk>Cogito</tnk>Ergo sum", /* is_partial= */ false, params);
     assert_equals(true, builder.try_parse_reasoning("<tnk>", "</tnk>"));
     assert_equals(std::string("Cogito"), builder.result().reasoning_content);
     assert_equals("Ergo sum", builder.consume_rest());
   }
   {
-    common_chat_msg_parser builder("Cogito</tnk>Ergo sum", /* is_partial= */ false, {
-        /* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
-        /* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE,
-        /* .reasoning_in_content = */ false,
-        /* .thinking_forced_open = */ false,
-    });
+    common_chat_parser_params params;
+    params.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    params.reasoning_format = COMMON_REASONING_FORMAT_NONE;
+    params.reasoning_in_content = false;
+    params.thinking_forced_open = false;
+    common_chat_msg_parser builder("Cogito</tnk>Ergo sum", /* is_partial= */ false, params);
     assert_equals(false, builder.try_parse_reasoning("<tnk>", "</tnk>"));
     assert_equals("Cogito</tnk>Ergo sum", builder.consume_rest());
   }
   {
-    common_chat_msg_parser builder("Cogito</tnk>Ergo sum", /* is_partial= */ false, {
-        /* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
-        /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
-        /* .reasoning_in_content = */ false,
-        /* .thinking_forced_open = */ true,
-    });
+    common_chat_parser_params params;
+    params.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+    params.reasoning_in_content = false;
+    params.thinking_forced_open = true;
+    common_chat_msg_parser builder("Cogito</tnk>Ergo sum", /* is_partial= */ false, params);
     assert_equals(true, builder.try_parse_reasoning("<tnk>", "</tnk>"));
     assert_equals(std::string("Cogito"), builder.result().reasoning_content);
     assert_equals("Ergo sum", builder.consume_rest());
   }
   {
-    common_chat_msg_parser builder("Cogito</tnk>Ergo sum", /* is_partial= */ false, {
-        /* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
-        /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
-        /* .reasoning_in_content = */ true,
-        /* .thinking_forced_open = */ true,
-    });
+    common_chat_parser_params params;
+    params.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+    params.reasoning_in_content = true;
+    params.thinking_forced_open = true;
+    common_chat_msg_parser builder("Cogito</tnk>Ergo sum", /* is_partial= */ false, params);
     assert_equals(true, builder.try_parse_reasoning("<tnk>", "</tnk>"));
     assert_equals("<think>Cogito</think>", builder.result().content);
     assert_equals("Ergo sum", builder.consume_rest());
   }
   {
     const std::string variant("content_only_inline_think");
-    common_chat_syntax syntax = {
-        /* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
-        /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
-        /* .reasoning_in_content = */ false,
-        /* .thinking_forced_open = */ false,
-        /* .parse_tool_calls = */ false,
-    };
+    common_chat_parser_params params;
+    params.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+    params.reasoning_in_content = false;
+    params.thinking_forced_open = false;
+    params.parse_tool_calls = false;
     const std::string input = "<think>Pense</think>Bonjour";
-    auto msg = common_chat_parse(input, false, syntax);
+    auto msg = common_chat_parse(input, false, params);
     assert_equals(variant, std::string("Pense"), msg.reasoning_content);
     assert_equals(variant, std::string("Bonjour"), msg.content);
   }
   {
     const std::string variant("llama_3_inline_think");
-    common_chat_syntax syntax = {
-        /* .format = */ COMMON_CHAT_FORMAT_LLAMA_3_X,
-        /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
-        /* .reasoning_in_content = */ false,
-        /* .thinking_forced_open = */ false,
-        /* .parse_tool_calls = */ false,
-    };
+    common_chat_parser_params params;
+    params.format = COMMON_CHAT_FORMAT_LLAMA_3_X;
+    params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+    params.reasoning_in_content = false;
+    params.thinking_forced_open = false;
+    params.parse_tool_calls = false;
     const std::string input = "<think>Plan</think>Réponse";
-    auto msg = common_chat_parse(input, false, syntax);
+    auto msg = common_chat_parse(input, false, params);
     assert_equals(variant, std::string("Plan"), msg.reasoning_content);
     assert_equals(variant, std::string("Réponse"), msg.content);
   }
   // Test DeepSeek V3.1 parsing - reasoning content followed by "</think>" and then regular content
   {
-    common_chat_syntax syntax = {
-        /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
-        /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
-        /* .reasoning_in_content = */ false,
-        /* .thinking_forced_open = */ true,
-        /* .parse_tool_calls = */ true,
-    };
+    common_chat_parser_params params;
+    params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
+    params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+    params.reasoning_in_content = false;
+    params.thinking_forced_open = true;
+    params.parse_tool_calls = true;
     const std::string variant("deepseek_v3_1_reasoning_format_deepseek");
-    common_chat_msg_parser builder("REASONING</think>ok", /* is_partial= */ false, syntax);
+    common_chat_msg_parser builder("REASONING</think>ok", /* is_partial= */ false, params);
     assert_equals(variant, true, builder.try_parse_reasoning("<think>", "</think>"));
     assert_equals(variant, std::string("REASONING"), builder.result().reasoning_content);
     assert_equals(variant, std::string("ok"), builder.consume_rest());
   }
   // Test DeepSeek V3.1 parsing - reasoning_format none - reasoning content followed by "</think>" and then regular content
   {
-    common_chat_syntax syntax = {
-        /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
-        /* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE,
-        /* .reasoning_in_content = */ false,
-        /* .thinking_forced_open = */ true,
-        /* .parse_tool_calls = */ true,
-    };
+    common_chat_parser_params params;
+    params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
+    params.reasoning_format = COMMON_REASONING_FORMAT_NONE;
+    params.reasoning_in_content = false;
+    params.thinking_forced_open = true;
+    params.parse_tool_calls = true;
     const std::string variant("deepseek_v3_1_reasoning_format_none");
     const std::string input = "REASONING</think>ok";
-    auto msg = common_chat_parse(input, false, syntax);
+    auto msg = common_chat_parse(input, false, params);
     assert_equals(variant, std::string("REASONING</think>ok"), msg.content);
     assert_equals(variant, std::string(""), msg.reasoning_content);
   }
@@ -256,15 +252,14 @@ static void test_deepseek_v3_1_tool_calls() {
     //common_log_set_verbosity_thold(LOG_DEFAULT_DEBUG);
     // variant: happy path for when it works as the model card says it should
     const std::string variant("simple");
-    common_chat_syntax syntax = {
-        /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
-        /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
-        /* .reasoning_in_content = */ false,
-        /* .thinking_forced_open = */ false,
-        /* .parse_tool_calls = */ true,
-    };
+    common_chat_parser_params params;
+    params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
+    params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+    params.reasoning_in_content = false;
+    params.thinking_forced_open = false;
+    params.parse_tool_calls = true;
     const std::string input = "<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time<｜tool▁sep｜>{\"city\": \"Tokyo\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>";
-    auto msg = common_chat_parse(input, false, syntax);
+    auto msg = common_chat_parse(input, false, params);
     assert_equals<std::size_t>(variant, 1, msg.tool_calls.size());
     assert_equals(variant, std::string("get_time"), msg.tool_calls[0].name);
     // JSON arguments are dumped without spaces
@@ -274,16 +269,15 @@ static void test_deepseek_v3_1_tool_calls() {
 
     // variant: simple + thinking open
     {
-        common_chat_syntax syntax = {
-            /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
-            /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
-            /* .reasoning_in_content = */ false,
-            /* .thinking_forced_open = */ true,
-            /* .parse_tool_calls = */ true,
-        };
+        common_chat_parser_params params;
+        params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
+        params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+        params.reasoning_in_content = false;
+        params.thinking_forced_open = true;
+        params.parse_tool_calls = true;
         const std::string variant("simple_thinking");
         const std::string in = "REASONING</think><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time<｜tool▁sep｜>{\"city\": \"Tokyo\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>";
-        auto m = common_chat_parse(in, false, syntax);
+        auto m = common_chat_parse(in, false, params);
         assert_equals<std::size_t>(variant, 1, m.tool_calls.size());
         assert_equals(variant, std::string("get_time"), m.tool_calls[0].name);
         assert_equals(variant, std::string("{\"city\":\"Tokyo\"}"), m.tool_calls[0].arguments);
@@ -292,16 +286,15 @@ static void test_deepseek_v3_1_tool_calls() {
     }
     // variant: simple + multiple tool calls
     {
-        common_chat_syntax syntax = {
-            /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
-            /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
-            /* .reasoning_in_content = */ false,
-            /* .thinking_forced_open = */ false,
-            /* .parse_tool_calls = */ true,
-        };
+        common_chat_parser_params params;
+        params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
+        params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+        params.reasoning_in_content = false;
+        params.thinking_forced_open = false;
+        params.parse_tool_calls = true;
         const std::string variant("simple_multiple_tool_calls");
         const std::string in = "CONTENT<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time<｜tool▁sep｜>{\"city\": \"Paris\"}<｜tool▁call▁end｜><｜tool▁call▁begin｜>get_weather<｜tool▁sep｜>{\"city\": \"Paris\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>";
-        auto m = common_chat_parse(in, false, syntax);
+        auto m = common_chat_parse(in, false, params);
         assert_equals<std::size_t>(variant, 2, m.tool_calls.size());
         assert_equals(variant, std::string("get_time"), m.tool_calls[0].name);
         assert_equals(variant, std::string("{\"city\":\"Paris\"}"), m.tool_calls[0].arguments);
@@ -314,16 +307,15 @@ static void test_deepseek_v3_1_tool_calls() {
 
     // variant: thinking forced open + tool call in reasoning content
     {
-        common_chat_syntax syntax = {
-            /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
-            /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
-            /* .reasoning_in_content = */ false,
-            /* .thinking_forced_open = */ true,
-            /* .parse_tool_calls = */ true,
-        };
+        common_chat_parser_params params;
+        params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
+        params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+        params.reasoning_in_content = false;
+        params.thinking_forced_open = true;
+        params.parse_tool_calls = true;
         const std::string variant("thinking_forced_open_tool_call_in_reasoning");
         const std::string in = "REASONING<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time2<｜tool▁sep｜>{\"city\": \"Tokyo2\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>REASONING</think><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time<｜tool▁sep｜>{\"city\": \"Tokyo\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>";
-        auto m = common_chat_parse(in, false, syntax);
+        auto m = common_chat_parse(in, false, params);
         assert_equals<std::size_t>(variant, 1, m.tool_calls.size());
         assert_equals(variant, std::string("get_time"), m.tool_calls[0].name);
         assert_equals(variant, std::string("{\"city\":\"Tokyo\"}"), m.tool_calls[0].arguments);
@@ -336,16 +328,15 @@ static void test_deepseek_v3_1_tool_calls() {
     //          to make tool calls in reasoning content according to the model card, but it does sometimes, so
     //          add the reasoning content as regular content and parse the tool calls.
     {
-        common_chat_syntax syntax = {
-            /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
-            /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
-            /* .reasoning_in_content = */ false,
-            /* .thinking_forced_open = */ true,
-            /* .parse_tool_calls = */ true,
-        };
+        common_chat_parser_params params;
+        params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
+        params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+        params.reasoning_in_content = false;
+        params.thinking_forced_open = true;
+        params.parse_tool_calls = true;
         const std::string variant("thinking_forced_open_tool_call_in_reasoning_no_closing_think_not_partial");
         const std::string in = "REASONING<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time<｜tool▁sep｜>{\"city\": \"Tokyo\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>";
-        auto m = common_chat_parse(in, false, syntax);
+        auto m = common_chat_parse(in, false, params);
         assert_equals(variant, std::string("REASONING"), m.content);
         assert_equals(variant, std::string(""), m.reasoning_content);
         assert_equals<std::size_t>(variant, 1, m.tool_calls.size());
@@ -355,16 +346,15 @@ static void test_deepseek_v3_1_tool_calls() {
 
     // variant: thinking forced open + tool call in reasoning content + no closing think + partial
     {
-        common_chat_syntax syntax = {
-            /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
-            /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
-            /* .reasoning_in_content = */ false,
-            /* .thinking_forced_open = */ true,
-            /* .parse_tool_calls = */ true,
-        };
+        common_chat_parser_params params;
+        params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
+        params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+        params.reasoning_in_content = false;
+        params.thinking_forced_open = true;
+        params.parse_tool_calls = true;
         const std::string variant("thinking_forced_open_tool_call_in_reasoning_no_closing_think_partial");
         const std::string in = "REASONING<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time<｜tool▁sep｜>{\"city\": \"Tokyo\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>";
-        auto m = common_chat_parse(in, /* is_partial= */ true, syntax);
+        auto m = common_chat_parse(in, /* is_partial= */ true, params);
         assert_equals(variant, std::string("REASONING<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time<｜tool▁sep｜>{\"city\": \"Tokyo\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>"), m.reasoning_content);
         assert_equals(variant, std::string(""), m.content);
         assert_equals<std::size_t>(variant, 0, m.tool_calls.size());
@@ -372,32 +362,30 @@ static void test_deepseek_v3_1_tool_calls() {
 
     // variant: thinking not forced open + reasoning + regular content + no tool calls
     {
-        common_chat_syntax syntax = {
-            /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
-            /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
-            /* .reasoning_in_content = */ false,
-            /* .thinking_forced_open = */ true,
-            /* .parse_tool_calls = */ true,
-        };
+        common_chat_parser_params params;
+        params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
+        params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+        params.reasoning_in_content = false;
+        params.thinking_forced_open = true;
+        params.parse_tool_calls = true;
         const std::string variant("thinking_forced_open_reasoning_regular_content_no_tool_calls");
         const std::string in = "REASONING</think>CONTENT";
-        auto m = common_chat_parse(in, false, syntax);
+        auto m = common_chat_parse(in, false, params);
         assert_equals<std::size_t>(variant, 0, m.tool_calls.size());
         assert_equals(variant, std::string("CONTENT"), m.content);
         assert_equals(variant, std::string("REASONING"), m.reasoning_content);
     }
     // variant: thinking not forced open + missing reasoning + no tool calls
     {
-        common_chat_syntax syntax = {
-            /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
-            /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
-            /* .reasoning_in_content = */ false,
-            /* .thinking_forced_open = */ false,
-            /* .parse_tool_calls = */ true,
-        };
+        common_chat_parser_params params;
+        params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
+        params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+        params.reasoning_in_content = false;
+        params.thinking_forced_open = false;
+        params.parse_tool_calls = true;
         const std::string variant("thinking_not_forced_open_missing_reasoning_no_tool_calls");
         const std::string in = "CONTENT";
-        auto m = common_chat_parse(in, false, syntax);
+        auto m = common_chat_parse(in, false, params);
         assert_equals<std::size_t>(variant, 0, m.tool_calls.size());
         assert_equals(variant, std::string("CONTENT"), m.content);
         assert_equals(variant, std::string(""), m.reasoning_content);
diff --git a/tests/test-chat-peg-parser.cpp b/tests/test-chat-peg-parser.cpp
index d3a4cfd22..f767c73c2 100644
--- a/tests/test-chat-peg-parser.cpp
+++ b/tests/test-chat-peg-parser.cpp
@@ -616,15 +616,15 @@ void test_command7_parser_compare(testing & t) {
 
     auto test_legacy = [&](const std::string & input, bool need_more_input, bool print_results) {
         // Original common_chat_combinator_parser taken from chat.cpp
+        common_chat_parser_params params;
+        params.format = COMMON_CHAT_FORMAT_GENERIC;
+        params.reasoning_format = COMMON_REASONING_FORMAT_AUTO;
+        params.reasoning_in_content = false;
+        params.thinking_forced_open = false;
         common_chat_msg_parser builder(
             input,
             /* .is_partial = */ need_more_input,
-            {
-                /* .format = */ COMMON_CHAT_FORMAT_GENERIC,
-                /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
-                /* .reasoning_in_content = */ false,
-                /* .thinking_forced_open = */ false,
-            }
+            params
         );
 
         builder.try_parse_reasoning("<|START_THINKING|>", "<|END_THINKING|>");
diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp
index e1264b8e8..6820acf67 100644
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -341,10 +341,11 @@ static void test_templates(const struct common_chat_templates * tmpls, const std
         }
 
         if (expect_grammar_triggered) {
-            common_chat_syntax syntax;
-            syntax.format = data.params.format;
-            syntax.reasoning_format = reasoning_format;
-            const auto msg = common_chat_parse(data.delta, /* is_partial= */ false, syntax);
+            // TODO @ngxson : refactor common_chat_parse to avoid passing format/reasoning_format every time
+            common_chat_parser_params params;
+            params.format = data.params.format;
+            params.reasoning_format = reasoning_format;
+            const auto msg = common_chat_parse(data.delta, /* is_partial= */ false, params);
             assert_msg_equals(test_message, msg, ignore_whitespace_differences);
         }
 
@@ -556,7 +557,9 @@ struct make_peg_parser {
     }
 
     common_chat_msg parse(const std::string & msg, bool is_partial) {
-        return common_chat_peg_parse(arena_, msg, is_partial, /* syntax = */ {params_.format});
+        common_chat_parser_params parser_params;
+        parser_params.format = params_.format;
+        return common_chat_peg_parse(arena_, msg, is_partial, parser_params);
     }
 };
 
@@ -750,6 +753,25 @@ static void test_tools_oaicompat_json_conversion() {
     }
 }
 
+// for compat; ref: https://github.com/ggml-org/llama.cpp/pull/18961
+struct test_parser_params {
+    common_chat_format       format                = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    common_reasoning_format  reasoning_format      = COMMON_REASONING_FORMAT_NONE;
+    bool                     reasoning_in_content  = false;
+    bool                     thinking_forced_open  = false;
+    bool                     parse_tool_calls      = true;
+};
+
+static common_chat_msg test_chat_parse(const std::string & input, bool is_partial, const test_parser_params & syntax) {
+    common_chat_parser_params params;
+    params.format               = syntax.format;
+    params.reasoning_format     = syntax.reasoning_format;
+    params.reasoning_in_content = syntax.reasoning_in_content;
+    params.thinking_forced_open = syntax.thinking_forced_open;
+    params.parse_tool_calls     = syntax.parse_tool_calls;
+    return common_chat_parse(input, is_partial, params);
+}
+
 static void test_template_output_parsers() {
     printf("[%s]\n", __func__);
 
@@ -781,17 +803,17 @@ static void test_template_output_parsers() {
         }
 
         assert_msg_equals(message_assist,
-            common_chat_parse(
+            test_chat_parse(
                 "Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_COMMAND_R7B}));
         assert_msg_equals(message_assist,
-            common_chat_parse(
+            test_chat_parse(
                 "<|START_RESPONSE|>Hello, world!\nWhat's up?<|END_RESPONSE|>",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_COMMAND_R7B}));
         assert_msg_equals(message_assist_thoughts,
-            common_chat_parse(
+            test_chat_parse(
                 "<|START_THINKING|>I'm\nthinking<|END_THINKING|>"
                 "<|START_RESPONSE|>Hello, world!\nWhat's up?<|END_RESPONSE|>",
                 /* is_partial= */ false,
@@ -800,7 +822,7 @@ static void test_template_output_parsers() {
                     /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
                 }));
         assert_msg_equals(message_assist_thoughts_unparsed_deepseek,
-            common_chat_parse(
+            test_chat_parse(
                 "<|START_THINKING|>I'm\nthinking<|END_THINKING|>"
                 "<|START_RESPONSE|>Hello, world!\nWhat's up?<|END_RESPONSE|>",
                 /* is_partial= */ false,
@@ -811,13 +833,13 @@ static void test_template_output_parsers() {
                     /* .thinking_forced_open = */ false,
                 }));
         assert_msg_equals(message_assist_thoughts_unparsed_r7b,
-            common_chat_parse(
+            test_chat_parse(
                 "<|START_THINKING|>I'm\nthinking<|END_THINKING|>"
                 "<|START_RESPONSE|>Hello, world!\nWhat's up?<|END_RESPONSE|>",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_COMMAND_R7B}));
         assert_msg_equals(message_assist_thoughts,
-            common_chat_parse(
+            test_chat_parse(
                 "<|START_THINKING|>I'm\nthinking<|END_THINKING|>"
                 "<|START_RESPONSE|>Hello, world!\nWhat's up?<|END_RESPONSE|>",
                 /* is_partial= */ false,
@@ -826,7 +848,7 @@ static void test_template_output_parsers() {
                     /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
                 }));
         assert_msg_equals(message_assist_thoughts_call_idx,
-            common_chat_parse(
+            test_chat_parse(
                 "<|START_THINKING|>I'm\nthinking<|END_THINKING|>"
                 "<|START_ACTION|>[\n"
                 "    {\"tool_call_id\": \"0\", \"tool_name\": \"special_function\", \"parameters\": {\"arg1\": 1}}\n"
@@ -837,7 +859,7 @@ static void test_template_output_parsers() {
                     /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
                 }));
         assert_msg_equals(message_assist_thoughts_no_content,
-            common_chat_parse(
+            test_chat_parse(
                 "<|START_THINKING|>I'm\nthinking<|END_THINKING|>"
                 "<|START_ACTION|>[\n"
                 "    {\"tool_call_id\": \"0\", \"tool_name\": \"special",
@@ -877,7 +899,7 @@ static void test_template_output_parsers() {
 
         assert_equals(
             simple_assist_msg("{ \"tool_call\" : { \"name\" : \"t"),
-            common_chat_parse(
+            test_chat_parse(
                 "{ \"tool_call\" : { \"name\" : \"t",
                 /* is_partial= */ true,
                 {
@@ -889,33 +911,33 @@ static void test_template_output_parsers() {
                 }));
         assert_equals(
             message_assist_empty,
-            common_chat_parse(
+            test_chat_parse(
                 "{ \"tool_call\" : { \"name\" : \"t",
                 /* is_partial= */ true,
                 {COMMON_CHAT_FORMAT_GENERIC}));
 
         assert_equals(
             simple_assist_msg("", "", "puppeteer_screenshot", "{\"name\":\"servethehome_homepage\","),
-            common_chat_parse(
+            test_chat_parse(
                 R"({"tool_call": {"name": "puppeteer_screenshot", "arguments": {"name": "servethehome_homepage",)",
                 /* is_partial= */ true,
                 {COMMON_CHAT_FORMAT_GENERIC}));
 
         assert_equals(
             message_assist_call_empty_args,
-            common_chat_parse(
+            test_chat_parse(
                 "{ \"tool_call\" : { \"name\" : \"special_function\"",
                 /* is_partial= */ true,
                 {COMMON_CHAT_FORMAT_GENERIC}));
         assert_equals(
             message_assist_call_cutoff_args,
-            common_chat_parse(
+            test_chat_parse(
                 "{ \"tool_call\" : { \"name\" : \"special_function\", \"arguments\" : { \"arg",
                 /* is_partial= */ true,
                 {COMMON_CHAT_FORMAT_GENERIC}));
 
         assert_msg_equals(message_assist,
-            common_chat_parse(
+            test_chat_parse(
                 "{\n"
                 "  \"response\": \"Hello, world!\\nWhat's up?\"\n"
                 "}",
@@ -951,7 +973,7 @@ static void test_template_output_parsers() {
     {
         assert_msg_equals(
             simple_assist_msg("Réponse", "raisonnement"),
-            common_chat_parse(
+            test_chat_parse(
                 message_assist_thoughts_unparsed_magistral.content,
                 /* is_partial= */ false,
                 {
@@ -988,14 +1010,14 @@ static void test_template_output_parsers() {
         // Test parsing
         assert_msg_equals(
             simple_assist_msg("", "", "python", ""),
-            common_chat_parse(
+            test_chat_parse(
                 "```json\n"
                 "<function_call> { \"name\" : \"python\"",
                 /* is_partial= */ true,
                 {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
         assert_msg_equals(
             simple_assist_msg("Let's call something\n"),
-            common_chat_parse(
+            test_chat_parse(
                 "Let's call something\n"
                 "<tool_call>{\"name\"",
                 /* is_partial= */ true,
@@ -1005,7 +1027,7 @@ static void test_template_output_parsers() {
                 }));
         assert_msg_equals(
             simple_assist_msg("Let's call something\n"),
-            common_chat_parse(
+            test_chat_parse(
                 "Let's call something\n"
                 "<tool_call>{\"name",
                 /* is_partial= */ true,
@@ -1014,7 +1036,7 @@ static void test_template_output_parsers() {
                     /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
                 }));
         assert_msg_equals(message_assist_call_thoughts,
-            common_chat_parse(
+            test_chat_parse(
                 // QwQ-32B's template adds a trailing <think> if add_generation_prompt
                 "I'm\nthinking</think>\n"
                 "<tool_call>{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}</tool_call>",
@@ -1027,14 +1049,14 @@ static void test_template_output_parsers() {
                 }));
         assert_msg_equals(
             message_assist_call,
-            common_chat_parse(
+            test_chat_parse(
                 "<tool_call>\n"
                 "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
                 "</tool_call>",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
         assert_msg_equals(message_assist_call_content,
-            common_chat_parse(
+            test_chat_parse(
                 "Hello, world!\nWhat's up?<tool_call>\n"
                 "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
                 "</tool_call>",
@@ -1042,13 +1064,13 @@ static void test_template_output_parsers() {
                 {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
         assert_msg_equals(
             message_assist_call,
-            common_chat_parse(
+            test_chat_parse(
                 "<function=special_function>{\"arg1\": 1}</function>",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
         assert_msg_equals(
             message_assist_call,
-            common_chat_parse(
+            test_chat_parse(
                 "<function name=\"special_function\">\n"
                 "{\"arg1\": 1}\n"
                 "</function>",
@@ -1056,7 +1078,7 @@ static void test_template_output_parsers() {
                 {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
         assert_msg_equals(
             message_assist_call,
-            common_chat_parse(
+            test_chat_parse(
                 "<tool>\n"
                 "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
                 "</tool>",
@@ -1064,7 +1086,7 @@ static void test_template_output_parsers() {
                 {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
         assert_msg_equals(
             message_assist_call,
-            common_chat_parse(
+            test_chat_parse(
                 "<tools>\n"
                 "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
                 "</tools>",
@@ -1072,7 +1094,7 @@ static void test_template_output_parsers() {
                 {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
         assert_msg_equals(
             message_assist_call,
-            common_chat_parse(
+            test_chat_parse(
                 "<response>\n"
                 "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
                 "</response>",
@@ -1080,7 +1102,7 @@ static void test_template_output_parsers() {
                 {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
         assert_msg_equals(
             message_assist_call,
-            common_chat_parse(
+            test_chat_parse(
                 "```xml\n"
                 "<response>\n"
                 "    {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
@@ -1090,7 +1112,7 @@ static void test_template_output_parsers() {
                 {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
         assert_msg_equals(
             message_assist_call,
-            common_chat_parse(
+            test_chat_parse(
                 "```xml\n"
                 "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
                 "```",
@@ -1098,7 +1120,7 @@ static void test_template_output_parsers() {
                 {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
         assert_msg_equals(
             message_assist_call,
-            common_chat_parse(
+            test_chat_parse(
                 "```\n"
                 "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
                 "```",
@@ -1106,7 +1128,7 @@ static void test_template_output_parsers() {
                 {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
         assert_msg_equals(
             message_assist_call,
-            common_chat_parse(
+            test_chat_parse(
                 "```\n"
                 "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
                 "```",
@@ -1114,7 +1136,7 @@ static void test_template_output_parsers() {
                 {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
         assert_msg_equals(
             message_assist_call,
-            common_chat_parse(
+            test_chat_parse(
                 "```json\n"
                 "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
                 "```",
@@ -1122,7 +1144,7 @@ static void test_template_output_parsers() {
                 {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
         assert_msg_equals(
             message_assist_call,
-            common_chat_parse(
+            test_chat_parse(
                 "```json\n"
                 "\n"
                 "                    <function_call> {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}} \n"
@@ -1132,7 +1154,7 @@ static void test_template_output_parsers() {
                 {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
         assert_msg_equals(
             message_assist_call,
-            common_chat_parse(
+            test_chat_parse(
                 "<json>\n"
                 "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
                 "</json>",
@@ -1140,7 +1162,7 @@ static void test_template_output_parsers() {
                 {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
         assert_msg_equals(
             message_assist_call,
-            common_chat_parse(
+            test_chat_parse(
                 "<xml>\n"
                 "  {\n"
                 "    \"name\": \"special_function\", \"arguments\": {\"arg1\": 1}\n"
@@ -1150,7 +1172,7 @@ static void test_template_output_parsers() {
                 {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
         assert_msg_equals(
             message_assist_call,
-            common_chat_parse(
+            test_chat_parse(
                 "<JSON>\n"
                 "  {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
                 "</JSON>",
@@ -1158,13 +1180,13 @@ static void test_template_output_parsers() {
                 {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
         assert_msg_equals(
             message_assist_call,
-            common_chat_parse(
+            test_chat_parse(
                 "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
         assert_msg_equals(
             message_assist_call,
-            common_chat_parse(
+            test_chat_parse(
                 "{\n  \"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
@@ -1178,7 +1200,7 @@ static void test_template_output_parsers() {
 
         assert_msg_equals(
             message_assist_multiple_calls,
-            common_chat_parse(
+            test_chat_parse(
                 "<tool_call>\n"
                 "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
                 "</tool_call>\n"
@@ -1190,7 +1212,7 @@ static void test_template_output_parsers() {
 
         assert_msg_equals(
             message_assist_multiple_calls,
-            common_chat_parse(
+            test_chat_parse(
                 "<function=special_function>{\"arg1\": 1}</function>\n"
                 "<function=python>{\"code\":\"print('hello')\"}</function>",
                 /* is_partial= */ false,
@@ -1202,27 +1224,27 @@ static void test_template_output_parsers() {
                 "",
                 "special_function",
                 "{\"arg1\": 1}"),
-            common_chat_parse(
+            test_chat_parse(
                 "This is not a tool call:\n"
                 "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
         assert_msg_equals(message_assist,
-            common_chat_parse(
+            test_chat_parse(
                 "Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
         assert_msg_equals(message_assist_thoughts_unparsed_deepseek,
-            common_chat_parse(
+            test_chat_parse(
                 "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_HERMES_2_PRO}));
         // assert_msg_equals(message_assist_thoughts_unparsed_deepseek,
-        //     common_chat_parse(
+        //     test_chat_parse(
         //         "I'm\nthinking</think>Hello, world!\nWhat's up?",
         //         COMMON_CHAT_FORMAT_HERMES_2_PRO));
         assert_msg_equals(message_assist_thoughts,
-            common_chat_parse(
+            test_chat_parse(
                 "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {
@@ -1230,7 +1252,7 @@ static void test_template_output_parsers() {
                     /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
                 }));
         assert_msg_equals(message_assist_thoughts,
-            common_chat_parse(
+            test_chat_parse(
                 "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
                 /* is_partial= */ true,
                 {
@@ -1238,7 +1260,7 @@ static void test_template_output_parsers() {
                     /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
                 }));
         assert_msg_equals(message_assist_thoughts_unparsed_md,
-            common_chat_parse(
+            test_chat_parse(
                 "<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n```json\n{}```",
                 /* is_partial= */ false,
                 {
@@ -1249,7 +1271,7 @@ static void test_template_output_parsers() {
                     /* .parse_tool_calls = */ false,
                 }));
         assert_msg_equals(message_assist_thoughts_unparsed_md_partial,
-            common_chat_parse(
+            test_chat_parse(
                 "<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n```json\n{}```",
                 /* is_partial= */ true,
                 {
@@ -1259,7 +1281,7 @@ static void test_template_output_parsers() {
                     /* .thinking_forced_open = */ false,
                 }));
         assert_msg_equals(message_assist_thoughts_unopened_unparsed,
-            common_chat_parse(
+            test_chat_parse(
                 "I'm\nthinking</think>Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {
@@ -1267,7 +1289,7 @@ static void test_template_output_parsers() {
                     /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
                 }));
         assert_msg_equals(message_assist_thoughts,
-            common_chat_parse(
+            test_chat_parse(
                 "I'm\nthinking</think>Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {
@@ -1304,7 +1326,7 @@ static void test_template_output_parsers() {
                       "</tool_call>");
         assert_msg_equals(
             simple_assist_msg("", /* reasoning_content= */ "<tool_call>nah uhg</tool_call>"),
-            common_chat_parse(
+            test_chat_parse(
                 "<think><tool_call>nah uhg</tool_call>",
                 /* is_partial= */ false,
                 {
@@ -1328,7 +1350,7 @@ static void test_template_output_parsers() {
 
         assert_equals(
             message_assist_call,
-            common_chat_parse(
+            test_chat_parse(
                 "{\"name\": \"special_function\", \"parameters\": {\"arg1\": 1}}",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_LLAMA_3_X}));
@@ -1366,7 +1388,7 @@ static void test_template_output_parsers() {
         for (auto is_partial : { false, true }) {
             assert_equals(
                 message_assist_call,
-                common_chat_parse(
+                test_chat_parse(
                     "<function=special_function>{\"arg1\": 1}</function>",
                     is_partial,
                     {COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1}));
@@ -1374,7 +1396,7 @@ static void test_template_output_parsers() {
 
         assert_equals(
             message_assist_call,
-            common_chat_parse(
+            test_chat_parse(
                 "<function=special_function>{\"arg1\": 1}<",
                 /* is_partial= */ true,
                 {COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1}));
@@ -1396,7 +1418,7 @@ static void test_template_output_parsers() {
                 "",
                 "special_function",
                 "{\"arg1\": 1}"),
-            common_chat_parse(
+            test_chat_parse(
                 "all\n"
                 "Hello, world!\n"
                 "nono\n"
@@ -1405,27 +1427,27 @@ static void test_template_output_parsers() {
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2}));
         assert_msg_equals(message_assist_call_python_lines,
-            common_chat_parse(
+            test_chat_parse(
                 "python\n"
                 "# This is a program:\n"
                 "print('hey')",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2}));
         assert_msg_equals(message_assist_call_python_lines_unclosed,
-            common_chat_parse(
+            test_chat_parse(
                 "python\n"
                 "# This is a program:\n"
                 "print('hey')",
                 /* is_partial= */ true,
                 {COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2}));
         assert_msg_equals(message_assist_call,
-            common_chat_parse(
+            test_chat_parse(
                 "special_function\n"
                 "{\"arg1\": 1} \n                    ",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2}));
         assert_msg_equals(message_assist,
-            common_chat_parse(
+            test_chat_parse(
                 "all\n"
                 "Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
@@ -1466,7 +1488,7 @@ static void test_template_output_parsers() {
         test_templates(tmpls.get(), end_tokens, message_assist_thoughts, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
         assert_msg_equals(
             simple_assist_msg("Hello, world!\nWhat's up?", "<think>I'm\nthinking"),
-            common_chat_parse(
+            test_chat_parse(
                 "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {
@@ -1477,7 +1499,7 @@ static void test_template_output_parsers() {
                 }));
         assert_msg_equals(
             simple_assist_msg("", "I need to remember the correct syntax. It starts with <｜tool▁calls▁begin｜> and ends with"),
-            common_chat_parse(
+            test_chat_parse(
                 "I need to remember the correct syntax. It starts with <｜tool▁calls▁begin｜> and ends with",
                 /* is_partial= */ true,
                 {
@@ -1487,7 +1509,7 @@ static void test_template_output_parsers() {
                     /* .thinking_forced_open = */ true,
                 }));
         assert_msg_equals(message_assist_thoughts,
-            common_chat_parse(
+            test_chat_parse(
                 "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {
@@ -1495,7 +1517,7 @@ static void test_template_output_parsers() {
                     /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
                 }));
         assert_msg_equals(message_assist_thoughts_unopened_unparsed,
-            common_chat_parse(
+            test_chat_parse(
                 "I'm\nthinking</think>Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {
@@ -1503,7 +1525,7 @@ static void test_template_output_parsers() {
                     /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
                 }));
         assert_msg_equals(message_assist_thoughts,
-            common_chat_parse(
+            test_chat_parse(
                 "I'm\nthinking</think>Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {
@@ -1514,7 +1536,7 @@ static void test_template_output_parsers() {
                 }));
         assert_msg_equals(message_assist_thoughts,
             // Latest template update (ast of 20250209) adds a trailing <think>\n if add_generation_prompt is true.
-            common_chat_parse(
+            test_chat_parse(
                 "I'm\nthinking</think>Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {
@@ -1543,12 +1565,12 @@ static void test_template_output_parsers() {
         test_templates(tmpls.get(), end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
         test_templates(tmpls.get(), end_tokens, message_assist_thoughts, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
         assert_msg_equals(message_assist_thoughts_unparsed_deepseek,
-            common_chat_parse(
+            test_chat_parse(
                 "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_DEEPSEEK_R1}));
         assert_msg_equals(message_assist_thoughts,
-            common_chat_parse(
+            test_chat_parse(
                 "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {
@@ -1556,7 +1578,7 @@ static void test_template_output_parsers() {
                     /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
                 }));
         assert_msg_equals(message_assist_thoughts,
-            common_chat_parse(
+            test_chat_parse(
                 "I'm\nthinking</think>Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {
@@ -1567,7 +1589,7 @@ static void test_template_output_parsers() {
                 }));
 
         assert_msg_equals(message_assist_call_thoughts_unparsed,
-            common_chat_parse(
+            test_chat_parse(
                 "<think>I'm\nthinking</think>\n\n"
                 "<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>special_function\n"
                 "```json\n"
@@ -1576,7 +1598,7 @@ static void test_template_output_parsers() {
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_DEEPSEEK_R1}));
         assert_msg_equals(message_assist_call,
-            common_chat_parse(
+            test_chat_parse(
                 "<｜tool▁calls｜>function<｜tool▁sep｜>special_function\n"
                 "```json\n"
                 "{\"arg1\": 1}\n"
@@ -1585,7 +1607,7 @@ static void test_template_output_parsers() {
                 {COMMON_CHAT_FORMAT_DEEPSEEK_R1}));
 
         assert_msg_equals(message_assist_call_thoughts,
-            common_chat_parse(
+            test_chat_parse(
                 "<think>I'm\nthinking</think>\n\n"
                 "<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>special_function\n"
                 "```json\n"
@@ -1612,20 +1634,20 @@ static void test_template_output_parsers() {
 
         // Test parsing regular content
         assert_msg_equals(message_assist,
-            common_chat_parse(
+            test_chat_parse(
                 "Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_GRANITE}));
         assert_msg_equals(
             message_assist,
-            common_chat_parse(
+            test_chat_parse(
                 "Hello, world!\nWhat's up?",
                 /* is_partial= */ true,
                 {COMMON_CHAT_FORMAT_GRANITE}));
 
         // Test parsing content with thinking
         assert_msg_equals(message_assist_thoughts,
-            common_chat_parse(
+            test_chat_parse(
                 "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {
@@ -1633,12 +1655,12 @@ static void test_template_output_parsers() {
                     /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
                 }));
         assert_msg_equals(message_assist_thoughts_unparsed_deepseek,
-            common_chat_parse(
+            test_chat_parse(
                 "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_GRANITE}));
         assert_msg_equals(message_assist_thoughts,
-            common_chat_parse(
+            test_chat_parse(
                 "<think>I'm\nthinking</think><response>Hello, world!\nWhat's up?",
                 /* is_partial= */ true,
                 {
@@ -1646,7 +1668,7 @@ static void test_template_output_parsers() {
                     /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
                 }));
         assert_msg_equals(message_assist_thoughts,
-            common_chat_parse(
+            test_chat_parse(
                 "<think>I'm\nthinking</think><response>Hello, world!\nWhat's up?</response>",
                 /* is_partial= */ false,
                 {
@@ -1654,12 +1676,12 @@ static void test_template_output_parsers() {
                     /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
                 }));
         assert_msg_equals(simple_assist_msg("<think>I'm\nthinking</think><response>Hello, world!\nWhat's up?</response>"),
-            common_chat_parse(
+            test_chat_parse(
                 "<think>I'm\nthinking</think><response>Hello, world!\nWhat's up?</response>",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_GRANITE}));
         assert_msg_equals(message_assist_empty,
-            common_chat_parse(
+            test_chat_parse(
                 "<think",
                 /* is_partial= */ true,
                 {
@@ -1667,12 +1689,12 @@ static void test_template_output_parsers() {
                     /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
                 }));
         assert_msg_equals(message_assist_empty,
-            common_chat_parse(
+            test_chat_parse(
                 "<think",
                 /* is_partial= */ true,
                 {COMMON_CHAT_FORMAT_GRANITE}));
         assert_msg_equals(message_assist_thoughts_no_content,
-            common_chat_parse(
+            test_chat_parse(
                 "<think>I'm\nthinking",
                 /* is_partial= */ true,
                 {
@@ -1681,32 +1703,32 @@ static void test_template_output_parsers() {
                 }));
         assert_msg_equals(
             message_assist_empty,
-            common_chat_parse(
+            test_chat_parse(
                 "<think>I'm\nthinking</think><response",
                 /* is_partial= */ true,
                 {COMMON_CHAT_FORMAT_GRANITE}));
 
         // Test parsing tool calls
         assert_msg_equals(message_assist_call,
-            common_chat_parse(
+            test_chat_parse(
                 "<|tool_call|>[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_GRANITE}));
         assert_msg_equals(
             message_assist_call_empty_args,
-            common_chat_parse(
+            test_chat_parse(
                 "<|tool_call|>[{\"name\": \"special_function\"",
                 /* is_partial= */ true,
                 {COMMON_CHAT_FORMAT_GRANITE}));
         assert_msg_equals(
             message_assist_call_cutoff_args,
-            common_chat_parse(
+            test_chat_parse(
                 "<|tool_call|>[{\"name\": \"special_function\", \"arguments\": {\"arg",
                 /* is_partial= */ true,
                 {COMMON_CHAT_FORMAT_GRANITE}));
         assert_msg_equals(
             message_assist_call_cutoff_args,
-            common_chat_parse(
+            test_chat_parse(
                 "<|tool_call|>[{\"name\": \"special_function\", \"arguments\": {\"arg",
                 /* is_partial= */ true,
                 {
@@ -1717,7 +1739,7 @@ static void test_template_output_parsers() {
         // Test parsing tool calls with thinking
         assert_msg_equals(
             message_assist_call_thoughts,
-            common_chat_parse(
+            test_chat_parse(
                 "<think>I'm\nthinking</think><|tool_call|>[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}, {",
                 /* is_partial= */ true,
                 {
@@ -1757,7 +1779,7 @@ static void test_template_output_parsers() {
         assert_equals(COMMON_CHAT_FORMAT_GPT_OSS, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
 
         assert_msg_equals(simple_assist_msg("", "I'm\nthink"),
-            common_chat_parse(
+            test_chat_parse(
                 "<|channel|>analysis<|message|>I'm\nthink",
                 /* is_partial= */ true,
                 {
@@ -1765,7 +1787,7 @@ static void test_template_output_parsers() {
                     /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
                 }));
         assert_msg_equals(simple_assist_msg("", "I'm\nthinking"),
-            common_chat_parse(
+            test_chat_parse(
                 "<|channel|>analysis<|message|>I'm\nthinking<|end|>",
                 /* is_partial= */ true,
                 {
@@ -1773,7 +1795,7 @@ static void test_template_output_parsers() {
                     /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
                 }));
         assert_msg_equals(simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking"),
-            common_chat_parse(
+            test_chat_parse(
                 "<|channel|>analysis<|message|>I'm\nthinking<|end|>"
                 "<|start|>assistant<|channel|>final<|message|>Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
@@ -1782,7 +1804,7 @@ static void test_template_output_parsers() {
                     /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
                 }));
         assert_msg_equals(simple_assist_msg("", "I'm\nthinking", "special_function", "{\"arg1"),
-            common_chat_parse(
+            test_chat_parse(
                 "<|channel|>analysis<|message|>I'm\nthinking<|end|>"
                 "<|start|>assistant<|channel|>commentary to=functions.special_function <|constrain|>json<|message|>{\"arg1",
                 /* is_partial= */ true,
@@ -1791,7 +1813,7 @@ static void test_template_output_parsers() {
                     /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
                 }));
         assert_msg_equals(simple_assist_msg("", "I'm\nthinking", "special_function", "{\"arg1"),
-            common_chat_parse(
+            test_chat_parse(
                 "<|channel|>analysis<|message|>I'm\nthinking<|end|>"
                 "<|start|>assistant<|channel|>commentary to=functions.special_function<|message|>{\"arg1",
                 /* is_partial= */ true,
@@ -1800,7 +1822,7 @@ static void test_template_output_parsers() {
                     /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
                 }));
         assert_msg_equals(simple_assist_msg("", "I'm\nthinking", "special_function", "{\"arg1\": 1}"),
-            common_chat_parse(
+            test_chat_parse(
                 "<|channel|>analysis<|message|>I'm\nthinking<|end|>"
                 "<|start|>assistant<|channel|>commentary to=functions.special_function <|constrain|>json<|message|>{\"arg1\": 1}",
                 /* is_partial= */ false,
@@ -1809,7 +1831,7 @@ static void test_template_output_parsers() {
                     /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
                 }));
         assert_msg_equals(simple_assist_msg("", "I'm\nthinking", "special_function", "{\"arg1\": 1}"),
-            common_chat_parse(
+            test_chat_parse(
                 "<|channel|>analysis<|message|>I'm\nthinking<|end|>"
                 "<|start|>assistant<|channel|>analysis to=functions.special_function <|constrain|>json<|message|>{\"arg1\": 1}",
                 /* is_partial= */ false,
@@ -1818,7 +1840,7 @@ static void test_template_output_parsers() {
                     /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
                 }));
         assert_msg_equals(simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking"),
-            common_chat_parse(
+            test_chat_parse(
                 "<|channel|>analysis<|message|>I'm\nthinking<|end|>"
                 "<|start|>assistant<|channel|>commentary<|message|>Hello, world!\nWhat's up?",
                 /* is_partial= */ true,
@@ -1827,7 +1849,7 @@ static void test_template_output_parsers() {
                     /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
                 }));
         assert_msg_equals(simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking", "special_function", "{\"arg1\": 1}"),
-            common_chat_parse(
+            test_chat_parse(
                 "<|channel|>analysis<|message|>I'm\nthinking<|end|>"
                 "<|start|>assistant<|channel|>commentary<|message|>Hello, world!\nWhat's up?<|end|>"
                 "<|start|>assistant<|channel|>commentary to=functions.special_function <|constrain|>json<|message|>{\"arg1\": 1}",
@@ -1840,7 +1862,7 @@ static void test_template_output_parsers() {
         // Test parse_tool_calls == false
         assert_msg_equals(
             simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking"),
-            common_chat_parse(
+            test_chat_parse(
                 "<|channel|>analysis<|message|>I'm\nthinking<|end|>"
                 "<|start|>assistant<|channel|>final<|message|>Hello, world!\nWhat's up?",
                 /* is_partial= */ true,
@@ -1853,7 +1875,7 @@ static void test_template_output_parsers() {
                 }));
         assert_msg_equals(
             simple_assist_msg("", "I'm\nthinking"),
-            common_chat_parse(
+            test_chat_parse(
                 "<|channel|>analysis<|message|>I'm\nthinking<|end|>"
                 "<|start|>assistant<|channel|>commentary to=functions.special_function<|message|>{\"arg1",
                 /* is_partial= */ true,
@@ -1866,7 +1888,7 @@ static void test_template_output_parsers() {
                 }));
         assert_msg_equals(
             simple_assist_msg("", "I'm\nthinking"),
-            common_chat_parse(
+            test_chat_parse(
                 "<|channel|>analysis<|message|>I'm\nthinking<|end|>"
                 "<|start|>assistant<|channel|>commentary to=functions.special_function <|constrain|>json<|message|>{\"arg1\": 1}",
                 /* is_partial= */ false,
@@ -1882,7 +1904,7 @@ static void test_template_output_parsers() {
         assert_msg_equals(
             simple_assist_msg(
                 "<|channel|>analysis<|message|>I'm\nthinking<|end|>Hello, world!\nWhat's up?"),
-            common_chat_parse(
+            test_chat_parse(
                 "<|channel|>analysis<|message|>I'm\nthinking<|end|>"
                 "<|start|>assistant<|channel|>final<|message|>Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
@@ -1894,7 +1916,7 @@ static void test_template_output_parsers() {
         assert_msg_equals(
             simple_assist_msg(
                 "<|channel|>analysis<|message|>I'm\nthinking<|end|>Hello, world!\nWhat's up?"),
-            common_chat_parse(
+            test_chat_parse(
                 "<|channel|>analysis<|message|>I'm\nthinking<|end|>"
                 "<|start|>assistant<|channel|>final<|message|>Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
@@ -1906,7 +1928,7 @@ static void test_template_output_parsers() {
 
         // Test tool calling in role header
         assert_msg_equals(simple_assist_msg("", "", "special_function", "{\"arg1\": 1}"),
-            common_chat_parse(
+            test_chat_parse(
                 " to=functions.special_function<|channel|>commentary <|constrain|>json<|message|>{\"arg1\": 1}",
                 /* is_partial= */ false,
                 {
@@ -1914,7 +1936,7 @@ static void test_template_output_parsers() {
                     /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
                 }));
         assert_msg_equals(simple_assist_msg("", "", "special_function", "{\"arg1\": 1}"),
-            common_chat_parse(
+            test_chat_parse(
                 " to=functions.special_function<|channel|>analysis <|constrain|>json<|message|>{\"arg1\": 1}",
                 /* is_partial= */ false,
                 {
@@ -1922,7 +1944,7 @@ static void test_template_output_parsers() {
                     /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO,
                 }));
         assert_msg_equals(simple_assist_msg("", "I'm\nthinking", "special_function", "{\"arg1\": 1}"),
-            common_chat_parse(
+            test_chat_parse(
                 "<|channel|>analysis<|message|>I'm\nthinking<|end|>"
                 "<|start|>assistant to=functions.special_function<|channel|>analysis <|constrain|>json<|message|>{\"arg1\": 1}",
                 /* is_partial= */ false,
@@ -1944,7 +1966,7 @@ static void test_template_output_parsers() {
         // Test simple reasoning content
         assert_msg_equals(
             simple_assist_msg("Hello, world!", "I'm thinking about the answer"),
-            common_chat_parse(
+            test_chat_parse(
                 "<seed:think>I'm thinking about the answer</seed:think>Hello, world!",
                 /* is_partial= */ false,
                 {
@@ -1959,7 +1981,7 @@ static void test_template_output_parsers() {
         msg_budget_reflect.reasoning_content = "Token usage: 45/1000\nI should continue thinking to find the best solution.";
         assert_msg_equals(
             msg_budget_reflect,
-            common_chat_parse(
+            test_chat_parse(
                 "<seed:think>Token usage: 45/1000\nI should continue thinking to find the best solution.</seed:think>"
                 "<seed:cot_budget_reflect>Token usage: 45/1000\nI should continue thinking to find the best solution.</seed:cot_budget_reflect>"
                 "I need to calculate this step by step.",
@@ -1975,7 +1997,7 @@ static void test_template_output_parsers() {
         msg_tool_call.tool_calls.push_back({"calculate_sum", "{\"numbers\": [1, 2, 3]}", ""});
         assert_msg_equals(
             msg_tool_call,
-            common_chat_parse(
+            test_chat_parse(
                 "<seed:tool_call>\n"
                 "<function=calculate_sum>\n"
                 "<parameter=numbers>[1, 2, 3]</parameter>\n"
@@ -1992,7 +2014,7 @@ static void test_template_output_parsers() {
         msg_reasoning_tool.tool_calls.push_back({"calculate_sum", "{\"numbers\": [1, 2, 3]}", ""});
         assert_msg_equals(
             msg_reasoning_tool,
-            common_chat_parse(
+            test_chat_parse(
                 "<seed:think>I need to calculate the sum of these numbers</seed:think>"
                 "<seed:tool_call>\n"
                 "<function=calculate_sum>\n"
@@ -2013,7 +2035,7 @@ static void test_template_output_parsers() {
         std::size_t previousToolCalls = 0;
         for (std::size_t i = std::string("<seed:tool_call>").length(); i < tool_msg.length() - 1; i++) {
             auto partial = tool_msg.substr(0, i);
-            auto partial_res = common_chat_parse(partial, true, { COMMON_CHAT_FORMAT_SEED_OSS, COMMON_REASONING_FORMAT_DEEPSEEK });
+            auto partial_res = test_chat_parse(partial, true, { COMMON_CHAT_FORMAT_SEED_OSS, COMMON_REASONING_FORMAT_DEEPSEEK });
             if (partial_res.tool_calls.size() < previousToolCalls) {
                 throw std::runtime_error("Tool call size decreased on partial: " + partial + " from " + std::to_string(previousToolCalls) + " to " + std::to_string(partial_res.tool_calls.size()));
             }
@@ -2026,7 +2048,7 @@ static void test_template_output_parsers() {
         msg_multi_param.tool_calls.push_back({"process_data", "{\"input\": \"test\", \"format\": \"json\"}", ""});
         assert_msg_equals(
             msg_multi_param,
-            common_chat_parse(
+            test_chat_parse(
                 "<seed:tool_call>\n"
                 "<function=process_data>\n"
                 "<parameter=input>test</parameter>\n"
@@ -2039,7 +2061,7 @@ static void test_template_output_parsers() {
         // Test partial parsing for incomplete tool call - don't actually add the call until parsing parameters is done
         assert_msg_equals(
             simple_assist_msg("", "", "calculate_sum", "{\"numbers\":"),
-            common_chat_parse(
+            test_chat_parse(
                 "<seed:tool_call>\n"
                 "<function=calculate_sum>\n"
                 "<parameter=numbers>[1,\n",
@@ -2049,7 +2071,7 @@ static void test_template_output_parsers() {
         // Test incomplete reasoning tag
         assert_msg_equals(
             simple_assist_msg("", "I was thinking"),
-            common_chat_parse(
+            test_chat_parse(
                 "<seed:think>I was thinking",
                 /* is_partial= */ true,
                 {
@@ -2060,7 +2082,7 @@ static void test_template_output_parsers() {
         // Test content without reasoning
         assert_msg_equals(
             simple_assist_msg("This is a simple response without reasoning."),
-            common_chat_parse(
+            test_chat_parse(
                 "This is a simple response without reasoning.",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_SEED_OSS}));
@@ -2074,14 +2096,14 @@ static void test_template_output_parsers() {
 
         // Test parsing regular content
         assert_msg_equals(message_assist,
-            common_chat_parse(
+            test_chat_parse(
                 "Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_NEMOTRON_V2}));
 
         // Test parsing content with thinking
         assert_msg_equals(message_assist_thoughts,
-            common_chat_parse(
+            test_chat_parse(
                 "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {
@@ -2091,14 +2113,14 @@ static void test_template_output_parsers() {
 
         // Test parsing tool calls
         assert_msg_equals(message_assist_call,
-            common_chat_parse(
+            test_chat_parse(
                 "<TOOLCALL>[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]</TOOLCALL>",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_NEMOTRON_V2}));
 
         // Test parsing tool calls with thinking
         assert_msg_equals(message_assist_call_thoughts,
-            common_chat_parse(
+            test_chat_parse(
                 "<think>I'm\nthinking</think><TOOLCALL>[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]</TOOLCALL>",
                 /* is_partial= */ false,
                 {
@@ -2108,7 +2130,7 @@ static void test_template_output_parsers() {
 
         // Test tool calls with extra content
         assert_msg_equals(message_assist_call_content,
-            common_chat_parse(
+            test_chat_parse(
                 "<TOOLCALL>[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]</TOOLCALL>Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_NEMOTRON_V2}
@@ -2116,7 +2138,7 @@ static void test_template_output_parsers() {
 
         // Test tool calls with extra content AND thinking
         assert_msg_equals(message_assist_call_thoughts_content,
-            common_chat_parse(
+            test_chat_parse(
                 "<think>I'm\nthinking</think><TOOLCALL>[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]</TOOLCALL>Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {
@@ -2149,7 +2171,7 @@ static void test_template_output_parsers() {
         test_templates(tmpls.get(), end_tokens, message_assist_thoughts, tools, "</think>Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
         assert_msg_equals(
             simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking"),
-            common_chat_parse(
+            test_chat_parse(
                 "I'm\nthinking</think>Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {
@@ -2161,7 +2183,7 @@ static void test_template_output_parsers() {
         // variant: thinking forced open, reasoning_format none
         assert_msg_equals(
             simple_assist_msg("REASONING</think>ok", ""),
-            common_chat_parse(
+            test_chat_parse(
                 "REASONING</think>ok",
                 /* is_partial= */ false,
                 {
@@ -2174,7 +2196,7 @@ static void test_template_output_parsers() {
         // variant: happy path for when it works as the model card says it should
         assert_msg_equals(
             simple_assist_msg("", "", "get_time", "{\"city\":\"Tokyo\"}"),
-            common_chat_parse(
+            test_chat_parse(
                 "<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time<｜tool▁sep｜>{\"city\": \"Tokyo\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>",
                 /* is_partial= */ false,
                 {
@@ -2187,7 +2209,7 @@ static void test_template_output_parsers() {
         // variant: simple + thinking open
         assert_msg_equals(
             simple_assist_msg("", "REASONING", "get_time", "{\"city\":\"Tokyo\"}"),
-            common_chat_parse(
+            test_chat_parse(
                 "REASONING</think><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time<｜tool▁sep｜>{\"city\": \"Tokyo\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>",
                 /* is_partial= */ false,
                 {
@@ -2205,7 +2227,7 @@ static void test_template_output_parsers() {
         message_assist_multiple_calls.tool_calls.push_back({"get_weather", "{\"city\":\"Paris\"}", ""});
         assert_msg_equals(
             message_assist_multiple_calls,
-            common_chat_parse(
+            test_chat_parse(
                 "CONTENT<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time<｜tool▁sep｜>{\"city\": \"Paris\"}<｜tool▁call▁end｜><｜tool▁call▁begin｜>get_weather<｜tool▁sep｜>{\"city\": \"Paris\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>",
                 /* is_partial= */ false,
                 {
@@ -2218,7 +2240,7 @@ static void test_template_output_parsers() {
         // variant: thinking forced open + tool call in reasoning content
         assert_msg_equals(
             simple_assist_msg("", "REASONING<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time2<｜tool▁sep｜>{\"city\": \"Tokyo2\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>REASONING", "get_time", "{\"city\":\"Tokyo\"}"),
-            common_chat_parse(
+            test_chat_parse(
                 "REASONING<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time2<｜tool▁sep｜>{\"city\": \"Tokyo2\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>REASONING</think><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time<｜tool▁sep｜>{\"city\": \"Tokyo\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>",
                 /* is_partial= */ false,
                 {
@@ -2234,7 +2256,7 @@ static void test_template_output_parsers() {
         //          add the reasoning content as regular content and parse the tool calls.
         assert_msg_equals(
             simple_assist_msg("REASONING", "", "get_time", "{\"city\":\"Tokyo\"}"),
-            common_chat_parse(
+            test_chat_parse(
                 "REASONING<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time<｜tool▁sep｜>{\"city\": \"Tokyo\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>",
                 /* is_partial= */ false,
                 {
@@ -2247,7 +2269,7 @@ static void test_template_output_parsers() {
         // variant: thinking forced open + tool call in reasoning content + no closing think + partial
         assert_msg_equals(
             simple_assist_msg("", "REASONING<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time<｜tool▁sep｜>{\"city\": \"Tokyo\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>", "", ""),
-            common_chat_parse(
+            test_chat_parse(
                 "REASONING<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>get_time<｜tool▁sep｜>{\"city\": \"Tokyo\"}<｜tool▁call▁end｜><｜tool▁calls▁end｜>",
                 /* is_partial= */ true,
                 {
@@ -2260,7 +2282,7 @@ static void test_template_output_parsers() {
         // variant: thinking not forced open + missing reasoning + no tool calls
         assert_msg_equals(
             simple_assist_msg("CONTENT", ""),
-            common_chat_parse(
+            test_chat_parse(
                 "CONTENT",
                 /* is_partial= */ false,
                 {
@@ -2280,14 +2302,14 @@ static void test_template_output_parsers() {
 
         // Test parsing regular content
         assert_msg_equals(message_assist,
-            common_chat_parse(
+            test_chat_parse(
                 "Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_APERTUS}));
 
         // Test parsing content with thinking
         assert_msg_equals(message_assist_thoughts,
-            common_chat_parse(
+            test_chat_parse(
                 "<|inner_prefix|>I'm\nthinking<|inner_suffix|>Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {
@@ -2297,14 +2319,14 @@ static void test_template_output_parsers() {
 
         // Test parsing tool calls
         assert_msg_equals(message_assist_call,
-            common_chat_parse(
+            test_chat_parse(
                 "<|tools_prefix|>[{\"special_function\": {\"arg1\": 1}}]<|tools_suffix|>",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_APERTUS}));
 
         // Test parsing tool calls with thinking
         assert_msg_equals(message_assist_call_thoughts,
-            common_chat_parse(
+            test_chat_parse(
                 "<|inner_prefix|>I'm\nthinking<|inner_suffix|><|tools_prefix|>[{\"special_function\": {\"arg1\": 1}}]<|tools_suffix|>",
                 /* is_partial= */ false,
                 {
@@ -2314,7 +2336,7 @@ static void test_template_output_parsers() {
 
         // Test tool calls with extra content
         assert_msg_equals(message_assist_call_content,
-            common_chat_parse(
+            test_chat_parse(
                 "<|tools_prefix|>[{\"special_function\": {\"arg1\": 1}}]<|tools_suffix|>Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_APERTUS}
@@ -2322,7 +2344,7 @@ static void test_template_output_parsers() {
 
         // Test tool calls with extra content AND thinking
         assert_msg_equals(message_assist_call_thoughts_content,
-            common_chat_parse(
+            test_chat_parse(
                 "<|inner_prefix|>I'm\nthinking<|inner_suffix|><|tools_prefix|>[{\"special_function\": {\"arg1\": 1}}]<|tools_suffix|>Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {
@@ -2402,7 +2424,7 @@ Hey there!<|im_end|>
 
         // Test parsing regular content
         assert_msg_equals(message_assist,
-            common_chat_parse(
+            test_chat_parse(
                 "Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS}));
@@ -2413,7 +2435,7 @@ Hey there!<|im_end|>
         msg_single_tool_call.tool_calls.push_back({"special_function", "{\"arg1\":1}", ""});
         assert_msg_equals(
             msg_single_tool_call,
-            common_chat_parse(
+            test_chat_parse(
                 "<|tool_call_start|>[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]<|tool_call_end|>",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS}));
@@ -2424,7 +2446,7 @@ Hey there!<|im_end|>
         msg_tool_call_string.tool_calls.push_back({"get_weather", "{\"location\":\"Paris\"}", ""});
         assert_msg_equals(
             msg_tool_call_string,
-            common_chat_parse(
+            test_chat_parse(
                 "<|tool_call_start|>[{\"name\": \"get_weather\", \"arguments\": {\"location\": \"Paris\"}}]<|tool_call_end|>",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS}));
@@ -2435,7 +2457,7 @@ Hey there!<|im_end|>
         msg_multi_args.tool_calls.push_back({"calculate", "{\"x\":10,\"y\":20,\"operation\":\"add\"}", ""});
         assert_msg_equals(
             msg_multi_args,
-            common_chat_parse(
+            test_chat_parse(
                 "<|tool_call_start|>[{\"name\": \"calculate\", \"arguments\": {\"x\": 10, \"y\": 20, \"operation\": \"add\"}}]<|tool_call_end|>",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS}));
@@ -2447,7 +2469,7 @@ Hey there!<|im_end|>
         msg_multiple_tools.tool_calls.push_back({"get_time", "{\"timezone\":\"UTC\"}", ""});
         assert_msg_equals(
             msg_multiple_tools,
-            common_chat_parse(
+            test_chat_parse(
                 "<|tool_call_start|>[{\"name\": \"get_weather\", \"arguments\": {\"location\": \"Paris\"}}, {\"name\": \"get_time\", \"arguments\": {\"timezone\": \"UTC\"}}]<|tool_call_end|>",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS}));
@@ -2459,7 +2481,7 @@ Hey there!<|im_end|>
         msg_content_before_tool.tool_calls.push_back({"get_weather", "{\"location\":\"Paris\"}", ""});
         assert_msg_equals(
             msg_content_before_tool,
-            common_chat_parse(
+            test_chat_parse(
                 "Let me check the weather for you.<|tool_call_start|>[{\"name\": \"get_weather\", \"arguments\": {\"location\": \"Paris\"}}]<|tool_call_end|>",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS}));
@@ -2471,7 +2493,7 @@ Hey there!<|im_end|>
         msg_content_after_tool.tool_calls.push_back({"get_weather", "{\"location\":\"Paris\"}", ""});
         assert_msg_equals(
             msg_content_after_tool,
-            common_chat_parse(
+            test_chat_parse(
                 "<|tool_call_start|>[{\"name\": \"get_weather\", \"arguments\": {\"location\": \"Paris\"}}]<|tool_call_end|>Here's the result.",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS}));
@@ -2482,7 +2504,7 @@ Hey there!<|im_end|>
         msg_tool_call_newlines.tool_calls.push_back({"get_current_time", "{\"location\":\"Paris\"}", ""});
         assert_msg_equals(
             msg_tool_call_newlines,
-            common_chat_parse(
+            test_chat_parse(
                 "<|tool_call_start|>[{\n    \"name\": \"get_current_time\",\n    \"arguments\": {\n        \"location\": \"Paris\"\n    }\n}]<|tool_call_end|>",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS}));
@@ -2502,14 +2524,14 @@ Hey there!<|im_end|>
 
         // Test parsing regular content
         assert_msg_equals(message_assist,
-            common_chat_parse(
+            test_chat_parse(
                 "Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_MINIMAX_M2}));
 
         // Test parsing content with thinking
         assert_msg_equals(message_assist_thoughts,
-            common_chat_parse(
+            test_chat_parse(
                 "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {
@@ -2519,14 +2541,14 @@ Hey there!<|im_end|>
 
         // Test parsing tool calls
         assert_msg_equals(message_assist_call,
-            common_chat_parse(
+            test_chat_parse(
                 "<minimax:tool_call><invoke name=\"special_function\"><parameter name=\"arg1\">1</parameter></invoke></minimax:tool_call>",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_MINIMAX_M2}));
 
         // Test parsing tool calls with thinking
         assert_msg_equals(message_assist_call_thoughts,
-            common_chat_parse(
+            test_chat_parse(
                 "<think>I'm\nthinking</think><minimax:tool_call><invoke name=\"special_function\"><parameter name=\"arg1\">1</parameter></invoke></minimax:tool_call>",
                 /* is_partial= */ false,
                 {
@@ -2536,7 +2558,7 @@ Hey there!<|im_end|>
 
         // Test tool calls with extra content
         assert_msg_equals(message_assist_call_content,
-            common_chat_parse(
+            test_chat_parse(
                 "<minimax:tool_call><invoke name=\"special_function\"><parameter name=\"arg1\">1</parameter></invoke></minimax:tool_call>Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_MINIMAX_M2}
@@ -2544,7 +2566,7 @@ Hey there!<|im_end|>
 
         // Test tool calls with extra content AND thinking
         assert_msg_equals(message_assist_call_thoughts_content,
-            common_chat_parse(
+            test_chat_parse(
                 "<think>I'm\nthinking</think><minimax:tool_call><invoke name=\"special_function\"><parameter name=\"arg1\">1</parameter></invoke></minimax:tool_call>Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {
@@ -2555,25 +2577,25 @@ Hey there!<|im_end|>
         // Test streaming
         test_parser_with_streaming(message_assist_call_thoughts_content,
             "<think>I'm\nthinking\n</think>Hello, world!\nWhat's up?\n<minimax:tool_call><invoke name=\"special_function\"><parameter name=\"arg1\">1</parameter></invoke></minimax:tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
                     /*  .format = */ COMMON_CHAT_FORMAT_MINIMAX_M2,
                     /*  .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
             }); });
         test_parser_with_streaming(message_assist_call_thoughts_unparsed,
             "<think>I'm\nthinking</think>\n\n<minimax:tool_call><invoke name=\"special_function\"><parameter name=\"arg1\">1</parameter></invoke></minimax:tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
                     /*  .format = */ COMMON_CHAT_FORMAT_MINIMAX_M2,
                     /*  .reasoning_format = */ COMMON_REASONING_FORMAT_NONE
             }); });
         test_parser_with_streaming(message_assist_call_thoughts_content,
             "<think>I'm\nthinking\n</think>\n\nHello, world!\nWhat's up?\n\n<minimax:tool_call>\n<invoke name=\"special_function\">\n<parameter name=\"arg1\">1</parameter>\n</invoke>\n</minimax:tool_call>\n",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
                     /*  .format = */ COMMON_CHAT_FORMAT_MINIMAX_M2,
                     /*  .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
             }); });
         test_parser_with_streaming(message_assist_call_withopt,
             "<minimax:tool_call>\n<invoke name=\"special_function_with_opt\">\n<parameter name=\"arg1\">1</parameter>\n<parameter name=\"arg2\">2</parameter>\n</invoke>\n</minimax:tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
                     /*  .format = */ COMMON_CHAT_FORMAT_MINIMAX_M2,
                     /*  .reasoning_format = */ COMMON_REASONING_FORMAT_NONE
             }); });
@@ -2618,14 +2640,14 @@ Hey there!<|im_end|>
 
         // Test parsing regular content
         assert_msg_equals(message_assist,
-            common_chat_parse(
+            test_chat_parse(
                 "Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_GLM_4_5}));
 
         // Test parsing content with thinking
         assert_msg_equals(message_assist_thoughts,
-            common_chat_parse(
+            test_chat_parse(
                 "\n<think>I'm\nthinking</think>\nHello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {
@@ -2635,14 +2657,14 @@ Hey there!<|im_end|>
 
         // Test parsing tool calls
         assert_msg_equals(message_assist_call,
-            common_chat_parse(
+            test_chat_parse(
                 "\n<tool_call>special_function\n<arg_key>arg1</arg_key>\n<arg_value>1</arg_value>\n</tool_call>",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_GLM_4_5}), true);
 
         // Test parsing tool calls with thinking
         assert_msg_equals(message_assist_call_thoughts,
-            common_chat_parse(
+            test_chat_parse(
                 "\n<think>I'm\nthinking</think>\n<tool_call>special_function\n<arg_key>arg1</arg_key>\n<arg_value>1</arg_value>\n</tool_call>",
                 /* is_partial= */ false,
                 {
@@ -2652,7 +2674,7 @@ Hey there!<|im_end|>
 
         // Test tool calls with extra content
         assert_msg_equals(message_assist_call_content,
-            common_chat_parse(
+            test_chat_parse(
                 "\n<tool_call>special_function\n<arg_key>arg1</arg_key>\n<arg_value>1</arg_value>\n</tool_call>Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_GLM_4_5}
@@ -2660,7 +2682,7 @@ Hey there!<|im_end|>
 
         // Test tool calls with extra content AND thinking
         assert_msg_equals(message_assist_call_thoughts_content,
-            common_chat_parse(
+            test_chat_parse(
                 "\n<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n<tool_call>special_function\n<arg_key>arg1</arg_key>\n<arg_value>1</arg_value>\n</tool_call>",
                 /* is_partial= */ false,
                 {
@@ -2671,19 +2693,19 @@ Hey there!<|im_end|>
         // Test streaming
         test_parser_with_streaming(message_assist_call_thoughts_content,
             "\n<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n<tool_call>special_function\n<arg_key>arg1</arg_key>\n<arg_value>1</arg_value>\n</tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
                     /*  .format = */ COMMON_CHAT_FORMAT_GLM_4_5,
                     /*  .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
             }); });
         test_parser_with_streaming(message_assist_call_thoughts_unparsed,
             "\n<think>I'm\nthinking</think>\n\n<tool_call>special_function\n<arg_key>arg1</arg_key>\n<arg_value>1</arg_value>\n</tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
                     /*  .format = */ COMMON_CHAT_FORMAT_GLM_4_5,
                     /*  .reasoning_format = */ COMMON_REASONING_FORMAT_NONE
             }); });
         test_parser_with_streaming(message_assist_call_withopt,
             "\n<think></think>\n<tool_call>special_function_with_opt\n<arg_key>arg1</arg_key>\n<arg_value>1</arg_value>\n<arg_key>arg2</arg_key>\n<arg_value>2</arg_value>\n</tool_call>\n",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
                     /*  .format = */ COMMON_CHAT_FORMAT_GLM_4_5,
                     /*  .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
             }); });
@@ -2699,7 +2721,7 @@ Hey there!<|im_end|>
                 "<arg_key>score</arg_key>\n"
                 "<arg_value>95.5</arg_value>\n"
                 "</tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_GLM_4_5}); });
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_GLM_4_5}); });
         test_parser_with_streaming(
                 simple_assist_msg("", "", "web_search", "{\"query\":\"\\\"From Zero\\\" Linkin Park album tracklist complete songs\",\"limit\":3,\"type\":\"text\"}"),
                 "<tool_call>web_search\n"
@@ -2710,18 +2732,18 @@ Hey there!<|im_end|>
                 "<arg_key>type</arg_key>\n"
                 "<arg_value>text</arg_value>\n"
                 "</tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_GLM_4_5}); });
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_GLM_4_5}); });
 
         // Test interleaved thinking
         test_parser_with_streaming(simple_assist_msg("Hello, world!\n\nWhat's up?", "I'm\nthinkingThinking2", "special_function", "{\"arg1\": 1}"),
             "\n<think>I'm\nthinking</think>Hello, world!\n<think>Thinking2</think>What's up?\n<tool_call>special_function\n<arg_key>arg1</arg_key>\n<arg_value>1</arg_value>\n</tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
                     /*  .format = */ COMMON_CHAT_FORMAT_GLM_4_5,
                     /*  .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
             }); });
         test_parser_with_streaming(simple_assist_msg("\n<think>I'm\nthinking</think>Hello, world!\n<think>Thinking2</think>What's up?", "", "special_function", "{\"arg1\": 1}"),
             "\n<think>I'm\nthinking</think>Hello, world!\n<think>Thinking2</think>What's up?\n<tool_call>special_function\n<arg_key>arg1</arg_key>\n<arg_value>1</arg_value>\n</tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
                     /*  .format = */ COMMON_CHAT_FORMAT_GLM_4_5,
                     /*  .reasoning_format = */ COMMON_REASONING_FORMAT_NONE
             }); });
@@ -2766,14 +2788,14 @@ Hey there!<|im_end|>
 
         // Test parsing regular content
         assert_msg_equals(message_assist,
-            common_chat_parse(
+            test_chat_parse(
                 "Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_KIMI_K2}));
 
         // Test parsing content with thinking
         assert_msg_equals(message_assist_thoughts,
-            common_chat_parse(
+            test_chat_parse(
                 "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {
@@ -2783,14 +2805,14 @@ Hey there!<|im_end|>
 
         // Test parsing tool calls
         assert_msg_equals(message_assist_call,
-            common_chat_parse(
+            test_chat_parse(
                 "<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_KIMI_K2}));
 
         // Test parsing tool calls with thinking
         assert_msg_equals(message_assist_call_thoughts,
-            common_chat_parse(
+            test_chat_parse(
                 "<think>I'm\nthinking</think><|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>",
                 /* is_partial= */ false,
                 {
@@ -2800,7 +2822,7 @@ Hey there!<|im_end|>
 
         // Test tool calls with extra content
         assert_msg_equals(message_assist_call_content,
-            common_chat_parse(
+            test_chat_parse(
                 "<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {COMMON_CHAT_FORMAT_KIMI_K2}
@@ -2808,7 +2830,7 @@ Hey there!<|im_end|>
 
         // Test tool calls with extra content AND thinking
         assert_msg_equals(message_assist_call_thoughts_content,
-            common_chat_parse(
+            test_chat_parse(
                 "<think>I'm\nthinking</think><|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>Hello, world!\nWhat's up?",
                 /* is_partial= */ false,
                 {
@@ -2819,43 +2841,43 @@ Hey there!<|im_end|>
         // Test streaming
         test_parser_with_streaming(message_assist_call_thoughts_content,
             "<think>I'm\nthinking\n</think>Hello, world!\nWhat's up?\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
                     /*  .format = */ COMMON_CHAT_FORMAT_KIMI_K2,
                     /*  .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
             }); });
         test_parser_with_streaming(message_assist_call_thoughts_unparsed,
             "<think>I'm\nthinking</think>\n\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
                     /*  .format = */ COMMON_CHAT_FORMAT_KIMI_K2,
                     /*  .reasoning_format = */ COMMON_REASONING_FORMAT_NONE
             }); });
         test_parser_with_streaming(message_assist_call_thoughts_content,
             "<think>I'm\nthinking\n</think>\n\nHello, world!\nWhat's up?\n\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>\n",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
                     /*  .format = */ COMMON_CHAT_FORMAT_KIMI_K2,
                     /*  .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
             }); });
         test_parser_with_streaming(message_assist_call_withopt,
             "<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function_with_opt:0<|tool_call_argument_begin|>{\"arg1\": 1, \"arg2\": 2}<|tool_call_end|><|tool_calls_section_end|>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
                     /*  .format = */ COMMON_CHAT_FORMAT_KIMI_K2,
                     /*  .reasoning_format = */ COMMON_REASONING_FORMAT_NONE
             }); });
         test_parser_with_streaming(simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking", "special_function", "{\"arg1\": \"123456\"}"),
             "<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": \"123456\"}<|tool_call_end|><|tool_calls_section_end|>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
                     /*  .format = */ COMMON_CHAT_FORMAT_KIMI_K2,
                     /*  .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
             }); });
         test_parser_with_streaming(simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking", "special_function", "{\"arg1\": [1, 2, \"345\", 6]}"),
             "<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": [1, 2, \"345\", 6]}<|tool_call_end|><|tool_calls_section_end|>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
                     /*  .format = */ COMMON_CHAT_FORMAT_KIMI_K2,
                     /*  .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
             }); });
         test_parser_with_streaming(simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking", "special_function", "{\"arg1\": {\"12\": 34, \"5\": [67, 8], \"9\": \"10\"}}"),
             "<think>I'm\nthinking</think>Hello, world!\nWhat's up?\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": {\"12\": 34, \"5\": [67, 8], \"9\": \"10\"}}<|tool_call_end|><|tool_calls_section_end|>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
                     /*  .format = */ COMMON_CHAT_FORMAT_KIMI_K2,
                     /*  .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK
             }); });
@@ -2864,19 +2886,19 @@ Hey there!<|im_end|>
                 "<|tool_calls_section_begin|><|tool_call_begin|>functions.complex_function:0<|tool_call_argument_begin|>"
                 "{\"name\": \"John Doe\", \"age\": 30, \"active\": true, \"score\": 95.5}"
                 "<|tool_call_end|><|tool_calls_section_end|>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_KIMI_K2}); });
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_KIMI_K2}); });
         test_parser_with_streaming(
                 simple_assist_msg("", "", "web_search", "{\"query\":\"\\\"From Zero\\\" Linkin Park album tracklist complete songs\",\"limit\":3,\"type\":\"text\"}"),
                 "<|tool_calls_section_begin|><|tool_call_begin|>functions.web_search:0<|tool_call_argument_begin|>"
                 "{\"query\":\"\\\"From Zero\\\" Linkin Park album tracklist complete songs\",\"limit\":3,\"type\":\"text\"}"
                 "<|tool_call_end|><|tool_calls_section_end|>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_KIMI_K2}); });
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_KIMI_K2}); });
         test_parser_with_streaming(
                 simple_assist_msg("", "", "read_file", "{\"args\": [{\"path\": \"src/providers/ThemeProvider.tsx\"}, {\"path\": \"src/components/Header.tsx\"}, {\"path\": \"src/components/ThemeToggle.tsx\"}, {\"path\": \"src/app/globals.css\"}, {\"path\": \"src/app/layout.tsx\"}]}"),
                 "<|tool_calls_section_begin|><|tool_call_begin|>functions.read_file:0<|tool_call_argument_begin|>"
                 "{\"args\": [{\"path\": \"src/providers/ThemeProvider.tsx\"}, {\"path\": \"src/components/Header.tsx\"}, {\"path\": \"src/components/ThemeToggle.tsx\"}, {\"path\": \"src/app/globals.css\"}, {\"path\": \"src/app/layout.tsx\"}]}"
                 "<|tool_call_end|><|tool_calls_section_end|>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_KIMI_K2}); });
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_KIMI_K2}); });
         test_parser_with_streaming(
                 simple_assist_msg(
                         "Let me start by examining the relevant files to understand the current implementation.", "",
@@ -2886,7 +2908,7 @@ Hey there!<|im_end|>
                 "<|tool_calls_section_begin|><|tool_call_begin|>functions.read_file:0<|tool_call_argument_begin|>"
                 "{\"files\":[{\"path\":\"src/app/Partners.tsx\",\"line_ranges\":[\"1-100\"]}]}"
                 "<|tool_call_end|><|tool_calls_section_end|>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_KIMI_K2}); });
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_KIMI_K2}); });
         auto multi_tool_msg = simple_assist_msg("Let me call multiple tools.", "I'm thinking.");
         multi_tool_msg.tool_calls.push_back({ "read_file", "{\"files\": [{\"path\": \"src/app/Partners.tsx\", \"line_ranges\": [\"1-100\"]}]}", "" });
         multi_tool_msg.tool_calls.push_back({ "web_search", "{\"query\":\"\\\"From Zero\\\" Linkin Park album tracklist complete songs\",\"limit\":3,\"type\":\"text\"}", "" });
@@ -2908,7 +2930,7 @@ Hey there!<|im_end|>
                 "{\"message\":\"Hello! 👋 🌟 🚀 Testing emojis: 😀😃😄😁 and symbols: ∑∏∆∇\"}"
                 "<|tool_call_end|>"
                 "<|tool_calls_section_end|>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
                     COMMON_CHAT_FORMAT_KIMI_K2,
                     COMMON_REASONING_FORMAT_DEEPSEEK
             }); });
@@ -2917,7 +2939,7 @@ Hey there!<|im_end|>
                 "<think>I'm thinking<|tool_calls_section_begin|><|tool_call_begin|>functions.complex_function_in_think:0<|tool_call_argument_begin|>"
                 "{\"name\": \"John Doe\", \"age\": 30, \"active\": true, \"score\": 95.5}"
                 "<|tool_call_end|><|tool_calls_section_end|>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
                     COMMON_CHAT_FORMAT_KIMI_K2,
                     COMMON_REASONING_FORMAT_DEEPSEEK
             }); });
@@ -2926,7 +2948,7 @@ Hey there!<|im_end|>
                 "<think>I'm thinking<|tool_calls_section_begin|><|tool_call_begin|>functions.complex_function_in_think:0<|tool_call_argument_begin|>"
                 "{\"name\": \"John Doe\", \"age\": 30, \"active\": true, \"score\": 95.5}"
                 "<|tool_call_end|><|tool_calls_section_end|>I'm still thinking</think>Hello",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {
                     COMMON_CHAT_FORMAT_KIMI_K2,
                     COMMON_REASONING_FORMAT_DEEPSEEK
             }); });
@@ -3001,7 +3023,7 @@ Hey there!<|im_end|>
         // Basic XML tool call parsing
         assert_msg_equals(
             message_assist_call,
-            common_chat_parse(
+            test_chat_parse(
                 "<tool_call>\n"
                 "  <function=special_function>\n"
                 "    <parameter=arg1>\n"
@@ -3036,7 +3058,7 @@ Hey there!<|im_end|>
                 "    </parameter>\n"
                 "  </function>\n"
                 "</tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
 
         // Special characters and Unicode
         common_chat_msg expected_special_chars;
@@ -3053,7 +3075,7 @@ Hey there!<|im_end|>
                 "    </parameter>\n"
                 "  </function>\n"
                 "</tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
 
         // Multiline content with newlines and indentation
         common_chat_msg expected_multiline;
@@ -3072,7 +3094,7 @@ Hey there!<|im_end|>
                 "    </parameter>\n"
                 "  </function>\n"
                 "</tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
 
         // JSON object as parameter value
         common_chat_msg expected_json_param;
@@ -3090,7 +3112,7 @@ Hey there!<|im_end|>
                 "    </parameter>\n"
                 "  </function>\n"
                 "</tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
 
         // Array as parameter value
         common_chat_msg expected_array_param;
@@ -3108,7 +3130,7 @@ Hey there!<|im_end|>
                 "    </parameter>\n"
                 "  </function>\n"
                 "</tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
 
         // Empty parameter
         common_chat_msg expected_empty_param;
@@ -3125,7 +3147,7 @@ Hey there!<|im_end|>
                 "    </parameter>\n"
                 "  </function>\n"
                 "</tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
 
         // Boolean values (true/false)
         common_chat_msg expected_boolean;
@@ -3146,7 +3168,7 @@ Hey there!<|im_end|>
                 "    </parameter>\n"
                 "  </function>\n"
                 "</tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
 
         // Null value
         common_chat_msg expected_null;
@@ -3164,7 +3186,7 @@ Hey there!<|im_end|>
                 "    </parameter>\n"
                 "  </function>\n"
                 "</tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
 
         // Negative numbers and scientific notation
         common_chat_msg expected_numbers;
@@ -3188,7 +3210,7 @@ Hey there!<|im_end|>
                 "    </parameter>\n"
                 "  </function>\n"
                 "</tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
 
         // XML-like content in parameters (should be escaped)
         common_chat_msg expected_xml_content;
@@ -3206,7 +3228,7 @@ Hey there!<|im_end|>
                 "    </parameter>\n"
                 "  </function>\n"
                 "</tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
 
         // Quotes and escape characters
         common_chat_msg expected_quotes;
@@ -3224,7 +3246,7 @@ Hey there!<|im_end|>
                 "    </parameter>\n"
                 "  </function>\n"
                 "</tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
 
         // Long parameter value (simplified)
         std::string long_text = "This is a long text parameter that should test the parser's ability to handle larger amounts of text data.";
@@ -3244,7 +3266,7 @@ Hey there!<|im_end|>
                 "    </parameter>\n"
                 "  </function>\n"
                 "</tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
 
         // Mixed content with text before and after tool call
         common_chat_msg expected_mixed_content;
@@ -3263,7 +3285,7 @@ Hey there!<|im_end|>
                 "    </parameter>\n"
                 "  </function>\n"
                 "</tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
 
         // Compact format (no extra whitespace)
         common_chat_msg expected_compact;
@@ -3275,7 +3297,7 @@ Hey there!<|im_end|>
         test_parser_with_streaming(
             expected_compact,
                 "<tool_call><function=compact_function><parameter=param>value</parameter></function></tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
 
         // Function name with underscores and numbers
         common_chat_msg expected_complex_name;
@@ -3293,7 +3315,7 @@ Hey there!<|im_end|>
                 "    </parameter>\n"
                 "  </function>\n"
                 "</tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
 
         // Parameter names with underscores and numbers
         common_chat_msg expected_complex_params;
@@ -3317,7 +3339,7 @@ Hey there!<|im_end|>
                 "    </parameter>\n"
                 "  </function>\n"
                 "</tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
 
         // Very deeply nested XML content in parameter
         common_chat_msg expected_deep_xml;
@@ -3335,7 +3357,7 @@ Hey there!<|im_end|>
                 "    </parameter>\n"
                 "  </function>\n"
                 "</tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
 
         // Parameter with only whitespace
         common_chat_msg expected_whitespace_param;
@@ -3353,7 +3375,7 @@ Hey there!<|im_end|>
                 "    </parameter>\n"
                 "  </function>\n"
                 "</tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
 
         // Parameter with tabs and mixed whitespace
         common_chat_msg expected_mixed_whitespace;
@@ -3373,7 +3395,7 @@ Hey there!<|im_end|>
                 "    </parameter>\n"
                 "  </function>\n"
                 "</tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
 
         // Control characters and special Unicode
         common_chat_msg expected_control_chars;
@@ -3391,7 +3413,7 @@ Hey there!<|im_end|>
                 "    </parameter>\n"
                 "  </function>\n"
                 "</tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
 
         // Emoji and extended Unicode characters
         common_chat_msg expected_emoji;
@@ -3409,7 +3431,7 @@ Hey there!<|im_end|>
                 "    </parameter>\n"
                 "  </function>\n"
                 "</tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
 
         // Mathematical expressions and formulas
         common_chat_msg expected_math;
@@ -3427,7 +3449,7 @@ Hey there!<|im_end|>
                 "    </parameter>\n"
                 "  </function>\n"
                 "</tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
 
         // SQL injection-like content (should be safely escaped)
         common_chat_msg expected_sql;
@@ -3445,7 +3467,7 @@ Hey there!<|im_end|>
                 "    </parameter>\n"
                 "  </function>\n"
                 "</tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
 
         // HTML/XML injection content
         common_chat_msg expected_html;
@@ -3463,7 +3485,7 @@ Hey there!<|im_end|>
                 "    </parameter>\n"
                 "  </function>\n"
                 "</tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
 
         // Binary-like content (base64)
         common_chat_msg expected_binary;
@@ -3481,7 +3503,7 @@ Hey there!<|im_end|>
                 "    </parameter>\n"
                 "  </function>\n"
                 "</tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
 
         // Very large numbers (should be parsed as scientific notation)
         common_chat_msg expected_large_numbers;
@@ -3499,7 +3521,7 @@ Hey there!<|im_end|>
                 "    </parameter>\n"
                 "  </function>\n"
                 "</tool_call>",
-            [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
+            [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); });
     }
 
     {
diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp
index caad29bac..0926e552e 100644
--- a/tools/cli/cli.cpp
+++ b/tools/cli/cli.cpp
@@ -66,21 +66,25 @@ struct cli_context {
         defaults.stream = true; // make sure we always use streaming mode
         defaults.timings_per_token = true; // in order to get timings even when we cancel mid-way
         // defaults.return_progress = true; // TODO: show progress
-        defaults.oaicompat_chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
     }
 
     std::string generate_completion(result_timings & out_timings) {
         server_response_reader rd = ctx_server.get_response_reader();
-        auto formatted = format_chat();
+        auto chat_params = format_chat();
         {
             // TODO: reduce some copies here in the future
             server_task task = server_task(SERVER_TASK_TYPE_COMPLETION);
             task.id         = rd.get_new_id();
             task.index      = 0;
-            task.params     = defaults;         // copy
-            task.cli_prompt = formatted.prompt; // copy
-            task.cli_files  = input_files;      // copy
+            task.params     = defaults;           // copy
+            task.cli_prompt = chat_params.prompt; // copy
+            task.cli_files  = input_files;        // copy
             task.cli        = true;
+
+            // chat template settings
+            task.params.chat_parser_params = common_chat_parser_params(chat_params);
+            task.params.chat_parser_params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+
             rd.post_task({std::move(task)});
         }
 
@@ -172,7 +176,6 @@ struct cli_context {
         inputs.use_jinja             = chat_params.use_jinja;
         inputs.parallel_tool_calls   = false;
         inputs.add_generation_prompt = true;
-        inputs.reasoning_format      = chat_params.reasoning_format;
         inputs.enable_thinking       = chat_params.enable_thinking;
 
         // Apply chat template to the list of messages
diff --git a/tools/server/server-common.h b/tools/server/server-common.h
index 7f4c07387..99e9c5e6f 100644
--- a/tools/server/server-common.h
+++ b/tools/server/server-common.h
@@ -274,6 +274,7 @@ std::vector<server_tokens> tokenize_input_prompts(
 // OAI utils
 //
 
+// global server parameters for chat formatting / parsing
 struct server_chat_params {
     bool use_jinja;
     bool prefill_assistant;
diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp
index 35ec7ad2a..2add9667d 100644
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@@ -68,10 +68,10 @@ json task_params::to_json(bool only_metrics) const {
             {"stream",                    stream},
             {"n_probs",                   sampling.n_probs},
             {"min_keep",                  sampling.min_keep},
-            {"chat_format",               common_chat_format_name(oaicompat_chat_syntax.format)},
-            {"reasoning_format",          common_reasoning_format_name(oaicompat_chat_syntax.reasoning_format)},
-            {"reasoning_in_content",      oaicompat_chat_syntax.reasoning_in_content},
-            {"thinking_forced_open",      oaicompat_chat_syntax.thinking_forced_open},
+            {"chat_format",               common_chat_format_name(chat_parser_params.format)},
+            {"reasoning_format",          common_reasoning_format_name(chat_parser_params.reasoning_format)},
+            {"reasoning_in_content",      chat_parser_params.reasoning_in_content},
+            {"thinking_forced_open",      chat_parser_params.thinking_forced_open},
             {"samplers",                  samplers},
             {"speculative.n_max",         speculative.n_max},
             {"speculative.n_min",         speculative.n_min},
@@ -127,10 +127,10 @@ json task_params::to_json(bool only_metrics) const {
         {"grammar_lazy",              sampling.grammar_lazy},
         {"grammar_triggers",          grammar_triggers},
         {"preserved_tokens",          sampling.preserved_tokens},
-        {"chat_format",               common_chat_format_name(oaicompat_chat_syntax.format)},
-        {"reasoning_format",          common_reasoning_format_name(oaicompat_chat_syntax.reasoning_format)},
-        {"reasoning_in_content",      oaicompat_chat_syntax.reasoning_in_content},
-        {"thinking_forced_open",      oaicompat_chat_syntax.thinking_forced_open},
+        {"chat_format",               common_chat_format_name(chat_parser_params.format)},
+        {"reasoning_format",          common_reasoning_format_name(chat_parser_params.reasoning_format)},
+        {"reasoning_in_content",      chat_parser_params.reasoning_in_content},
+        {"thinking_forced_open",      chat_parser_params.thinking_forced_open},
         {"samplers",                  samplers},
         {"speculative.n_max",         speculative.n_max},
         {"speculative.n_min",         speculative.n_min},
@@ -291,21 +291,21 @@ task_params server_task::params_from_json_cmpl(
     {
         auto it = data.find("chat_format");
         if (it != data.end()) {
-            params.oaicompat_chat_syntax.format = static_cast<common_chat_format>(it->get<int>());
-            SRV_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_syntax.format));
+            params.chat_parser_params.format = static_cast<common_chat_format>(it->get<int>());
+            SRV_INF("Chat format: %s\n", common_chat_format_name(params.chat_parser_params.format));
         } else {
-            params.oaicompat_chat_syntax.format = defaults.oaicompat_chat_syntax.format;
+            params.chat_parser_params.format = defaults.chat_parser_params.format;
         }
         common_reasoning_format reasoning_format = params_base.reasoning_format;
         if (data.contains("reasoning_format")) {
             reasoning_format = common_reasoning_format_from_name(data.at("reasoning_format").get<std::string>());
         }
-        params.oaicompat_chat_syntax.reasoning_format = reasoning_format;
-        params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
-        params.oaicompat_chat_syntax.thinking_forced_open = json_value(data, "thinking_forced_open", false);
-        params.oaicompat_chat_syntax.parse_tool_calls = json_value(data, "parse_tool_calls", false);
+        params.chat_parser_params.reasoning_format = reasoning_format;
+        params.chat_parser_params.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
+        params.chat_parser_params.thinking_forced_open = json_value(data, "thinking_forced_open", false);
+        params.chat_parser_params.parse_tool_calls = json_value(data, "parse_tool_calls", false);
         if (data.contains("chat_parser")) {
-            params.oaicompat_chat_syntax.parser.load(data.at("chat_parser").get<std::string>());
+            params.chat_parser_params.parser.load(data.at("chat_parser").get<std::string>());
         }
     }
 
@@ -722,7 +722,7 @@ common_chat_msg task_result_state::update_chat_msg(
     auto new_msg = common_chat_parse(
         generated_text,
         is_partial,
-        oaicompat_chat_syntax);
+        chat_parser_params);
     if (!new_msg.empty()) {
         new_msg.set_tool_call_ids(generated_tool_call_ids, gen_tool_call_id);
         chat_msg = new_msg;
diff --git a/tools/server/server-task.h b/tools/server/server-task.h
index daffe0c90..6835eef50 100644
--- a/tools/server/server-task.h
+++ b/tools/server/server-task.h
@@ -78,7 +78,9 @@ struct task_params {
     task_response_type res_type = TASK_RESPONSE_TYPE_NONE;
     std::string        oaicompat_model;
     std::string        oaicompat_cmpl_id;
-    common_chat_syntax oaicompat_chat_syntax;
+
+    // per-request parameters for chat parsing
+    common_chat_parser_params chat_parser_params;
 
     // Embeddings
     int32_t embd_normalize = 2; // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm)
@@ -91,7 +93,7 @@ struct task_params {
 struct task_result_state {
     // tracking diffs for partial tool calls
     std::vector<common_chat_msg_diff> diffs;
-    common_chat_syntax oaicompat_chat_syntax;
+    common_chat_parser_params chat_parser_params;
     common_chat_msg chat_msg;
     std::string generated_text; // append new chunks of generated text here
     std::vector<std::string> generated_tool_call_ids;
@@ -100,8 +102,8 @@ struct task_result_state {
     bool anthropic_thinking_block_started = false;
     bool anthropic_text_block_started = false;
 
-    task_result_state(const common_chat_syntax & oaicompat_chat_syntax)
-        : oaicompat_chat_syntax(oaicompat_chat_syntax) {}
+    task_result_state(const common_chat_parser_params & chat_parser_params)
+        : chat_parser_params(chat_parser_params) {}
 
     // parse partial tool calls and update the internal state
     common_chat_msg update_chat_msg(
@@ -230,7 +232,7 @@ struct server_task {
     // the task will be moved into queue, then onto slots
     // however, the state must be kept by caller (e.g., HTTP thread)
     task_result_state create_state() const {
-        return task_result_state(params.oaicompat_chat_syntax);
+        return task_result_state(params.chat_parser_params);
     }
 
     bool is_parent() const {

From 1c7cf94b22a9dc6b1d32422f72a627787a4783a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= <angt@huggingface.co>
Date: Tue, 20 Jan 2026 18:28:43 +0100
Subject: [PATCH 14/17] common, server : use the same User-Agent by default
 (#18957)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit also ensures that if a custom User-Agent is used, it will be
the only one sent.

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
---
 common/common.h                |  2 ++
 common/download.cpp            | 33 +++++++++++++++++++--------------
 tools/server/server-common.cpp |  1 -
 tools/server/server-common.h   |  2 --
 4 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/common/common.h b/common/common.h
index 8247949de..96c990c05 100644
--- a/common/common.h
+++ b/common/common.h
@@ -57,6 +57,8 @@ extern const char * LLAMA_COMMIT;
 extern const char * LLAMA_COMPILER;
 extern const char * LLAMA_BUILD_TARGET;
 
+const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
+
 struct common_control_vector_load_info;
 
 //
diff --git a/common/download.cpp b/common/download.cpp
index a37780421..57f29a23b 100644
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -314,23 +314,26 @@ static bool common_pull_file(httplib::Client & cli,
 
 // download one single file from remote URL to local path
 // returns status code or -1 on error
-static int common_download_file_single_online(const std::string & url,
-                                               const std::string & path,
-                                               const std::string & bearer_token,
-                                               const common_header_list & custom_headers) {
+static int common_download_file_single_online(const std::string        & url,
+                                              const std::string        & path,
+                                              const std::string        & bearer_token,
+                                              const common_header_list & custom_headers) {
     static const int max_attempts        = 3;
     static const int retry_delay_seconds = 2;
 
     auto [cli, parts] = common_http_client(url);
 
-    httplib::Headers default_headers = {{"User-Agent", "llama-cpp"}};
-    if (!bearer_token.empty()) {
-        default_headers.insert({"Authorization", "Bearer " + bearer_token});
-    }
+    httplib::Headers headers;
     for (const auto & h : custom_headers) {
-        default_headers.emplace(h.first, h.second);
+        headers.emplace(h.first, h.second);
     }
-    cli.set_default_headers(default_headers);
+    if (headers.find("User-Agent") == headers.end()) {
+        headers.emplace("User-Agent", "llama-cpp/" + build_info);
+    }
+    if (!bearer_token.empty()) {
+        headers.emplace("Authorization", "Bearer " + bearer_token);
+    }
+    cli.set_default_headers(headers);
 
     const bool file_exists = std::filesystem::exists(path);
 
@@ -437,10 +440,12 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string
                                                              const common_remote_params & params) {
     auto [cli, parts] = common_http_client(url);
 
-    httplib::Headers headers = {{"User-Agent", "llama-cpp"}};
-
-    for (const auto & header : params.headers) {
-        headers.emplace(header.first, header.second);
+    httplib::Headers headers;
+    for (const auto & h : params.headers) {
+        headers.emplace(h.first, h.second);
+    }
+    if (headers.find("User-Agent") == headers.end()) {
+        headers.emplace("User-Agent", "llama-cpp/" + build_info);
     }
 
     if (params.timeout > 0) {
diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp
index 1bbe85322..4aeeda2ff 100644
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@@ -779,7 +779,6 @@ static void handle_media(
         // download remote image
         // TODO @ngxson : maybe make these params configurable
         common_remote_params params;
-        params.headers.push_back({"User-Agent", "llama.cpp/" + build_info});
         params.max_size = 1024 * 1024 * 10; // 10MB
         params.timeout  = 10; // seconds
         SRV_INF("downloading image from '%s'\n", url.c_str());
diff --git a/tools/server/server-common.h b/tools/server/server-common.h
index 99e9c5e6f..a88d40494 100644
--- a/tools/server/server-common.h
+++ b/tools/server/server-common.h
@@ -13,8 +13,6 @@
 #include <vector>
 #include <cinttypes>
 
-const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
-
 using json = nlohmann::ordered_json;
 
 #define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__)

From 5bd341c9a135a13f901c4cacacc27fa5b299ce19 Mon Sep 17 00:00:00 2001
From: Oliver Simons <osimons@nvidia.com>
Date: Wed, 21 Jan 2026 02:34:29 +0100
Subject: [PATCH 15/17] CUDA: Fix builds for older CCCL versions by ifdefing
 strided_iterator (#18964)

* CUDA: Fix builds for older CCCL versions by ifdefing strided_iterator

Strided iterator was added in [CCCL
3.1](https://github.com/NVIDIA/cccl/releases/tag/v3.1.0), which is packaged into
[CTK
13.1](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#id5)

* Unindent as per code review request
---
 ggml/src/ggml-cuda/argsort.cu | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu
index cf7a44f7a..4896669c3 100644
--- a/ggml/src/ggml-cuda/argsort.cu
+++ b/ggml/src/ggml-cuda/argsort.cu
@@ -2,6 +2,9 @@
 
 #ifdef GGML_CUDA_USE_CUB
 #    include <cub/cub.cuh>
+#    if (CCCL_MAJOR_VERSION >= 3 && CCCL_MINOR_VERSION >= 1)
+#        define STRIDED_ITERATOR_AVAILABLE
+#    endif
 using namespace cub;
 #endif  // GGML_CUDA_USE_CUB
 
@@ -14,6 +17,14 @@ static __global__ void init_indices(int * indices, const int ncols, const int nr
     }
 }
 
+#ifndef STRIDED_ITERATOR_AVAILABLE
+static __global__ void init_offsets(int * offsets, const int ncols, const int nrows) {
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx <= nrows) {
+        offsets[idx] = idx * ncols;
+    }
+}
+#endif  // STRIDED_ITERATOR_AVAILABLE
 
 #ifdef GGML_CUDA_USE_CUB
 void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
@@ -33,8 +44,14 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
     const dim3 grid_size((ncols + block_size - 1) / block_size, nrows);
     init_indices<<<grid_size, block_size, 0, stream>>>(temp_indices, ncols, nrows);
 
+#ifdef STRIDED_ITERATOR_AVAILABLE
     auto offset_iterator = cuda::make_strided_iterator(cuda::make_counting_iterator(0), ncols);
-
+#else
+    ggml_cuda_pool_alloc<int> offsets_alloc(pool, nrows + 1);
+    int *                     offset_iterator = offsets_alloc.get();
+    const dim3                offset_grid((nrows + block_size - 1) / block_size);
+    init_offsets<<<offset_grid, block_size, 0, stream>>>(offset_iterator, ncols, nrows);
+#endif
     CUDA_CHECK(cudaMemcpyAsync(temp_keys, x, ncols * nrows * sizeof(float), cudaMemcpyDeviceToDevice, stream));
 
     size_t temp_storage_bytes = 0;

From 37c35f0e1c625831687b146cbb0a57654ef88ca2 Mon Sep 17 00:00:00 2001
From: Matthieu Coudron <886074+teto@users.noreply.github.com>
Date: Wed, 21 Jan 2026 07:52:46 +0100
Subject: [PATCH 16/17] gguf: display strerrno when cant load a model (#18884)

I've had issues loading models with llama-server:
[44039] E gguf_init_from_file: failed to open GGUF file 'mistral-7b-v0.1.Q8_0.gguf'

and I was sure it could access the file. Seems like --models-dir and
--models-presets dont interact like I thought they would but I salvaged
this snippet that helps troubleshooting
[44039] E gguf_init_from_file: failed to open GGUF file 'mistral-7b-v0.1.Q8_0.gguf' (errno No such file or directory)
---
 ggml/src/gguf.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
index b165d8bdc..bfab5c4d6 100644
--- a/ggml/src/gguf.cpp
+++ b/ggml/src/gguf.cpp
@@ -734,7 +734,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
     FILE * file = ggml_fopen(fname, "rb");
 
     if (!file) {
-        GGML_LOG_ERROR("%s: failed to open GGUF file '%s'\n", __func__, fname);
+        GGML_LOG_ERROR("%s: failed to open GGUF file '%s' (%s)\n", __func__, fname, strerror(errno));
         return nullptr;
     }
 

From 12a4a47e6aaf691492644c39da453745aaee1672 Mon Sep 17 00:00:00 2001
From: "Piotr Wilkin (ilintar)" <piotr.wilkin@syndatis.com>
Date: Wed, 21 Jan 2026 12:35:20 +0100
Subject: [PATCH 17/17] Fix GLM 4.7 Lite MoE gating func (#18980)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Fix GLM 4.7 MoE gating func

* Update src/models/deepseek2.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update src/llama-model.cpp

Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
---
 src/llama-model.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 94c47dc24..255289b7c 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1713,7 +1713,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
                     // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
                     // that have no expert_gating_func model parameter set
-                    hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
+                    if ((hparams.n_layer == 47 || hparams.n_layer == 48) && n_vocab == 154880) {
+                        // GLM 4.7 Lite
+                        hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
+                    } else {
+                        hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
+                    }
                 }
 
                 if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {