From 287a33017b32600bfc0e81feeb0ad6e81e0dd484 Mon Sep 17 00:00:00 2001 From: Julius Tischbein Date: Sun, 18 Jan 2026 17:35:57 +0100 Subject: [PATCH 01/17] llama : Extend fallback, fix fileno for dio file, exclude case that mmap uses dio file (#18887) --- src/llama-mmap.cpp | 6 +++++- src/llama-model-loader.cpp | 18 ++++++++++++------ 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index fe0847fe1..0261e4c72 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -265,7 +265,8 @@ struct llama_file::impl { continue; // Interrupted by signal, retry } // Fallback to std::fread in case the DMA controller cannot access the buffer - if (errno == EFAULT) { + if (errno == EFAULT || errno == EINVAL) { + LLAMA_LOG_WARN("%s: Falling back to buffered IO due to %s\n", __func__, strerror(errno)); auto curr_off = tell(); close(fd); fd = -1; @@ -384,6 +385,9 @@ int llama_file::file_id() const { #ifdef _WIN32 return _fileno(pimpl->fp); #else + if (pimpl->fd != -1) { + return pimpl->fd; + } #if defined(fileno) return fileno(pimpl->fp); #else diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 300a322c5..383b8dc76 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -539,12 +539,18 @@ llama_model_loader::llama_model_loader( files.emplace_back(new llama_file(fname.c_str(), "rb", use_direct_io)); contexts.emplace_back(ctx); - use_direct_io = use_direct_io && files.back()->has_direct_io(); - - // Disable mmap in case Direct I/O is enabled and available - if (use_direct_io && use_mmap) { - use_mmap = false; - LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__); + if (use_mmap && use_direct_io) { + if (files.back()->has_direct_io()) { + // Disable mmap, as DirectIO is available + use_mmap = false; + LLAMA_LOG_WARN("%s: direct I/O is enabled, disabling mmap\n", __func__); + } else { + // Disable DirectIO and reopen file using std::fopen for mmap + use_direct_io = false; + files.pop_back(); + files.emplace_back(new llama_file(fname.c_str(), "rb", false)); + LLAMA_LOG_WARN("%s: direct I/O is not available, using mmap\n", __func__); + } } // Save tensors data offset of the main file. From 3d55846a5c626e2e608db8c24fa9ee6defaacca9 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Mon, 19 Jan 2026 13:12:38 +0100 Subject: [PATCH 02/17] model-conversion : add BUILD_DIR variable to run-converted-model scripts (#18927) This commit adds a BUILD_DIR variable to the scripts used for running converted models. The motivation for this is that currently the `build` directory is hardcoded and it can be useful to specify a different build directory, with builds for different configurations. --- .../causal/run-converted-model-embeddings-logits.sh | 9 +++++++-- .../scripts/causal/run-converted-model.sh | 11 ++++++++--- .../scripts/embedding/run-converted-model.sh | 5 +++-- 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/examples/model-conversion/scripts/causal/run-converted-model-embeddings-logits.sh b/examples/model-conversion/scripts/causal/run-converted-model-embeddings-logits.sh index 3cce3fc94..1b5ff8611 100755 --- a/examples/model-conversion/scripts/causal/run-converted-model-embeddings-logits.sh +++ b/examples/model-conversion/scripts/causal/run-converted-model-embeddings-logits.sh @@ -4,6 +4,7 @@ set -e # First try command line argument, then environment variable, then file CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}" +BUILD_DIR="${2:-"$BUILD_DIR"}" # Final check if we have a model path if [ -z "$CONVERTED_MODEL" ]; then @@ -13,6 +14,10 @@ if [ -z "$CONVERTED_MODEL" ]; then exit 1 fi -cmake --build ../../build --target llama-debug -j8 +if [ -z "$BUILD_DIR" ]; then + BUILD_DIR="../../build" +fi -../../build/bin/llama-debug -m $CONVERTED_MODEL --embedding -p "Hello world today" --save-logits +cmake --build ${BUILD_DIR} --target llama-debug -j8 + +${BUILD_DIR}/bin/llama-debug -m $CONVERTED_MODEL --embedding -p "Hello world today" --save-logits diff --git a/examples/model-conversion/scripts/causal/run-converted-model.sh b/examples/model-conversion/scripts/causal/run-converted-model.sh index b6c3d3866..b684804e0 100755 --- a/examples/model-conversion/scripts/causal/run-converted-model.sh +++ b/examples/model-conversion/scripts/causal/run-converted-model.sh @@ -5,11 +5,16 @@ set -e # First try command line argument, then environment variable, then file CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}" MODEL_TESTING_PROMPT="${2:-"$MODEL_TESTING_PROMPT"}" +BUILD_DIR="${3:-"$BUILD_DIR"}" -if [ -z "$MODEL_TESTING_PROMPT"]; then +if [ -z "$MODEL_TESTING_PROMPT" ]; then MODEL_TESTING_PROMPT="Hello, my name is" fi +if [ -z "$BUILD_DIR" ]; then + BUILD_DIR="../../build" +fi + # Final check if we have a model path if [ -z "$CONVERTED_MODEL" ]; then echo "Error: Model path must be provided either as:" >&2 @@ -21,6 +26,6 @@ fi echo $CONVERTED_MODEL echo $MODEL_TESTING_PROMPT -cmake --build ../../build --target llama-debug -j8 +cmake --build ${BUILD_DIR} --target llama-debug -j8 -../../build/bin/llama-debug -m "$CONVERTED_MODEL" -p "$MODEL_TESTING_PROMPT" --save-logits +${BUILD_DIR}/bin/llama-debug -m "$CONVERTED_MODEL" -p "$MODEL_TESTING_PROMPT" --save-logits diff --git a/examples/model-conversion/scripts/embedding/run-converted-model.sh b/examples/model-conversion/scripts/embedding/run-converted-model.sh index 84625cec3..ba8a3afae 100755 --- a/examples/model-conversion/scripts/embedding/run-converted-model.sh +++ b/examples/model-conversion/scripts/embedding/run-converted-model.sh @@ -28,6 +28,7 @@ done # First try command line argument, then environment variable CONVERTED_MODEL="${CONVERTED_MODEL:-"$CONVERTED_EMBEDDING_MODEL"}" +BUILD_DIR="${BUILD_DIR:-"../../build"}" # Final check if we have a model path if [ -z "$CONVERTED_MODEL" ]; then @@ -50,5 +51,5 @@ fi echo $CONVERTED_MODEL -cmake --build ../../build --target llama-debug -j8 -../../build/bin/llama-debug -m "$CONVERTED_MODEL" --embedding -p "$PROMPT" --save-logits --embd-normalize $EMBD_NORMALIZE +cmake --build ${BUILD_DIR} --target llama-debug -j8 +${BUILD_DIR}/bin/llama-debug -m "$CONVERTED_MODEL" --embedding -p "$PROMPT" --save-logits --embd-normalize $EMBD_NORMALIZE From 365a3e8c319ddb5442afdf17d0d23dfa0ff26c78 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 19 Jan 2026 20:03:19 +0200 Subject: [PATCH 03/17] ggml : add ggml_build_forward_select (#18550) * ggml : add ggml_build_forward_select * cuda : adapt CUDA graph compat to new feature * vulkan : update logic to handle command buffer closing * ggml : check compute for fusion * ggml : add comment --- ggml/include/ggml.h | 46 ++++++++++++++--- ggml/src/ggml-backend.cpp | 5 +- ggml/src/ggml-blas/ggml-blas.cpp | 4 ++ ggml/src/ggml-cann/ggml-cann.cpp | 4 ++ ggml/src/ggml-cpu/ggml-cpu.c | 4 ++ ggml/src/ggml-cuda/common.cuh | 1 + ggml/src/ggml-cuda/ggml-cuda.cu | 8 +++ ggml/src/ggml-hexagon/ggml-hexagon.cpp | 4 ++ ggml/src/ggml-impl.h | 3 ++ ggml/src/ggml-metal/ggml-metal-ops.cpp | 4 ++ ggml/src/ggml-opencl/ggml-opencl.cpp | 4 ++ ggml/src/ggml-sycl/ggml-sycl.cpp | 3 ++ ggml/src/ggml-vulkan/ggml-vulkan.cpp | 5 +- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 3 ++ ggml/src/ggml-zdnn/ggml-zdnn.cpp | 4 ++ ggml/src/ggml-zendnn/ggml-zendnn.cpp | 4 ++ ggml/src/ggml.c | 70 +++++++++++++++++++------- 17 files changed, 148 insertions(+), 28 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index b69583dd3..1988d16dc 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -630,10 +630,11 @@ extern "C" { // this tensor... enum ggml_tensor_flag { - GGML_TENSOR_FLAG_INPUT = 1, // ...is an input for the GGML compute graph - GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph - GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters - GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up) + GGML_TENSOR_FLAG_INPUT = 1, // ...is an input for the GGML compute graph + GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph + GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters + GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up) + GGML_TENSOR_FLAG_COMPUTE = 16, // ...must be computed }; enum ggml_tri_type { @@ -2577,11 +2578,42 @@ extern "C" { struct ggml_tensor * grad, struct ggml_tensor * sgd_params); // alpha, weight decay + // build forward mutiple tensors and select one of them for computing + // this is useful for creating graphs that have constant topology but compute different things based on the input + // ref: https://github.com/ggml-org/llama.cpp/pull/18550 // - // automatic differentiation + // nodes: + // | - build forward into the graph but do not compute + // c - build forward into the graph and compute // + // | | ... c ... | + // | | ... c ... | + // | | ... c ... | + // [0 1 ... idx ... n-1] <-- ggml_build_forward_select(..., n, idx) + // c + // c + // + // example: + // struct ggml_tensor * curs[3]; + // + // curs[0] = compute0(...); + // curs[1] = compute1(...); + // curs[2] = compute2(...); + // + // int idx = select_branch(some_input); + // + // struct ggml_tensor * out = ggml_build_forward_select(cgraph, curs, 3, idx); + // + GGML_API struct ggml_tensor * ggml_build_forward_select( + struct ggml_cgraph * cgraph, + struct ggml_tensor ** tensors, + int n_tensors, + int idx); + + GGML_API void ggml_build_forward_expand( + struct ggml_cgraph * cgraph, + struct ggml_tensor * tensor); - GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); GGML_API void ggml_build_backward_expand( struct ggml_context * ctx, // context for gradient computation struct ggml_cgraph * cgraph, @@ -2613,7 +2645,7 @@ extern "C" { GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph); // dump the graph into a file using the dot format - GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename); + GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * cgraph, const char * filename); // TODO these functions were sandwiched in the old optimization interface, is there a better place for them? typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data); diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 1b59924b8..354876574 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -874,9 +874,9 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str } if (sched->debug > 1) { ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node); - GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d:", i, ggml_op_name(node->op), node->name, + GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d,c=%d:", i, ggml_op_name(node->op), node->name, fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node), - graph->use_counts[ggml_hash_find(&graph->visited_hash_set, node)]); + graph->use_counts[ggml_hash_find(&graph->visited_hash_set, node)], node->flags & GGML_TENSOR_FLAG_COMPUTE ? 1 : 0); for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { @@ -1922,6 +1922,7 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, dst->view_offs = src->view_offs; } dst->op = src->op; + dst->flags = src->flags; memcpy(dst->op_params, src->op_params, sizeof(dst->op_params)); ggml_set_name(dst, src->name); diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp index 84956cbb9..2e9ddf224 100644 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp @@ -226,6 +226,10 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; + if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) { + continue; + } + switch (node->op) { case GGML_OP_MUL_MAT: ggml_backend_blas_mul_mat(ctx, node); diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index eba83327f..42c6c67a4 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -2146,6 +2146,10 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx continue; } + if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) { + continue; + } + bool ok = ggml_cann_compute_forward(*cann_ctx, node); if (!ok) { GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index f7ba1fe31..4c7a75e76 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -2943,6 +2943,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { continue; } + if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) { + continue; + } + ggml_compute_forward(¶ms, node); if (state->ith == 0 && cplan->abort_callback && diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index eaaf87612..179522d83 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1123,6 +1123,7 @@ struct ggml_tensor_extra_gpu { struct ggml_cuda_graph_node_properties { void * node_address; ggml_op node_op; + int32_t flags; int64_t ne[GGML_MAX_DIMS]; size_t nb[GGML_MAX_DIMS]; void * src_address[GGML_MAX_SRC]; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index ed1021469..cda422def 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2918,6 +2918,7 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) { static void ggml_cuda_graph_node_set_properties(ggml_cuda_graph_node_properties * props, ggml_tensor * node) { props->node_address = node->data; props->node_op = node->op; + props->flags = node->flags; for (int i = 0; i < GGML_MAX_DIMS; i++) { props->ne[i] = node->ne[i]; props->nb[i] = node->nb[i]; @@ -2961,6 +2962,10 @@ static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_ return false; } + if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) != (props->flags & GGML_TENSOR_FLAG_COMPUTE)) { + return false; + } + return true; } @@ -3378,6 +3383,9 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud continue; } + if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) { + continue; + } // start of fusion operations static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr); diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index cf1eb994c..5b835c11c 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -2497,6 +2497,10 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg continue; } + if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) { + continue; + } + uint32_t flags = 0; // skip quantizer if src1 is reused diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 80e0fd2ff..baadfe9a7 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -611,6 +611,9 @@ static inline bool ggml_can_fuse_ext(const struct ggml_cgraph * cgraph, const in if (node->op != ops[i]) { return false; } + if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) { + return false; + } if (i < num_ops - 1 && !ggml_node_has_n_uses(cgraph, node_idxs[i], 1)) { return false; } diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index 680ad794d..3d97d3dfd 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -203,6 +203,10 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) { GGML_ABORT("unsupported op"); } + if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) { + return 1; + } + int n_fuse = 1; // check if the current node can run concurrently with other nodes before it diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index d89d5e724..8059240b1 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -3058,6 +3058,10 @@ static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggm continue; } + if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) { + continue; + } + if (!backend_ctx->disable_fusion && ggml_opencl_can_fuse(cgraph, i, { GGML_OP_NORM, GGML_OP_MUL, GGML_OP_ADD })) { ggml_opencl_op_norm_fused(backend, node, cgraph->nodes[i+1], cgraph->nodes[i+2]); i += 2; diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 8f8176b67..bb8acc922 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4109,6 +4109,9 @@ static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * syc if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { continue; } + if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) { + continue; + } #ifndef NDEBUG assert(node->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device)); for (int j = 0; j < GGML_MAX_SRC; j++) { diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 0fabbcec3..08fd044ca 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -12191,6 +12191,9 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr if (ggml_is_empty(node) || ggml_op_is_empty(node->op) || !node->buffer) { return false; } + if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) { + return false; + } VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")"); ctx->semaphore_idx = 0; @@ -13645,7 +13648,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg int last_node = cgraph->n_nodes - 1; // If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly - while (last_node > 0 && ggml_vk_is_empty(cgraph->nodes[last_node])) { + while (last_node > 0 && (ggml_vk_is_empty(cgraph->nodes[last_node]) || ((cgraph->nodes[last_node]->flags & GGML_TENSOR_FLAG_COMPUTE) == 0))) { last_node -= 1; } diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 1470378af..584cea769 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -1982,6 +1982,9 @@ static std::optional ggml_webgpu_encode_node(webgpu_context ctx, if (ggml_is_empty(node)) { return std::nullopt; } + if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) { + return std::nullopt; + } WEBGPU_LOG_DEBUG("ggml_webgpu_encode_node(" << node << ", " << ggml_op_name(node->op) << ")"); ggml_tensor * src0 = node->src[0]; diff --git a/ggml/src/ggml-zdnn/ggml-zdnn.cpp b/ggml/src/ggml-zdnn/ggml-zdnn.cpp index edbeb8eef..906d25417 100644 --- a/ggml/src/ggml-zdnn/ggml-zdnn.cpp +++ b/ggml/src/ggml-zdnn/ggml-zdnn.cpp @@ -58,6 +58,10 @@ static enum ggml_status ggml_zdnn_graph_compute(ggml_backend_t backend, ggml_cgr continue; } + if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) { + continue; + } + bool ok = ggml_zdnn_compute_forward(ctx, node); if (!ok) { GGML_LOG_ERROR("%s: unsupported op %s (%s)\n", diff --git a/ggml/src/ggml-zendnn/ggml-zendnn.cpp b/ggml/src/ggml-zendnn/ggml-zendnn.cpp index fd07f983d..afbecde7a 100644 --- a/ggml/src/ggml-zendnn/ggml-zendnn.cpp +++ b/ggml/src/ggml-zendnn/ggml-zendnn.cpp @@ -211,6 +211,10 @@ static ggml_status ggml_backend_zendnn_graph_compute(ggml_backend_t backend, ggm for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; + if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) { + continue; + } + switch (node->op) { case GGML_OP_MUL_MAT: ggml_zendnn_compute_forward_mul_mat(ctx, node); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index c75fe7d27..1725ad165 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -3441,7 +3441,8 @@ struct ggml_tensor * ggml_cast( result->op = GGML_OP_CPY; result->src[0] = a; - result->src[1] = result; + result->src[1] = result; // note: this self-reference might seem redundant, but it's actually needed by some + // backends for consistency with ggml_cpy_impl() above return result; } @@ -6725,20 +6726,35 @@ static void ggml_compute_backward( GGML_ASSERT(!src2_needs_grads || ggml_are_same_shape(src2, cgraph->grads[isrc2])); } -static size_t ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) { - // check if already visited - size_t node_hash_pos = ggml_hash_find(&cgraph->visited_hash_set, node); +static size_t ggml_visit_parents_graph(struct ggml_cgraph * cgraph, struct ggml_tensor * node, bool compute) { + if (node->op != GGML_OP_NONE && compute) { + node->flags |= GGML_TENSOR_FLAG_COMPUTE; + } + + const size_t node_hash_pos = ggml_hash_find(&cgraph->visited_hash_set, node); GGML_ASSERT(node_hash_pos != GGML_HASHSET_FULL); - if (!ggml_bitset_get(cgraph->visited_hash_set.used, node_hash_pos)) { - // This is the first time we see this node in the current graph. - cgraph->visited_hash_set.keys[node_hash_pos] = node; - ggml_bitset_set(cgraph->visited_hash_set.used, node_hash_pos); - cgraph->use_counts[node_hash_pos] = 0; - } else { + + if (ggml_bitset_get(cgraph->visited_hash_set.used, node_hash_pos)) { // already visited + + if (compute) { + // update the compute flag regardless + for (int i = 0; i < GGML_MAX_SRC; ++i) { + struct ggml_tensor * src = node->src[i]; + if (src && ((src->flags & GGML_TENSOR_FLAG_COMPUTE) == 0)) { + ggml_visit_parents_graph(cgraph, src, true); + } + } + } + return node_hash_pos; } + // This is the first time we see this node in the current graph. + cgraph->visited_hash_set.keys[node_hash_pos] = node; + ggml_bitset_set(cgraph->visited_hash_set.used, node_hash_pos); + cgraph->use_counts[node_hash_pos] = 0; + for (int i = 0; i < GGML_MAX_SRC; ++i) { const int k = (cgraph->order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? i : @@ -6747,7 +6763,7 @@ static size_t ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor struct ggml_tensor * src = node->src[k]; if (src) { - size_t src_hash_pos = ggml_visit_parents(cgraph, src); + const size_t src_hash_pos = ggml_visit_parents_graph(cgraph, src, compute); // Update the use count for this operand. cgraph->use_counts[src_hash_pos]++; @@ -6778,17 +6794,17 @@ static size_t ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor return node_hash_pos; } -static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) { +static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand, bool compute) { if (!expand) { // TODO: this branch isn't accessible anymore, maybe move this to ggml_build_forward_expand ggml_graph_clear(cgraph); } - const int n0 = cgraph->n_nodes; + const int n_old = cgraph->n_nodes; - ggml_visit_parents(cgraph, tensor); + ggml_visit_parents_graph(cgraph, tensor, compute); - const int n_new = cgraph->n_nodes - n0; + const int n_new = cgraph->n_nodes - n_old; GGML_PRINT_DEBUG("%s: visited %d new nodes\n", __func__, n_new); if (n_new > 0) { @@ -6797,8 +6813,22 @@ static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_ten } } +struct ggml_tensor * ggml_build_forward_select( + struct ggml_cgraph * cgraph, + struct ggml_tensor ** tensors, + int n_tensors, + int idx) { + GGML_ASSERT(idx >= 0 && idx < n_tensors); + + for (int i = 0; i < n_tensors; i++) { + ggml_build_forward_impl(cgraph, tensors[i], true, i == idx ? true : false); + } + + return tensors[idx]; +} + void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) { - ggml_build_forward_impl(cgraph, tensor, true); + ggml_build_forward_impl(cgraph, tensor, true, true); } void ggml_build_backward_expand( @@ -7229,6 +7259,10 @@ bool ggml_can_fuse_subgraph_ext(const struct ggml_cgraph * cgraph, return false; } + if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) { + return false; + } + if (ggml_node_list_find_tensor(cgraph, outputs, num_outputs, node) != -1) { continue; } @@ -7310,7 +7344,7 @@ static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, label); } -void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) { +void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * cgraph, const char * filename) { char color[16]; FILE * fp = ggml_fopen(filename, "w"); @@ -7331,7 +7365,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph if (node->flags & GGML_TENSOR_FLAG_PARAM) { snprintf(color, sizeof(color), "yellow"); } else if (grad) { - if (ggml_graph_find(gf, node)) { + if (ggml_graph_find(cgraph, node)) { snprintf(color, sizeof(color), "green"); } else { snprintf(color, sizeof(color), "lightblue"); From 18361c579cc7ed0a06ed9085eaf900326b537fa7 Mon Sep 17 00:00:00 2001 From: Lennart Austenfeld <53152202+l-austenfeld@users.noreply.github.com> Date: Mon, 19 Jan 2026 19:13:31 +0100 Subject: [PATCH 04/17] server: fix memory reservations in populate_token_probs (#18787) --- tools/server/server-context.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 82294d940..c790ac79e 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -1326,11 +1326,12 @@ private: } void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) const { - const size_t n_probs = slot.task->params.sampling.n_probs; + const size_t n_probs_request = slot.task->params.sampling.n_probs; if (post_sampling) { const auto * cur_p = common_sampler_get_candidates(slot.smpl.get(), true); const size_t max_probs = cur_p->size; + const size_t n_probs = std::min(max_probs, n_probs_request); // set probability for sampled token for (size_t i = 0; i < max_probs; i++) { @@ -1341,8 +1342,8 @@ private: } // set probability for top n_probs tokens - result.probs.reserve(max_probs); - for (size_t i = 0; i < std::min(max_probs, n_probs); i++) { + result.probs.reserve(n_probs); + for (size_t i = 0; i < n_probs; i++) { result.probs.push_back({ cur_p->data[i].id, common_token_to_piece(ctx, cur_p->data[i].id, special), @@ -1352,9 +1353,11 @@ private: } else { // TODO: optimize this with min-p optimization std::vector cur = get_token_probabilities(ctx, idx); + const size_t max_probs = cur.size(); + const size_t n_probs = std::min(max_probs, n_probs_request); // set probability for sampled token - for (size_t i = 0; i < cur.size(); i++) { + for (size_t i = 0; i < max_probs; i++) { // set probability for sampled token if (cur[i].id == result.tok) { result.prob = cur[i].p; @@ -1364,7 +1367,7 @@ private: // set probability for top n_probs tokens result.probs.reserve(n_probs); - for (size_t i = 0; i < std::min(cur.size(), n_probs); i++) { + for (size_t i = 0; i < n_probs; i++) { result.probs.push_back({ cur[i].id, common_token_to_piece(ctx, cur[i].id, special), From 4037093c66bfe53b9b27d72765416815fdb0398f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Mon, 19 Jan 2026 20:29:15 +0100 Subject: [PATCH 05/17] ci : run test-jinja -py on high perf [no ci] (#18916) --- ci/run.sh | 2 +- tests/CMakeLists.txt | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/run.sh b/ci/run.sh index 6ca6ea566..dfcf95966 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -254,7 +254,7 @@ function gg_run_ctest_release { (time make -j$(nproc) ) 2>&1 | tee -a $OUT/${ci}-make.log if [ -z ${GG_BUILD_LOW_PERF} ]; then - (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log + (time ctest --output-on-failure -L 'main|python' ) 2>&1 | tee -a $OUT/${ci}-ctest.log else (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log fi diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 3eae18eef..c9436c599 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -187,6 +187,7 @@ llama_build_and_test(test-chat-parser.cpp) llama_build_and_test(test-chat-peg-parser.cpp peg-parser/simple-tokenize.cpp) llama_build_and_test(test-chat-template.cpp) llama_build_and_test(test-jinja.cpp) +llama_test(test-jinja NAME test-jinja-py ARGS -py LABEL python) llama_build_and_test(test-json-partial.cpp) llama_build_and_test(test-log.cpp) llama_build_and_test( From 959ecf7f234dc0bc0cd6829b25cb0ee1481aa78a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Mon, 19 Jan 2026 20:29:43 +0100 Subject: [PATCH 06/17] jinja : fix undefined keys and attributes and int/float as bool (#18924) * fix undefined keys and attributes * add falsy tests * as_bool for integers and floats * more falsy/truthy tests * --typo --- common/jinja/runtime.cpp | 4 +-- common/jinja/value.h | 6 ++++ tests/test-jinja.cpp | 78 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 86 insertions(+), 2 deletions(-) diff --git a/common/jinja/runtime.cpp b/common/jinja/runtime.cpp index d8ef27908..e3e4ebf1e 100644 --- a/common/jinja/runtime.cpp +++ b/common/jinja/runtime.cpp @@ -805,7 +805,7 @@ value member_expression::execute_impl(context & ctx) { } else if (is_val(property)) { auto key = property->as_string().str(); JJ_DEBUG("Accessing %s built-in '%s'", is_val(object) ? "array" : "string", key.c_str()); - val = try_builtin_func(ctx, key, object); + val = try_builtin_func(ctx, key, object, true); } else { throw std::runtime_error("Cannot access property with non-string/non-number: got " + property->type()); } @@ -814,7 +814,7 @@ value member_expression::execute_impl(context & ctx) { throw std::runtime_error("Cannot access property with non-string: got " + property->type()); } auto key = property->as_string().str(); - val = try_builtin_func(ctx, key, object); + val = try_builtin_func(ctx, key, object, true); } if (ctx.is_get_stats && val && object && property) { diff --git a/common/jinja/value.h b/common/jinja/value.h index 4e916919b..7bd0202ce 100644 --- a/common/jinja/value.h +++ b/common/jinja/value.h @@ -203,6 +203,9 @@ struct value_int_t : public value_t { virtual int64_t as_int() const override { return val_int; } virtual double as_float() const override { return static_cast(val_int); } virtual string as_string() const override { return std::to_string(val_int); } + virtual bool as_bool() const override { + return val_int != 0; + } virtual const func_builtins & get_builtins() const override; }; using value_int = std::shared_ptr; @@ -219,6 +222,9 @@ struct value_float_t : public value_t { if (out.back() == '.') out.push_back('0'); // leave one zero if no decimals return out; } + virtual bool as_bool() const override { + return val_flt != 0.0; + } virtual const func_builtins & get_builtins() const override; }; using value_float = std::shared_ptr; diff --git a/tests/test-jinja.cpp b/tests/test-jinja.cpp index 13818381e..99630ecb3 100644 --- a/tests/test-jinja.cpp +++ b/tests/test-jinja.cpp @@ -191,6 +191,84 @@ static void test_conditionals(testing & t) { json::object(), "yes" ); + + test_template(t, "is undefined falsy", + "{{ 'yes' if not y else 'no' }}", + json::object(), + "yes" + ); + + test_template(t, "is undefined attribute falsy", + "{{ 'yes' if not y.x else 'no' }}", + {{"y", true}}, + "yes" + ); + + test_template(t, "is undefined key falsy", + "{{ 'yes' if not y['x'] else 'no' }}", + {{"y", {{}}}}, + "yes" + ); + + test_template(t, "is empty array falsy", + "{{ 'yes' if not y else 'no' }}", + {{"y", json::array()}}, + "yes" + ); + + test_template(t, "is empty object falsy", + "{{ 'yes' if not y else 'no' }}", + {{"y", json::object()}}, + "yes" + ); + + test_template(t, "is empty string falsy", + "{{ 'yes' if not y else 'no' }}", + {{"y", ""}}, + "yes" + ); + + test_template(t, "is 0 falsy", + "{{ 'yes' if not y else 'no' }}", + {{"y", 0}}, + "yes" + ); + + test_template(t, "is 0.0 falsy", + "{{ 'yes' if not y else 'no' }}", + {{"y", 0.0}}, + "yes" + ); + + test_template(t, "is non-empty array truthy", + "{{ 'yes' if y else 'no' }}", + {{"y", json::array({""})}}, + "yes" + ); + + test_template(t, "is non-empty object truthy", + "{{ 'yes' if y else 'no' }}", + {{"y", {"x", false}}}, + "yes" + ); + + test_template(t, "is non-empty string truthy", + "{{ 'yes' if y else 'no' }}", + {{"y", "0"}}, + "yes" + ); + + test_template(t, "is 1 truthy", + "{{ 'yes' if y else 'no' }}", + {{"y", 1}}, + "yes" + ); + + test_template(t, "is 1.0 truthy", + "{{ 'yes' if y else 'no' }}", + {{"y", 1.0}}, + "yes" + ); } static void test_loops(testing & t) { From 1706a6d7c68fc54374b0807324079b2c6ebf674a Mon Sep 17 00:00:00 2001 From: ddh0 Date: Mon, 19 Jan 2026 16:09:20 -0600 Subject: [PATCH 07/17] convert : support Glm4MoeLite (#18936) * initial commit for branch * add glm-4.7-flash, move tokenizer hash * use `glm4` pretok * silence flake8 E302 (CI) * apply review feedback * add <|user|> as eog * also add EOG `<|observation|>` * revert llama-vocab * inherit vocab from glm4 --------- Co-authored-by: Xuan Son Nguyen --- convert_hf_to_gguf.py | 31 ++++++++++++++++++++++++++++++- convert_hf_to_gguf_update.py | 1 + 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 464ecbaab..becbad046 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1078,6 +1078,9 @@ class TextModel(ModelBase): if chkhsh == "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df": # ref: https://huggingface.co/aari1995/German_Semantic_V3 res = "jina-v2-de" + if chkhsh == "cdf5f35325780597efd76153d4d1c16778f766173908894c04afc20108536267": + # ref: https://huggingface.co/zai-org/GLM-4.7-Flash + res = "glm4" if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B res = "llama-bpe" @@ -7458,7 +7461,7 @@ class DeepseekModel(TextModel): "DeepseekV3ForCausalLM", "KimiVLForConditionalGeneration", "YoutuForCausalLM", - "YoutuVLForConditionalGeneration" + "YoutuVLForConditionalGeneration", ) class DeepseekV2Model(TextModel): model_arch = gguf.MODEL_ARCH.DEEPSEEK2 @@ -8446,6 +8449,32 @@ class Glm4MoeModel(TextModel): raise ValueError(f"Unprocessed experts: {experts}") +@ModelBase.register("Glm4MoeLiteForCausalLM") +class Glm4MoeLiteModel(DeepseekV2Model): + model_arch = gguf.MODEL_ARCH.DEEPSEEK2 + + # copied from Glm4MoeModel + def set_vocab(self): + from transformers import AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained(self.dir_model) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + # Special tokens + # Note: Using <|endoftext|> (151329) for eot causes endless generation + special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"]) # 151331 + special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # 151336 + special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # 151329 + special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"]) # 151338 + + special_vocab.add_to_gguf(self.gguf_writer) + + @ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration") class ChatGLMModel(TextModel): model_arch = gguf.MODEL_ARCH.CHATGLM diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index aa9843ea1..2811f7f88 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -170,6 +170,7 @@ pre_computed_hashes = [ {"name": "grok-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"}, # jina-v2-de variants {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/aari1995/German_Semantic_V3", "chkhsh": "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df"}, + {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/zai-org/GLM-4.7-Flash", "chkhsh": "cdf5f35325780597efd76153d4d1c16778f766173908894c04afc20108536267"}, ] From 6df686bee68ff109f62123c7a8eac003f3dd9e20 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Mon, 19 Jan 2026 23:28:01 +0100 Subject: [PATCH 08/17] server : refactor oai_parser_opt, move it to server_chat_params (#18937) * server_chat_params * move chat format into CLI * use meta whenever possible * clean up, no more chatml fallback --- common/chat.cpp | 14 ++-- common/chat.h | 2 +- tools/cli/cli.cpp | 32 ++++++-- tools/server/server-common.cpp | 4 +- tools/server/server-common.h | 14 ++-- tools/server/server-context.cpp | 138 ++++++++++++++------------------ tools/server/server-context.h | 5 +- tools/server/server-task.h | 6 +- 8 files changed, 112 insertions(+), 103 deletions(-) diff --git a/common/chat.cpp b/common/chat.cpp index 28721ac7d..b29544dac 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -601,18 +601,18 @@ bool common_chat_templates_was_explicit(const struct common_chat_templates * tmp return tmpls->has_explicit_template; } -const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant) { - if (variant != nullptr) { - if (strcmp(variant, "tool_use") == 0) { +std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant) { + if (!variant.empty()) { + if (variant == "tool_use") { if (tmpls->template_tool_use) { - return tmpls->template_tool_use->source().c_str(); + return tmpls->template_tool_use->source(); } - return nullptr; + return ""; } else { - LOG_DBG("%s: unknown template variant: %s\n", __func__, variant); + LOG_DBG("%s: unknown template variant: %s\n", __func__, variant.c_str()); } } - return tmpls->template_default->source().c_str(); + return tmpls->template_default->source(); } common_chat_templates_ptr common_chat_templates_init( diff --git a/common/chat.h b/common/chat.h index 454085e90..148801738 100644 --- a/common/chat.h +++ b/common/chat.h @@ -191,7 +191,7 @@ common_chat_templates_ptr common_chat_templates_init( const std::string & eos_token_override = ""); bool common_chat_templates_was_explicit(const struct common_chat_templates * tmpls); -const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant = nullptr); +std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant = ""); struct common_chat_params common_chat_templates_apply( diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index 2f0ffea1c..caad29bac 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -71,14 +71,16 @@ struct cli_context { std::string generate_completion(result_timings & out_timings) { server_response_reader rd = ctx_server.get_response_reader(); + auto formatted = format_chat(); { // TODO: reduce some copies here in the future server_task task = server_task(SERVER_TASK_TYPE_COMPLETION); - task.id = rd.get_new_id(); - task.index = 0; - task.params = defaults; // copy - task.cli_input = messages; // copy - task.cli_files = input_files; // copy + task.id = rd.get_new_id(); + task.index = 0; + task.params = defaults; // copy + task.cli_prompt = formatted.prompt; // copy + task.cli_files = input_files; // copy + task.cli = true; rd.post_task({std::move(task)}); } @@ -156,6 +158,26 @@ struct cli_context { return content; } } + + common_chat_params format_chat() { + auto meta = ctx_server.get_meta(); + auto & chat_params = meta.chat_params; + + common_chat_templates_inputs inputs; + inputs.messages = common_chat_msgs_parse_oaicompat(messages); + inputs.tools = {}; // TODO + inputs.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE; + inputs.json_schema = ""; // TODO + inputs.grammar = ""; // TODO + inputs.use_jinja = chat_params.use_jinja; + inputs.parallel_tool_calls = false; + inputs.add_generation_prompt = true; + inputs.reasoning_format = chat_params.reasoning_format; + inputs.enable_thinking = chat_params.enable_thinking; + + // Apply chat template to the list of messages + return common_chat_templates_apply(chat_params.tmpls.get(), inputs); + } }; int main(int argc, char ** argv) { diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index 16b0db298..1bbe85322 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -831,7 +831,7 @@ static void handle_media( // used by /chat/completions endpoint json oaicompat_chat_params_parse( json & body, /* openai api json semantics */ - const oaicompat_parser_options & opt, + const server_chat_params & opt, std::vector & out_files) { json llama_params; @@ -1012,7 +1012,7 @@ json oaicompat_chat_params_parse( } // Apply chat template to the list of messages - auto chat_params = common_chat_templates_apply(opt.tmpls, inputs); + auto chat_params = common_chat_templates_apply(opt.tmpls.get(), inputs); /* Append assistant prefilled message */ if (prefill_assistant_message) { diff --git a/tools/server/server-common.h b/tools/server/server-common.h index 152a2a3c4..7f4c07387 100644 --- a/tools/server/server-common.h +++ b/tools/server/server-common.h @@ -274,25 +274,25 @@ std::vector tokenize_input_prompts( // OAI utils // -// used by /completions endpoint -json oaicompat_completion_params_parse(const json & body); - -struct oaicompat_parser_options { +struct server_chat_params { bool use_jinja; bool prefill_assistant; common_reasoning_format reasoning_format; - std::map chat_template_kwargs; - common_chat_templates * tmpls; + std::map chat_template_kwargs; // mapping key --> json value + common_chat_templates_ptr tmpls; bool allow_image; bool allow_audio; bool enable_thinking = true; std::string media_path; }; +// used by /completions endpoint +json oaicompat_completion_params_parse(const json & body); + // used by /chat/completions endpoint json oaicompat_chat_params_parse( json & body, /* openai api json semantics */ - const oaicompat_parser_options & opt, + const server_chat_params & opt, std::vector & out_files); // convert Anthropic Messages API format to OpenAI Chat Completions API format diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index c790ac79e..f1f677add 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -534,8 +534,8 @@ public: server_queue queue_tasks; server_response queue_results; - common_chat_templates_ptr chat_templates; - oaicompat_parser_options oai_parser_opt; + // note: chat_params must not be refreshed upon existing sleeping state + server_chat_params chat_params; ~server_context_impl() { if (!sleeping) { @@ -688,15 +688,6 @@ private: llama_init_dft->free_context(); } - chat_templates = common_chat_templates_init(model, params_base.chat_template); - try { - common_chat_format_example(chat_templates.get(), params.use_jinja, params.default_template_kwargs); - } catch (const std::exception & e) { - SRV_WRN("%s: Chat template parsing error: %s\n", __func__, e.what()); - SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__); - chat_templates = common_chat_templates_init(model, "chatml"); - } - std::string & mmproj_path = params_base.mmproj.path; if (!mmproj_path.empty()) { if (!is_resume) { @@ -845,30 +836,6 @@ private: model_name = model_path.filename().string(); } - // thinking is enabled if: - // 1. It's not explicitly disabled (reasoning_budget == 0) - // 2. The chat template supports it - const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get()); - SRV_INF("thinking = %d\n", enable_thinking); - - oai_parser_opt = { - /* use_jinja */ params_base.use_jinja, - /* prefill_assistant */ params_base.prefill_assistant, - /* reasoning_format */ params_base.reasoning_format, - /* chat_template_kwargs */ params_base.default_template_kwargs, - /* common_chat_templates */ chat_templates.get(), - /* allow_image */ mctx ? mtmd_support_vision(mctx) : false, - /* allow_audio */ mctx ? mtmd_support_audio (mctx) : false, - /* enable_thinking */ enable_thinking, - /* media_path */ params_base.media_path, - }; - - // print sample chat example to make it clear which template is used - // @ngxson modern templates are too long, spam the logs; printing the example is enough - LOG_INF("%s: chat template, example_format: '%s'\n", __func__, - // common_chat_templates_source(chat_templates.get()), - common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str()); - if (!is_resume) { return init(); } @@ -907,6 +874,42 @@ private: } } + // populate chat template params + { + common_chat_templates_ptr chat_templates; + + try { + chat_templates = common_chat_templates_init(model, params_base.chat_template); + + LOG_INF("%s: chat template, example_format: '%s'\n", __func__, + common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str()); + + } catch (const std::exception & e) { + SRV_ERR("%s: chat template parsing error: %s\n", __func__, e.what()); + SRV_ERR("%s: please consider disabling jinja via --no-jinja, or use a custom chat template via --chat-template\n", __func__); + SRV_ERR("%s: for example: --no-jinja --chat-template chatml\n", __func__); + return false; + } + + // thinking is enabled if: + // 1. It's not explicitly disabled (reasoning_budget == 0) + // 2. The chat template supports it + const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get()); + SRV_INF("%s: chat template, thinking = %d\n", __func__, enable_thinking); + + chat_params = { + /* use_jinja */ params_base.use_jinja, + /* prefill_assistant */ params_base.prefill_assistant, + /* reasoning_format */ params_base.reasoning_format, + /* chat_template_kwargs */ params_base.default_template_kwargs, + /* tmpls */ std::move(chat_templates), + /* allow_image */ mctx ? mtmd_support_vision(mctx) : false, + /* allow_audio */ mctx ? mtmd_support_audio (mctx) : false, + /* enable_thinking */ enable_thinking, + /* media_path */ params_base.media_path, + }; + } + return true; } @@ -1588,32 +1591,14 @@ private: // tokenize the input if it's set by CLI, return false on error bool tokenize_cli_input(server_task & task) { - GGML_ASSERT(task.cli_input != nullptr); try { - auto & opt = oai_parser_opt; - common_chat_templates_inputs inputs; - inputs.messages = common_chat_msgs_parse_oaicompat(task.cli_input); - inputs.tools = {}; // TODO - inputs.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE; - inputs.json_schema = ""; // TODO - inputs.grammar = ""; // TODO - inputs.use_jinja = opt.use_jinja; - inputs.parallel_tool_calls = false; - inputs.add_generation_prompt = true; - inputs.reasoning_format = opt.reasoning_format; - inputs.enable_thinking = opt.enable_thinking; - - // Apply chat template to the list of messages - auto chat_params = common_chat_templates_apply(opt.tmpls, inputs); - - // tokenize the resulting prompt - auto & prompt = chat_params.prompt; + auto & prompt = task.cli_prompt; if (mctx != nullptr) { task.tokens = process_mtmd_prompt(mctx, prompt, task.cli_files); } else { task.tokens = std::move(tokenize_input_prompts(vocab, mctx, prompt, true, true)[0]); } - task.cli_input.clear(); + task.cli_prompt.clear(); task.cli_files.clear(); } catch (const std::exception & e) { send_error(task, std::string("Failed to format input: ") + e.what(), ERROR_TYPE_INVALID_REQUEST); @@ -1689,7 +1674,7 @@ private: { // special case: if input is provided via CLI, tokenize it first // otherwise, no need to tokenize as it's already done inside the HTTP thread - if (task.cli_input != nullptr) { + if (task.cli) { if (!tokenize_cli_input(task)) { break; } @@ -2901,8 +2886,6 @@ server_response_reader server_context::get_response_reader() { } server_context_meta server_context::get_meta() const { - auto tool_use_src = common_chat_templates_source(impl->chat_templates.get(), "tool_use"); - auto bos_id = llama_vocab_bos(impl->vocab); auto eos_id = llama_vocab_eos(impl->vocab); auto bos_token_str = bos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx, bos_id, true) : ""; @@ -2913,14 +2896,13 @@ server_context_meta server_context::get_meta() const { /* model_name */ impl->model_name, /* model_path */ impl->params_base.model.path, /* has_mtmd */ impl->mctx != nullptr, - /* has_inp_image */ impl->oai_parser_opt.allow_image, - /* has_inp_audio */ impl->oai_parser_opt.allow_audio, + /* has_inp_image */ impl->chat_params.allow_image, + /* has_inp_audio */ impl->chat_params.allow_audio, /* json_webui_settings */ impl->json_webui_settings, /* slot_n_ctx */ impl->get_slot_n_ctx(), /* pooling_type */ llama_pooling_type(impl->ctx), - /* chat_template */ common_chat_templates_source(impl->chat_templates.get()), - /* chat_template_tool_use */ tool_use_src ? tool_use_src : "", + /* chat_params */ impl->chat_params, /* bos_token_str */ bos_token_str, /* eos_token_str */ eos_token_str, @@ -3202,8 +3184,8 @@ void server_routes::init_routes() { // this endpoint can be accessed during sleeping // the next LOC is to avoid someone accidentally use ctx_server - bool server_ctx; // do NOT delete this line - GGML_UNUSED(server_ctx); + bool ctx_server; // do NOT delete this line + GGML_UNUSED(ctx_server); res->ok({{"status", "ok"}}); return res; @@ -3393,8 +3375,8 @@ void server_routes::init_routes() { // this endpoint can be accessed during sleeping // the next LOC is to avoid someone accidentally use ctx_server - bool server_ctx; // do NOT delete this line - GGML_UNUSED(server_ctx); + bool ctx_server; // do NOT delete this line + GGML_UNUSED(ctx_server); task_params tparams; tparams.sampling = params.sampling; @@ -3403,6 +3385,9 @@ void server_routes::init_routes() { { "n_ctx", meta->slot_n_ctx }, }; + std::string tmpl_default = common_chat_templates_source(meta->chat_params.tmpls.get(), ""); + std::string tmpl_tools = common_chat_templates_source(meta->chat_params.tmpls.get(), "tool_use"); + json props = { { "default_generation_settings", default_generation_settings_for_props }, { "total_slots", params.n_parallel }, @@ -3417,15 +3402,15 @@ void server_routes::init_routes() { { "endpoint_metrics", params.endpoint_metrics }, { "webui", params.webui }, { "webui_settings", meta->json_webui_settings }, - { "chat_template", meta->chat_template }, + { "chat_template", tmpl_default }, { "bos_token", meta->bos_token_str }, { "eos_token", meta->eos_token_str }, { "build_info", meta->build_info }, { "is_sleeping", queue_tasks.is_sleeping() }, }; if (params.use_jinja) { - if (!meta->chat_template_tool_use.empty()) { - props["chat_template_tool_use"] = meta->chat_template_tool_use; + if (!tmpl_tools.empty()) { + props["chat_template_tool_use"] = tmpl_tools; } } res->ok(props); @@ -3446,6 +3431,7 @@ void server_routes::init_routes() { this->get_api_show = [this](const server_http_req &) { auto res = create_response(); + std::string tmpl_default = common_chat_templates_source(meta->chat_params.tmpls.get(), ""); json data = { { "model_info", { @@ -3454,7 +3440,7 @@ void server_routes::init_routes() { }, {"modelfile", ""}, {"parameters", ""}, - {"template", meta->chat_template}, + {"template", tmpl_default}, {"details", { {"parent_model", ""}, {"format", "gguf"}, @@ -3579,7 +3565,7 @@ void server_routes::init_routes() { json body = json::parse(req.body); json body_parsed = oaicompat_chat_params_parse( body, - ctx_server.oai_parser_opt, + meta->chat_params, files); return handle_completions_impl( req, @@ -3595,7 +3581,7 @@ void server_routes::init_routes() { json body = convert_anthropic_to_oai(json::parse(req.body)); json body_parsed = oaicompat_chat_params_parse( body, - ctx_server.oai_parser_opt, + meta->chat_params, files); return handle_completions_impl( req, @@ -3611,7 +3597,7 @@ void server_routes::init_routes() { json body = convert_anthropic_to_oai(json::parse(req.body)); json body_parsed = oaicompat_chat_params_parse( body, - ctx_server.oai_parser_opt, + meta->chat_params, files); json prompt = body_parsed.at("prompt"); @@ -3627,7 +3613,7 @@ void server_routes::init_routes() { json body = json::parse(req.body); json data = oaicompat_chat_params_parse( body, - ctx_server.oai_parser_opt, + meta->chat_params, files); res->ok({{ "prompt", std::move(data.at("prompt")) }}); return res; @@ -3638,8 +3624,8 @@ void server_routes::init_routes() { // this endpoint can be accessed during sleeping // the next LOC is to avoid someone accidentally use ctx_server - bool server_ctx; // do NOT delete this line - GGML_UNUSED(server_ctx); + bool ctx_server; // do NOT delete this line + GGML_UNUSED(ctx_server); json models = { {"models", { diff --git a/tools/server/server-context.h b/tools/server/server-context.h index 09bec15ae..ec1df9695 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -20,9 +20,8 @@ struct server_context_meta { int slot_n_ctx; enum llama_pooling_type pooling_type; - // chat template - std::string chat_template; - std::string chat_template_tool_use; + // chat params + server_chat_params & chat_params; // tokens std::string bos_token_str; diff --git a/tools/server/server-task.h b/tools/server/server-task.h index 11943ee4f..daffe0c90 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -130,8 +130,10 @@ struct server_task { task_params params; server_tokens tokens; - // only used by CLI, this delegates the tokenization to the server - json cli_input = nullptr; + // only used by CLI, this allow tokenizing CLI inputs on server side + // we need this because mtmd_context and vocab are not accessible outside of server_context + bool cli = false; + std::string cli_prompt; std::vector cli_files; server_task_type type; From 7dee9ff59ad507304bf43a2682dbe0a89bbc3dce Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 20 Jan 2026 06:55:24 +0100 Subject: [PATCH 09/17] convert : use n_groups instead of hardcoded values in reshape (#18929) * convert : use n_groups instead of hardcoded values in reshape This commit modifies the conversion script for NemotronHModel to use the 'n_groups' hyperparameter, and allow Python to calculate the the last dimension, using -1, when reshaping the 'mixer.norm.weight' tensor. * use self.n_group instead of self.hparams["n_groups"] --- convert_hf_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index becbad046..ab015dd2c 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -9212,7 +9212,7 @@ class NemotronHModel(GraniteHybridModel): return [(mapped_name, reshaped_data)] if name.endswith("mixer.norm.weight"): - reshaped_data = data_torch.reshape(8, 512) + reshaped_data = data_torch.reshape(self.n_group, -1) mapped_name = self.map_tensor_name(name) return [(mapped_name, reshaped_data)] From 271191906c3ff0a02916622f703166b6891fce0e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 20 Jan 2026 12:21:28 +0200 Subject: [PATCH 10/17] metal : enable FA for MLA heads (#18950) --- ggml/src/ggml-metal/ggml-metal-device.m | 8 ++------ ggml/src/ggml-metal/ggml-metal-ops.cpp | 2 +- ggml/src/ggml-metal/ggml-metal.metal | 13 ++++++++----- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m index c418afe9c..eb4e2c209 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.m +++ b/ggml/src/ggml-metal/ggml-metal-device.m @@ -1078,12 +1078,8 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te op->src[0]->ne[0] != 112 && op->src[0]->ne[0] != 128 && op->src[0]->ne[0] != 192 && - op->src[0]->ne[0] != 256) { - return false; - } - if (op->src[0]->ne[0] == 576) { - // DeepSeek sizes - // TODO: disabled for now, until optmized + op->src[0]->ne[0] != 256 && + op->src[0]->ne[0] != 576) { return false; } if (op->src[1]->type != op->src[2]->type) { diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index 3d97d3dfd..7f4cfbba2 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -2520,7 +2520,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) { // simdgroups per threadgroup (a.k.a. warps) //nsg = ne01 <= nqptg ? MAX(4, MIN(nsgmax, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32))) : 4; - int32_t nsg = 4; + int32_t nsg = ne00 >= 512 ? 8 : 4; const size_t smem = FATTN_SMEM(nsg); diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index a4e1cafe5..17e358d1a 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -5552,9 +5552,7 @@ void kernel_flash_attn_ext_impl( constexpr short NC = (C/8)/NSG; - // note: do not unroll for large heads - #pragma unroll (DK <= 64 ? NC : 1) - for (short cc = 0; cc < NC; ++cc) { + FOR_UNROLL (short cc = 0; cc < NC; ++cc) { qk8x8_t mqk = make_filled_simdgroup_matrix((qk_t) 0.0f); if (DK % 16 != 0) { @@ -5575,7 +5573,9 @@ void kernel_flash_attn_ext_impl( k8x8_t mk[2]; q8x8_t mq[2]; - FOR_UNROLL (short i = 0; i < DK8/2; ++i) { + // note: too much unroll can tank the performance for large heads + #pragma unroll (MIN(DK8/2, 4*NSG)) + for (short i = 0; i < DK8/2; ++i) { simdgroup_barrier(mem_flags::mem_none); simdgroup_load(mq[0], pq + 0*8 + 16*i, DK); @@ -5749,7 +5749,9 @@ void kernel_flash_attn_ext_impl( pv += 8*NS20; } } else { - FOR_UNROLL (short cc = 0; cc < (C/8)/2; ++cc) { + constexpr short NC = (C/8)/2; + + FOR_UNROLL (short cc = 0; cc < NC; ++cc) { s8x8_t vs[2]; simdgroup_load(vs[0], ss + 16*cc + 0, SH, 0, false); @@ -5952,6 +5954,7 @@ kernel void kernel_flash_attn_ext( //case 1: kernel_flash_attn_ext_impl(FWD_ARGS); break; //case 2: kernel_flash_attn_ext_impl(FWD_ARGS); break; case 4: kernel_flash_attn_ext_impl(FWD_ARGS); break; + case 8: kernel_flash_attn_ext_impl(FWD_ARGS); break; } #undef FWD_TMPL #undef FWD_ARGS From 08f3f4a8a30633491b031bf833441de2a1ab5029 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Tue, 20 Jan 2026 11:42:49 +0100 Subject: [PATCH 11/17] ggml : cleanup path_str() (#18928) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove pragmas as `std::codecvt_utf8` is not used. - Avoid implicit `strlen()`. Signed-off-by: Adrien Gallouët --- ggml/src/ggml-backend-reg.cpp | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 4181a714a..6bee1bc4b 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -77,39 +77,23 @@ #include "ggml-zendnn.h" #endif -// disable C++17 deprecation warning for std::codecvt_utf8 -#if defined(__clang__) -# pragma clang diagnostic push -# pragma clang diagnostic ignored "-Wdeprecated-declarations" -#elif defined(__GNUC__) -# pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wdeprecated-declarations" -#endif - namespace fs = std::filesystem; static std::string path_str(const fs::path & path) { - std::string u8path; try { #if defined(__cpp_lib_char8_t) // C++20 and later: u8string() returns std::u8string - std::u8string u8str = path.u8string(); - u8path = std::string(reinterpret_cast(u8str.c_str())); + const std::u8string u8str = path.u8string(); + return std::string(reinterpret_cast(u8str.data()), u8str.size()); #else // C++17: u8string() returns std::string - u8path = path.u8string(); + return path.u8string(); #endif } catch (...) { + return std::string(); } - return u8path; } -#if defined(__clang__) -# pragma clang diagnostic pop -#elif defined(__GNUC__) -# pragma GCC diagnostic pop -#endif - #ifdef _WIN32 using dl_handle = std::remove_pointer_t; From d1e3556481c8b351f9b7b69ba3febf6cb77fffa6 Mon Sep 17 00:00:00 2001 From: Oliver Simons Date: Tue, 20 Jan 2026 13:11:01 +0100 Subject: [PATCH 12/17] CUDA: Replace init_offsets kernel with iterators in cub-based argsort (#18930) * CUDA: Replace `init_offsets` with iterators in argsort This is a QOL improvement, saving us the cost of materializing the iterator * Remove unnecessary include from top-k.cu --- ggml/src/ggml-cuda/argsort.cu | 22 +++++++--------------- ggml/src/ggml-cuda/top-k.cu | 1 - 2 files changed, 7 insertions(+), 16 deletions(-) diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu index 57c8a99a2..cf7a44f7a 100644 --- a/ggml/src/ggml-cuda/argsort.cu +++ b/ggml/src/ggml-cuda/argsort.cu @@ -14,12 +14,6 @@ static __global__ void init_indices(int * indices, const int ncols, const int nr } } -static __global__ void init_offsets(int * offsets, const int ncols, const int nrows) { - const int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx <= nrows) { - offsets[idx] = idx * ncols; - } -} #ifdef GGML_CUDA_USE_CUB void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool, @@ -31,18 +25,15 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool, cudaStream_t stream) { ggml_cuda_pool_alloc temp_indices_alloc(pool, ncols * nrows); ggml_cuda_pool_alloc temp_keys_alloc(pool, ncols * nrows); - ggml_cuda_pool_alloc offsets_alloc(pool, nrows + 1); int * temp_indices = temp_indices_alloc.get(); float * temp_keys = temp_keys_alloc.get(); - int * d_offsets = offsets_alloc.get(); static const int block_size = 256; const dim3 grid_size((ncols + block_size - 1) / block_size, nrows); init_indices<<>>(temp_indices, ncols, nrows); - const dim3 offset_grid((nrows + block_size - 1) / block_size); - init_offsets<<>>(d_offsets, ncols, nrows); + auto offset_iterator = cuda::make_strided_iterator(cuda::make_counting_iterator(0), ncols); CUDA_CHECK(cudaMemcpyAsync(temp_keys, x, ncols * nrows * sizeof(float), cudaMemcpyDeviceToDevice, stream)); @@ -57,7 +48,7 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool, DeviceSegmentedSort::SortPairs(nullptr, temp_storage_bytes, temp_keys, temp_keys, // keys (in-place) temp_indices, dst, // values (indices) ncols * nrows, nrows, // num items, num segments - d_offsets, d_offsets + 1, stream); + offset_iterator, offset_iterator + 1, stream); } } else { if (nrows == 1) { @@ -66,7 +57,8 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool, ncols, 0, sizeof(float) * 8, stream); } else { DeviceSegmentedSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys, temp_keys, temp_indices, - dst, ncols * nrows, nrows, d_offsets, d_offsets + 1, stream); + dst, ncols * nrows, nrows, offset_iterator, offset_iterator + 1, + stream); } } @@ -80,7 +72,7 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool, ncols, 0, sizeof(float) * 8, stream); } else { DeviceSegmentedSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, temp_indices, dst, - ncols * nrows, nrows, d_offsets, d_offsets + 1, stream); + ncols * nrows, nrows, offset_iterator, offset_iterator + 1, stream); } } else { if (nrows == 1) { @@ -89,8 +81,8 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool, ncols, 0, sizeof(float) * 8, stream); } else { DeviceSegmentedSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, - temp_indices, dst, ncols * nrows, nrows, d_offsets, d_offsets + 1, - stream); + temp_indices, dst, ncols * nrows, nrows, offset_iterator, + offset_iterator + 1, stream); } } } diff --git a/ggml/src/ggml-cuda/top-k.cu b/ggml/src/ggml-cuda/top-k.cu index 318ac3869..785a18389 100644 --- a/ggml/src/ggml-cuda/top-k.cu +++ b/ggml/src/ggml-cuda/top-k.cu @@ -4,7 +4,6 @@ #ifdef GGML_CUDA_USE_CUB # include # if (CCCL_MAJOR_VERSION >= 3 && CCCL_MINOR_VERSION >= 2) -# include # define CUB_TOP_K_AVAILABLE using namespace cub; # endif // CCCL_MAJOR_VERSION >= 3 && CCCL_MINOR_VERSION >= 2 From 2c1f1996535cd49132e6e7d1b28908bcd4f56819 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Tue, 20 Jan 2026 18:23:25 +0100 Subject: [PATCH 13/17] cli : fix reasoning responses in CLI (#18961) * cli : fix reasoning responses in CLI * fix build * fix build (2) --- common/chat-parser.cpp | 6 +- common/chat-parser.h | 8 +- common/chat.h | 23 +- common/common.h | 1 + common/json-partial.h | 1 + tests/test-chat-parser.cpp | 240 +++++++++-------- tests/test-chat-peg-parser.cpp | 12 +- tests/test-chat.cpp | 454 +++++++++++++++++---------------- tools/cli/cli.cpp | 15 +- tools/server/server-common.h | 1 + tools/server/server-task.cpp | 34 +-- tools/server/server-task.h | 12 +- 12 files changed, 417 insertions(+), 390 deletions(-) diff --git a/common/chat-parser.cpp b/common/chat-parser.cpp index 2f073512e..c2d1e30f3 100644 --- a/common/chat-parser.cpp +++ b/common/chat-parser.cpp @@ -129,7 +129,7 @@ static void parse_json_tool_calls( } } -common_chat_msg_parser::common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax) +common_chat_msg_parser::common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_parser_params & syntax) : input_(input), is_partial_(is_partial), syntax_(syntax) { result_.role = "assistant"; @@ -1611,7 +1611,7 @@ static void common_chat_parse(common_chat_msg_parser & builder) { builder.finish(); } -common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax) { +common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_parser_params & syntax) { if (syntax.format == COMMON_CHAT_FORMAT_PEG_SIMPLE || syntax.format == COMMON_CHAT_FORMAT_PEG_NATIVE || syntax.format == COMMON_CHAT_FORMAT_PEG_CONSTRUCTED) { @@ -1635,7 +1635,7 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co return msg; } -common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_syntax & syntax) { +common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_parser_params & syntax) { if (parser.empty()) { throw std::runtime_error("Failed to parse due to missing parser definition."); } diff --git a/common/chat-parser.h b/common/chat-parser.h index 78c4b74c2..3ed9c30a2 100644 --- a/common/chat-parser.h +++ b/common/chat-parser.h @@ -5,7 +5,7 @@ #include "json-partial.h" #include "regex-partial.h" -#include +#include #include #include @@ -19,20 +19,20 @@ class common_chat_msg_partial_exception : public std::runtime_error { class common_chat_msg_parser { std::string input_; bool is_partial_; - common_chat_syntax syntax_; + common_chat_parser_params syntax_; // TODO: rename to params std::string healing_marker_; size_t pos_ = 0; common_chat_msg result_; public: - common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax); + common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_parser_params & syntax); const std::string & input() const { return input_; } size_t pos() const { return pos_; } const std::string & healing_marker() const { return healing_marker_; } const bool & is_partial() const { return is_partial_; } const common_chat_msg & result() const { return result_; } - const common_chat_syntax & syntax() const { return syntax_; } + const common_chat_parser_params & syntax() const { return syntax_; } void move_to(size_t pos) { if (pos > input_.size()) { diff --git a/common/chat.h b/common/chat.h index 148801738..ac19348ec 100644 --- a/common/chat.h +++ b/common/chat.h @@ -145,7 +145,7 @@ struct common_chat_templates_inputs { std::vector tools; common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO; bool parallel_tool_calls = false; - common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; + common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; // TODO: refactor this to "bool enable_thinking" bool enable_thinking = true; std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); std::map chat_template_kwargs; @@ -165,14 +165,21 @@ struct common_chat_params { std::string parser; }; -struct common_chat_syntax { +// per-message parsing syntax +// should be derived from common_chat_params +struct common_chat_parser_params { common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY; - common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; + common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; // TODO: refactor this to "bool parse_reasoning" // Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode) bool reasoning_in_content = false; bool thinking_forced_open = false; bool parse_tool_calls = true; common_peg_arena parser = {}; + common_chat_parser_params() = default; + common_chat_parser_params(const common_chat_params & chat_params) { + format = chat_params.format; + thinking_forced_open = chat_params.thinking_forced_open; + } }; // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid @@ -213,10 +220,12 @@ std::string common_chat_format_example( const std::map & chat_template_kwargs); const char* common_chat_format_name(common_chat_format format); -const char* common_reasoning_format_name(common_reasoning_format format); -common_reasoning_format common_reasoning_format_from_name(const std::string & format); -common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax); -common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_syntax & syntax); +common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_parser_params & syntax); +common_chat_msg common_chat_peg_parse(const common_peg_arena & parser, const std::string & input, bool is_partial, const common_chat_parser_params & syntax); + +// used by arg and server +const char * common_reasoning_format_name(common_reasoning_format format); +common_reasoning_format common_reasoning_format_from_name(const std::string & format); common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice); diff --git a/common/common.h b/common/common.h index b9566df62..8247949de 100644 --- a/common/common.h +++ b/common/common.h @@ -284,6 +284,7 @@ struct common_params_diffusion { }; // reasoning API response format (not to be confused as chat template's reasoning format) +// only used by server enum common_reasoning_format { COMMON_REASONING_FORMAT_NONE, COMMON_REASONING_FORMAT_AUTO, // Same as deepseek, using `message.reasoning_content` diff --git a/common/json-partial.h b/common/json-partial.h index f63356dc4..be51aabfb 100644 --- a/common/json-partial.h +++ b/common/json-partial.h @@ -1,5 +1,6 @@ #pragma once +// TODO: use json_fwd.hpp when possible #include // Healing marker (empty if the JSON was fully parsed / wasn't healed). diff --git a/tests/test-chat-parser.cpp b/tests/test-chat-parser.cpp index 4766518fe..6f44a2b42 100644 --- a/tests/test-chat-parser.cpp +++ b/tests/test-chat-parser.cpp @@ -54,113 +54,109 @@ static void assert_throws(const std::function & fn, const std::string & static void test_reasoning() { //common_log_set_verbosity_thold(LOG_DEFAULT_DEBUG); { - common_chat_msg_parser builder("CogitoErgo sum", /* is_partial= */ false, { - /* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY, - /* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE, - /* .reasoning_in_content = */ false, - /* .thinking_forced_open = */ false, - }); + common_chat_parser_params params; + params.format = COMMON_CHAT_FORMAT_CONTENT_ONLY; + params.reasoning_format = COMMON_REASONING_FORMAT_NONE; + params.reasoning_in_content = false; + params.thinking_forced_open = false; + common_chat_msg_parser builder("CogitoErgo sum", /* is_partial= */ false, params); assert_equals(false, builder.try_parse_reasoning("", "")); assert_equals("CogitoErgo sum", builder.consume_rest()); } { - common_chat_msg_parser builder("CogitoErgo sum", /* is_partial= */ false, { - /* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY, - /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK, - /* .reasoning_in_content = */ false, - /* .thinking_forced_open = */ false, - }); + common_chat_parser_params params; + params.format = COMMON_CHAT_FORMAT_CONTENT_ONLY; + params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; + params.reasoning_in_content = false; + params.thinking_forced_open = false; + common_chat_msg_parser builder("CogitoErgo sum", /* is_partial= */ false, params); assert_equals(true, builder.try_parse_reasoning("", "")); assert_equals(std::string("Cogito"), builder.result().reasoning_content); assert_equals("Ergo sum", builder.consume_rest()); } { - common_chat_msg_parser builder("CogitoErgo sum", /* is_partial= */ false, { - /* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY, - /* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE, - /* .reasoning_in_content = */ false, - /* .thinking_forced_open = */ false, - }); + common_chat_parser_params params; + params.format = COMMON_CHAT_FORMAT_CONTENT_ONLY; + params.reasoning_format = COMMON_REASONING_FORMAT_NONE; + params.reasoning_in_content = false; + params.thinking_forced_open = false; + common_chat_msg_parser builder("CogitoErgo sum", /* is_partial= */ false, params); assert_equals(false, builder.try_parse_reasoning("", "")); assert_equals("CogitoErgo sum", builder.consume_rest()); } { - common_chat_msg_parser builder("CogitoErgo sum", /* is_partial= */ false, { - /* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY, - /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK, - /* .reasoning_in_content = */ false, - /* .thinking_forced_open = */ true, - }); + common_chat_parser_params params; + params.format = COMMON_CHAT_FORMAT_CONTENT_ONLY; + params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; + params.reasoning_in_content = false; + params.thinking_forced_open = true; + common_chat_msg_parser builder("CogitoErgo sum", /* is_partial= */ false, params); assert_equals(true, builder.try_parse_reasoning("", "")); assert_equals(std::string("Cogito"), builder.result().reasoning_content); assert_equals("Ergo sum", builder.consume_rest()); } { - common_chat_msg_parser builder("CogitoErgo sum", /* is_partial= */ false, { - /* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY, - /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK, - /* .reasoning_in_content = */ true, - /* .thinking_forced_open = */ true, - }); + common_chat_parser_params params; + params.format = COMMON_CHAT_FORMAT_CONTENT_ONLY; + params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; + params.reasoning_in_content = true; + params.thinking_forced_open = true; + common_chat_msg_parser builder("CogitoErgo sum", /* is_partial= */ false, params); assert_equals(true, builder.try_parse_reasoning("", "")); assert_equals("Cogito", builder.result().content); assert_equals("Ergo sum", builder.consume_rest()); } { const std::string variant("content_only_inline_think"); - common_chat_syntax syntax = { - /* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY, - /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK, - /* .reasoning_in_content = */ false, - /* .thinking_forced_open = */ false, - /* .parse_tool_calls = */ false, - }; + common_chat_parser_params params; + params.format = COMMON_CHAT_FORMAT_CONTENT_ONLY; + params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; + params.reasoning_in_content = false; + params.thinking_forced_open = false; + params.parse_tool_calls = false; const std::string input = "PenseBonjour"; - auto msg = common_chat_parse(input, false, syntax); + auto msg = common_chat_parse(input, false, params); assert_equals(variant, std::string("Pense"), msg.reasoning_content); assert_equals(variant, std::string("Bonjour"), msg.content); } { const std::string variant("llama_3_inline_think"); - common_chat_syntax syntax = { - /* .format = */ COMMON_CHAT_FORMAT_LLAMA_3_X, - /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK, - /* .reasoning_in_content = */ false, - /* .thinking_forced_open = */ false, - /* .parse_tool_calls = */ false, - }; + common_chat_parser_params params; + params.format = COMMON_CHAT_FORMAT_LLAMA_3_X; + params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; + params.reasoning_in_content = false; + params.thinking_forced_open = false; + params.parse_tool_calls = false; const std::string input = "PlanRéponse"; - auto msg = common_chat_parse(input, false, syntax); + auto msg = common_chat_parse(input, false, params); assert_equals(variant, std::string("Plan"), msg.reasoning_content); assert_equals(variant, std::string("Réponse"), msg.content); } // Test DeepSeek V3.1 parsing - reasoning content followed by "" and then regular content { - common_chat_syntax syntax = { - /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1, - /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK, - /* .reasoning_in_content = */ false, - /* .thinking_forced_open = */ true, - /* .parse_tool_calls = */ true, - }; + common_chat_parser_params params; + params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1; + params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; + params.reasoning_in_content = false; + params.thinking_forced_open = true; + params.parse_tool_calls = true; const std::string variant("deepseek_v3_1_reasoning_format_deepseek"); - common_chat_msg_parser builder("REASONINGok", /* is_partial= */ false, syntax); + common_chat_msg_parser builder("REASONINGok", /* is_partial= */ false, params); assert_equals(variant, true, builder.try_parse_reasoning("", "")); assert_equals(variant, std::string("REASONING"), builder.result().reasoning_content); assert_equals(variant, std::string("ok"), builder.consume_rest()); } // Test DeepSeek V3.1 parsing - reasoning_format none - reasoning content followed by "" and then regular content { - common_chat_syntax syntax = { - /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1, - /* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE, - /* .reasoning_in_content = */ false, - /* .thinking_forced_open = */ true, - /* .parse_tool_calls = */ true, - }; + common_chat_parser_params params; + params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1; + params.reasoning_format = COMMON_REASONING_FORMAT_NONE; + params.reasoning_in_content = false; + params.thinking_forced_open = true; + params.parse_tool_calls = true; const std::string variant("deepseek_v3_1_reasoning_format_none"); const std::string input = "REASONINGok"; - auto msg = common_chat_parse(input, false, syntax); + auto msg = common_chat_parse(input, false, params); assert_equals(variant, std::string("REASONINGok"), msg.content); assert_equals(variant, std::string(""), msg.reasoning_content); } @@ -256,15 +252,14 @@ static void test_deepseek_v3_1_tool_calls() { //common_log_set_verbosity_thold(LOG_DEFAULT_DEBUG); // variant: happy path for when it works as the model card says it should const std::string variant("simple"); - common_chat_syntax syntax = { - /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1, - /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK, - /* .reasoning_in_content = */ false, - /* .thinking_forced_open = */ false, - /* .parse_tool_calls = */ true, - }; + common_chat_parser_params params; + params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1; + params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; + params.reasoning_in_content = false; + params.thinking_forced_open = false; + params.parse_tool_calls = true; const std::string input = "<|tool▁calls▁begin|><|tool▁call▁begin|>get_time<|tool▁sep|>{\"city\": \"Tokyo\"}<|tool▁call▁end|><|tool▁calls▁end|>"; - auto msg = common_chat_parse(input, false, syntax); + auto msg = common_chat_parse(input, false, params); assert_equals(variant, 1, msg.tool_calls.size()); assert_equals(variant, std::string("get_time"), msg.tool_calls[0].name); // JSON arguments are dumped without spaces @@ -274,16 +269,15 @@ static void test_deepseek_v3_1_tool_calls() { // variant: simple + thinking open { - common_chat_syntax syntax = { - /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1, - /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK, - /* .reasoning_in_content = */ false, - /* .thinking_forced_open = */ true, - /* .parse_tool_calls = */ true, - }; + common_chat_parser_params params; + params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1; + params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; + params.reasoning_in_content = false; + params.thinking_forced_open = true; + params.parse_tool_calls = true; const std::string variant("simple_thinking"); const std::string in = "REASONING<|tool▁calls▁begin|><|tool▁call▁begin|>get_time<|tool▁sep|>{\"city\": \"Tokyo\"}<|tool▁call▁end|><|tool▁calls▁end|>"; - auto m = common_chat_parse(in, false, syntax); + auto m = common_chat_parse(in, false, params); assert_equals(variant, 1, m.tool_calls.size()); assert_equals(variant, std::string("get_time"), m.tool_calls[0].name); assert_equals(variant, std::string("{\"city\":\"Tokyo\"}"), m.tool_calls[0].arguments); @@ -292,16 +286,15 @@ static void test_deepseek_v3_1_tool_calls() { } // variant: simple + multiple tool calls { - common_chat_syntax syntax = { - /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1, - /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK, - /* .reasoning_in_content = */ false, - /* .thinking_forced_open = */ false, - /* .parse_tool_calls = */ true, - }; + common_chat_parser_params params; + params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1; + params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; + params.reasoning_in_content = false; + params.thinking_forced_open = false; + params.parse_tool_calls = true; const std::string variant("simple_multiple_tool_calls"); const std::string in = "CONTENT<|tool▁calls▁begin|><|tool▁call▁begin|>get_time<|tool▁sep|>{\"city\": \"Paris\"}<|tool▁call▁end|><|tool▁call▁begin|>get_weather<|tool▁sep|>{\"city\": \"Paris\"}<|tool▁call▁end|><|tool▁calls▁end|>"; - auto m = common_chat_parse(in, false, syntax); + auto m = common_chat_parse(in, false, params); assert_equals(variant, 2, m.tool_calls.size()); assert_equals(variant, std::string("get_time"), m.tool_calls[0].name); assert_equals(variant, std::string("{\"city\":\"Paris\"}"), m.tool_calls[0].arguments); @@ -314,16 +307,15 @@ static void test_deepseek_v3_1_tool_calls() { // variant: thinking forced open + tool call in reasoning content { - common_chat_syntax syntax = { - /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1, - /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK, - /* .reasoning_in_content = */ false, - /* .thinking_forced_open = */ true, - /* .parse_tool_calls = */ true, - }; + common_chat_parser_params params; + params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1; + params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; + params.reasoning_in_content = false; + params.thinking_forced_open = true; + params.parse_tool_calls = true; const std::string variant("thinking_forced_open_tool_call_in_reasoning"); const std::string in = "REASONING<|tool▁calls▁begin|><|tool▁call▁begin|>get_time2<|tool▁sep|>{\"city\": \"Tokyo2\"}<|tool▁call▁end|><|tool▁calls▁end|>REASONING<|tool▁calls▁begin|><|tool▁call▁begin|>get_time<|tool▁sep|>{\"city\": \"Tokyo\"}<|tool▁call▁end|><|tool▁calls▁end|>"; - auto m = common_chat_parse(in, false, syntax); + auto m = common_chat_parse(in, false, params); assert_equals(variant, 1, m.tool_calls.size()); assert_equals(variant, std::string("get_time"), m.tool_calls[0].name); assert_equals(variant, std::string("{\"city\":\"Tokyo\"}"), m.tool_calls[0].arguments); @@ -336,16 +328,15 @@ static void test_deepseek_v3_1_tool_calls() { // to make tool calls in reasoning content according to the model card, but it does sometimes, so // add the reasoning content as regular content and parse the tool calls. { - common_chat_syntax syntax = { - /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1, - /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK, - /* .reasoning_in_content = */ false, - /* .thinking_forced_open = */ true, - /* .parse_tool_calls = */ true, - }; + common_chat_parser_params params; + params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1; + params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; + params.reasoning_in_content = false; + params.thinking_forced_open = true; + params.parse_tool_calls = true; const std::string variant("thinking_forced_open_tool_call_in_reasoning_no_closing_think_not_partial"); const std::string in = "REASONING<|tool▁calls▁begin|><|tool▁call▁begin|>get_time<|tool▁sep|>{\"city\": \"Tokyo\"}<|tool▁call▁end|><|tool▁calls▁end|>"; - auto m = common_chat_parse(in, false, syntax); + auto m = common_chat_parse(in, false, params); assert_equals(variant, std::string("REASONING"), m.content); assert_equals(variant, std::string(""), m.reasoning_content); assert_equals(variant, 1, m.tool_calls.size()); @@ -355,16 +346,15 @@ static void test_deepseek_v3_1_tool_calls() { // variant: thinking forced open + tool call in reasoning content + no closing think + partial { - common_chat_syntax syntax = { - /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1, - /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK, - /* .reasoning_in_content = */ false, - /* .thinking_forced_open = */ true, - /* .parse_tool_calls = */ true, - }; + common_chat_parser_params params; + params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1; + params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; + params.reasoning_in_content = false; + params.thinking_forced_open = true; + params.parse_tool_calls = true; const std::string variant("thinking_forced_open_tool_call_in_reasoning_no_closing_think_partial"); const std::string in = "REASONING<|tool▁calls▁begin|><|tool▁call▁begin|>get_time<|tool▁sep|>{\"city\": \"Tokyo\"}<|tool▁call▁end|><|tool▁calls▁end|>"; - auto m = common_chat_parse(in, /* is_partial= */ true, syntax); + auto m = common_chat_parse(in, /* is_partial= */ true, params); assert_equals(variant, std::string("REASONING<|tool▁calls▁begin|><|tool▁call▁begin|>get_time<|tool▁sep|>{\"city\": \"Tokyo\"}<|tool▁call▁end|><|tool▁calls▁end|>"), m.reasoning_content); assert_equals(variant, std::string(""), m.content); assert_equals(variant, 0, m.tool_calls.size()); @@ -372,32 +362,30 @@ static void test_deepseek_v3_1_tool_calls() { // variant: thinking not forced open + reasoning + regular content + no tool calls { - common_chat_syntax syntax = { - /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1, - /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK, - /* .reasoning_in_content = */ false, - /* .thinking_forced_open = */ true, - /* .parse_tool_calls = */ true, - }; + common_chat_parser_params params; + params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1; + params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; + params.reasoning_in_content = false; + params.thinking_forced_open = true; + params.parse_tool_calls = true; const std::string variant("thinking_forced_open_reasoning_regular_content_no_tool_calls"); const std::string in = "REASONINGCONTENT"; - auto m = common_chat_parse(in, false, syntax); + auto m = common_chat_parse(in, false, params); assert_equals(variant, 0, m.tool_calls.size()); assert_equals(variant, std::string("CONTENT"), m.content); assert_equals(variant, std::string("REASONING"), m.reasoning_content); } // variant: thinking not forced open + missing reasoning + no tool calls { - common_chat_syntax syntax = { - /* .format = */ COMMON_CHAT_FORMAT_DEEPSEEK_V3_1, - /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK, - /* .reasoning_in_content = */ false, - /* .thinking_forced_open = */ false, - /* .parse_tool_calls = */ true, - }; + common_chat_parser_params params; + params.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1; + params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; + params.reasoning_in_content = false; + params.thinking_forced_open = false; + params.parse_tool_calls = true; const std::string variant("thinking_not_forced_open_missing_reasoning_no_tool_calls"); const std::string in = "CONTENT"; - auto m = common_chat_parse(in, false, syntax); + auto m = common_chat_parse(in, false, params); assert_equals(variant, 0, m.tool_calls.size()); assert_equals(variant, std::string("CONTENT"), m.content); assert_equals(variant, std::string(""), m.reasoning_content); diff --git a/tests/test-chat-peg-parser.cpp b/tests/test-chat-peg-parser.cpp index d3a4cfd22..f767c73c2 100644 --- a/tests/test-chat-peg-parser.cpp +++ b/tests/test-chat-peg-parser.cpp @@ -616,15 +616,15 @@ void test_command7_parser_compare(testing & t) { auto test_legacy = [&](const std::string & input, bool need_more_input, bool print_results) { // Original common_chat_combinator_parser taken from chat.cpp + common_chat_parser_params params; + params.format = COMMON_CHAT_FORMAT_GENERIC; + params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; + params.reasoning_in_content = false; + params.thinking_forced_open = false; common_chat_msg_parser builder( input, /* .is_partial = */ need_more_input, - { - /* .format = */ COMMON_CHAT_FORMAT_GENERIC, - /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO, - /* .reasoning_in_content = */ false, - /* .thinking_forced_open = */ false, - } + params ); builder.try_parse_reasoning("<|START_THINKING|>", "<|END_THINKING|>"); diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index e1264b8e8..6820acf67 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -341,10 +341,11 @@ static void test_templates(const struct common_chat_templates * tmpls, const std } if (expect_grammar_triggered) { - common_chat_syntax syntax; - syntax.format = data.params.format; - syntax.reasoning_format = reasoning_format; - const auto msg = common_chat_parse(data.delta, /* is_partial= */ false, syntax); + // TODO @ngxson : refactor common_chat_parse to avoid passing format/reasoning_format every time + common_chat_parser_params params; + params.format = data.params.format; + params.reasoning_format = reasoning_format; + const auto msg = common_chat_parse(data.delta, /* is_partial= */ false, params); assert_msg_equals(test_message, msg, ignore_whitespace_differences); } @@ -556,7 +557,9 @@ struct make_peg_parser { } common_chat_msg parse(const std::string & msg, bool is_partial) { - return common_chat_peg_parse(arena_, msg, is_partial, /* syntax = */ {params_.format}); + common_chat_parser_params parser_params; + parser_params.format = params_.format; + return common_chat_peg_parse(arena_, msg, is_partial, parser_params); } }; @@ -750,6 +753,25 @@ static void test_tools_oaicompat_json_conversion() { } } +// for compat; ref: https://github.com/ggml-org/llama.cpp/pull/18961 +struct test_parser_params { + common_chat_format format = COMMON_CHAT_FORMAT_CONTENT_ONLY; + common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; + bool reasoning_in_content = false; + bool thinking_forced_open = false; + bool parse_tool_calls = true; +}; + +static common_chat_msg test_chat_parse(const std::string & input, bool is_partial, const test_parser_params & syntax) { + common_chat_parser_params params; + params.format = syntax.format; + params.reasoning_format = syntax.reasoning_format; + params.reasoning_in_content = syntax.reasoning_in_content; + params.thinking_forced_open = syntax.thinking_forced_open; + params.parse_tool_calls = syntax.parse_tool_calls; + return common_chat_parse(input, is_partial, params); +} + static void test_template_output_parsers() { printf("[%s]\n", __func__); @@ -781,17 +803,17 @@ static void test_template_output_parsers() { } assert_msg_equals(message_assist, - common_chat_parse( + test_chat_parse( "Hello, world!\nWhat's up?", /* is_partial= */ false, {COMMON_CHAT_FORMAT_COMMAND_R7B})); assert_msg_equals(message_assist, - common_chat_parse( + test_chat_parse( "<|START_RESPONSE|>Hello, world!\nWhat's up?<|END_RESPONSE|>", /* is_partial= */ false, {COMMON_CHAT_FORMAT_COMMAND_R7B})); assert_msg_equals(message_assist_thoughts, - common_chat_parse( + test_chat_parse( "<|START_THINKING|>I'm\nthinking<|END_THINKING|>" "<|START_RESPONSE|>Hello, world!\nWhat's up?<|END_RESPONSE|>", /* is_partial= */ false, @@ -800,7 +822,7 @@ static void test_template_output_parsers() { /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK, })); assert_msg_equals(message_assist_thoughts_unparsed_deepseek, - common_chat_parse( + test_chat_parse( "<|START_THINKING|>I'm\nthinking<|END_THINKING|>" "<|START_RESPONSE|>Hello, world!\nWhat's up?<|END_RESPONSE|>", /* is_partial= */ false, @@ -811,13 +833,13 @@ static void test_template_output_parsers() { /* .thinking_forced_open = */ false, })); assert_msg_equals(message_assist_thoughts_unparsed_r7b, - common_chat_parse( + test_chat_parse( "<|START_THINKING|>I'm\nthinking<|END_THINKING|>" "<|START_RESPONSE|>Hello, world!\nWhat's up?<|END_RESPONSE|>", /* is_partial= */ false, {COMMON_CHAT_FORMAT_COMMAND_R7B})); assert_msg_equals(message_assist_thoughts, - common_chat_parse( + test_chat_parse( "<|START_THINKING|>I'm\nthinking<|END_THINKING|>" "<|START_RESPONSE|>Hello, world!\nWhat's up?<|END_RESPONSE|>", /* is_partial= */ false, @@ -826,7 +848,7 @@ static void test_template_output_parsers() { /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK, })); assert_msg_equals(message_assist_thoughts_call_idx, - common_chat_parse( + test_chat_parse( "<|START_THINKING|>I'm\nthinking<|END_THINKING|>" "<|START_ACTION|>[\n" " {\"tool_call_id\": \"0\", \"tool_name\": \"special_function\", \"parameters\": {\"arg1\": 1}}\n" @@ -837,7 +859,7 @@ static void test_template_output_parsers() { /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK, })); assert_msg_equals(message_assist_thoughts_no_content, - common_chat_parse( + test_chat_parse( "<|START_THINKING|>I'm\nthinking<|END_THINKING|>" "<|START_ACTION|>[\n" " {\"tool_call_id\": \"0\", \"tool_name\": \"special", @@ -877,7 +899,7 @@ static void test_template_output_parsers() { assert_equals( simple_assist_msg("{ \"tool_call\" : { \"name\" : \"t"), - common_chat_parse( + test_chat_parse( "{ \"tool_call\" : { \"name\" : \"t", /* is_partial= */ true, { @@ -889,33 +911,33 @@ static void test_template_output_parsers() { })); assert_equals( message_assist_empty, - common_chat_parse( + test_chat_parse( "{ \"tool_call\" : { \"name\" : \"t", /* is_partial= */ true, {COMMON_CHAT_FORMAT_GENERIC})); assert_equals( simple_assist_msg("", "", "puppeteer_screenshot", "{\"name\":\"servethehome_homepage\","), - common_chat_parse( + test_chat_parse( R"({"tool_call": {"name": "puppeteer_screenshot", "arguments": {"name": "servethehome_homepage",)", /* is_partial= */ true, {COMMON_CHAT_FORMAT_GENERIC})); assert_equals( message_assist_call_empty_args, - common_chat_parse( + test_chat_parse( "{ \"tool_call\" : { \"name\" : \"special_function\"", /* is_partial= */ true, {COMMON_CHAT_FORMAT_GENERIC})); assert_equals( message_assist_call_cutoff_args, - common_chat_parse( + test_chat_parse( "{ \"tool_call\" : { \"name\" : \"special_function\", \"arguments\" : { \"arg", /* is_partial= */ true, {COMMON_CHAT_FORMAT_GENERIC})); assert_msg_equals(message_assist, - common_chat_parse( + test_chat_parse( "{\n" " \"response\": \"Hello, world!\\nWhat's up?\"\n" "}", @@ -951,7 +973,7 @@ static void test_template_output_parsers() { { assert_msg_equals( simple_assist_msg("Réponse", "raisonnement"), - common_chat_parse( + test_chat_parse( message_assist_thoughts_unparsed_magistral.content, /* is_partial= */ false, { @@ -988,14 +1010,14 @@ static void test_template_output_parsers() { // Test parsing assert_msg_equals( simple_assist_msg("", "", "python", ""), - common_chat_parse( + test_chat_parse( "```json\n" " { \"name\" : \"python\"", /* is_partial= */ true, {COMMON_CHAT_FORMAT_HERMES_2_PRO})); assert_msg_equals( simple_assist_msg("Let's call something\n"), - common_chat_parse( + test_chat_parse( "Let's call something\n" "{\"name\"", /* is_partial= */ true, @@ -1005,7 +1027,7 @@ static void test_template_output_parsers() { })); assert_msg_equals( simple_assist_msg("Let's call something\n"), - common_chat_parse( + test_chat_parse( "Let's call something\n" "{\"name", /* is_partial= */ true, @@ -1014,7 +1036,7 @@ static void test_template_output_parsers() { /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK, })); assert_msg_equals(message_assist_call_thoughts, - common_chat_parse( + test_chat_parse( // QwQ-32B's template adds a trailing if add_generation_prompt "I'm\nthinking\n" "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}", @@ -1027,14 +1049,14 @@ static void test_template_output_parsers() { })); assert_msg_equals( message_assist_call, - common_chat_parse( + test_chat_parse( "\n" "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n" "", /* is_partial= */ false, {COMMON_CHAT_FORMAT_HERMES_2_PRO})); assert_msg_equals(message_assist_call_content, - common_chat_parse( + test_chat_parse( "Hello, world!\nWhat's up?\n" "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n" "", @@ -1042,13 +1064,13 @@ static void test_template_output_parsers() { {COMMON_CHAT_FORMAT_HERMES_2_PRO})); assert_msg_equals( message_assist_call, - common_chat_parse( + test_chat_parse( "{\"arg1\": 1}", /* is_partial= */ false, {COMMON_CHAT_FORMAT_HERMES_2_PRO})); assert_msg_equals( message_assist_call, - common_chat_parse( + test_chat_parse( "\n" "{\"arg1\": 1}\n" "", @@ -1056,7 +1078,7 @@ static void test_template_output_parsers() { {COMMON_CHAT_FORMAT_HERMES_2_PRO})); assert_msg_equals( message_assist_call, - common_chat_parse( + test_chat_parse( "\n" " {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n" "", @@ -1064,7 +1086,7 @@ static void test_template_output_parsers() { {COMMON_CHAT_FORMAT_HERMES_2_PRO})); assert_msg_equals( message_assist_call, - common_chat_parse( + test_chat_parse( "\n" " {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n" "", @@ -1072,7 +1094,7 @@ static void test_template_output_parsers() { {COMMON_CHAT_FORMAT_HERMES_2_PRO})); assert_msg_equals( message_assist_call, - common_chat_parse( + test_chat_parse( "\n" " {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n" "", @@ -1080,7 +1102,7 @@ static void test_template_output_parsers() { {COMMON_CHAT_FORMAT_HERMES_2_PRO})); assert_msg_equals( message_assist_call, - common_chat_parse( + test_chat_parse( "```xml\n" "\n" " {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n" @@ -1090,7 +1112,7 @@ static void test_template_output_parsers() { {COMMON_CHAT_FORMAT_HERMES_2_PRO})); assert_msg_equals( message_assist_call, - common_chat_parse( + test_chat_parse( "```xml\n" " {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n" "```", @@ -1098,7 +1120,7 @@ static void test_template_output_parsers() { {COMMON_CHAT_FORMAT_HERMES_2_PRO})); assert_msg_equals( message_assist_call, - common_chat_parse( + test_chat_parse( "```\n" " {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n" "```", @@ -1106,7 +1128,7 @@ static void test_template_output_parsers() { {COMMON_CHAT_FORMAT_HERMES_2_PRO})); assert_msg_equals( message_assist_call, - common_chat_parse( + test_chat_parse( "```\n" "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n" "```", @@ -1114,7 +1136,7 @@ static void test_template_output_parsers() { {COMMON_CHAT_FORMAT_HERMES_2_PRO})); assert_msg_equals( message_assist_call, - common_chat_parse( + test_chat_parse( "```json\n" " {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n" "```", @@ -1122,7 +1144,7 @@ static void test_template_output_parsers() { {COMMON_CHAT_FORMAT_HERMES_2_PRO})); assert_msg_equals( message_assist_call, - common_chat_parse( + test_chat_parse( "```json\n" "\n" " {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}} \n" @@ -1132,7 +1154,7 @@ static void test_template_output_parsers() { {COMMON_CHAT_FORMAT_HERMES_2_PRO})); assert_msg_equals( message_assist_call, - common_chat_parse( + test_chat_parse( "\n" " {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n" "", @@ -1140,7 +1162,7 @@ static void test_template_output_parsers() { {COMMON_CHAT_FORMAT_HERMES_2_PRO})); assert_msg_equals( message_assist_call, - common_chat_parse( + test_chat_parse( "\n" " {\n" " \"name\": \"special_function\", \"arguments\": {\"arg1\": 1}\n" @@ -1150,7 +1172,7 @@ static void test_template_output_parsers() { {COMMON_CHAT_FORMAT_HERMES_2_PRO})); assert_msg_equals( message_assist_call, - common_chat_parse( + test_chat_parse( "\n" " {\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n" "", @@ -1158,13 +1180,13 @@ static void test_template_output_parsers() { {COMMON_CHAT_FORMAT_HERMES_2_PRO})); assert_msg_equals( message_assist_call, - common_chat_parse( + test_chat_parse( "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}", /* is_partial= */ false, {COMMON_CHAT_FORMAT_HERMES_2_PRO})); assert_msg_equals( message_assist_call, - common_chat_parse( + test_chat_parse( "{\n \"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}", /* is_partial= */ false, {COMMON_CHAT_FORMAT_HERMES_2_PRO})); @@ -1178,7 +1200,7 @@ static void test_template_output_parsers() { assert_msg_equals( message_assist_multiple_calls, - common_chat_parse( + test_chat_parse( "\n" "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n" "\n" @@ -1190,7 +1212,7 @@ static void test_template_output_parsers() { assert_msg_equals( message_assist_multiple_calls, - common_chat_parse( + test_chat_parse( "{\"arg1\": 1}\n" "{\"code\":\"print('hello')\"}", /* is_partial= */ false, @@ -1202,27 +1224,27 @@ static void test_template_output_parsers() { "", "special_function", "{\"arg1\": 1}"), - common_chat_parse( + test_chat_parse( "This is not a tool call:\n" "{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}", /* is_partial= */ false, {COMMON_CHAT_FORMAT_HERMES_2_PRO})); assert_msg_equals(message_assist, - common_chat_parse( + test_chat_parse( "Hello, world!\nWhat's up?", /* is_partial= */ false, {COMMON_CHAT_FORMAT_HERMES_2_PRO})); assert_msg_equals(message_assist_thoughts_unparsed_deepseek, - common_chat_parse( + test_chat_parse( "I'm\nthinkingHello, world!\nWhat's up?", /* is_partial= */ false, {COMMON_CHAT_FORMAT_HERMES_2_PRO})); // assert_msg_equals(message_assist_thoughts_unparsed_deepseek, - // common_chat_parse( + // test_chat_parse( // "I'm\nthinkingHello, world!\nWhat's up?", // COMMON_CHAT_FORMAT_HERMES_2_PRO)); assert_msg_equals(message_assist_thoughts, - common_chat_parse( + test_chat_parse( "I'm\nthinkingHello, world!\nWhat's up?", /* is_partial= */ false, { @@ -1230,7 +1252,7 @@ static void test_template_output_parsers() { /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK, })); assert_msg_equals(message_assist_thoughts, - common_chat_parse( + test_chat_parse( "I'm\nthinkingHello, world!\nWhat's up?", /* is_partial= */ true, { @@ -1238,7 +1260,7 @@ static void test_template_output_parsers() { /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK, })); assert_msg_equals(message_assist_thoughts_unparsed_md, - common_chat_parse( + test_chat_parse( "I'm\nthinkingHello, world!\nWhat's up?\n```json\n{}```", /* is_partial= */ false, { @@ -1249,7 +1271,7 @@ static void test_template_output_parsers() { /* .parse_tool_calls = */ false, })); assert_msg_equals(message_assist_thoughts_unparsed_md_partial, - common_chat_parse( + test_chat_parse( "I'm\nthinkingHello, world!\nWhat's up?\n```json\n{}```", /* is_partial= */ true, { @@ -1259,7 +1281,7 @@ static void test_template_output_parsers() { /* .thinking_forced_open = */ false, })); assert_msg_equals(message_assist_thoughts_unopened_unparsed, - common_chat_parse( + test_chat_parse( "I'm\nthinkingHello, world!\nWhat's up?", /* is_partial= */ false, { @@ -1267,7 +1289,7 @@ static void test_template_output_parsers() { /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK, })); assert_msg_equals(message_assist_thoughts, - common_chat_parse( + test_chat_parse( "I'm\nthinkingHello, world!\nWhat's up?", /* is_partial= */ false, { @@ -1304,7 +1326,7 @@ static void test_template_output_parsers() { ""); assert_msg_equals( simple_assist_msg("", /* reasoning_content= */ "nah uhg"), - common_chat_parse( + test_chat_parse( "nah uhg", /* is_partial= */ false, { @@ -1328,7 +1350,7 @@ static void test_template_output_parsers() { assert_equals( message_assist_call, - common_chat_parse( + test_chat_parse( "{\"name\": \"special_function\", \"parameters\": {\"arg1\": 1}}", /* is_partial= */ false, {COMMON_CHAT_FORMAT_LLAMA_3_X})); @@ -1366,7 +1388,7 @@ static void test_template_output_parsers() { for (auto is_partial : { false, true }) { assert_equals( message_assist_call, - common_chat_parse( + test_chat_parse( "{\"arg1\": 1}", is_partial, {COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1})); @@ -1374,7 +1396,7 @@ static void test_template_output_parsers() { assert_equals( message_assist_call, - common_chat_parse( + test_chat_parse( "{\"arg1\": 1}<", /* is_partial= */ true, {COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1})); @@ -1396,7 +1418,7 @@ static void test_template_output_parsers() { "", "special_function", "{\"arg1\": 1}"), - common_chat_parse( + test_chat_parse( "all\n" "Hello, world!\n" "nono\n" @@ -1405,27 +1427,27 @@ static void test_template_output_parsers() { /* is_partial= */ false, {COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2})); assert_msg_equals(message_assist_call_python_lines, - common_chat_parse( + test_chat_parse( "python\n" "# This is a program:\n" "print('hey')", /* is_partial= */ false, {COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2})); assert_msg_equals(message_assist_call_python_lines_unclosed, - common_chat_parse( + test_chat_parse( "python\n" "# This is a program:\n" "print('hey')", /* is_partial= */ true, {COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2})); assert_msg_equals(message_assist_call, - common_chat_parse( + test_chat_parse( "special_function\n" "{\"arg1\": 1} \n ", /* is_partial= */ false, {COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2})); assert_msg_equals(message_assist, - common_chat_parse( + test_chat_parse( "all\n" "Hello, world!\nWhat's up?", /* is_partial= */ false, @@ -1466,7 +1488,7 @@ static void test_template_output_parsers() { test_templates(tmpls.get(), end_tokens, message_assist_thoughts, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false); assert_msg_equals( simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking"), - common_chat_parse( + test_chat_parse( "I'm\nthinkingHello, world!\nWhat's up?", /* is_partial= */ false, { @@ -1477,7 +1499,7 @@ static void test_template_output_parsers() { })); assert_msg_equals( simple_assist_msg("", "I need to remember the correct syntax. It starts with <|tool▁calls▁begin|> and ends with"), - common_chat_parse( + test_chat_parse( "I need to remember the correct syntax. It starts with <|tool▁calls▁begin|> and ends with", /* is_partial= */ true, { @@ -1487,7 +1509,7 @@ static void test_template_output_parsers() { /* .thinking_forced_open = */ true, })); assert_msg_equals(message_assist_thoughts, - common_chat_parse( + test_chat_parse( "I'm\nthinkingHello, world!\nWhat's up?", /* is_partial= */ false, { @@ -1495,7 +1517,7 @@ static void test_template_output_parsers() { /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK, })); assert_msg_equals(message_assist_thoughts_unopened_unparsed, - common_chat_parse( + test_chat_parse( "I'm\nthinkingHello, world!\nWhat's up?", /* is_partial= */ false, { @@ -1503,7 +1525,7 @@ static void test_template_output_parsers() { /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK, })); assert_msg_equals(message_assist_thoughts, - common_chat_parse( + test_chat_parse( "I'm\nthinkingHello, world!\nWhat's up?", /* is_partial= */ false, { @@ -1514,7 +1536,7 @@ static void test_template_output_parsers() { })); assert_msg_equals(message_assist_thoughts, // Latest template update (ast of 20250209) adds a trailing \n if add_generation_prompt is true. - common_chat_parse( + test_chat_parse( "I'm\nthinkingHello, world!\nWhat's up?", /* is_partial= */ false, { @@ -1543,12 +1565,12 @@ static void test_template_output_parsers() { test_templates(tmpls.get(), end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false); test_templates(tmpls.get(), end_tokens, message_assist_thoughts, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false); assert_msg_equals(message_assist_thoughts_unparsed_deepseek, - common_chat_parse( + test_chat_parse( "I'm\nthinkingHello, world!\nWhat's up?", /* is_partial= */ false, {COMMON_CHAT_FORMAT_DEEPSEEK_R1})); assert_msg_equals(message_assist_thoughts, - common_chat_parse( + test_chat_parse( "I'm\nthinkingHello, world!\nWhat's up?", /* is_partial= */ false, { @@ -1556,7 +1578,7 @@ static void test_template_output_parsers() { /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK, })); assert_msg_equals(message_assist_thoughts, - common_chat_parse( + test_chat_parse( "I'm\nthinkingHello, world!\nWhat's up?", /* is_partial= */ false, { @@ -1567,7 +1589,7 @@ static void test_template_output_parsers() { })); assert_msg_equals(message_assist_call_thoughts_unparsed, - common_chat_parse( + test_chat_parse( "I'm\nthinking\n\n" "<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>special_function\n" "```json\n" @@ -1576,7 +1598,7 @@ static void test_template_output_parsers() { /* is_partial= */ false, {COMMON_CHAT_FORMAT_DEEPSEEK_R1})); assert_msg_equals(message_assist_call, - common_chat_parse( + test_chat_parse( "<|tool▁calls|>function<|tool▁sep|>special_function\n" "```json\n" "{\"arg1\": 1}\n" @@ -1585,7 +1607,7 @@ static void test_template_output_parsers() { {COMMON_CHAT_FORMAT_DEEPSEEK_R1})); assert_msg_equals(message_assist_call_thoughts, - common_chat_parse( + test_chat_parse( "I'm\nthinking\n\n" "<|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>special_function\n" "```json\n" @@ -1612,20 +1634,20 @@ static void test_template_output_parsers() { // Test parsing regular content assert_msg_equals(message_assist, - common_chat_parse( + test_chat_parse( "Hello, world!\nWhat's up?", /* is_partial= */ false, {COMMON_CHAT_FORMAT_GRANITE})); assert_msg_equals( message_assist, - common_chat_parse( + test_chat_parse( "Hello, world!\nWhat's up?", /* is_partial= */ true, {COMMON_CHAT_FORMAT_GRANITE})); // Test parsing content with thinking assert_msg_equals(message_assist_thoughts, - common_chat_parse( + test_chat_parse( "I'm\nthinkingHello, world!\nWhat's up?", /* is_partial= */ false, { @@ -1633,12 +1655,12 @@ static void test_template_output_parsers() { /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK, })); assert_msg_equals(message_assist_thoughts_unparsed_deepseek, - common_chat_parse( + test_chat_parse( "I'm\nthinkingHello, world!\nWhat's up?", /* is_partial= */ false, {COMMON_CHAT_FORMAT_GRANITE})); assert_msg_equals(message_assist_thoughts, - common_chat_parse( + test_chat_parse( "I'm\nthinkingHello, world!\nWhat's up?", /* is_partial= */ true, { @@ -1646,7 +1668,7 @@ static void test_template_output_parsers() { /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK, })); assert_msg_equals(message_assist_thoughts, - common_chat_parse( + test_chat_parse( "I'm\nthinkingHello, world!\nWhat's up?", /* is_partial= */ false, { @@ -1654,12 +1676,12 @@ static void test_template_output_parsers() { /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK, })); assert_msg_equals(simple_assist_msg("I'm\nthinkingHello, world!\nWhat's up?"), - common_chat_parse( + test_chat_parse( "I'm\nthinkingHello, world!\nWhat's up?", /* is_partial= */ false, {COMMON_CHAT_FORMAT_GRANITE})); assert_msg_equals(message_assist_empty, - common_chat_parse( + test_chat_parse( "I'm\nthinking", /* is_partial= */ true, { @@ -1681,32 +1703,32 @@ static void test_template_output_parsers() { })); assert_msg_equals( message_assist_empty, - common_chat_parse( + test_chat_parse( "I'm\nthinking[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]", /* is_partial= */ false, {COMMON_CHAT_FORMAT_GRANITE})); assert_msg_equals( message_assist_call_empty_args, - common_chat_parse( + test_chat_parse( "<|tool_call|>[{\"name\": \"special_function\"", /* is_partial= */ true, {COMMON_CHAT_FORMAT_GRANITE})); assert_msg_equals( message_assist_call_cutoff_args, - common_chat_parse( + test_chat_parse( "<|tool_call|>[{\"name\": \"special_function\", \"arguments\": {\"arg", /* is_partial= */ true, {COMMON_CHAT_FORMAT_GRANITE})); assert_msg_equals( message_assist_call_cutoff_args, - common_chat_parse( + test_chat_parse( "<|tool_call|>[{\"name\": \"special_function\", \"arguments\": {\"arg", /* is_partial= */ true, { @@ -1717,7 +1739,7 @@ static void test_template_output_parsers() { // Test parsing tool calls with thinking assert_msg_equals( message_assist_call_thoughts, - common_chat_parse( + test_chat_parse( "I'm\nthinking<|tool_call|>[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}, {", /* is_partial= */ true, { @@ -1757,7 +1779,7 @@ static void test_template_output_parsers() { assert_equals(COMMON_CHAT_FORMAT_GPT_OSS, common_chat_templates_apply(tmpls.get(), inputs_tools).format); assert_msg_equals(simple_assist_msg("", "I'm\nthink"), - common_chat_parse( + test_chat_parse( "<|channel|>analysis<|message|>I'm\nthink", /* is_partial= */ true, { @@ -1765,7 +1787,7 @@ static void test_template_output_parsers() { /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO, })); assert_msg_equals(simple_assist_msg("", "I'm\nthinking"), - common_chat_parse( + test_chat_parse( "<|channel|>analysis<|message|>I'm\nthinking<|end|>", /* is_partial= */ true, { @@ -1773,7 +1795,7 @@ static void test_template_output_parsers() { /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO, })); assert_msg_equals(simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking"), - common_chat_parse( + test_chat_parse( "<|channel|>analysis<|message|>I'm\nthinking<|end|>" "<|start|>assistant<|channel|>final<|message|>Hello, world!\nWhat's up?", /* is_partial= */ false, @@ -1782,7 +1804,7 @@ static void test_template_output_parsers() { /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO, })); assert_msg_equals(simple_assist_msg("", "I'm\nthinking", "special_function", "{\"arg1"), - common_chat_parse( + test_chat_parse( "<|channel|>analysis<|message|>I'm\nthinking<|end|>" "<|start|>assistant<|channel|>commentary to=functions.special_function <|constrain|>json<|message|>{\"arg1", /* is_partial= */ true, @@ -1791,7 +1813,7 @@ static void test_template_output_parsers() { /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO, })); assert_msg_equals(simple_assist_msg("", "I'm\nthinking", "special_function", "{\"arg1"), - common_chat_parse( + test_chat_parse( "<|channel|>analysis<|message|>I'm\nthinking<|end|>" "<|start|>assistant<|channel|>commentary to=functions.special_function<|message|>{\"arg1", /* is_partial= */ true, @@ -1800,7 +1822,7 @@ static void test_template_output_parsers() { /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO, })); assert_msg_equals(simple_assist_msg("", "I'm\nthinking", "special_function", "{\"arg1\": 1}"), - common_chat_parse( + test_chat_parse( "<|channel|>analysis<|message|>I'm\nthinking<|end|>" "<|start|>assistant<|channel|>commentary to=functions.special_function <|constrain|>json<|message|>{\"arg1\": 1}", /* is_partial= */ false, @@ -1809,7 +1831,7 @@ static void test_template_output_parsers() { /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO, })); assert_msg_equals(simple_assist_msg("", "I'm\nthinking", "special_function", "{\"arg1\": 1}"), - common_chat_parse( + test_chat_parse( "<|channel|>analysis<|message|>I'm\nthinking<|end|>" "<|start|>assistant<|channel|>analysis to=functions.special_function <|constrain|>json<|message|>{\"arg1\": 1}", /* is_partial= */ false, @@ -1818,7 +1840,7 @@ static void test_template_output_parsers() { /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO, })); assert_msg_equals(simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking"), - common_chat_parse( + test_chat_parse( "<|channel|>analysis<|message|>I'm\nthinking<|end|>" "<|start|>assistant<|channel|>commentary<|message|>Hello, world!\nWhat's up?", /* is_partial= */ true, @@ -1827,7 +1849,7 @@ static void test_template_output_parsers() { /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO, })); assert_msg_equals(simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking", "special_function", "{\"arg1\": 1}"), - common_chat_parse( + test_chat_parse( "<|channel|>analysis<|message|>I'm\nthinking<|end|>" "<|start|>assistant<|channel|>commentary<|message|>Hello, world!\nWhat's up?<|end|>" "<|start|>assistant<|channel|>commentary to=functions.special_function <|constrain|>json<|message|>{\"arg1\": 1}", @@ -1840,7 +1862,7 @@ static void test_template_output_parsers() { // Test parse_tool_calls == false assert_msg_equals( simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking"), - common_chat_parse( + test_chat_parse( "<|channel|>analysis<|message|>I'm\nthinking<|end|>" "<|start|>assistant<|channel|>final<|message|>Hello, world!\nWhat's up?", /* is_partial= */ true, @@ -1853,7 +1875,7 @@ static void test_template_output_parsers() { })); assert_msg_equals( simple_assist_msg("", "I'm\nthinking"), - common_chat_parse( + test_chat_parse( "<|channel|>analysis<|message|>I'm\nthinking<|end|>" "<|start|>assistant<|channel|>commentary to=functions.special_function<|message|>{\"arg1", /* is_partial= */ true, @@ -1866,7 +1888,7 @@ static void test_template_output_parsers() { })); assert_msg_equals( simple_assist_msg("", "I'm\nthinking"), - common_chat_parse( + test_chat_parse( "<|channel|>analysis<|message|>I'm\nthinking<|end|>" "<|start|>assistant<|channel|>commentary to=functions.special_function <|constrain|>json<|message|>{\"arg1\": 1}", /* is_partial= */ false, @@ -1882,7 +1904,7 @@ static void test_template_output_parsers() { assert_msg_equals( simple_assist_msg( "<|channel|>analysis<|message|>I'm\nthinking<|end|>Hello, world!\nWhat's up?"), - common_chat_parse( + test_chat_parse( "<|channel|>analysis<|message|>I'm\nthinking<|end|>" "<|start|>assistant<|channel|>final<|message|>Hello, world!\nWhat's up?", /* is_partial= */ false, @@ -1894,7 +1916,7 @@ static void test_template_output_parsers() { assert_msg_equals( simple_assist_msg( "<|channel|>analysis<|message|>I'm\nthinking<|end|>Hello, world!\nWhat's up?"), - common_chat_parse( + test_chat_parse( "<|channel|>analysis<|message|>I'm\nthinking<|end|>" "<|start|>assistant<|channel|>final<|message|>Hello, world!\nWhat's up?", /* is_partial= */ false, @@ -1906,7 +1928,7 @@ static void test_template_output_parsers() { // Test tool calling in role header assert_msg_equals(simple_assist_msg("", "", "special_function", "{\"arg1\": 1}"), - common_chat_parse( + test_chat_parse( " to=functions.special_function<|channel|>commentary <|constrain|>json<|message|>{\"arg1\": 1}", /* is_partial= */ false, { @@ -1914,7 +1936,7 @@ static void test_template_output_parsers() { /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO, })); assert_msg_equals(simple_assist_msg("", "", "special_function", "{\"arg1\": 1}"), - common_chat_parse( + test_chat_parse( " to=functions.special_function<|channel|>analysis <|constrain|>json<|message|>{\"arg1\": 1}", /* is_partial= */ false, { @@ -1922,7 +1944,7 @@ static void test_template_output_parsers() { /* .reasoning_format = */ COMMON_REASONING_FORMAT_AUTO, })); assert_msg_equals(simple_assist_msg("", "I'm\nthinking", "special_function", "{\"arg1\": 1}"), - common_chat_parse( + test_chat_parse( "<|channel|>analysis<|message|>I'm\nthinking<|end|>" "<|start|>assistant to=functions.special_function<|channel|>analysis <|constrain|>json<|message|>{\"arg1\": 1}", /* is_partial= */ false, @@ -1944,7 +1966,7 @@ static void test_template_output_parsers() { // Test simple reasoning content assert_msg_equals( simple_assist_msg("Hello, world!", "I'm thinking about the answer"), - common_chat_parse( + test_chat_parse( "I'm thinking about the answerHello, world!", /* is_partial= */ false, { @@ -1959,7 +1981,7 @@ static void test_template_output_parsers() { msg_budget_reflect.reasoning_content = "Token usage: 45/1000\nI should continue thinking to find the best solution."; assert_msg_equals( msg_budget_reflect, - common_chat_parse( + test_chat_parse( "Token usage: 45/1000\nI should continue thinking to find the best solution." "Token usage: 45/1000\nI should continue thinking to find the best solution." "I need to calculate this step by step.", @@ -1975,7 +1997,7 @@ static void test_template_output_parsers() { msg_tool_call.tool_calls.push_back({"calculate_sum", "{\"numbers\": [1, 2, 3]}", ""}); assert_msg_equals( msg_tool_call, - common_chat_parse( + test_chat_parse( "\n" "\n" "[1, 2, 3]\n" @@ -1992,7 +2014,7 @@ static void test_template_output_parsers() { msg_reasoning_tool.tool_calls.push_back({"calculate_sum", "{\"numbers\": [1, 2, 3]}", ""}); assert_msg_equals( msg_reasoning_tool, - common_chat_parse( + test_chat_parse( "I need to calculate the sum of these numbers" "\n" "\n" @@ -2013,7 +2035,7 @@ static void test_template_output_parsers() { std::size_t previousToolCalls = 0; for (std::size_t i = std::string("").length(); i < tool_msg.length() - 1; i++) { auto partial = tool_msg.substr(0, i); - auto partial_res = common_chat_parse(partial, true, { COMMON_CHAT_FORMAT_SEED_OSS, COMMON_REASONING_FORMAT_DEEPSEEK }); + auto partial_res = test_chat_parse(partial, true, { COMMON_CHAT_FORMAT_SEED_OSS, COMMON_REASONING_FORMAT_DEEPSEEK }); if (partial_res.tool_calls.size() < previousToolCalls) { throw std::runtime_error("Tool call size decreased on partial: " + partial + " from " + std::to_string(previousToolCalls) + " to " + std::to_string(partial_res.tool_calls.size())); } @@ -2026,7 +2048,7 @@ static void test_template_output_parsers() { msg_multi_param.tool_calls.push_back({"process_data", "{\"input\": \"test\", \"format\": \"json\"}", ""}); assert_msg_equals( msg_multi_param, - common_chat_parse( + test_chat_parse( "\n" "\n" "test\n" @@ -2039,7 +2061,7 @@ static void test_template_output_parsers() { // Test partial parsing for incomplete tool call - don't actually add the call until parsing parameters is done assert_msg_equals( simple_assist_msg("", "", "calculate_sum", "{\"numbers\":"), - common_chat_parse( + test_chat_parse( "\n" "\n" "[1,\n", @@ -2049,7 +2071,7 @@ static void test_template_output_parsers() { // Test incomplete reasoning tag assert_msg_equals( simple_assist_msg("", "I was thinking"), - common_chat_parse( + test_chat_parse( "I was thinking", /* is_partial= */ true, { @@ -2060,7 +2082,7 @@ static void test_template_output_parsers() { // Test content without reasoning assert_msg_equals( simple_assist_msg("This is a simple response without reasoning."), - common_chat_parse( + test_chat_parse( "This is a simple response without reasoning.", /* is_partial= */ false, {COMMON_CHAT_FORMAT_SEED_OSS})); @@ -2074,14 +2096,14 @@ static void test_template_output_parsers() { // Test parsing regular content assert_msg_equals(message_assist, - common_chat_parse( + test_chat_parse( "Hello, world!\nWhat's up?", /* is_partial= */ false, {COMMON_CHAT_FORMAT_NEMOTRON_V2})); // Test parsing content with thinking assert_msg_equals(message_assist_thoughts, - common_chat_parse( + test_chat_parse( "I'm\nthinkingHello, world!\nWhat's up?", /* is_partial= */ false, { @@ -2091,14 +2113,14 @@ static void test_template_output_parsers() { // Test parsing tool calls assert_msg_equals(message_assist_call, - common_chat_parse( + test_chat_parse( "[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]", /* is_partial= */ false, {COMMON_CHAT_FORMAT_NEMOTRON_V2})); // Test parsing tool calls with thinking assert_msg_equals(message_assist_call_thoughts, - common_chat_parse( + test_chat_parse( "I'm\nthinking[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]", /* is_partial= */ false, { @@ -2108,7 +2130,7 @@ static void test_template_output_parsers() { // Test tool calls with extra content assert_msg_equals(message_assist_call_content, - common_chat_parse( + test_chat_parse( "[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]Hello, world!\nWhat's up?", /* is_partial= */ false, {COMMON_CHAT_FORMAT_NEMOTRON_V2} @@ -2116,7 +2138,7 @@ static void test_template_output_parsers() { // Test tool calls with extra content AND thinking assert_msg_equals(message_assist_call_thoughts_content, - common_chat_parse( + test_chat_parse( "I'm\nthinking[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]Hello, world!\nWhat's up?", /* is_partial= */ false, { @@ -2149,7 +2171,7 @@ static void test_template_output_parsers() { test_templates(tmpls.get(), end_tokens, message_assist_thoughts, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false); assert_msg_equals( simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking"), - common_chat_parse( + test_chat_parse( "I'm\nthinkingHello, world!\nWhat's up?", /* is_partial= */ false, { @@ -2161,7 +2183,7 @@ static void test_template_output_parsers() { // variant: thinking forced open, reasoning_format none assert_msg_equals( simple_assist_msg("REASONINGok", ""), - common_chat_parse( + test_chat_parse( "REASONINGok", /* is_partial= */ false, { @@ -2174,7 +2196,7 @@ static void test_template_output_parsers() { // variant: happy path for when it works as the model card says it should assert_msg_equals( simple_assist_msg("", "", "get_time", "{\"city\":\"Tokyo\"}"), - common_chat_parse( + test_chat_parse( "<|tool▁calls▁begin|><|tool▁call▁begin|>get_time<|tool▁sep|>{\"city\": \"Tokyo\"}<|tool▁call▁end|><|tool▁calls▁end|>", /* is_partial= */ false, { @@ -2187,7 +2209,7 @@ static void test_template_output_parsers() { // variant: simple + thinking open assert_msg_equals( simple_assist_msg("", "REASONING", "get_time", "{\"city\":\"Tokyo\"}"), - common_chat_parse( + test_chat_parse( "REASONING<|tool▁calls▁begin|><|tool▁call▁begin|>get_time<|tool▁sep|>{\"city\": \"Tokyo\"}<|tool▁call▁end|><|tool▁calls▁end|>", /* is_partial= */ false, { @@ -2205,7 +2227,7 @@ static void test_template_output_parsers() { message_assist_multiple_calls.tool_calls.push_back({"get_weather", "{\"city\":\"Paris\"}", ""}); assert_msg_equals( message_assist_multiple_calls, - common_chat_parse( + test_chat_parse( "CONTENT<|tool▁calls▁begin|><|tool▁call▁begin|>get_time<|tool▁sep|>{\"city\": \"Paris\"}<|tool▁call▁end|><|tool▁call▁begin|>get_weather<|tool▁sep|>{\"city\": \"Paris\"}<|tool▁call▁end|><|tool▁calls▁end|>", /* is_partial= */ false, { @@ -2218,7 +2240,7 @@ static void test_template_output_parsers() { // variant: thinking forced open + tool call in reasoning content assert_msg_equals( simple_assist_msg("", "REASONING<|tool▁calls▁begin|><|tool▁call▁begin|>get_time2<|tool▁sep|>{\"city\": \"Tokyo2\"}<|tool▁call▁end|><|tool▁calls▁end|>REASONING", "get_time", "{\"city\":\"Tokyo\"}"), - common_chat_parse( + test_chat_parse( "REASONING<|tool▁calls▁begin|><|tool▁call▁begin|>get_time2<|tool▁sep|>{\"city\": \"Tokyo2\"}<|tool▁call▁end|><|tool▁calls▁end|>REASONING<|tool▁calls▁begin|><|tool▁call▁begin|>get_time<|tool▁sep|>{\"city\": \"Tokyo\"}<|tool▁call▁end|><|tool▁calls▁end|>", /* is_partial= */ false, { @@ -2234,7 +2256,7 @@ static void test_template_output_parsers() { // add the reasoning content as regular content and parse the tool calls. assert_msg_equals( simple_assist_msg("REASONING", "", "get_time", "{\"city\":\"Tokyo\"}"), - common_chat_parse( + test_chat_parse( "REASONING<|tool▁calls▁begin|><|tool▁call▁begin|>get_time<|tool▁sep|>{\"city\": \"Tokyo\"}<|tool▁call▁end|><|tool▁calls▁end|>", /* is_partial= */ false, { @@ -2247,7 +2269,7 @@ static void test_template_output_parsers() { // variant: thinking forced open + tool call in reasoning content + no closing think + partial assert_msg_equals( simple_assist_msg("", "REASONING<|tool▁calls▁begin|><|tool▁call▁begin|>get_time<|tool▁sep|>{\"city\": \"Tokyo\"}<|tool▁call▁end|><|tool▁calls▁end|>", "", ""), - common_chat_parse( + test_chat_parse( "REASONING<|tool▁calls▁begin|><|tool▁call▁begin|>get_time<|tool▁sep|>{\"city\": \"Tokyo\"}<|tool▁call▁end|><|tool▁calls▁end|>", /* is_partial= */ true, { @@ -2260,7 +2282,7 @@ static void test_template_output_parsers() { // variant: thinking not forced open + missing reasoning + no tool calls assert_msg_equals( simple_assist_msg("CONTENT", ""), - common_chat_parse( + test_chat_parse( "CONTENT", /* is_partial= */ false, { @@ -2280,14 +2302,14 @@ static void test_template_output_parsers() { // Test parsing regular content assert_msg_equals(message_assist, - common_chat_parse( + test_chat_parse( "Hello, world!\nWhat's up?", /* is_partial= */ false, {COMMON_CHAT_FORMAT_APERTUS})); // Test parsing content with thinking assert_msg_equals(message_assist_thoughts, - common_chat_parse( + test_chat_parse( "<|inner_prefix|>I'm\nthinking<|inner_suffix|>Hello, world!\nWhat's up?", /* is_partial= */ false, { @@ -2297,14 +2319,14 @@ static void test_template_output_parsers() { // Test parsing tool calls assert_msg_equals(message_assist_call, - common_chat_parse( + test_chat_parse( "<|tools_prefix|>[{\"special_function\": {\"arg1\": 1}}]<|tools_suffix|>", /* is_partial= */ false, {COMMON_CHAT_FORMAT_APERTUS})); // Test parsing tool calls with thinking assert_msg_equals(message_assist_call_thoughts, - common_chat_parse( + test_chat_parse( "<|inner_prefix|>I'm\nthinking<|inner_suffix|><|tools_prefix|>[{\"special_function\": {\"arg1\": 1}}]<|tools_suffix|>", /* is_partial= */ false, { @@ -2314,7 +2336,7 @@ static void test_template_output_parsers() { // Test tool calls with extra content assert_msg_equals(message_assist_call_content, - common_chat_parse( + test_chat_parse( "<|tools_prefix|>[{\"special_function\": {\"arg1\": 1}}]<|tools_suffix|>Hello, world!\nWhat's up?", /* is_partial= */ false, {COMMON_CHAT_FORMAT_APERTUS} @@ -2322,7 +2344,7 @@ static void test_template_output_parsers() { // Test tool calls with extra content AND thinking assert_msg_equals(message_assist_call_thoughts_content, - common_chat_parse( + test_chat_parse( "<|inner_prefix|>I'm\nthinking<|inner_suffix|><|tools_prefix|>[{\"special_function\": {\"arg1\": 1}}]<|tools_suffix|>Hello, world!\nWhat's up?", /* is_partial= */ false, { @@ -2402,7 +2424,7 @@ Hey there!<|im_end|> // Test parsing regular content assert_msg_equals(message_assist, - common_chat_parse( + test_chat_parse( "Hello, world!\nWhat's up?", /* is_partial= */ false, {COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS})); @@ -2413,7 +2435,7 @@ Hey there!<|im_end|> msg_single_tool_call.tool_calls.push_back({"special_function", "{\"arg1\":1}", ""}); assert_msg_equals( msg_single_tool_call, - common_chat_parse( + test_chat_parse( "<|tool_call_start|>[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]<|tool_call_end|>", /* is_partial= */ false, {COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS})); @@ -2424,7 +2446,7 @@ Hey there!<|im_end|> msg_tool_call_string.tool_calls.push_back({"get_weather", "{\"location\":\"Paris\"}", ""}); assert_msg_equals( msg_tool_call_string, - common_chat_parse( + test_chat_parse( "<|tool_call_start|>[{\"name\": \"get_weather\", \"arguments\": {\"location\": \"Paris\"}}]<|tool_call_end|>", /* is_partial= */ false, {COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS})); @@ -2435,7 +2457,7 @@ Hey there!<|im_end|> msg_multi_args.tool_calls.push_back({"calculate", "{\"x\":10,\"y\":20,\"operation\":\"add\"}", ""}); assert_msg_equals( msg_multi_args, - common_chat_parse( + test_chat_parse( "<|tool_call_start|>[{\"name\": \"calculate\", \"arguments\": {\"x\": 10, \"y\": 20, \"operation\": \"add\"}}]<|tool_call_end|>", /* is_partial= */ false, {COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS})); @@ -2447,7 +2469,7 @@ Hey there!<|im_end|> msg_multiple_tools.tool_calls.push_back({"get_time", "{\"timezone\":\"UTC\"}", ""}); assert_msg_equals( msg_multiple_tools, - common_chat_parse( + test_chat_parse( "<|tool_call_start|>[{\"name\": \"get_weather\", \"arguments\": {\"location\": \"Paris\"}}, {\"name\": \"get_time\", \"arguments\": {\"timezone\": \"UTC\"}}]<|tool_call_end|>", /* is_partial= */ false, {COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS})); @@ -2459,7 +2481,7 @@ Hey there!<|im_end|> msg_content_before_tool.tool_calls.push_back({"get_weather", "{\"location\":\"Paris\"}", ""}); assert_msg_equals( msg_content_before_tool, - common_chat_parse( + test_chat_parse( "Let me check the weather for you.<|tool_call_start|>[{\"name\": \"get_weather\", \"arguments\": {\"location\": \"Paris\"}}]<|tool_call_end|>", /* is_partial= */ false, {COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS})); @@ -2471,7 +2493,7 @@ Hey there!<|im_end|> msg_content_after_tool.tool_calls.push_back({"get_weather", "{\"location\":\"Paris\"}", ""}); assert_msg_equals( msg_content_after_tool, - common_chat_parse( + test_chat_parse( "<|tool_call_start|>[{\"name\": \"get_weather\", \"arguments\": {\"location\": \"Paris\"}}]<|tool_call_end|>Here's the result.", /* is_partial= */ false, {COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS})); @@ -2482,7 +2504,7 @@ Hey there!<|im_end|> msg_tool_call_newlines.tool_calls.push_back({"get_current_time", "{\"location\":\"Paris\"}", ""}); assert_msg_equals( msg_tool_call_newlines, - common_chat_parse( + test_chat_parse( "<|tool_call_start|>[{\n \"name\": \"get_current_time\",\n \"arguments\": {\n \"location\": \"Paris\"\n }\n}]<|tool_call_end|>", /* is_partial= */ false, {COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS})); @@ -2502,14 +2524,14 @@ Hey there!<|im_end|> // Test parsing regular content assert_msg_equals(message_assist, - common_chat_parse( + test_chat_parse( "Hello, world!\nWhat's up?", /* is_partial= */ false, {COMMON_CHAT_FORMAT_MINIMAX_M2})); // Test parsing content with thinking assert_msg_equals(message_assist_thoughts, - common_chat_parse( + test_chat_parse( "I'm\nthinkingHello, world!\nWhat's up?", /* is_partial= */ false, { @@ -2519,14 +2541,14 @@ Hey there!<|im_end|> // Test parsing tool calls assert_msg_equals(message_assist_call, - common_chat_parse( + test_chat_parse( "1", /* is_partial= */ false, {COMMON_CHAT_FORMAT_MINIMAX_M2})); // Test parsing tool calls with thinking assert_msg_equals(message_assist_call_thoughts, - common_chat_parse( + test_chat_parse( "I'm\nthinking1", /* is_partial= */ false, { @@ -2536,7 +2558,7 @@ Hey there!<|im_end|> // Test tool calls with extra content assert_msg_equals(message_assist_call_content, - common_chat_parse( + test_chat_parse( "1Hello, world!\nWhat's up?", /* is_partial= */ false, {COMMON_CHAT_FORMAT_MINIMAX_M2} @@ -2544,7 +2566,7 @@ Hey there!<|im_end|> // Test tool calls with extra content AND thinking assert_msg_equals(message_assist_call_thoughts_content, - common_chat_parse( + test_chat_parse( "I'm\nthinking1Hello, world!\nWhat's up?", /* is_partial= */ false, { @@ -2555,25 +2577,25 @@ Hey there!<|im_end|> // Test streaming test_parser_with_streaming(message_assist_call_thoughts_content, "I'm\nthinking\nHello, world!\nWhat's up?\n1", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, { + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, { /* .format = */ COMMON_CHAT_FORMAT_MINIMAX_M2, /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK }); }); test_parser_with_streaming(message_assist_call_thoughts_unparsed, "I'm\nthinking\n\n1", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, { + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, { /* .format = */ COMMON_CHAT_FORMAT_MINIMAX_M2, /* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE }); }); test_parser_with_streaming(message_assist_call_thoughts_content, "I'm\nthinking\n\n\nHello, world!\nWhat's up?\n\n\n\n1\n\n\n", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, { + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, { /* .format = */ COMMON_CHAT_FORMAT_MINIMAX_M2, /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK }); }); test_parser_with_streaming(message_assist_call_withopt, "\n\n1\n2\n\n", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, { + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, { /* .format = */ COMMON_CHAT_FORMAT_MINIMAX_M2, /* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE }); }); @@ -2618,14 +2640,14 @@ Hey there!<|im_end|> // Test parsing regular content assert_msg_equals(message_assist, - common_chat_parse( + test_chat_parse( "Hello, world!\nWhat's up?", /* is_partial= */ false, {COMMON_CHAT_FORMAT_GLM_4_5})); // Test parsing content with thinking assert_msg_equals(message_assist_thoughts, - common_chat_parse( + test_chat_parse( "\nI'm\nthinking\nHello, world!\nWhat's up?", /* is_partial= */ false, { @@ -2635,14 +2657,14 @@ Hey there!<|im_end|> // Test parsing tool calls assert_msg_equals(message_assist_call, - common_chat_parse( + test_chat_parse( "\nspecial_function\narg1\n1\n", /* is_partial= */ false, {COMMON_CHAT_FORMAT_GLM_4_5}), true); // Test parsing tool calls with thinking assert_msg_equals(message_assist_call_thoughts, - common_chat_parse( + test_chat_parse( "\nI'm\nthinking\nspecial_function\narg1\n1\n", /* is_partial= */ false, { @@ -2652,7 +2674,7 @@ Hey there!<|im_end|> // Test tool calls with extra content assert_msg_equals(message_assist_call_content, - common_chat_parse( + test_chat_parse( "\nspecial_function\narg1\n1\nHello, world!\nWhat's up?", /* is_partial= */ false, {COMMON_CHAT_FORMAT_GLM_4_5} @@ -2660,7 +2682,7 @@ Hey there!<|im_end|> // Test tool calls with extra content AND thinking assert_msg_equals(message_assist_call_thoughts_content, - common_chat_parse( + test_chat_parse( "\nI'm\nthinkingHello, world!\nWhat's up?\nspecial_function\narg1\n1\n", /* is_partial= */ false, { @@ -2671,19 +2693,19 @@ Hey there!<|im_end|> // Test streaming test_parser_with_streaming(message_assist_call_thoughts_content, "\nI'm\nthinkingHello, world!\nWhat's up?\nspecial_function\narg1\n1\n", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, { + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, { /* .format = */ COMMON_CHAT_FORMAT_GLM_4_5, /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK }); }); test_parser_with_streaming(message_assist_call_thoughts_unparsed, "\nI'm\nthinking\n\nspecial_function\narg1\n1\n", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, { + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, { /* .format = */ COMMON_CHAT_FORMAT_GLM_4_5, /* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE }); }); test_parser_with_streaming(message_assist_call_withopt, "\n\nspecial_function_with_opt\narg1\n1\narg2\n2\n\n", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, { + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, { /* .format = */ COMMON_CHAT_FORMAT_GLM_4_5, /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK }); }); @@ -2699,7 +2721,7 @@ Hey there!<|im_end|> "score\n" "95.5\n" "", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_GLM_4_5}); }); + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_GLM_4_5}); }); test_parser_with_streaming( simple_assist_msg("", "", "web_search", "{\"query\":\"\\\"From Zero\\\" Linkin Park album tracklist complete songs\",\"limit\":3,\"type\":\"text\"}"), "web_search\n" @@ -2710,18 +2732,18 @@ Hey there!<|im_end|> "type\n" "text\n" "", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_GLM_4_5}); }); + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_GLM_4_5}); }); // Test interleaved thinking test_parser_with_streaming(simple_assist_msg("Hello, world!\n\nWhat's up?", "I'm\nthinkingThinking2", "special_function", "{\"arg1\": 1}"), "\nI'm\nthinkingHello, world!\nThinking2What's up?\nspecial_function\narg1\n1\n", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, { + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, { /* .format = */ COMMON_CHAT_FORMAT_GLM_4_5, /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK }); }); test_parser_with_streaming(simple_assist_msg("\nI'm\nthinkingHello, world!\nThinking2What's up?", "", "special_function", "{\"arg1\": 1}"), "\nI'm\nthinkingHello, world!\nThinking2What's up?\nspecial_function\narg1\n1\n", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, { + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, { /* .format = */ COMMON_CHAT_FORMAT_GLM_4_5, /* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE }); }); @@ -2766,14 +2788,14 @@ Hey there!<|im_end|> // Test parsing regular content assert_msg_equals(message_assist, - common_chat_parse( + test_chat_parse( "Hello, world!\nWhat's up?", /* is_partial= */ false, {COMMON_CHAT_FORMAT_KIMI_K2})); // Test parsing content with thinking assert_msg_equals(message_assist_thoughts, - common_chat_parse( + test_chat_parse( "I'm\nthinkingHello, world!\nWhat's up?", /* is_partial= */ false, { @@ -2783,14 +2805,14 @@ Hey there!<|im_end|> // Test parsing tool calls assert_msg_equals(message_assist_call, - common_chat_parse( + test_chat_parse( "<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>", /* is_partial= */ false, {COMMON_CHAT_FORMAT_KIMI_K2})); // Test parsing tool calls with thinking assert_msg_equals(message_assist_call_thoughts, - common_chat_parse( + test_chat_parse( "I'm\nthinking<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>", /* is_partial= */ false, { @@ -2800,7 +2822,7 @@ Hey there!<|im_end|> // Test tool calls with extra content assert_msg_equals(message_assist_call_content, - common_chat_parse( + test_chat_parse( "<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>Hello, world!\nWhat's up?", /* is_partial= */ false, {COMMON_CHAT_FORMAT_KIMI_K2} @@ -2808,7 +2830,7 @@ Hey there!<|im_end|> // Test tool calls with extra content AND thinking assert_msg_equals(message_assist_call_thoughts_content, - common_chat_parse( + test_chat_parse( "I'm\nthinking<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>Hello, world!\nWhat's up?", /* is_partial= */ false, { @@ -2819,43 +2841,43 @@ Hey there!<|im_end|> // Test streaming test_parser_with_streaming(message_assist_call_thoughts_content, "I'm\nthinking\nHello, world!\nWhat's up?\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, { + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, { /* .format = */ COMMON_CHAT_FORMAT_KIMI_K2, /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK }); }); test_parser_with_streaming(message_assist_call_thoughts_unparsed, "I'm\nthinking\n\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, { + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, { /* .format = */ COMMON_CHAT_FORMAT_KIMI_K2, /* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE }); }); test_parser_with_streaming(message_assist_call_thoughts_content, "I'm\nthinking\n\n\nHello, world!\nWhat's up?\n\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>\n", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, { + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, { /* .format = */ COMMON_CHAT_FORMAT_KIMI_K2, /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK }); }); test_parser_with_streaming(message_assist_call_withopt, "<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function_with_opt:0<|tool_call_argument_begin|>{\"arg1\": 1, \"arg2\": 2}<|tool_call_end|><|tool_calls_section_end|>", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, { + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, { /* .format = */ COMMON_CHAT_FORMAT_KIMI_K2, /* .reasoning_format = */ COMMON_REASONING_FORMAT_NONE }); }); test_parser_with_streaming(simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking", "special_function", "{\"arg1\": \"123456\"}"), "I'm\nthinkingHello, world!\nWhat's up?\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": \"123456\"}<|tool_call_end|><|tool_calls_section_end|>", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, { + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, { /* .format = */ COMMON_CHAT_FORMAT_KIMI_K2, /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK }); }); test_parser_with_streaming(simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking", "special_function", "{\"arg1\": [1, 2, \"345\", 6]}"), "I'm\nthinkingHello, world!\nWhat's up?\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": [1, 2, \"345\", 6]}<|tool_call_end|><|tool_calls_section_end|>", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, { + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, { /* .format = */ COMMON_CHAT_FORMAT_KIMI_K2, /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK }); }); test_parser_with_streaming(simple_assist_msg("Hello, world!\nWhat's up?", "I'm\nthinking", "special_function", "{\"arg1\": {\"12\": 34, \"5\": [67, 8], \"9\": \"10\"}}"), "I'm\nthinkingHello, world!\nWhat's up?\n<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>{\"arg1\": {\"12\": 34, \"5\": [67, 8], \"9\": \"10\"}}<|tool_call_end|><|tool_calls_section_end|>", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, { + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, { /* .format = */ COMMON_CHAT_FORMAT_KIMI_K2, /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK }); }); @@ -2864,19 +2886,19 @@ Hey there!<|im_end|> "<|tool_calls_section_begin|><|tool_call_begin|>functions.complex_function:0<|tool_call_argument_begin|>" "{\"name\": \"John Doe\", \"age\": 30, \"active\": true, \"score\": 95.5}" "<|tool_call_end|><|tool_calls_section_end|>", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_KIMI_K2}); }); + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_KIMI_K2}); }); test_parser_with_streaming( simple_assist_msg("", "", "web_search", "{\"query\":\"\\\"From Zero\\\" Linkin Park album tracklist complete songs\",\"limit\":3,\"type\":\"text\"}"), "<|tool_calls_section_begin|><|tool_call_begin|>functions.web_search:0<|tool_call_argument_begin|>" "{\"query\":\"\\\"From Zero\\\" Linkin Park album tracklist complete songs\",\"limit\":3,\"type\":\"text\"}" "<|tool_call_end|><|tool_calls_section_end|>", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_KIMI_K2}); }); + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_KIMI_K2}); }); test_parser_with_streaming( simple_assist_msg("", "", "read_file", "{\"args\": [{\"path\": \"src/providers/ThemeProvider.tsx\"}, {\"path\": \"src/components/Header.tsx\"}, {\"path\": \"src/components/ThemeToggle.tsx\"}, {\"path\": \"src/app/globals.css\"}, {\"path\": \"src/app/layout.tsx\"}]}"), "<|tool_calls_section_begin|><|tool_call_begin|>functions.read_file:0<|tool_call_argument_begin|>" "{\"args\": [{\"path\": \"src/providers/ThemeProvider.tsx\"}, {\"path\": \"src/components/Header.tsx\"}, {\"path\": \"src/components/ThemeToggle.tsx\"}, {\"path\": \"src/app/globals.css\"}, {\"path\": \"src/app/layout.tsx\"}]}" "<|tool_call_end|><|tool_calls_section_end|>", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_KIMI_K2}); }); + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_KIMI_K2}); }); test_parser_with_streaming( simple_assist_msg( "Let me start by examining the relevant files to understand the current implementation.", "", @@ -2886,7 +2908,7 @@ Hey there!<|im_end|> "<|tool_calls_section_begin|><|tool_call_begin|>functions.read_file:0<|tool_call_argument_begin|>" "{\"files\":[{\"path\":\"src/app/Partners.tsx\",\"line_ranges\":[\"1-100\"]}]}" "<|tool_call_end|><|tool_calls_section_end|>", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_KIMI_K2}); }); + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_KIMI_K2}); }); auto multi_tool_msg = simple_assist_msg("Let me call multiple tools.", "I'm thinking."); multi_tool_msg.tool_calls.push_back({ "read_file", "{\"files\": [{\"path\": \"src/app/Partners.tsx\", \"line_ranges\": [\"1-100\"]}]}", "" }); multi_tool_msg.tool_calls.push_back({ "web_search", "{\"query\":\"\\\"From Zero\\\" Linkin Park album tracklist complete songs\",\"limit\":3,\"type\":\"text\"}", "" }); @@ -2908,7 +2930,7 @@ Hey there!<|im_end|> "{\"message\":\"Hello! 👋 🌟 🚀 Testing emojis: 😀😃😄😁 and symbols: ∑∏∆∇\"}" "<|tool_call_end|>" "<|tool_calls_section_end|>", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, { + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, { COMMON_CHAT_FORMAT_KIMI_K2, COMMON_REASONING_FORMAT_DEEPSEEK }); }); @@ -2917,7 +2939,7 @@ Hey there!<|im_end|> "I'm thinking<|tool_calls_section_begin|><|tool_call_begin|>functions.complex_function_in_think:0<|tool_call_argument_begin|>" "{\"name\": \"John Doe\", \"age\": 30, \"active\": true, \"score\": 95.5}" "<|tool_call_end|><|tool_calls_section_end|>", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, { + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, { COMMON_CHAT_FORMAT_KIMI_K2, COMMON_REASONING_FORMAT_DEEPSEEK }); }); @@ -2926,7 +2948,7 @@ Hey there!<|im_end|> "I'm thinking<|tool_calls_section_begin|><|tool_call_begin|>functions.complex_function_in_think:0<|tool_call_argument_begin|>" "{\"name\": \"John Doe\", \"age\": 30, \"active\": true, \"score\": 95.5}" "<|tool_call_end|><|tool_calls_section_end|>I'm still thinkingHello", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, { + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, { COMMON_CHAT_FORMAT_KIMI_K2, COMMON_REASONING_FORMAT_DEEPSEEK }); }); @@ -3001,7 +3023,7 @@ Hey there!<|im_end|> // Basic XML tool call parsing assert_msg_equals( message_assist_call, - common_chat_parse( + test_chat_parse( "\n" " \n" " \n" @@ -3036,7 +3058,7 @@ Hey there!<|im_end|> " \n" " \n" "", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); // Special characters and Unicode common_chat_msg expected_special_chars; @@ -3053,7 +3075,7 @@ Hey there!<|im_end|> " \n" " \n" "", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); // Multiline content with newlines and indentation common_chat_msg expected_multiline; @@ -3072,7 +3094,7 @@ Hey there!<|im_end|> " \n" " \n" "", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); // JSON object as parameter value common_chat_msg expected_json_param; @@ -3090,7 +3112,7 @@ Hey there!<|im_end|> " \n" " \n" "", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); // Array as parameter value common_chat_msg expected_array_param; @@ -3108,7 +3130,7 @@ Hey there!<|im_end|> " \n" " \n" "", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); // Empty parameter common_chat_msg expected_empty_param; @@ -3125,7 +3147,7 @@ Hey there!<|im_end|> " \n" " \n" "", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); // Boolean values (true/false) common_chat_msg expected_boolean; @@ -3146,7 +3168,7 @@ Hey there!<|im_end|> " \n" " \n" "", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); // Null value common_chat_msg expected_null; @@ -3164,7 +3186,7 @@ Hey there!<|im_end|> " \n" " \n" "", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); // Negative numbers and scientific notation common_chat_msg expected_numbers; @@ -3188,7 +3210,7 @@ Hey there!<|im_end|> " \n" " \n" "", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); // XML-like content in parameters (should be escaped) common_chat_msg expected_xml_content; @@ -3206,7 +3228,7 @@ Hey there!<|im_end|> " \n" " \n" "", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); // Quotes and escape characters common_chat_msg expected_quotes; @@ -3224,7 +3246,7 @@ Hey there!<|im_end|> " \n" " \n" "", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); // Long parameter value (simplified) std::string long_text = "This is a long text parameter that should test the parser's ability to handle larger amounts of text data."; @@ -3244,7 +3266,7 @@ Hey there!<|im_end|> " \n" " \n" "", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); // Mixed content with text before and after tool call common_chat_msg expected_mixed_content; @@ -3263,7 +3285,7 @@ Hey there!<|im_end|> " \n" " \n" "", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); // Compact format (no extra whitespace) common_chat_msg expected_compact; @@ -3275,7 +3297,7 @@ Hey there!<|im_end|> test_parser_with_streaming( expected_compact, "value", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); // Function name with underscores and numbers common_chat_msg expected_complex_name; @@ -3293,7 +3315,7 @@ Hey there!<|im_end|> " \n" " \n" "", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); // Parameter names with underscores and numbers common_chat_msg expected_complex_params; @@ -3317,7 +3339,7 @@ Hey there!<|im_end|> " \n" " \n" "", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); // Very deeply nested XML content in parameter common_chat_msg expected_deep_xml; @@ -3335,7 +3357,7 @@ Hey there!<|im_end|> " \n" " \n" "", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); // Parameter with only whitespace common_chat_msg expected_whitespace_param; @@ -3353,7 +3375,7 @@ Hey there!<|im_end|> " \n" " \n" "", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); // Parameter with tabs and mixed whitespace common_chat_msg expected_mixed_whitespace; @@ -3373,7 +3395,7 @@ Hey there!<|im_end|> " \n" " \n" "", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); // Control characters and special Unicode common_chat_msg expected_control_chars; @@ -3391,7 +3413,7 @@ Hey there!<|im_end|> " \n" " \n" "", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); // Emoji and extended Unicode characters common_chat_msg expected_emoji; @@ -3409,7 +3431,7 @@ Hey there!<|im_end|> " \n" " \n" "", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); // Mathematical expressions and formulas common_chat_msg expected_math; @@ -3427,7 +3449,7 @@ Hey there!<|im_end|> " \n" " \n" "", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); // SQL injection-like content (should be safely escaped) common_chat_msg expected_sql; @@ -3445,7 +3467,7 @@ Hey there!<|im_end|> " \n" " \n" "", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); // HTML/XML injection content common_chat_msg expected_html; @@ -3463,7 +3485,7 @@ Hey there!<|im_end|> " \n" " \n" "", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); // Binary-like content (base64) common_chat_msg expected_binary; @@ -3481,7 +3503,7 @@ Hey there!<|im_end|> " \n" " \n" "", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); // Very large numbers (should be parsed as scientific notation) common_chat_msg expected_large_numbers; @@ -3499,7 +3521,7 @@ Hey there!<|im_end|> " \n" " \n" "", - [&](const std::string &msg) { return common_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); + [&](const std::string &msg) { return test_chat_parse(msg, /* is_partial= */ true, {COMMON_CHAT_FORMAT_QWEN3_CODER_XML}); }); } { diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index caad29bac..0926e552e 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -66,21 +66,25 @@ struct cli_context { defaults.stream = true; // make sure we always use streaming mode defaults.timings_per_token = true; // in order to get timings even when we cancel mid-way // defaults.return_progress = true; // TODO: show progress - defaults.oaicompat_chat_syntax.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; } std::string generate_completion(result_timings & out_timings) { server_response_reader rd = ctx_server.get_response_reader(); - auto formatted = format_chat(); + auto chat_params = format_chat(); { // TODO: reduce some copies here in the future server_task task = server_task(SERVER_TASK_TYPE_COMPLETION); task.id = rd.get_new_id(); task.index = 0; - task.params = defaults; // copy - task.cli_prompt = formatted.prompt; // copy - task.cli_files = input_files; // copy + task.params = defaults; // copy + task.cli_prompt = chat_params.prompt; // copy + task.cli_files = input_files; // copy task.cli = true; + + // chat template settings + task.params.chat_parser_params = common_chat_parser_params(chat_params); + task.params.chat_parser_params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; + rd.post_task({std::move(task)}); } @@ -172,7 +176,6 @@ struct cli_context { inputs.use_jinja = chat_params.use_jinja; inputs.parallel_tool_calls = false; inputs.add_generation_prompt = true; - inputs.reasoning_format = chat_params.reasoning_format; inputs.enable_thinking = chat_params.enable_thinking; // Apply chat template to the list of messages diff --git a/tools/server/server-common.h b/tools/server/server-common.h index 7f4c07387..99e9c5e6f 100644 --- a/tools/server/server-common.h +++ b/tools/server/server-common.h @@ -274,6 +274,7 @@ std::vector tokenize_input_prompts( // OAI utils // +// global server parameters for chat formatting / parsing struct server_chat_params { bool use_jinja; bool prefill_assistant; diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index 35ec7ad2a..2add9667d 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -68,10 +68,10 @@ json task_params::to_json(bool only_metrics) const { {"stream", stream}, {"n_probs", sampling.n_probs}, {"min_keep", sampling.min_keep}, - {"chat_format", common_chat_format_name(oaicompat_chat_syntax.format)}, - {"reasoning_format", common_reasoning_format_name(oaicompat_chat_syntax.reasoning_format)}, - {"reasoning_in_content", oaicompat_chat_syntax.reasoning_in_content}, - {"thinking_forced_open", oaicompat_chat_syntax.thinking_forced_open}, + {"chat_format", common_chat_format_name(chat_parser_params.format)}, + {"reasoning_format", common_reasoning_format_name(chat_parser_params.reasoning_format)}, + {"reasoning_in_content", chat_parser_params.reasoning_in_content}, + {"thinking_forced_open", chat_parser_params.thinking_forced_open}, {"samplers", samplers}, {"speculative.n_max", speculative.n_max}, {"speculative.n_min", speculative.n_min}, @@ -127,10 +127,10 @@ json task_params::to_json(bool only_metrics) const { {"grammar_lazy", sampling.grammar_lazy}, {"grammar_triggers", grammar_triggers}, {"preserved_tokens", sampling.preserved_tokens}, - {"chat_format", common_chat_format_name(oaicompat_chat_syntax.format)}, - {"reasoning_format", common_reasoning_format_name(oaicompat_chat_syntax.reasoning_format)}, - {"reasoning_in_content", oaicompat_chat_syntax.reasoning_in_content}, - {"thinking_forced_open", oaicompat_chat_syntax.thinking_forced_open}, + {"chat_format", common_chat_format_name(chat_parser_params.format)}, + {"reasoning_format", common_reasoning_format_name(chat_parser_params.reasoning_format)}, + {"reasoning_in_content", chat_parser_params.reasoning_in_content}, + {"thinking_forced_open", chat_parser_params.thinking_forced_open}, {"samplers", samplers}, {"speculative.n_max", speculative.n_max}, {"speculative.n_min", speculative.n_min}, @@ -291,21 +291,21 @@ task_params server_task::params_from_json_cmpl( { auto it = data.find("chat_format"); if (it != data.end()) { - params.oaicompat_chat_syntax.format = static_cast(it->get()); - SRV_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_syntax.format)); + params.chat_parser_params.format = static_cast(it->get()); + SRV_INF("Chat format: %s\n", common_chat_format_name(params.chat_parser_params.format)); } else { - params.oaicompat_chat_syntax.format = defaults.oaicompat_chat_syntax.format; + params.chat_parser_params.format = defaults.chat_parser_params.format; } common_reasoning_format reasoning_format = params_base.reasoning_format; if (data.contains("reasoning_format")) { reasoning_format = common_reasoning_format_from_name(data.at("reasoning_format").get()); } - params.oaicompat_chat_syntax.reasoning_format = reasoning_format; - params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY); - params.oaicompat_chat_syntax.thinking_forced_open = json_value(data, "thinking_forced_open", false); - params.oaicompat_chat_syntax.parse_tool_calls = json_value(data, "parse_tool_calls", false); + params.chat_parser_params.reasoning_format = reasoning_format; + params.chat_parser_params.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY); + params.chat_parser_params.thinking_forced_open = json_value(data, "thinking_forced_open", false); + params.chat_parser_params.parse_tool_calls = json_value(data, "parse_tool_calls", false); if (data.contains("chat_parser")) { - params.oaicompat_chat_syntax.parser.load(data.at("chat_parser").get()); + params.chat_parser_params.parser.load(data.at("chat_parser").get()); } } @@ -722,7 +722,7 @@ common_chat_msg task_result_state::update_chat_msg( auto new_msg = common_chat_parse( generated_text, is_partial, - oaicompat_chat_syntax); + chat_parser_params); if (!new_msg.empty()) { new_msg.set_tool_call_ids(generated_tool_call_ids, gen_tool_call_id); chat_msg = new_msg; diff --git a/tools/server/server-task.h b/tools/server/server-task.h index daffe0c90..6835eef50 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -78,7 +78,9 @@ struct task_params { task_response_type res_type = TASK_RESPONSE_TYPE_NONE; std::string oaicompat_model; std::string oaicompat_cmpl_id; - common_chat_syntax oaicompat_chat_syntax; + + // per-request parameters for chat parsing + common_chat_parser_params chat_parser_params; // Embeddings int32_t embd_normalize = 2; // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm) @@ -91,7 +93,7 @@ struct task_params { struct task_result_state { // tracking diffs for partial tool calls std::vector diffs; - common_chat_syntax oaicompat_chat_syntax; + common_chat_parser_params chat_parser_params; common_chat_msg chat_msg; std::string generated_text; // append new chunks of generated text here std::vector generated_tool_call_ids; @@ -100,8 +102,8 @@ struct task_result_state { bool anthropic_thinking_block_started = false; bool anthropic_text_block_started = false; - task_result_state(const common_chat_syntax & oaicompat_chat_syntax) - : oaicompat_chat_syntax(oaicompat_chat_syntax) {} + task_result_state(const common_chat_parser_params & chat_parser_params) + : chat_parser_params(chat_parser_params) {} // parse partial tool calls and update the internal state common_chat_msg update_chat_msg( @@ -230,7 +232,7 @@ struct server_task { // the task will be moved into queue, then onto slots // however, the state must be kept by caller (e.g., HTTP thread) task_result_state create_state() const { - return task_result_state(params.oaicompat_chat_syntax); + return task_result_state(params.chat_parser_params); } bool is_parent() const { From 1c7cf94b22a9dc6b1d32422f72a627787a4783a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Tue, 20 Jan 2026 18:28:43 +0100 Subject: [PATCH 14/17] common, server : use the same User-Agent by default (#18957) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit also ensures that if a custom User-Agent is used, it will be the only one sent. Signed-off-by: Adrien Gallouët --- common/common.h | 2 ++ common/download.cpp | 33 +++++++++++++++++++-------------- tools/server/server-common.cpp | 1 - tools/server/server-common.h | 2 -- 4 files changed, 21 insertions(+), 17 deletions(-) diff --git a/common/common.h b/common/common.h index 8247949de..96c990c05 100644 --- a/common/common.h +++ b/common/common.h @@ -57,6 +57,8 @@ extern const char * LLAMA_COMMIT; extern const char * LLAMA_COMPILER; extern const char * LLAMA_BUILD_TARGET; +const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT); + struct common_control_vector_load_info; // diff --git a/common/download.cpp b/common/download.cpp index a37780421..57f29a23b 100644 --- a/common/download.cpp +++ b/common/download.cpp @@ -314,23 +314,26 @@ static bool common_pull_file(httplib::Client & cli, // download one single file from remote URL to local path // returns status code or -1 on error -static int common_download_file_single_online(const std::string & url, - const std::string & path, - const std::string & bearer_token, - const common_header_list & custom_headers) { +static int common_download_file_single_online(const std::string & url, + const std::string & path, + const std::string & bearer_token, + const common_header_list & custom_headers) { static const int max_attempts = 3; static const int retry_delay_seconds = 2; auto [cli, parts] = common_http_client(url); - httplib::Headers default_headers = {{"User-Agent", "llama-cpp"}}; - if (!bearer_token.empty()) { - default_headers.insert({"Authorization", "Bearer " + bearer_token}); - } + httplib::Headers headers; for (const auto & h : custom_headers) { - default_headers.emplace(h.first, h.second); + headers.emplace(h.first, h.second); } - cli.set_default_headers(default_headers); + if (headers.find("User-Agent") == headers.end()) { + headers.emplace("User-Agent", "llama-cpp/" + build_info); + } + if (!bearer_token.empty()) { + headers.emplace("Authorization", "Bearer " + bearer_token); + } + cli.set_default_headers(headers); const bool file_exists = std::filesystem::exists(path); @@ -437,10 +440,12 @@ std::pair> common_remote_get_content(const std::string const common_remote_params & params) { auto [cli, parts] = common_http_client(url); - httplib::Headers headers = {{"User-Agent", "llama-cpp"}}; - - for (const auto & header : params.headers) { - headers.emplace(header.first, header.second); + httplib::Headers headers; + for (const auto & h : params.headers) { + headers.emplace(h.first, h.second); + } + if (headers.find("User-Agent") == headers.end()) { + headers.emplace("User-Agent", "llama-cpp/" + build_info); } if (params.timeout > 0) { diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index 1bbe85322..4aeeda2ff 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -779,7 +779,6 @@ static void handle_media( // download remote image // TODO @ngxson : maybe make these params configurable common_remote_params params; - params.headers.push_back({"User-Agent", "llama.cpp/" + build_info}); params.max_size = 1024 * 1024 * 10; // 10MB params.timeout = 10; // seconds SRV_INF("downloading image from '%s'\n", url.c_str()); diff --git a/tools/server/server-common.h b/tools/server/server-common.h index 99e9c5e6f..a88d40494 100644 --- a/tools/server/server-common.h +++ b/tools/server/server-common.h @@ -13,8 +13,6 @@ #include #include -const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT); - using json = nlohmann::ordered_json; #define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, ((slot).task ? (slot).task->id : -1), __VA_ARGS__) From 5bd341c9a135a13f901c4cacacc27fa5b299ce19 Mon Sep 17 00:00:00 2001 From: Oliver Simons Date: Wed, 21 Jan 2026 02:34:29 +0100 Subject: [PATCH 15/17] CUDA: Fix builds for older CCCL versions by ifdefing strided_iterator (#18964) * CUDA: Fix builds for older CCCL versions by ifdefing strided_iterator Strided iterator was added in [CCCL 3.1](https://github.com/NVIDIA/cccl/releases/tag/v3.1.0), which is packaged into [CTK 13.1](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#id5) * Unindent as per code review request --- ggml/src/ggml-cuda/argsort.cu | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/argsort.cu b/ggml/src/ggml-cuda/argsort.cu index cf7a44f7a..4896669c3 100644 --- a/ggml/src/ggml-cuda/argsort.cu +++ b/ggml/src/ggml-cuda/argsort.cu @@ -2,6 +2,9 @@ #ifdef GGML_CUDA_USE_CUB # include +# if (CCCL_MAJOR_VERSION >= 3 && CCCL_MINOR_VERSION >= 1) +# define STRIDED_ITERATOR_AVAILABLE +# endif using namespace cub; #endif // GGML_CUDA_USE_CUB @@ -14,6 +17,14 @@ static __global__ void init_indices(int * indices, const int ncols, const int nr } } +#ifndef STRIDED_ITERATOR_AVAILABLE +static __global__ void init_offsets(int * offsets, const int ncols, const int nrows) { + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx <= nrows) { + offsets[idx] = idx * ncols; + } +} +#endif // STRIDED_ITERATOR_AVAILABLE #ifdef GGML_CUDA_USE_CUB void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool, @@ -33,8 +44,14 @@ void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool, const dim3 grid_size((ncols + block_size - 1) / block_size, nrows); init_indices<<>>(temp_indices, ncols, nrows); +#ifdef STRIDED_ITERATOR_AVAILABLE auto offset_iterator = cuda::make_strided_iterator(cuda::make_counting_iterator(0), ncols); - +#else + ggml_cuda_pool_alloc offsets_alloc(pool, nrows + 1); + int * offset_iterator = offsets_alloc.get(); + const dim3 offset_grid((nrows + block_size - 1) / block_size); + init_offsets<<>>(offset_iterator, ncols, nrows); +#endif CUDA_CHECK(cudaMemcpyAsync(temp_keys, x, ncols * nrows * sizeof(float), cudaMemcpyDeviceToDevice, stream)); size_t temp_storage_bytes = 0; From 37c35f0e1c625831687b146cbb0a57654ef88ca2 Mon Sep 17 00:00:00 2001 From: Matthieu Coudron <886074+teto@users.noreply.github.com> Date: Wed, 21 Jan 2026 07:52:46 +0100 Subject: [PATCH 16/17] gguf: display strerrno when cant load a model (#18884) I've had issues loading models with llama-server: [44039] E gguf_init_from_file: failed to open GGUF file 'mistral-7b-v0.1.Q8_0.gguf' and I was sure it could access the file. Seems like --models-dir and --models-presets dont interact like I thought they would but I salvaged this snippet that helps troubleshooting [44039] E gguf_init_from_file: failed to open GGUF file 'mistral-7b-v0.1.Q8_0.gguf' (errno No such file or directory) --- ggml/src/gguf.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index b165d8bdc..bfab5c4d6 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -734,7 +734,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p FILE * file = ggml_fopen(fname, "rb"); if (!file) { - GGML_LOG_ERROR("%s: failed to open GGUF file '%s'\n", __func__, fname); + GGML_LOG_ERROR("%s: failed to open GGUF file '%s' (%s)\n", __func__, fname, strerror(errno)); return nullptr; } From 12a4a47e6aaf691492644c39da453745aaee1672 Mon Sep 17 00:00:00 2001 From: "Piotr Wilkin (ilintar)" Date: Wed, 21 Jan 2026 12:35:20 +0100 Subject: [PATCH 17/17] Fix GLM 4.7 Lite MoE gating func (#18980) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix GLM 4.7 MoE gating func * Update src/models/deepseek2.cpp Co-authored-by: Sigbjørn Skjæret * Update src/llama-model.cpp Co-authored-by: Xuan-Son Nguyen --------- Co-authored-by: Sigbjørn Skjæret Co-authored-by: Xuan-Son Nguyen --- src/llama-model.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 94c47dc24..255289b7c 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1713,7 +1713,12 @@ void llama_model::load_hparams(llama_model_loader & ml) { if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) { // for compatibility with existing DeepSeek V2 and V2.5 GGUFs // that have no expert_gating_func model parameter set - hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX; + if ((hparams.n_layer == 47 || hparams.n_layer == 48) && n_vocab == 154880) { + // GLM 4.7 Lite + hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID; + } else { + hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX; + } } if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {