Merge branch 'upstream' into concedo_experimental

# Conflicts: # .devops/tools.sh # build-xcframework.sh # ci/run.sh # examples/Miku.sh # examples/chat-13B.sh # examples/chat-persistent.sh # examples/chat-vicuna.sh # examples/chat.sh # examples/jeopardy/jeopardy.sh # examples/reason-act.sh # examples/server-llama2-13B.sh # examples/sycl/build.sh # examples/sycl/run-llama2.sh # examples/sycl/run-llama3.sh # examples/ts-type-to-grammar.sh # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-sycl/element_wise.cpp # ggml/src/ggml-sycl/element_wise.hpp # ggml/src/ggml-sycl/ggml-sycl.cpp # scripts/apple/validate-apps.sh # scripts/apple/validate-ios.sh # scripts/apple/validate-macos.sh # scripts/apple/validate-tvos.sh # scripts/apple/validate-visionos.sh # scripts/check-requirements.sh # scripts/ci-run.sh # scripts/compare-commits.sh # scripts/debug-test.sh # scripts/gen-authors.sh # scripts/get-hellaswag.sh # scripts/get-pg.sh # scripts/get-wikitext-103.sh # scripts/get-wikitext-2.sh # scripts/get-winogrande.sh # scripts/hf.sh # scripts/qnt-all.sh # scripts/run-all-perf.sh # scripts/run-all-ppl.sh # scripts/sync-ggml-am.sh # scripts/sync-ggml.sh # scripts/tool_bench.sh # tests/test-backend-ops.cpp # tests/test-lora-conversion-inference.sh # tests/test-tokenizer-0.sh # tools/server/README.md
2025-09-13 02:19:41 +00:00 · 2025-06-30 20:38:44 +08:00 · 2025-06-30 20:38:44 +08:00 · cdda9d16e0
commit cdda9d16e0
parent 989f9e6b98 eb3fa2913e
42 changed files with 1519 additions and 118 deletions
--- a/tools/server/chat-llama2.sh
+++ b/tools/server/chat-llama2.sh
@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash

 API_URL="${API_URL:-http://127.0.0.1:8080}"

--- a/tools/server/chat.sh
+++ b/tools/server/chat.sh
@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash

 API_URL="${API_URL:-http://127.0.0.1:8080}"

--- a/tools/server/public/index.html.gz
+++ b/tools/server/public/index.html.gz
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@ -2110,6 +2110,7 @@ struct server_context {
            /* use_jinja             */ params_base.use_jinja,
            /* prefill_assistant     */ params_base.prefill_assistant,
            /* reasoning_format      */ params_base.reasoning_format,
+            /* chat_template_kwargs  */ params_base.default_template_kwargs,
            /* common_chat_templates */ chat_templates.get(),
            /* allow_image           */ mctx ? mtmd_support_vision(mctx) : false,
            /* allow_audio           */ mctx ? mtmd_support_audio (mctx) : false,
--- a/tools/server/tests/tests.sh
+++ b/tools/server/tests/tests.sh
@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash

 # make sure we are in the right directory
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@ -579,6 +579,7 @@ struct oaicompat_parser_options {
    bool use_jinja;
    bool prefill_assistant;
    common_reasoning_format reasoning_format;
+    std::map<std::string,std::string> chat_template_kwargs;
    common_chat_templates * tmpls;
    bool allow_image;
    bool allow_audio;
@ -756,6 +757,13 @@ static json oaicompat_chat_params_parse(
        llama_params["parse_tool_calls"] = true;
    }

+    // merge the template args provided from command line with the args provided in the user request
+    auto chat_template_kwargs_object = json_value(body, "chat_template_kwargs", json::object());
+    inputs.chat_template_kwargs = opt.chat_template_kwargs;
+    for (const auto & item : chat_template_kwargs_object.items()) {
+        inputs.chat_template_kwargs[item.key()] = item.value().dump();
+    }
+
    // if the assistant message appears at the end of list, we do not add end-of-turn token
    // for ex. this can be useful to modify the reasoning process in reasoning models
    bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && opt.prefill_assistant;
@ -771,6 +779,11 @@ static json oaicompat_chat_params_parse(

        /* TODO: test this properly */
        inputs.reasoning_format = COMMON_REASONING_FORMAT_NONE;
+
+        if ( (!inputs.enable_thinking) || inputs.chat_template_kwargs.find("enable_thinking") != inputs.chat_template_kwargs.end()) {
+            throw std::runtime_error("Assistant response prefill is incompatible with enable_thinking.");
+        }
+
        inputs.add_generation_prompt = true;
    }

--- a/tools/server/webui/src/components/Sidebar.tsx
+++ b/tools/server/webui/src/components/Sidebar.tsx
@ -231,7 +231,7 @@ function ConversationItem({
      >
        {conv.name}
      </button>
-      <div className="dropdown dropdown-end h-5">
+      <div tabIndex={0} className="dropdown dropdown-end h-5">
        <BtnWithTooltips
          // on mobile, we always show the ellipsis icon
          // on desktop, we only show it when the user hovers over the conversation item