Merge branch 'upstream' into concedo_experimental

# Conflicts: # .devops/intel.Dockerfile # .github/workflows/build-android.yml # .github/workflows/build.yml # .github/workflows/release.yml # .gitignore # docs/backend/SYCL.md # docs/backend/snapdragon/README.md # examples/model-conversion/scripts/causal/convert-model.sh # ggml/CMakeLists.txt # ggml/src/CMakeLists.txt # ggml/src/ggml-hexagon/ggml-hexagon.cpp # ggml/src/ggml-hexagon/htp/CMakeLists.txt # ggml/src/ggml-hexagon/htp/hex-utils.h # ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c # ggml/src/ggml-hexagon/htp/htp-ctx.h # ggml/src/ggml-hexagon/htp/htp-ops.h # ggml/src/ggml-hexagon/htp/htp_iface.idl # ggml/src/ggml-hexagon/htp/hvx-base.h # ggml/src/ggml-hexagon/htp/main.c # ggml/src/ggml-hexagon/htp/matmul-ops.c # ggml/src/ggml-hexagon/libggml-htp.inf # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-sycl/mmvq.cpp # ggml/src/ggml-sycl/mmvq.hpp # ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp # ggml/src/ggml-webgpu/ggml-webgpu.cpp # ggml/src/ggml-webgpu/wgsl-shaders/flash_attn.wgsl # ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_blk.wgsl # ggml/src/ggml-webgpu/wgsl-shaders/flash_attn_vec_split.wgsl # scripts/server-test-structured.py # scripts/snapdragon/adb/run-bench.sh # scripts/snapdragon/adb/run-cli.sh # scripts/snapdragon/adb/run-completion.sh # scripts/snapdragon/adb/run-mtmd.sh # scripts/snapdragon/adb/run-tool.sh # scripts/snapdragon/qdc/requirements.txt # scripts/snapdragon/windows/run-bench.ps1 # scripts/snapdragon/windows/run-cli.ps1 # scripts/snapdragon/windows/run-completion.ps1 # scripts/snapdragon/windows/run-mtmd.ps1 # scripts/snapdragon/windows/run-tool.ps1 # tests/test-backend-ops.cpp # tools/cli/cli.cpp # ty.toml
2026-04-26 10:41:25 +00:00 · 2026-04-25 12:13:14 +08:00 · 2026-04-25 12:13:14 +08:00 · 340b22283e
commit 340b22283e
parent 4090400dff 0adede866d
20 changed files with 260 additions and 98 deletions
--- a/common/chat.cpp
+++ b/common/chat.cpp
@ -558,6 +558,26 @@ bool common_chat_templates_was_explicit(const struct common_chat_templates * tmp
    return tmpls->has_explicit_template;
 }

+// LFM2 format detection: template uses <|tool_list_start|>[...]<|tool_list_end|> around the tool list
+// and <|tool_call_start|>[...]<|tool_call_end|> around each tool call
+static bool is_lfm2_template(const std::string & src) {
+    return src.find("<|tool_list_start|>") != std::string::npos &&
+           src.find("<|tool_list_end|>")   != std::string::npos;
+}
+
+common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates) {
+    common_chat_prompt_preset asr_preset;
+    asr_preset.system = "";
+    asr_preset.user   = "Transcribe audio to text";
+
+    if (chat_templates && chat_templates->template_default && is_lfm2_template(chat_templates->template_default->source())) {
+        asr_preset.system = "Perform ASR.";
+        asr_preset.user   = "";
+    }
+
+    return asr_preset;
+}
+
 std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant) {
    if (!variant.empty()) {
        if (variant == "tool_use") {
@ -2067,10 +2087,7 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
        return common_chat_params_init_kimi_k2(tmpl, params);
    }

-    // LFM2 format detection: template uses <|tool_list_start|>[...]<|tool_list_end|> around the tool list
-    // and <|tool_call_start|>[...]<|tool_call_end|> around each tool call
-    if (src.find("<|tool_list_start|>") != std::string::npos &&
-        src.find("<|tool_list_end|>") != std::string::npos) {
+    if (is_lfm2_template(src)) {
        LOG_DBG("Using specialized template: LFM2\n");
        return common_chat_params_init_lfm2(tmpl, params);
    }
@ -2379,4 +2396,3 @@ std::map<std::string, bool> common_chat_templates_get_caps(const common_chat_tem
    GGML_ASSERT(chat_templates->template_default != nullptr);
    return chat_templates->template_default->caps.to_map();
 }
-
--- a/common/chat.h
+++ b/common/chat.h
@ -274,3 +274,11 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
        const common_chat_template &          tmpl,
        const std::string &                   src,
        autoparser::generation_params & params);
+
+// specialized per-task preset
+struct common_chat_prompt_preset {
+    std::string system;
+    std::string user;
+};
+
+common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates);
--- a/common/common.h
+++ b/common/common.h
@ -747,6 +747,11 @@ inline bool string_starts_with(std::string_view str, std::string_view prefix) {
           str.compare(0, prefix.size(), prefix) == 0;
 }

+// remove when moving to c++20
+inline bool string_starts_with(std::string_view str, char prefix) {
+    return !str.empty() && str.front() == prefix;
+}
+
 // remove when moving to c++20
 inline bool string_ends_with(std::string_view str, std::string_view suffix) {
    return str.size() >= suffix.size() &&
--- a/common/jinja/caps.cpp
+++ b/common/jinja/caps.cpp
@ -1,4 +1,3 @@
-#include "log.h"
 #include "value.h"
 #include "runtime.h"
 #include "caps.h"
--- a/common/jinja/runtime.h
+++ b/common/jinja/runtime.h
@ -106,10 +106,16 @@ struct statement {
    size_t pos; // position in source, for debugging
    virtual ~statement() = default;
    virtual std::string type() const { return "Statement"; }
+
    // execute_impl must be overridden by derived classes
-    virtual value execute_impl(context &) { throw std::runtime_error("cannot exec " + type()); }
+    virtual value execute_impl(context &) { throw_exec_error(); }
    // execute is the public method to execute a statement with error handling
    value execute(context &);
+
+private:
+    [[noreturn]] void throw_exec_error() const {
+        throw std::runtime_error("cannot exec " + type());
+    }
 };

 // Type Checking Utilities
@ -143,7 +149,7 @@ struct program : public statement {
    program() = default;
    explicit program(statements && body) : body(std::move(body)) {}
    std::string type() const override { return "Program"; }
-    value execute_impl(context &) override {
+    [[noreturn]] value execute_impl(context &) override {
        throw std::runtime_error("Cannot execute program directly, use jinja::runtime instead");
    }
 };
@ -195,7 +201,7 @@ struct break_statement : public statement {
        }
    };

-    value execute_impl(context &) override {
+    [[noreturn]] value execute_impl(context &) override {
        throw break_statement::signal();
    }
 };
@ -209,7 +215,7 @@ struct continue_statement : public statement {
        }
    };

-    value execute_impl(context &) override {
+    [[noreturn]] value execute_impl(context &) override {
        throw continue_statement::signal();
    }
 };
@ -509,7 +515,7 @@ struct slice_expression : public expression {
        chk_type<expression>(this->step_expr);
    }
    std::string type() const override { return "SliceExpression"; }
-    value execute_impl(context &) override {
+    [[noreturn]] value execute_impl(context &) override {
        throw std::runtime_error("must be handled by MemberExpression");
    }
 };
--- a/common/jinja/value.cpp
+++ b/common/jinja/value.cpp
@ -590,6 +590,10 @@ static bool string_endswith(const std::string & str, const std::string & suffix)
    return str.compare(str.length() - suffix.length(), suffix.length(), suffix) == 0;
 }

+[[noreturn]] static value string_join_not_implemented(const func_args &) {
+    throw not_implemented_exception("String join builtin not implemented");
+}
+
 const func_builtins & value_string_t::get_builtins() const {
    static const func_builtins builtins = {
        {"default", default_value},
@ -851,9 +855,7 @@ const func_builtins & value_string_t::get_builtins() const {
            res->val_str.mark_input_based_on(val_input->as_string());
            return res;
        }},
-        {"join", [](const func_args &) -> value {
-            throw not_implemented_exception("String join builtin not implemented");
-        }},
+        {"join", string_join_not_implemented},
    };
    return builtins;
 }
@ -884,6 +886,9 @@ const func_builtins & value_bool_t::get_builtins() const {
    return builtins;
 }

+[[noreturn]] static value array_unique_not_implemented(const func_args &) {
+    throw not_implemented_exception("Array unique builtin not implemented");
+}

 const func_builtins & value_array_t::get_builtins() const {
    static const func_builtins builtins = {
@ -1084,13 +1089,14 @@ const func_builtins & value_array_t::get_builtins() const {
            std::reverse(arr.begin(), arr.end());
            return is_val<value_tuple>(val) ? mk_val<value_tuple>(std::move(arr)) : mk_val<value_array>(std::move(arr));
        }},
-        {"unique", [](const func_args &) -> value {
-            throw not_implemented_exception("Array unique builtin not implemented");
-        }},
+        {"unique", array_unique_not_implemented},
    };
    return builtins;
 }

+[[noreturn]] static value object_join_not_implemented(const func_args &) {
+    throw not_implemented_exception("object join not implemented");
+}

 const func_builtins & value_object_t::get_builtins() const {
    if (!has_builtins) {
@ -1183,9 +1189,7 @@ const func_builtins & value_object_t::get_builtins() const {
            });
            return result;
        }},
-        {"join", [](const func_args &) -> value {
-            throw not_implemented_exception("object join not implemented");
-        }},
+        {"join", object_join_not_implemented},
    };
    return builtins;
 }
--- a/common/jinja/value.h
+++ b/common/jinja/value.h
@ -129,27 +129,25 @@ struct value_t {
    // Note: only for debugging and error reporting purposes
    virtual std::string type() const { return ""; }

-    virtual int64_t as_int() const { throw std::runtime_error(type() + " is not an int value"); }
-    virtual double as_float() const { throw std::runtime_error(type() + " is not a float value"); }
-    virtual string as_string() const { throw std::runtime_error(type() + " is not a string value"); }
-    virtual bool as_bool() const { throw std::runtime_error(type() + " is not a bool value"); }
-    virtual const std::vector<value> & as_array() const { throw std::runtime_error(type() + " is not an array value"); }
-    virtual const std::vector<std::pair<value, value>> & as_ordered_object() const { throw std::runtime_error(type() + " is not an object value"); }
-    virtual value invoke(const func_args &) const { throw std::runtime_error(type() + " is not a function value"); }
+    virtual int64_t as_int() const { throw_type_error("is not an int value"); }
+    virtual double as_float() const { throw_type_error("is not a float value"); }
+    virtual string as_string() const { throw_type_error("is not a string value"); }
+    virtual bool as_bool() const { throw_type_error("is not a bool value"); }
+    virtual const std::vector<value> & as_array() const { throw_type_error("is not an array value"); }
+    virtual const std::vector<std::pair<value, value>> & as_ordered_object() const { throw_type_error("is not an object value"); }
+    virtual value invoke(const func_args &) const { throw_type_error("is not a function value"); }
    virtual bool is_none() const { return false; }
    virtual bool is_undefined() const { return false; }
-    virtual const func_builtins & get_builtins() const {
-        throw std::runtime_error("No builtins available for type " + type());
-    }
+    virtual const func_builtins & get_builtins() const { throw_type_error("has no builtins"); }

-    virtual bool has_key(const value &) { throw std::runtime_error(type() + " is not an object value"); }
-    virtual void insert(const value & /* key */, const value & /* val */) { throw std::runtime_error(type() + " is not an object value"); }
-    virtual value & at(const value & /* key */, value & /* default_val */) { throw std::runtime_error(type() + " is not an object value"); }
-    virtual value & at(const value & /* key */) { throw std::runtime_error(type() + " is not an object value"); }
-    virtual value & at(const std::string & /* key */, value & /* default_val */) { throw std::runtime_error(type() + " is not an object value"); }
-    virtual value & at(const std::string & /* key */) { throw std::runtime_error(type() + " is not an object value"); }
-    virtual value & at(int64_t /* idx */, value & /* default_val */) { throw std::runtime_error(type() + " is not an array value"); }
-    virtual value & at(int64_t /* idx */) { throw std::runtime_error(type() + " is not an array value"); }
+    virtual bool has_key(const value &) { throw_type_error("is not an object value"); }
+    virtual void insert(const value & /* key */, const value & /* val */) { throw_type_error("is not an object value"); }
+    virtual value & at(const value & /* key */, value & /* default_val */) { throw_type_error("is not an object value"); }
+    virtual value & at(const value & /* key */) { throw_type_error("is not an object value"); }
+    virtual value & at(const std::string & /* key */, value & /* default_val */) { throw_type_error("is not an object value"); }
+    virtual value & at(const std::string & /* key */) { throw_type_error("is not an object value"); }
+    virtual value & at(int64_t /* idx */, value & /* default_val */) { throw_type_error("is not an array value"); }
+    virtual value & at(int64_t /* idx */) { throw_type_error("is not an array value"); }

    virtual bool is_numeric() const { return false; }
    virtual bool is_hashable() const { return false; }
@ -163,6 +161,11 @@ struct value_t {
    // Note: only for debugging purposes
    virtual std::string as_repr() const { return as_string().str(); }

+private:
+    [[noreturn]] void throw_type_error(const char* expected) const {
+        throw std::runtime_error(type() + " " + expected);
+    }
+
 protected:
    virtual bool equivalent(const value_t &) const = 0;
    virtual bool nonequal(const value_t & other) const { return !equivalent(other); }
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -746,7 +746,12 @@ class ModelBase:

        if (not quant_algo or not quant_layers) and quant_config_file.is_file():
            with open(quant_config_file, "r", encoding="utf-8") as f:
-                quant_config = json.load(f).get("quantization") or {}
+                hf_quant_config = json.load(f)
+                quant_config = hf_quant_config.get("quantization") or {}
+                producer = hf_quant_config.get("producer") or {}
+                producer_name = (producer.get("name") or "").lower()
+                if quant_method is None:
+                    self.hparams.setdefault("quantization_config", {})["quant_method"] = producer_name
                quant_algo = quant_config.get("quant_algo", quant_algo)
                quant_layers = quant_config.get("quantized_layers", quant_layers) or {}

--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@ -3608,6 +3608,30 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph *                cgraph,
        return true;
    }

+    if (ops.size() == 2 && ops.begin()[0] == GGML_OP_UNARY && ops.begin()[1] == GGML_OP_SQR
+     && unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_RELU) {
+        const ggml_tensor * unary = cgraph->nodes[node_idx];
+        const ggml_tensor * sqr   = cgraph->nodes[node_idx+1];
+
+        if (ggml_get_unary_op(unary) != GGML_UNARY_OP_RELU) {
+            return false;
+        }
+
+        if (unary->type != GGML_TYPE_F32 && unary->type != GGML_TYPE_F16) {
+            return false;
+        }
+
+        if (unary->type != sqr->type) {
+            return false;
+        }
+
+        if (!ggml_is_contiguous(unary->src[0])) {
+            return false;
+        }
+
+        return true;
+    }
+
    if (ops.size() == 3 && ops.begin()[0] == GGML_OP_SCALE && ops.begin()[1] == GGML_OP_UNARY && ops.begin()[2] == GGML_OP_SCALE
     && unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_TANH) {
        const ggml_tensor *scale  = cgraph->nodes[node_idx];
@ -4116,6 +4140,12 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
                        continue;
                    }

+                    if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_SQR }, { GGML_UNARY_OP_RELU })) {
+                        ggml_cuda_op_relu_sqr(*cuda_ctx, node, cgraph->nodes[i+1]);
+                        i++;
+                        continue;
+                    }
+
                    if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SCALE, GGML_OP_UNARY, GGML_OP_SCALE }, { GGML_UNARY_OP_TANH })) {
                        i += 2;
                        ggml_cuda_op_softcap(*cuda_ctx, cgraph->nodes[i], node);
--- a/ggml/src/ggml-cuda/unary.cu
+++ b/ggml/src/ggml-cuda/unary.cu
@ -65,6 +65,11 @@ static __device__ __forceinline__ float op_sqr(float x) {
    return x * x;
 }

+static __device__ __forceinline__ float op_relu_sqr(float x) {
+    const float r = fmaxf(x, 0.0f);
+    return r * r;
+}
+
 static __device__ __forceinline__ float op_sqrt(float x) {
    return sqrtf(x);
 }
@ -615,3 +620,21 @@ void ggml_cuda_op_unary_mul(ggml_backend_cuda_context & ctx, ggml_tensor * unary
            GGML_ABORT("Unsupported unary op for fused unary+mul");
    }
 }
+
+/* fused relu + sqr */
+
+void ggml_cuda_op_relu_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * relu_node, ggml_tensor * sqr_node) {
+    const ggml_tensor * src = relu_node->src[0];
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(ggml_is_contiguous(src));
+    GGML_ASSERT(src->type == GGML_TYPE_F32 || src->type == GGML_TYPE_F16);
+    GGML_ASSERT(src->type == sqr_node->type);
+
+    const int k = ggml_nelements(src);
+    if (src->type == GGML_TYPE_F16) {
+        unary_cuda<op_relu_sqr>((const half *)src->data, (half *)sqr_node->data, k, stream);
+    } else {
+        unary_cuda<op_relu_sqr>((const float *)src->data, (float *)sqr_node->data, k, stream);
+    }
+}
--- a/ggml/src/ggml-cuda/unary.cuh
+++ b/ggml/src/ggml-cuda/unary.cuh
@ -91,6 +91,8 @@ void ggml_cuda_op_xielu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

 void ggml_cuda_op_unary_mul(ggml_backend_cuda_context & ctx, ggml_tensor * unary_node, ggml_tensor * mul_node);

+void ggml_cuda_op_relu_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * relu_node, ggml_tensor * sqr_node);
+
 __device__ __forceinline__ float ggml_cuda_op_silu_single(float x) {
    return x / (1.0f + expf(-x));
 }
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@ -820,7 +820,7 @@ ggml_metal_device_t ggml_metal_device_init(int device) {
            }

            // print MTL GPU family:
-            GGML_LOG_INFO("%s: GPU name:   %s\n", __func__, dev->props.name);
+            GGML_LOG_INFO("%s: GPU name:   %s (%s)\n", __func__, dev->props.name, dev->props.desc);

            // determine max supported GPU family
            // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
@ -937,13 +937,13 @@ void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev) {
 }

 struct ggml_metal_event {
-    void * obj; // id<MTLEvent>
+    void * obj; // id<MTLSharedEvent>

    atomic_int value;
 };

 void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
-    id<MTLEvent> event = (id<MTLEvent>)ev->obj;
+    id<MTLSharedEvent> event = (id<MTLSharedEvent>)ev->obj;

    id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;

@ -951,7 +951,7 @@ void ggml_metal_event_encode_signal(ggml_metal_event_t ev, ggml_metal_cmd_buf_t
 }

 void ggml_metal_event_encode_wait(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cmd_buf_raw) {
-    id<MTLEvent> event = (id<MTLEvent>)ev->obj;
+    id<MTLSharedEvent> event = (id<MTLSharedEvent>)ev->obj;

    id<MTLCommandBuffer> cmd_buf = (id<MTLCommandBuffer>) cmd_buf_raw;

@ -959,7 +959,7 @@ void ggml_metal_event_encode_wait(ggml_metal_event_t ev, ggml_metal_cmd_buf_t cm
 }

 ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev) {
-    id<MTLEvent> event = [dev->mtl_device newEvent];
+    id<MTLSharedEvent> event = [dev->mtl_device newSharedEvent];

    ggml_metal_event_t ev = calloc(1, sizeof(struct ggml_metal_event));

@ -970,7 +970,7 @@ ggml_metal_event_t ggml_metal_device_event_init(ggml_metal_device_t dev) {
 }

 void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev) {
-    id<MTLEvent> event = ev->obj;
+    id<MTLSharedEvent> event = ev->obj;
    [event release];

    free(ev);
@ -979,14 +979,13 @@ void ggml_metal_device_event_free(ggml_metal_device_t dev, ggml_metal_event_t ev
 }

 void ggml_metal_device_event_synchronize(ggml_metal_device_t dev, ggml_metal_event_t ev) {
-    @autoreleasepool {
-        id<MTLEvent> event = ev->obj;
-
-        id<MTLCommandBuffer> cmd_buf = [dev->mtl_queue commandBuffer];
-        [cmd_buf encodeWaitForEvent:event value:atomic_load_explicit(&ev->value, memory_order_relaxed)];
-        [cmd_buf commit];
-        [cmd_buf waitUntilCompleted];
+    id<MTLSharedEvent> event = ev->obj;
+    const bool res = [event waitUntilSignaledValue:atomic_load_explicit(&ev->value, memory_order_relaxed) timeoutMS:60000];
+    if (!res) {
+        GGML_ABORT("%s: failed to wait for event\n", __func__);
    }
+
+    GGML_UNUSED(dev);
 }

 void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total) {
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -7672,7 +7672,7 @@ size_t ggml_quantize_chunk(
               int64_t   nrows,
               int64_t   n_per_row,
           const float * imatrix) {
-    const int64_t n = (int64_t) nrows * n_per_row;
+    const int64_t n = nrows * n_per_row;

    if (ggml_quantize_requires_imatrix(type)) {
        GGML_ASSERT(imatrix != NULL);
@ -7689,21 +7689,21 @@ size_t ggml_quantize_chunk(
    size_t result = 0;

    switch (type) {
-        case GGML_TYPE_Q1_0:    result = quantize_q1_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q4_0:    result = quantize_q4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q4_1:    result = quantize_q4_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q5_0:    result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q5_1:    result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q8_0:    result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_MXFP4:   result = quantize_mxfp4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_NVFP4:   result = quantize_nvfp4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q2_K:    result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q3_K:    result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q4_K:    result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q5_K:    result = quantize_q5_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_Q6_K:    result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_TQ1_0:   result = quantize_tq1_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case GGML_TYPE_TQ2_0:   result = quantize_tq2_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q1_0:    result = quantize_q1_0   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q4_0:    result = quantize_q4_0   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q4_1:    result = quantize_q4_1   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q5_0:    result = quantize_q5_0   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q5_1:    result = quantize_q5_1   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q8_0:    result = quantize_q8_0   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_MXFP4:   result = quantize_mxfp4  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_NVFP4:   result = quantize_nvfp4  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q2_K:    result = quantize_q2_K   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q3_K:    result = quantize_q3_K   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q4_K:    result = quantize_q4_K   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q5_K:    result = quantize_q5_K   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q6_K:    result = quantize_q6_K   (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_TQ1_0:   result = quantize_tq1_0  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_TQ2_0:   result = quantize_tq2_0  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ2_XS:  result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
        case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
@ -7768,10 +7768,10 @@ struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) {
 }

 bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) {
-    if (p0->n_threads      != p1->n_threads  )    return false;
-    if (p0->prio           != p1->prio       )    return false;
-    if (p0->poll           != p1->poll       )    return false;
-    if (p0->strict_cpu     != p1->strict_cpu )    return false;
+    if (p0->n_threads  != p1->n_threads  ) return false;
+    if (p0->prio       != p1->prio       ) return false;
+    if (p0->poll       != p1->poll       ) return false;
+    if (p0->strict_cpu != p1->strict_cpu ) return false;
    return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0;
 }

--- a/tools/server/server-chat.cpp
+++ b/tools/server/server-chat.cpp
@ -281,6 +281,42 @@ json server_chat_convert_responses_to_chatcmpl(const json & response_body) {
    return chatcmpl_body;
 }

+// Edits the cch section of an "x-anthropic-billing-header" system prompt.
+// Does nothing to any other prompt.
+//
+// This is a claude message with a "cch=ef01a" attribute that breaks prefix caching.
+// The cch stamp is a whitebox end-to-end integrity hint. It's not meaningful as a
+// system prompt data, particularly to llama.cpp, but its presence means the prefix
+// cache will not get past it: It changes on each request.
+//
+// Reference: https://github.com/ggml-org/llama.cpp/pull/21793
+// Example header:
+// ```
+// x-anthropic-billing-header: cc_version=2.1.101.e51; cc_entrypoint=cli; cch=a5145;You are Claude Code, Anthropic's official CLI for Claude.
+//                                                                            ^^^^^
+// ```
+static void normalize_anthropic_billing_header(std::string & system_text) {
+    if (system_text.rfind("x-anthropic-billing-header:", 0) != 0) {
+        return;
+    }
+
+    const size_t header_prefix_length = strlen("x-anthropic-billing-header:");
+    const size_t cch_length = 5;
+    const size_t index_cch = system_text.find("cch=", header_prefix_length);
+    if (index_cch == std::string::npos) {
+        return;
+    }
+
+    const size_t index_replace = index_cch + 4;
+    if (index_replace + cch_length < system_text.length() && system_text[index_replace + cch_length] == ';') {
+        for (size_t i = 0; i < cch_length; ++i) {
+            system_text[index_replace + i] = 'f';
+        }
+    } else {
+        LOG_ERR("anthropic string not as expected: %s", system_text.c_str());
+    }
+}
+
 json server_chat_convert_anthropic_to_oai(const json & body) {
    json oai_body;

@ -292,10 +328,13 @@ json server_chat_convert_anthropic_to_oai(const json & body) {

        if (system_param.is_string()) {
            system_content = system_param.get<std::string>();
+            normalize_anthropic_billing_header(system_content);
        } else if (system_param.is_array()) {
            for (const auto & block : system_param) {
                if (json_value(block, "type", std::string()) == "text") {
-                    system_content += json_value(block, "text", std::string());
+                    auto system_text = json_value(block, "text", std::string());
+                    normalize_anthropic_billing_header(system_text);
+                    system_content += system_text;
                }
            }
        }
@ -475,7 +514,7 @@ json server_chat_convert_anthropic_to_oai(const json & body) {
    }

    // Pass through common params
-    for (const auto & key : {"temperature", "top_p", "top_k", "stream"}) {
+    for (const auto & key : {"temperature", "top_p", "top_k", "stream", "chat_template_kwargs"}) {
        if (body.contains(key)) {
            oai_body[key] = body.at(key);
        }
@ -535,6 +574,7 @@ json server_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {

 json convert_transcriptions_to_chatcmpl(
        const json & inp_body,
+        const common_chat_templates * tmpls,
        const std::map<std::string, raw_buffer> & in_files,
        std::vector<raw_buffer> & out_files) {
    // TODO @ngxson : this function may need to be improved in the future
@ -548,27 +588,29 @@ json convert_transcriptions_to_chatcmpl(
    }

    // handle input data
-    std::string prompt = json_value(inp_body, "prompt", std::string());
-    std::string language = json_value(inp_body, "language", std::string());
+    std::string prompt          = json_value(inp_body, "prompt", std::string());
+    std::string language        = json_value(inp_body, "language", std::string());
    std::string response_format = json_value(inp_body, "response_format", std::string("json"));
    if (response_format != "json") {
        throw std::invalid_argument("Only 'json' response_format is supported for transcription");
    }
+    const common_chat_prompt_preset preset = common_chat_get_asr_prompt(tmpls);
    if (prompt.empty()) {
-        prompt = "Transcribe audio to text";
+        prompt = preset.user;
    }
    if (!language.empty()) {
        prompt += string_format(" (language: %s)", language.c_str());
    }
    prompt += get_media_marker();

+    json messages = json::array();
+    if (!preset.system.empty()) {
+        messages.push_back({{"role", "system"}, {"content", preset.system}});
+    }
+    messages.push_back({{"role", "user"}, {"content", prompt}});
+
    json chatcmpl_body = inp_body; // copy all fields
-    chatcmpl_body["messages"] = json::array({
-        {
-            {"role", "user"},
-            {"content", prompt},
-        },
-    });
+    chatcmpl_body["messages"] = messages;

    // because input from form-data, everything is string, we need to correct the types here
    std::string stream = json_value(inp_body, "stream", std::string("false"));
--- a/tools/server/server-chat.h
+++ b/tools/server/server-chat.h
@ -18,6 +18,7 @@ json server_chat_convert_anthropic_to_oai(const json & body);
 // convert OpenAI transcriptions API format to OpenAI Chat Completions API format
 json convert_transcriptions_to_chatcmpl(
    const json & body,
+    const common_chat_templates * tmpls,
    const std::map<std::string, raw_buffer> & in_files,
    std::vector<raw_buffer> & out_files);

--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@ -947,7 +947,9 @@ json oaicompat_chat_params_parse(
        json response_format      = json_value(body, "response_format", json::object());
        std::string response_type = json_value(response_format, "type", std::string());
        if (response_type == "json_object") {
-            json_schema = json_value(response_format, "schema", json::object());
+            if (response_format.contains("schema") || json_schema.empty()) {
+                json_schema = json_value(response_format, "schema", json::object());
+            }
        } else if (response_type == "json_schema") {
            auto schema_wrapper = json_value(response_format, "json_schema", json::object());
            json_schema = json_value(schema_wrapper, "schema", json::object());
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@ -675,6 +675,10 @@ private:

    int32_t n_ctx; // total context for all clients / slots

+    // set to llama_model_n_swa(model)
+    // if swa_full is enabled, this is set to 0 to simulate a non-SWA model
+    int32_t n_swa;
+
    // slots / clients
    std::vector<server_slot> slots;

@ -719,7 +723,7 @@ private:
            return;
        }
        SLT_INF(slot, "%s", "saving idle slot to prompt cache\n");
-        SLT_DBG(slot, "%s", "__TEST_TAG_CLEAR_IDLE_SLOT__\n");
+        SLT_DBG(slot, "%s", "__TEST_TAG_CACHE_IDLE_SLOT__\n");
        slot.prompt_save(*prompt_cache);
        slot.prompt_clear(false);
        prompt_cache->update();
@ -854,6 +858,8 @@ private:
            }
        }

+        n_swa = params_base.swa_full ? 0 : llama_model_n_swa(model);
+
        // Necessary similarity of prompt for slot selection
        slot_prompt_similarity = params_base.slot_prompt_similarity;

@ -996,7 +1002,7 @@ private:
                params_base.cache_idle_slots = false;
            } else {
                SRV_INF("%s: idle slots will be saved to prompt cache and cleared upon starting a new task\n", __func__);
-                SRV_DBG("%s", "__TEST_TAG_CLEAR_IDLE_ENABLED__\n");
+                SRV_DBG("%s", "__TEST_TAG_CACHE_IDLE_SLOTS_ENABLED__\n");
            }
        }

@ -2415,9 +2421,6 @@ private:

                            llama_pos pos_next = slot.prompt.tokens.pos_next(n_past);

-                            // note: when n_swa == 0, the model does not use SWA
-                            const auto n_swa = std::max(0, llama_model_n_swa(model));
-
                            // the largest pos_min required for a checkpoint to be useful
                            const auto pos_min_thold = std::max(0, pos_next - n_swa);

@ -2589,10 +2592,10 @@ private:
                    // make a checkpoint of the parts of the memory that cannot be rolled back.
                    // checkpoints are created only if:
                    // - the model does not support partial sequence removal
-                    // - the model uses SWA and we are not using `swa_full`
+                    // - the model uses SWA (and we are not using `swa_full`)
                    do_checkpoint = do_checkpoint && (
                            (slot.ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) ||
-                            (llama_model_n_swa(model) > 0 && !params_base.swa_full));
+                            (n_swa > 0));

                    bool has_mtmd = false;

@ -3807,6 +3810,7 @@ void server_routes::init_routes() {
        std::vector<raw_buffer> files;
        json body = convert_transcriptions_to_chatcmpl(
            json::parse(req.body),
+            meta->chat_params.tmpls.get(),
            req.files,
            files);
        SRV_DBG("%s\n", "Request converted: OpenAI Transcriptions -> OpenAI Chat Completions");
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@ -270,6 +270,7 @@ task_params server_task::params_from_json_cmpl(
    params.n_indent         = json_value(data,       "n_indent",           defaults.n_indent);
    params.n_keep           = json_value(data,       "n_keep",             defaults.n_keep);
    params.n_discard        = json_value(data,       "n_discard",          defaults.n_discard);
+    params.n_discard        = std::max(0, params.n_discard);
    params.n_cmpl           = json_value(data,       "n_cmpl",             json_value(data, "n", 1));
    params.n_cache_reuse    = json_value(data,       "n_cache_reuse",      defaults.n_cache_reuse);
    //params.t_max_prompt_ms  = json_value(data,       "t_max_prompt_ms",    defaults.t_max_prompt_ms); // TODO: implement
--- a/tools/server/tests/unit/test_kv_keep_only_active.py
+++ b/tools/server/tests/unit/test_kv_keep_only_active.py
@ -48,7 +48,7 @@ def test_clear_and_restore():
    log = LogReader(server.log_path)

    # verify feature is enabled
-    assert "__TEST_TAG_CLEAR_IDLE_ENABLED__" in log.drain()
+    assert "__TEST_TAG_CACHE_IDLE_SLOTS_ENABLED__" in log.drain()

    res = server.make_request("POST", "/completion", data={
        "prompt": LONG_PROMPT,
@ -59,7 +59,7 @@ def test_clear_and_restore():
    original_prompt_n = res.body["timings"]["prompt_n"]

    # Slot 0 is the only slot with KV — should NOT be cleared
-    assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
+    assert "__TEST_TAG_CACHE_IDLE_SLOT__" not in log.drain()

    # Launching slot 1 clears idle slot 0
    res = server.make_request("POST", "/completion", data={
@ -68,7 +68,7 @@ def test_clear_and_restore():
        "cache_prompt": True,
    })
    assert res.status_code == 200
-    assert "__TEST_TAG_CLEAR_IDLE_SLOT__" in log.drain()
+    assert "__TEST_TAG_CACHE_IDLE_SLOT__" in log.drain()

    # Re-send same prompt — should restore from cache-ram
    res = server.make_request("POST", "/completion", data={
@ -86,7 +86,7 @@ def test_clear_and_restore():
        "cache_prompt": True,
    })
    assert res.status_code == 200
-    assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
+    assert "__TEST_TAG_CACHE_IDLE_SLOT__" not in log.drain()


 def test_disabled_with_flag():
@ -96,7 +96,7 @@ def test_disabled_with_flag():
    log = LogReader(server.log_path)

    # Feature should not be enabled
-    assert "__TEST_TAG_CLEAR_IDLE_ENABLED__" not in log.drain()
+    assert "__TEST_TAG_CACHE_IDLE_SLOTS_ENABLED__" not in log.drain()

    res = server.make_request("POST", "/completion", data={
        "prompt": LONG_PROMPT,
@ -112,4 +112,4 @@ def test_disabled_with_flag():
        "cache_prompt": True,
    })
    assert res.status_code == 200
-    assert "__TEST_TAG_CLEAR_IDLE_SLOT__" not in log.drain()
+    assert "__TEST_TAG_CACHE_IDLE_SLOT__" not in log.drain()
--- a/vendor/cpp-httplib/CMakeLists.txt
+++ b/vendor/cpp-httplib/CMakeLists.txt
@ -81,7 +81,7 @@ if (LLAMA_BUILD_BORINGSSL)
    target_link_libraries(${TARGET} PUBLIC ssl crypto)

 elseif (LLAMA_BUILD_LIBRESSL)
-    set(LIBRESSL_VERSION "4.2.1" CACHE STRING "LibreSSL version")
+    set(LIBRESSL_VERSION "4.3.1" CACHE STRING "LibreSSL version")

    message(STATUS "Fetching LibreSSL version ${LIBRESSL_VERSION}")

@ -161,12 +161,24 @@ if(LLAMA_BUILD_BORINGSSL OR LLAMA_BUILD_LIBRESSL)
        if(LLAMA_BUILD_BORINGSSL)
            target_compile_options(fipsmodule PRIVATE /w)
        endif()
+        if(LLAMA_BUILD_LIBRESSL)
+            target_compile_options(ssl_obj PRIVATE /w)
+            target_compile_options(bs_obj PRIVATE /w)
+            target_compile_options(compat_obj PRIVATE /w)
+            target_compile_options(crypto_obj PRIVATE /w)
+        endif()
    else()
        target_compile_options(ssl PRIVATE -w)
        target_compile_options(crypto PRIVATE -w)
        if(LLAMA_BUILD_BORINGSSL)
            target_compile_options(fipsmodule PRIVATE -w)
        endif()
+        if(LLAMA_BUILD_LIBRESSL)
+            target_compile_options(ssl_obj PRIVATE -w)
+            target_compile_options(bs_obj PRIVATE -w)
+            target_compile_options(compat_obj PRIVATE -w)
+            target_compile_options(crypto_obj PRIVATE -w)
+        endif()
    endif()
 endif()