Merge branch 'upstream' into concedo_experimental

# Conflicts: # .devops/cpu.Dockerfile # .devops/cuda.Dockerfile # .github/ISSUE_TEMPLATE/010-bug-compilation.yml # .github/ISSUE_TEMPLATE/011-bug-results.yml # .github/labeler.yml # .github/workflows/build.yml # .github/workflows/release.yml # CODEOWNERS # README.md # docs/build-s390x.md # docs/ops.md # examples/eval-callback/eval-callback.cpp # ggml/CMakeLists.txt # ggml/src/CMakeLists.txt # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-opencl/CMakeLists.txt # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-opencl/kernels/transpose.cl # tests/test-backend-ops.cpp # tests/test-chat.cpp # tests/test-opt.cpp
2025-09-12 09:59:41 +00:00 · 2025-08-16 12:39:25 +08:00 · 2025-08-16 12:39:25 +08:00 · d876898476
commit d876898476
parent 9491b6966d 5e6229a840
31 changed files with 10043 additions and 77 deletions
--- a/.github/workflows/copilot-setup-steps.yml
+++ b/.github/workflows/copilot-setup-steps.yml
@ -29,7 +29,7 @@ jobs:
        uses: actions/checkout@v4
      - name: ccache
-        uses: hendrikmuhs/ccache-action@v1.2.16
+        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: copilot-setup-steps
          evict-old-files: 1d
--- a/common/chat.cpp
+++ b/common/chat.cpp
@ -296,6 +296,7 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
        }
        if (!msg.reasoning_content.empty()) {
            jmsg["reasoning_content"] = msg.reasoning_content;
            jmsg["thinking"] = msg.reasoning_content; // gpt-oss
        }
        if (!msg.tool_name.empty()) {
            jmsg["name"] = msg.tool_name;
@ -472,11 +473,12 @@ std::string common_chat_format_single(
    return ss.str();
 }
-std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja) {
+std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja, const std::map<std::string, std::string> & chat_template_kwargs) {
    common_chat_templates_inputs inputs;
    inputs.use_jinja = use_jinja;
    inputs.add_bos = tmpls->add_bos;
    inputs.add_eos = tmpls->add_eos;
    inputs.chat_template_kwargs = chat_template_kwargs;
    auto add_simple_msg = [&](auto role, auto content) {
        common_chat_msg msg;
        msg.role = role;
@ -1338,16 +1340,164 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
    data.prompt = prompt;
    data.format = COMMON_CHAT_FORMAT_GPT_OSS;
-    // TODO: support tool calls in GPT-OSS?
+    // These special tokens are required to parse properly, so we include them
    // even if parse_tool_calls is false.
    data.preserved_tokens = {
        "<|channel|>",
        "<|constrain|>",
        "<|message|>",
        "<|start|>",
        "<|end|>",
    };
    if (inputs.tools.is_array() && !inputs.tools.empty()) {
        data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
            // tool calls can appear in commentary or analysis channels
            auto channel = builder.add_rule("channel", "\"<|channel|>\" ( \"commentary\" | \"analysis\" )");
            std::vector<std::string> tool_rules_recipient_in_role;
            std::vector<std::string> tool_rules_recipient_in_channel;
            foreach_function(inputs.tools, [&](const json & tool) {
                const auto & function = tool.at("function");
                std::string name = function.at("name");
                auto parameters = function.at("parameters");
                builder.resolve_refs(parameters);
                tool_rules_recipient_in_role.push_back(
                    builder.add_rule(name + "-call",
                        "\"" + name + "\"" + channel + " \" <|constrain|>json\"? \"<|message|>\" " +
                        builder.add_schema(name + "-args", parameters)
                    )
                );
                tool_rules_recipient_in_channel.push_back(
                    builder.add_rule(name + "-call",
                        "\"" + name + "\"" + " \" <|constrain|>json\"? \"<|message|>\" " +
                        builder.add_schema(name + "-args", parameters)
                    )
                );
            });
            auto recipient_in_role = builder.add_rule("recipient_in_role",
                "\"<|start|>assistant\"? \" to=functions.\" ( " +
                string_join(tool_rules_recipient_in_role, " | ") + " )"
            );
            auto recipient_in_channel = builder.add_rule("recipient_in_channel",
                channel + " \" to=functions.\" ( " +
                string_join(tool_rules_recipient_in_channel, " | ") + " )"
            );
            builder.add_rule("root", recipient_in_role + " | " + recipient_in_channel);
            // Trigger on tool calls that appear in the commentary channel
            data.grammar_triggers.push_back({
                COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
                "<\\|channel\\|>(commentary|analysis) to"
            });
            // Trigger tool calls that appear in the role section, either at the
            // start or in the middle.
            data.grammar_triggers.push_back({
                COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
                "^ to"
            });
            data.grammar_triggers.push_back({
                COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
                "<\\|start\\|>assistant to"
            });
        });
    }
    return data;
 }
 static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
-    // TODO @ngxson : this won't work with --special enabled, we should fix that
+    static const std::string constraint = "(?: (<\\|constrain\\|>)?([a-zA-Z0-9_-]+))";
-    builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|start|>assistant<|channel|>final<|message|>");
+    static const std::string recipient("(?: to=functions\\.([^<\\s]+))");
-    if (!builder.syntax().parse_tool_calls) {
+
-        builder.add_content(builder.consume_rest());
+    static const common_regex start_regex("<\\|start\\|>assistant");
-        return;
+    static const common_regex analysis_regex("<\\|channel\\|>analysis");
    static const common_regex final_regex("<\\|channel\\|>final" + constraint + "?");
    static const common_regex preamble_regex("<\\|channel\\|>commentary");
    static const common_regex tool_call1_regex(recipient + "<\\|channel\\|>(analysis|commentary)" + constraint + "?");
    static const common_regex tool_call2_regex("<\\|channel\\|>(analysis|commentary)" + recipient + constraint + "?");
    auto consume_end = [&](bool include_end = false) {
        if (auto res = builder.try_find_literal("<|end|>")) {
            return res->prelude + (include_end ? builder.str(res->groups[0]) : "");
        }
        return builder.consume_rest();
    };
    auto handle_tool_call = [&](const std::string & name) {
        if (auto args = builder.try_consume_json_with_dumped_args({{}})) {
            if (builder.syntax().parse_tool_calls) {
                if (!builder.add_tool_call(name, "", args->value) || args->is_partial) {
                    throw common_chat_msg_partial_exception("incomplete tool call");
                }
            } else if (args->is_partial) {
                throw common_chat_msg_partial_exception("incomplete tool call");
            }
        }
    };
    auto regex_match = [](const common_regex & regex, const std::string & input) -> std::optional<common_regex_match> {
        auto match = regex.search(input, 0, true);
        if (match.type == COMMON_REGEX_MATCH_TYPE_FULL) {
            return match;
        }
        return std::nullopt;
    };
    do {
        auto header_start_pos = builder.pos();
        auto content_start = builder.try_find_literal("<|message|>");
        if (!content_start) {
            throw common_chat_msg_partial_exception("incomplete header");
        }
        auto header = content_start->prelude;
        if (auto match = regex_match(tool_call1_regex, header)) {
            auto group = match->groups[1];
            auto name = header.substr(group.begin, group.end - group.begin);
            handle_tool_call(name);
            continue;
        }
        if (auto match = regex_match(tool_call2_regex, header)) {
            auto group = match->groups[2];
            auto name = header.substr(group.begin, group.end - group.begin);
            handle_tool_call(name);
            continue;
        }
        if (regex_match(analysis_regex, header)) {
            builder.move_to(header_start_pos);
            if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE || builder.syntax().reasoning_in_content) {
                builder.add_content(consume_end(true));
            } else {
                builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|end|>");
            }
            continue;
        }
        if(regex_match(final_regex, header) || regex_match(preamble_regex, header)) {
            builder.add_content(consume_end());
            continue;
        }
        // Possibly a malformed message, attempt to recover by rolling
        // back to pick up the next <|start|>
        LOG_DBG("%s: unknown header from message: %s\n", __func__, header.c_str());
        builder.move_to(header_start_pos);
    } while (builder.try_find_regex(start_regex, std::string::npos, false));
    auto remaining = builder.consume_rest();
    if (!remaining.empty()) {
        LOG_DBG("%s: content after last message: %s\n", __func__, remaining.c_str());
    }
 }
@ -1911,8 +2061,8 @@ static common_chat_params common_chat_templates_apply_jinja(
    params.enable_thinking = inputs.enable_thinking;
    params.grammar = inputs.grammar;
    params.now = inputs.now;
-    params.add_bos = inputs.add_bos;
+    params.add_bos = tmpls->add_bos;
-    params.add_eos = inputs.add_eos;
+    params.add_eos = tmpls->add_eos;
    params.extra_context = json::object();
    for (auto el : inputs.chat_template_kwargs) {
--- a/common/chat.h
+++ b/common/chat.h
@ -187,7 +187,8 @@ std::string common_chat_format_single(
 // Returns an example of formatted chat
 std::string common_chat_format_example(
    const struct common_chat_templates * tmpls,
-    bool use_jinja);
+    bool use_jinja,
    const std::map<std::string, std::string> & chat_template_kwargs);
 const char*               common_chat_format_name(common_chat_format format);
 const char*               common_reasoning_format_name(common_reasoning_format format);
--- a/docs/ops/zDNN.csv
+++ b/docs/ops/zDNN.csv
--- a/ggml/include/ggml-zdnn.h
+++ b/ggml/include/ggml-zdnn.h
@ -0,0 +1,16 @@
 #pragma once
 #include "ggml.h"
 #include "ggml-backend.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 GGML_BACKEND_API ggml_backend_t ggml_backend_zdnn_init(void);
 GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void);
 #ifdef __cplusplus
 }
 #endif
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@ -49,6 +49,10 @@
 #include "ggml-webgpu.h"
 #endif
 #ifdef GGML_USE_ZDNN
 #include "ggml-zdnn.h"
 #endif
 #ifdef GGML_USE_OPENCL
 #include "ggml-opencl.h"
 #endif
@ -180,6 +184,9 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_WEBGPU
        register_backend(ggml_backend_webgpu_reg());
 #endif
 #ifdef GGML_USE_ZDNN
        register_backend(ggml_backend_zdnn_reg());
 #endif
 #ifdef GGML_USE_OPENCL
        register_backend(ggml_backend_opencl_reg());
 #endif
--- a/ggml/src/ggml-cuda/convert.cu
+++ b/ggml/src/ggml-cuda/convert.cu
@ -31,8 +31,8 @@ static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __
    dequantize_kernel(vx, ib, iqs, v);
    const int64_t iy0 = ((i03*ne02 + i02)*ne01 + i01)*ne00 + iybs + iqs;
-    y[iy0 + 0]        = float(v.x);
+    y[iy0 + 0]        = ggml_cuda_cast<dst_t>(v.x);
-    y[iy0 + y_offset] = float(v.y);
+    y[iy0 + y_offset] = ggml_cuda_cast<dst_t>(v.y);
 }
 template <bool need_check>
@ -630,7 +630,7 @@ static __global__ void convert_unary(
    const int64_t ix = i03*s03 + i02*s02 + i01*s01 + i00;
    const int64_t iy = ((i03*ne02 + i02)*ne01 + i01)*ne00 + i00;
-    y[iy] = float(x[ix]);
+    y[iy] = ggml_cuda_cast<dst_t>(x[ix]);
 }
 template <typename src_t, typename dst_t>
--- a/ggml/src/ggml-cuda/convert.cuh
+++ b/ggml/src/ggml-cuda/convert.cuh
@ -29,3 +29,16 @@ typedef to_t_nc_cuda_t<nv_bfloat16> to_bf16_nc_cuda_t;
 to_fp32_nc_cuda_t ggml_get_to_fp32_nc_cuda(ggml_type type);
 to_fp16_nc_cuda_t ggml_get_to_fp16_nc_cuda(ggml_type type);
 to_bf16_nc_cuda_t ggml_get_to_bf16_nc_cuda(ggml_type type);
 template<typename dst_t, typename src_t>
 __host__ __device__ inline dst_t ggml_cuda_cast(src_t x) {
    if constexpr (std::is_same_v<dst_t, src_t>) {
        return x;
    } else if constexpr(std::is_same_v<dst_t, nv_bfloat16>) {
        return __float2bfloat16(float(x));
    } else if constexpr(std::is_same_v<src_t, nv_bfloat16>) {
        return __bfloat162float(x);
    } else {
        return float(x);
    }
 }
--- a/ggml/src/ggml-cuda/cpy-utils.cuh
+++ b/ggml/src/ggml-cuda/cpy-utils.cuh
@ -1,15 +1,7 @@
 #pragma once
 #include "ggml-common.h"
-
+#include "convert.cuh"
 template<typename src_t, typename dst_t>
 static __device__ __forceinline__ void convert_flt(const src_t * src, dst_t * dst) {
    if constexpr (std::is_same_v<src_t, dst_t>) {
        *dst = *src;
    } else {
        *dst = float(*src);
    }
 }
 static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val, float x) {
    if (x <= val[0]) return 0;
@ -221,5 +213,5 @@ static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
 template<typename src_t, typename dst_t>
 static __device__ void cpy_1_flt(const char * cxi, char * cdsti) {
-    convert_flt((const src_t *)cxi, (dst_t *)cdsti);
+    *(dst_t *) cdsti = ggml_cuda_cast<dst_t>(*(const src_t *) cxi);
 }
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@ -539,11 +539,15 @@ static __global__ void flash_attn_mask_to_KV_max(
        all_inf = warp_reduce_all(all_inf);
        if (!all_inf) {
            KV_max_sj += FATTN_KQ_STRIDE;
            break;
        }
    }
    // If the break in the loop was not triggered, KV_max_sj is now -FATTN_KQ_STRIDE.
    // If the break was triggered it's the lower edge of the tile with the first non-masked values.
    // In either case, walk back the decrementation by FATTN_KQ_STRIDE.
    KV_max_sj += FATTN_KQ_STRIDE;
    if (threadIdx.x != 0) {
        return;
    }
--- a/ggml/src/ggml-cuda/getrows.cu
+++ b/ggml/src/ggml-cuda/getrows.cu
@ -1,5 +1,6 @@
 #include "getrows.cuh"
 #include "dequantize.cuh"
 #include "convert.cuh"
 template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
 static __global__ void k_get_rows(
@ -34,8 +35,8 @@ static __global__ void k_get_rows(
    dfloat2 v;
    dequantize_kernel(src0_row, ib, iqs, v);
-    dst_row[iybs + iqs + 0]        = float(v.x);
+    dst_row[iybs + iqs + 0]        = ggml_cuda_cast<dst_t>(v.x);
-    dst_row[iybs + iqs + y_offset] = float(v.y);
+    dst_row[iybs + iqs + y_offset] = ggml_cuda_cast<dst_t>(v.y);
 }
 template<typename src0_t, typename dst_t>
@ -62,7 +63,7 @@ static __global__ void k_get_rows_float(
    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
    const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03);
-    dst_row[i00] = float(src0_row[i00]);
+    dst_row[i00] = ggml_cuda_cast<dst_t>(src0_row[i00]);
 }
 template<typename grad_t, typename dst_t>
--- a/ggml/src/ggml-cuda/mmvf.cu
+++ b/ggml/src/ggml-cuda/mmvf.cu
@ -1,5 +1,6 @@
 #include "ggml.h"
 #include "common.cuh"
 #include "convert.cuh"
 #include "mmvf.cuh"
 template <typename T, typename type_acc, int ncols_dst, int block_size>
@ -93,8 +94,8 @@ static __global__ void mul_mat_vec_f(
 #pragma unroll
            for (int j = 0; j < ncols_dst; ++j) {
                const float2 tmpy = y2[j*stride_col_y2 + col2];
-                sumf[j] += float(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[0]) * tmpy.x;
+                sumf[j] += ggml_cuda_cast<float>(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[0]) * tmpy.x;
-                sumf[j] += float(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[1]) * tmpy.y;
+                sumf[j] += ggml_cuda_cast<float>(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[1]) * tmpy.y;
            }
        }
    } else {
--- a/ggml/src/ggml-cuda/set-rows.cu
+++ b/ggml/src/ggml-cuda/set-rows.cu
@ -3,11 +3,6 @@
 typedef void (*set_rows_kernel_t)(const char * src, char * dst);
 template<typename src_t, typename dst_t>
 __device__ __forceinline__ void set_rows_1(const src_t * src_f, dst_t * dst_f) {
    convert_flt(src_f, dst_f);
 }
 // Generic quantized set_rows kernel template
 template<typename block_type, int qk, void (*quantize_func)(const float*, block_type*)>
 static __global__ void k_set_rows_quant(
@ -117,9 +112,7 @@ static __global__ void k_set_rows(
    const src_t * src0_row = src0 + i01*s01 + i02*s02 + i03*s03;
    dst_t * dst_row_ptr    = dst + dst_row*s1 + i02*s2 + i03*s3;
-    const src_t* src_elem = src0_row + i00;
+    dst_row_ptr[i00] = ggml_cuda_cast<dst_t>(src0_row[i00]);
    dst_t* dst_elem = dst_row_ptr + i00;
    set_rows_1(src_elem, dst_elem);
    GGML_UNUSED(ne10);
    GGML_UNUSED(ne13);
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@ -4,7 +4,7 @@
 #include <hip/hip_runtime.h>
 #include <hipblas/hipblas.h>
 #include <hip/hip_fp16.h>
-#include <hip/hip_bfloat16.h>
+#include <hip/hip_bf16.h>
 #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
 #define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
@ -135,7 +135,7 @@
 #define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
 #define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
-#if HIP_VERSION >= 70000000
+#if HIP_VERSION >= 60500000
 #define CUBLAS_COMPUTE_16F HIPBLAS_COMPUTE_16F
 #define CUBLAS_COMPUTE_32F HIPBLAS_COMPUTE_32F
 #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_COMPUTE_32F_FAST_16F
@ -147,7 +147,7 @@
 #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
 #define cublasComputeType_t hipblasDatatype_t
 #define cudaDataType_t hipblasDatatype_t
-#endif // HIP_VERSION >= 7000000
+#endif // HIP_VERSION >= 6050000
 #if !defined(__HIP_PLATFORM_AMD__)
 #error "The HIP backend supports only AMD targets"
@ -179,8 +179,7 @@
 #define RDNA4
 #endif
-#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
+#if defined(__GFX11__)
    defined(__gfx1150__) || defined(__gfx1151__)
 #define RDNA3
 #endif
@ -197,8 +196,8 @@
    #define __has_builtin(x) 0
 #endif
-typedef hip_bfloat16 nv_bfloat16;
+typedef __hip_bfloat16 nv_bfloat16;
-typedef short2 nv_bfloat162; // FIXME there is no 2x BF16 type being defined in bfloat16.h, ad-hoc compilation fix
+typedef __hip_bfloat162 nv_bfloat162;
 typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
 typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
--- a/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32.cl
@ -0,0 +1,189 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #ifdef cl_intel_subgroups
 #pragma OPENCL EXTENSION cl_intel_subgroups : enable
 #else
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 #endif
 #ifdef cl_intel_required_subgroup_size
 #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
 #define INTEL_GPU 1
 #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
 #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
 #elif defined(cl_qcom_reqd_sub_group_size)
 #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 #define ADRENO_GPU 1
 #define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
 #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
 #endif
 #define QK_MXFP4 32
 typedef struct {
    uchar e; // E8M0
    uchar qs[QK_MXFP4/2];
 } block_mxfp4;
 constant static float kvalues_mxfp4_f[16] = {
    0, .5f, 1.f, 1.5f, 2.f, 3.f, 4.f, 6.f, -0, -.5f, -1.f, -1.5f, -2.f, -3.f, -4.f, -6.f
 };
 static inline float e8m0_to_fp32(uchar x) {
    int bits;
    if (x == 0) {
        bits = 0x00400000;
    } else {
        bits = (uint) x << 23;
    }
    return as_float(bits);
 }
 #ifdef INTEL_GPU
 #define N_R0_MXFP4 2 // number of rows each subgroup works on
 #define N_SG_MXFP4 2 // number of subgroups in a work group
 #define N_SIMDWIDTH 16 // subgroup size
 #elif defined (ADRENO_GPU)
 #define N_R0_MXFP4 2
 #define N_SG_MXFP4 2
 #define N_SIMDWIDTH 64
 #endif
 inline void mul_mv_mxfp4_f32(
    global char * src0,
    global char * src1,
    global char * dst,
    int ne00,
    ulong nb01,
    ulong nb02,
    ulong nb03,
    int ne12,
    ulong nb11,
    ulong nb12,
    ulong nb13,
    int ne0,
    int ne1,
    int r2,
    int r3,
    local  char * shmem
 ) {
    local float * shmem_f32 = (local float *) shmem;
    int nb = ne00/QK_MXFP4;
    int r0 = get_group_id(0);
    int r1 = get_group_id(1);
    int im = 0;
    int first_row = (r0 * N_SG_MXFP4 + get_sub_group_id()) * N_R0_MXFP4;
    uint i12 = im%ne12;
    uint i13 = im/ne12;
    ulong offset_src0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
    ulong offset_src1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
    global block_mxfp4 * x = (global block_mxfp4 *) (src0 + offset_src0);
    global float       * y = (global float       *) (src1 + offset_src1);
    const short ix = get_sub_group_local_id()/2;  // 0...15
    const short it = get_sub_group_local_id()%2;  // 0 or 1
    shmem_f32[get_sub_group_local_id()] = kvalues_mxfp4_f[get_sub_group_local_id()%16];
    barrier(CLK_LOCAL_MEM_FENCE);
    float4 yl[4];
    float sumf[N_R0_MXFP4] = {0.f};
    global float * yb = y + ix * QK_MXFP4 + it * 8;
    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
        global float4 * y4 = (global float4 *)yb;
        yl[0] = y4[0];
        yl[1] = y4[4];
        yl[2] = y4[1];
        yl[3] = y4[5];
        for (short row = 0; row < N_R0_MXFP4; row++) {
            global block_mxfp4 * xb = x + row*nb + ib;
            global uchar       * q2 = (global uchar *)(xb->qs + 8*it);
            float4 acc1 = yl[0]*(float4)(shmem_f32[q2[0] &  0x0F], shmem_f32[q2[1] &  0x0F], shmem_f32[q2[2] &  0x0F], shmem_f32[q2[3] &  0x0F]);
            float4 acc2 = yl[1]*(float4)(shmem_f32[q2[0] >> 4   ], shmem_f32[q2[1] >> 4   ], shmem_f32[q2[2] >> 4   ], shmem_f32[q2[3] >> 4   ]);
            float4 acc3 = yl[2]*(float4)(shmem_f32[q2[4] &  0x0F], shmem_f32[q2[5] &  0x0F], shmem_f32[q2[6] &  0x0F], shmem_f32[q2[7] &  0x0F]);
            float4 acc4 = yl[3]*(float4)(shmem_f32[q2[4] >> 4   ], shmem_f32[q2[5] >> 4   ], shmem_f32[q2[6] >> 4   ], shmem_f32[q2[7] >> 4   ]);
            acc1 = (acc1 + acc3) + (acc2 + acc4);
            sumf[row] += e8m0_to_fp32(xb->e) * ((acc1.s0 + acc1.s1) + (acc1.s2 + acc1.s3));
        }
        yb += (N_SIMDWIDTH/2) * QK_MXFP4;
    }
    global float * dst_f32 = (global float *) dst + (ulong)im*ne0*ne1 + (ulong)r1*ne0;
    for (int row = 0; row < N_R0_MXFP4 && first_row + row < ne0; ++row) {
        float sum_all = sub_group_reduce_add(sumf[row]);
        if (get_sub_group_local_id() == 0) {
            dst_f32[first_row + row] = sum_all;
        }
    }
 }
 #ifdef INTEL_GPU
 REQD_SUBGROUP_SIZE_16
 #elif defined (ADRENO_GPU)
 REQD_SUBGROUP_SIZE_64
 #endif
 kernel void kernel_mul_mv_id_mxfp4_f32(
    global char * src0,
    ulong         offset0,
    global char * src1,
    ulong         offset1,
    global char * src2,
    ulong         offset2,
    global char * dst,
    ulong         offsetd,
    int           ne00,
    ulong         nb01,
    ulong         nb02,
    ulong         nb03,
    int           ne11,
    int           ne12,
    ulong         nb11,
    ulong         nb12,
    ulong         nb13,
    int           ne20,
    int           ne21,
    ulong         nb21,
    int           ne0,
    int           ne1,
    int           r2,
    int           r3,
    local  char * shmem
 ) {
    src0 = (global char *)((global char *)src0 + offset0);
    src1 = (global char *)((global char *)src1 + offset1);
    src2 = (global char *)((global char *)src2 + offset2);
    dst  = (global char *)((global char *)dst  + offsetd);
    const int iid1 = get_group_id(2)/ne20;
    const int idx  = get_group_id(2)%ne20;
    int i02 = ((global int *) (src2 + iid1*nb21))[idx];
    int i11 = idx % ne11;
    int i12 = iid1;
    int i1 = idx;
    int i2 = i12;
    global char * src0_cur = src0 + i02*nb02;
    global char * src1_cur = src1 + i11*nb11 + i12*nb12;
    global char * dst_cur = dst + (i1*ne0 + i2*ne1*ne0)*sizeof(float);
    mul_mv_mxfp4_f32(src0_cur, src1_cur, dst_cur,
        ne00, nb01, nb02, nb03, ne12, nb11, nb12, nb13, ne0, ne1, r2, r3, shmem);
 }
--- a/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32.cl
@ -0,0 +1,144 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #ifdef cl_intel_subgroups
 #pragma OPENCL EXTENSION cl_intel_subgroups : enable
 #else
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 #endif
 #ifdef cl_intel_required_subgroup_size
 #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
 #define INTEL_GPU 1
 #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
 #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
 #elif defined(cl_qcom_reqd_sub_group_size)
 #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
 #define ADRENO_GPU 1
 #define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
 #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
 #endif
 #define QK_MXFP4 32
 typedef struct {
    uchar e; // E8M0
    uchar qs[QK_MXFP4/2];
 } block_mxfp4;
 constant static float kvalues_mxfp4_f[16] = {
    0, .5f, 1.f, 1.5f, 2.f, 3.f, 4.f, 6.f, -0, -.5f, -1.f, -1.5f, -2.f, -3.f, -4.f, -6.f
 };
 static inline float e8m0_to_fp32(uchar x) {
    int bits;
    if (x == 0) {
        bits = 0x00400000;
    } else {
        bits = (uint) x << 23;
    }
    return as_float(bits);
 }
 #ifdef INTEL_GPU
 #define N_R0_MXFP4 2 // number of rows each subgroup works on
 #define N_SG_MXFP4 2 // number of subgroups in a work group
 #define N_SIMDWIDTH 16 // subgroup size
 #elif defined (ADRENO_GPU)
 #define N_R0_MXFP4 2
 #define N_SG_MXFP4 2
 #define N_SIMDWIDTH 64
 #endif
 #ifdef INTEL_GPU
 REQD_SUBGROUP_SIZE_16
 #elif defined (ADRENO_GPU)
 REQD_SUBGROUP_SIZE_64
 #endif
 kernel void kernel_mul_mv_mxfp4_f32(
    global char * src0,
    ulong         offset0,
    global char * src1,
    ulong         offset1,
    global char * dst,
    ulong         offsetd,
    int ne00,
    ulong nb01,
    ulong nb02,
    ulong nb03,
    int ne12,
    ulong nb11,
    ulong nb12,
    ulong nb13,
    int ne0,
    int ne1,
    int r2,
    int r3,
    local  char * shmem
 ) {
    src0 = (global char*)((global char*)src0 + offset0);
    src1 = (global char*)((global char*)src1 + offset1);
    dst  = (global char*)((global char*)dst  + offsetd);
    local float * shmem_f32 = (local float *) shmem;
    int nb = ne00/QK_MXFP4;
    int r0 = get_group_id(0);
    int r1 = get_group_id(1);
    int im = get_group_id(2);
    int first_row = (r0 * N_SG_MXFP4 + get_sub_group_id()) * N_R0_MXFP4;
    uint i12 = im%ne12;
    uint i13 = im/ne12;
    ulong offset_src0 = first_row*nb01 + (i12/r2)*nb02 + (i13/r3)*nb03;
    ulong offset_src1 =        r1*nb11 + (i12   )*nb12 + (i13   )*nb13;
    global block_mxfp4 * x = (global block_mxfp4 *) (src0 + offset_src0);
    global float       * y = (global float       *) (src1 + offset_src1);
    const short ix = get_sub_group_local_id()/2;  // 0...15
    const short it = get_sub_group_local_id()%2;  // 0 or 1
    shmem_f32[get_sub_group_local_id()] = kvalues_mxfp4_f[get_sub_group_local_id()%16];
    barrier(CLK_LOCAL_MEM_FENCE);
    float4 yl[4];
    float sumf[N_R0_MXFP4] = {0.f};
    global float * yb = y + ix * QK_MXFP4 + it * 8;
    for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
        global float4 * y4 = (global float4 *)yb;
        yl[0] = y4[0];
        yl[1] = y4[4];
        yl[2] = y4[1];
        yl[3] = y4[5];
        for (short row = 0; row < N_R0_MXFP4; row++) {
            global block_mxfp4 * xb = x + row*nb + ib;
            global uchar       * q2 = (global uchar *)(xb->qs + 8*it);
            float4 acc1 = yl[0]*(float4)(shmem_f32[q2[0] &  0x0F], shmem_f32[q2[1] &  0x0F], shmem_f32[q2[2] &  0x0F], shmem_f32[q2[3] &  0x0F]);
            float4 acc2 = yl[1]*(float4)(shmem_f32[q2[0] >> 4   ], shmem_f32[q2[1] >> 4   ], shmem_f32[q2[2] >> 4   ], shmem_f32[q2[3] >> 4   ]);
            float4 acc3 = yl[2]*(float4)(shmem_f32[q2[4] &  0x0F], shmem_f32[q2[5] &  0x0F], shmem_f32[q2[6] &  0x0F], shmem_f32[q2[7] &  0x0F]);
            float4 acc4 = yl[3]*(float4)(shmem_f32[q2[4] >> 4   ], shmem_f32[q2[5] >> 4   ], shmem_f32[q2[6] >> 4   ], shmem_f32[q2[7] >> 4   ]);
            acc1 = (acc1 + acc3) + (acc2 + acc4);
            sumf[row] += e8m0_to_fp32(xb->e) * ((acc1.s0 + acc1.s1) + (acc1.s2 + acc1.s3));
        }
        yb += (N_SIMDWIDTH/2) * QK_MXFP4;
    }
    global float * dst_f32 = (global float *) dst + (ulong)im*ne0*ne1 + (ulong)r1*ne0;
    for (int row = 0; row < N_R0_MXFP4 && first_row + row < ne0; ++row) {
        float sum_all = sub_group_reduce_add(sumf[row]);
        if (get_sub_group_local_id() == 0) {
            dst_f32[first_row + row] = sum_all;
        }
    }
 }
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -1127,17 +1127,23 @@ class vk_perf_logger {
            return;
        }
        if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
-            const uint64_t m    = node->src[0]->ne[1];
+            const uint64_t m     = node->src[0]->ne[1];
-            const uint64_t n    = node->src[1]->ne[1];
+            const uint64_t n     = node->ne[1];
-            const uint64_t k    = node->src[1]->ne[0];
+            const uint64_t k     = node->src[1]->ne[0];
-            std::string    name = ggml_op_name(node->op);
+            const uint64_t batch = node->src[1]->ne[2] * node->src[1]->ne[3];
-            if (n == 1) {
+            std::string    name  = ggml_op_name(node->op);
-                name += "_VEC m=" + std::to_string(m) + " k=" + std::to_string(k);
+            if ((node->op == GGML_OP_MUL_MAT && n <= mul_mat_vec_max_cols) ||
-            } else {
+                (node->op == GGML_OP_MUL_MAT_ID && node->src[2]->ne[1] == 1)) {
-                name += " m=" + std::to_string(m) + " n=" + std::to_string(n) + " k=" + std::to_string(k);
+                name += "_VEC";
            }
            name += " ";
            name += ggml_type_name(node->src[0]->type);
            name += " m=" + std::to_string(m) + " n=" + std::to_string(n) + " k=" + std::to_string(k);
            if (batch > 1) {
                name += " batch=" + std::to_string(batch);
            }
            timings[name].push_back(time);
-            flops[name].push_back(m * n * (k + (k - 1)));
+            flops[name].push_back(m * n * (k + (k - 1)) * batch);
            return;
        }
        if (node->op == GGML_OP_CONV_2D) {
@ -8384,7 +8390,7 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context& subctx, cons
        (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
        freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale,
        src2 != nullptr, (uint32_t)src0->ne[2], s1, s2,
-        sections[0], sections[1], sections[2], sections[3], backprop
+        { sections[0], sections[1], sections[2], sections[3] }, backprop
    }, dryrun);
 }
@ -8416,7 +8422,7 @@ static void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context& subctx,
 }
 static void ggml_vk_argmax(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGMAX, { (uint32_t)src0->ne[0], 0, 0.0f, 0.0f }, dryrun);
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGMAX, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], 0.0f, 0.0f }, dryrun);
 }
 static void ggml_vk_count_equal(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@ -9651,7 +9657,6 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
    default:
        std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
        GGML_ABORT("fatal error");
        return false;
    }
    vk_context compute_ctx;
@ -10936,7 +10941,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                default:
                    return false;
            }
            break;
        case GGML_OP_GLU:
            switch (ggml_get_glu_op(op)) {
                case GGML_GLU_OP_GEGLU:
@ -10952,7 +10956,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                default:
                    return false;
            }
            break;
        case GGML_OP_MUL_MAT:
        case GGML_OP_MUL_MAT_ID:
            {
@ -11016,7 +11019,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                }
                return true;
-            } break;
+            }
        case GGML_OP_FLASH_ATTN_EXT:
            {
                ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
@ -11106,7 +11109,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                    default:
                        return false;
                }
-            } break;
+            }
        case GGML_OP_SET_ROWS:
            {
                switch (op->type) {
@ -11123,7 +11126,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                    default:
                        return false;
                }
-            } break;
+            }
        case GGML_OP_CONT:
        case GGML_OP_CPY:
        case GGML_OP_DUP:
@ -11175,7 +11178,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                    return true;
                }
                return false;
-            } break;
+            }
        case GGML_OP_REPEAT:
            return ggml_type_size(op->type) == sizeof(float) && ggml_type_size(op->src[0]->type) == sizeof(float);
        case GGML_OP_REPEAT_BACK:
--- a/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp
@ -5,6 +5,8 @@
 #extension GL_EXT_control_flow_attributes : enable
 #define FLT_MAX 3.402823466e+38F
 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
@ -19,19 +21,26 @@ void main() {
    const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
    const uint col = gl_LocalInvocationID.x;
-    if (col >= p.KX) {
+    if (row >= p.KY) {
        return;
    }
-    A_TYPE amax = data_a[row*p.KX + col];
+
-    tmp[col] = col;
+    A_TYPE amax = -FLT_MAX;
    uint acol = col;
    if (col < p.KX) {
        amax = data_a[row*p.KX + col];
    }
    for (uint i = col + BLOCK_SIZE; i < p.KX; i += BLOCK_SIZE) {
        A_TYPE val = data_a[row*p.KX + i];
        if (val > amax) {
            amax = val;
-            tmp[col] = i;
+            acol = i;
        }
    }
    tmp[col] = acol;
    tmpmax[col] = amax;
    barrier();
--- a/ggml/src/ggml-zdnn/CMakeLists.txt
+++ b/ggml/src/ggml-zdnn/CMakeLists.txt
@ -0,0 +1,36 @@
 if (DEFINED ZDNN_ROOT)
    message(STATUS "zdnn: using ZDNN_ROOT override: ${ZDNN_ROOT}")
    set(ZDNN_HINT "${ZDNN_ROOT}")
 else()
    set(ZDNN_HINT "")
 endif()
 find_path(ZDNN_INCLUDE
            NAMES zdnn.h
            HINTS ${ZDNN_HINT} /usr /usr/local
            PATH_SUFFIXES include)
 if (ZDNN_INCLUDE)
    message(STATUS "zdnn: found include: ${ZDNN_INCLUDE}")
 else()
    message(FATAL_ERROR "zdnn: include directory not found, please set ZDNN_ROOT to the proper path if necessary")
 endif()
 find_library(ZDNN_LIB
                NAMES zdnn
                HINTS ${ZDNN_HINT} /usr /usr/local
                PATH_SUFFIXES lib lib64)
 if (ZDNN_LIB)
    message(STATUS "zdnn: found library: ${ZDNN_LIB}")
 else()
    message(FATAL_ERROR "zdnn: library not found, please set ZDNN_ROOT to the proper path if necessary")
 endif()
 file(GLOB GGML_SOURCES_ZDNN "*.c" "*.cpp")
 file(GLOB GGML_HEADERS_ZDNN "*.h" "*.hpp")
 ggml_add_backend_library(ggml-zdnn ${GGML_HEADERS_ZDNN} ${GGML_SOURCES_ZDNN})
 target_link_libraries(ggml-zdnn PRIVATE ${ZDNN_LIB})
 target_include_directories(ggml-zdnn PRIVATE ${ZDNN_INCLUDE})
 target_link_directories(ggml-zdnn PRIVATE ${ZDNN_LIB})
 target_compile_definitions(ggml-zdnn PRIVATE GGML_USE_ZDNN)
--- a/ggml/src/ggml-zdnn/ggml-zdnn-impl.h
+++ b/ggml/src/ggml-zdnn/ggml-zdnn-impl.h
@ -0,0 +1,97 @@
 #ifndef GGML_ZDNN_IMPL
 #define GGML_ZDNN_IMPL
 #include "zdnn.h"
 #include "ggml.h"
 #include "ggml-zdnn.h"
 #include <vector>
 #include <memory>
 #include <vecintrin.h>
 #define GGML_ZDNN_NAME    "zDNN"
 #define GGML_ZDNN_VERSION ZDNN_VERNUM
 #define vec_neg(a)    (-(a))                // Vector Negate
 #define vec_add(a, b) ((a) + (b))           // Vector Add
 #define vec_sub(a, b) ((a) - (b))           // Vector Subtract
 #define vec_mul(a, b) ((a) * (b))           // Vector Multiply
 #define vec_div(a, b) ((a) / (b))           // Vector Divide
 #define vec_sl(a, b)  ((a) << (b))          // Vector Shift Left
 #define vec_sra(a, b) ((a) >> (b))          // Vector Shift Right
 #define vec_sr(a, b)  ((a) >> (b))          // Vector Shift Right Algebraic
 #define vec_slo(a, b) vec_slb(a, (b) << 64) // Vector Shift Left by Octet
 #define vec_sro(a, b) vec_srb(a, (b) << 64) // Vector Shift Right by Octet
 #ifndef vec_and
 #define vec_and(a, b) ((a) & (b)) // Vector AND
 #endif
 #ifndef vec_or
 #define vec_or(a, b)  ((a) | (b)) // Vector OR
 #endif
 #ifndef vec_xor
 #define vec_xor(a, b) ((a) ^ (b)) // Vector XOR
 #endif
 typedef   signed char char8x16_t  __attribute__((vector_size(16)));
 typedef unsigned char uchar8x16_t __attribute__((vector_size(16)));
 typedef int8_t   int8x16_t  __attribute__((vector_size(16)));
 typedef int16_t  int16x8_t  __attribute__((vector_size(16)));
 typedef int32_t  int32x4_t  __attribute__((vector_size(16)));
 typedef uint8_t  uint8x16_t __attribute__((vector_size(16)));
 typedef uint16_t uint16x8_t __attribute__((vector_size(16)));
 typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
 typedef float float32x4_t   __attribute__((vector_size(16)));
 typedef double double64x2_t __attribute__((vector_size(16)));
 typedef   signed long long long64x2_t  __attribute__((vector_size(16)));
 typedef unsigned long long ulong64x2_t __attribute__((vector_size(16)));
 #define ZDNN_CHECK(stmt)                \
    do {                                \
        zdnn_status status = (stmt);    \
        GGML_ASSERT(status == ZDNN_OK); \
    } while (0);
 struct ggml_backend_zdnn_device_context {
    int zdnn_device;
    int zdnn_device_ref_count;
    bool has_parmblkformat_0;
    bool has_parmblkformat_1;
    size_t max_size;
    char name[128];
 };
 struct ggml_backend_zdnn_context {
    int device;
    ggml_cgraph * gf;
 };
 struct ggml_backend_zdnn_buffer {
    void * data;
    size_t size;
    zdnn_tensor_desc pre_tfm_desc;
    zdnn_tensor_desc tfm_desc;
    zdnn_ztensor     ztensor;
    char name[GGML_MAX_NAME];
 };
 struct ggml_backend_zdnn_buffer_context {
    void * all_data;
    size_t all_size;
    bool owned;
    int n_buffers;
    std::vector<std::unique_ptr<ggml_backend_zdnn_buffer>> buffers;
 };
 #endif  // GGML_ZDNN_IMPL
--- a/ggml/src/ggml-zdnn/ggml-zdnn.cpp
+++ b/ggml/src/ggml-zdnn/ggml-zdnn.cpp
@ -0,0 +1,846 @@
 #include "zdnn.h"
 #include "ggml-zdnn.h"
 #include "ggml-zdnn-impl.h"
 #include "ggml-impl.h"
 #include "ggml-backend-impl.h"
 #include <vector>
 #include <memory>
 #include <csignal>
 #include <unistd.h>
 inline zdnn_data_types ggml_zdnn_type_mapping(ggml_type type) {
    switch (type) {
        case GGML_TYPE_F32:
            return FP32;
        case GGML_TYPE_F16:
            return FP16;
        case GGML_TYPE_BF16:
            return BFLOAT;
        case GGML_TYPE_I8:
            return INT8;
        case GGML_TYPE_I32:
            return INT32;
        case GGML_TYPE_Q8_0:
            return INT8;
        default:
            GGML_ABORT("%s: fatal: unable to determine zTensor data type",
                       __func__);
            break;
    }
 }
 inline void ggml_zdnn_create_tensor(zdnn_tensor_desc  & pre_tfm_desc,
                                    zdnn_tensor_desc  & tfm_desc,
                                    zdnn_ztensor      & ztensor,
                              const ggml_tensor       * src,
                              const int64_t           * ne,
                              const zdnn_data_layouts   layout) {
    zdnn_init_pre_transformed_desc(
        layout,
        ggml_zdnn_type_mapping(src->type),
        &pre_tfm_desc,
        ne[3], ne[2], ne[1], ne[0]
    );
    ZDNN_CHECK(zdnn_generate_transformed_desc(&pre_tfm_desc, &tfm_desc));
    ZDNN_CHECK(zdnn_init_ztensor_with_malloc(&pre_tfm_desc, &tfm_desc, &ztensor));
 }
 inline void ggml_zdnn_load_tensor(zdnn_ztensor & ztensor,
                                          void * buffer) {
    ZDNN_CHECK(zdnn_transform_ztensor(&ztensor, buffer));
 }
 inline void ggml_zdnn_init_tensor(ggml_backend_zdnn_buffer * buffer, const ggml_tensor * tensor) {
    switch (tensor->op) {
        case GGML_OP_MUL_MAT:
            {
                zdnn_init_pre_transformed_desc(
                    ZDNN_2D,
                    ggml_zdnn_type_mapping(tensor->type),
                    &buffer->pre_tfm_desc,
                    tensor->ne[1], tensor->ne[0]
                );
            } break;
        default:
            {
                // For 4D tensors, GGML uses NCHW layout. However, because zDNN
                // automatically transforms everything to NHWC, we will use it
                // directly to avoid the performance penalty changing the
                // layout and reshaping the tensor.
                zdnn_init_pre_transformed_desc(
                    ZDNN_NHWC,
                    ggml_zdnn_type_mapping(tensor->type),
                    &buffer->pre_tfm_desc,
                    tensor->ne[3], tensor->ne[2], tensor->ne[1], tensor->ne[0]
                );
                // TODO: Consider adding a ggml check.
                // TODO: If tensor = 4D, use ZDNN_NCHW by default.
                // TODO: If tensor = 2D, use ZDNN_NHWC by default.
            } break;
    }
    ZDNN_CHECK(zdnn_generate_transformed_desc(&buffer->pre_tfm_desc, &buffer->tfm_desc));
    ZDNN_CHECK(zdnn_init_ztensor_with_malloc(&buffer->pre_tfm_desc, &buffer->tfm_desc, &buffer->ztensor));
 }
 static void ggml_zdnn_mul_mat_op(ggml_backend_zdnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_TENSOR_BINARY_OP_LOCALS;
    const enum ggml_type type = src0->type;
    GGML_ASSERT(ne0 == ne01);
    GGML_ASSERT(ne1 == ne11);
    GGML_ASSERT(ne2 == ne12);
    GGML_ASSERT(ne3 == ne13);
    // we don't support permuted src0 or src1
    GGML_ASSERT(nb00 == ggml_type_size(type));
    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
    // dst cannot be transposed or permuted
    GGML_ASSERT(nb0 == sizeof(float));
    GGML_ASSERT(nb0 <= nb1);
    GGML_ASSERT(nb1 <= nb2);
    GGML_ASSERT(nb2 <= nb3);
    const ggml_tensor * weights = src0;
    const ggml_tensor * inputs  = src1;
          ggml_tensor * output  = dst;
    ggml_backend_zdnn_buffer * weights_extra = (ggml_backend_zdnn_buffer *)weights->extra;
    ggml_backend_zdnn_buffer * inputs_extra  = (ggml_backend_zdnn_buffer *)inputs->extra;
    ggml_backend_zdnn_buffer * output_extra  = (ggml_backend_zdnn_buffer *)output->extra;
    zdnn_tensor_desc ptd_bias, td_bias;
    zdnn_ztensor zt_bias;
    const int64_t weights_rows = ne01;
    const int64_t weights_cols = ne00;
    const int64_t inputs_rows  = ne11;
    const int64_t inputs_cols  = ne10;
    assert(inputs_cols == weights_cols);
    const int64_t output_rows = ne1;
    const int64_t output_cols = ne0;
    const int64_t bias_dim  [GGML_MAX_DIMS]  = { 1, 1, 1, output_cols };
    ggml_zdnn_create_tensor(ptd_bias, td_bias, zt_bias, output, bias_dim, ZDNN_1D);
    void * bias_data = (void *)calloc(ne0, ggml_element_size(output));
    if (weights_extra->ztensor.is_transformed == false) ggml_zdnn_load_tensor(weights_extra->ztensor, weights->data);
    if (inputs_extra->ztensor.is_transformed == false) ggml_zdnn_load_tensor(inputs_extra->ztensor, inputs->data);
    ggml_zdnn_load_tensor(zt_bias, bias_data);
    // GGML_LOG_INFO("%s: tensor '%s' tensor dimensions: [%ld, %ld, %ld, %ld] pre_tfm_desc dimensions: [%ld, %ld, %ld, %ld]\n",
    //               __func__, weights_extra->name,
    //               weights->ne[3], weights->ne[2], weights->ne[1], weights->ne[0],
    //               weights_extra->pre_tfm_desc.dim1,
    //               weights_extra->pre_tfm_desc.dim2,
    //               weights_extra->pre_tfm_desc.dim3,
    //               weights_extra->pre_tfm_desc.dim4);
    // GGML_LOG_INFO("%s: tensor '%s' tensor dimensions: [%ld, %ld, %ld, %ld] pre_tfm_desc dimensions: [%ld, %ld, %ld, %ld]\n",
    //               __func__, inputs_extra->name,
    //               inputs->ne[3], inputs->ne[2], inputs->ne[1], inputs->ne[0],
    //               inputs_extra->pre_tfm_desc.dim1,
    //               inputs_extra->pre_tfm_desc.dim2,
    //               inputs_extra->pre_tfm_desc.dim3,
    //               inputs_extra->pre_tfm_desc.dim4);
    GGML_ASSERT(weights_extra->pre_tfm_desc.dim1 == weights->ne[0] && "weights_extra->pre_tfm_desc.dim1 must match weights->ne[0]");
    GGML_ASSERT(weights_extra->pre_tfm_desc.dim2 == weights->ne[1] && "weights_extra->pre_tfm_desc.dim2 must match weights->ne[1]");
    GGML_ASSERT(inputs_extra->pre_tfm_desc.dim1  == inputs->ne[0]  && "inputs_extra->pre_tfm_desc.dim1 must match inputs->ne[0]");
    GGML_ASSERT(inputs_extra->pre_tfm_desc.dim2  == inputs->ne[1]  && "inputs_extra->pre_tfm_desc.dim2 must match inputs->ne[1]");
    ZDNN_CHECK(zdnn_matmul_transpose_op(&inputs_extra->ztensor, &weights_extra->ztensor, &zt_bias,
                                        false, true, MATMUL_OP_ADDITION, &output_extra->ztensor));
    // TODO: Remove in the future as we are currently DLF16 -> FP32 then in the next op, FP32 -> DLF16 again. Inefficient.
    ZDNN_CHECK(zdnn_transform_origtensor(&output_extra->ztensor, output->data));
    ZDNN_CHECK(zdnn_free_ztensor_buffer(&zt_bias));
    free(bias_data);
 }
 static void ggml_zdnn_mul_mat_dispatch(ggml_backend_zdnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    bool use_mul_mat_vec =
        (src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F16)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
        && src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
    bool use_mul_mat_vec_q =
        ggml_is_quantized(src0->type)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
    bool use_mul_mat_q =
        ggml_is_quantized(src0->type)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
    // debug helpers
    // GGML_LOG_INFO("%s: use_mul_mat_vec   = %d\n", __func__, use_mul_mat_vec);
    // GGML_LOG_INFO("%s: use_mul_mat_vec_q = %d\n", __func__, use_mul_mat_vec_q);
    // GGML_LOG_INFO("%s: use_mul_mat_q     = %d\n", __func__, use_mul_mat_q);
    // GGML_LOG_INFO("%s: src0: %8d %8d %8d %8d\n", __func__, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
    // GGML_LOG_INFO("%s:       %8d %8d %8d %8d\n", __func__, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
    // GGML_LOG_INFO("%s: src1: %8d %8d %8d %8d\n", __func__, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
    // GGML_LOG_INFO("%s:       %8d %8d %8d %8d\n", __func__, src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
    // GGML_LOG_INFO("%s: src0 is contiguous %d, transposed %d, type = %s, name = %s\n", __func__, ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
    // GGML_LOG_INFO("%s: src1 is contiguous %d, transposed %d, type = %s, name = %s\n", __func__, ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
    if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16
        && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)
        && src1->ne[2] * src1->ne[3] > 1) {
        // general KQ + KQV multi-batch
        GGML_LOG_INFO("%s: using zdnn_mul_mat_batched for KQ + KQV multi-batch\n", __func__);
        // ggml_zdnn_mul_mat_batched(ctx, src0, src1, dst);
    } else if (use_mul_mat_vec) {
        GGML_LOG_INFO("%s: using zdnn_op_mul_mat_vec for vector multiplication\n", __func__);
        // ggml_zdnn_op_mul_mat(ctx, src0, src1, dst, ggml_zdnn_op_mul_mat_vec, nullptr);
    } else if (use_mul_mat_vec_q) {
        GGML_LOG_INFO("%s: using zdnn_op_mul_mat_vec_q for quantized vector multiplication\n", __func__);
        // ggml_zdnn_op_mul_mat(ctx, src0, src1, dst, ggml_zdnn_op_mul_mat_vec_q, ggml_zdnn_quantize_row_q8_1);
    } else if (use_mul_mat_q) {
        GGML_LOG_INFO("%s: using zdnn_op_mul_mat_q for quantized matrix multiplication\n", __func__);
        // ggml_zdnn_op_mul_mat(ctx, src0, src1, dst, ggml_zdnn_op_mul_mat_q, ggml_zdnn_quantize_mmq_q8_1);
    } else {
        // GGML_LOG_INFO("%s: using zdnn_op_mul_mat for general matrix multiplication\n", __func__);
        ggml_zdnn_mul_mat_op(ctx, src0, src1, dst);
    }
 }
 static bool ggml_zdnn_compute_forward(ggml_backend_zdnn_context * ctx, ggml_tensor * dst) {
    switch (dst->op) {
        case GGML_OP_MUL_MAT:
            ggml_zdnn_mul_mat_dispatch(ctx, dst->src[0], dst->src[1], dst);
            break;
        default:
            return false;
    }
    return true;
 }
 static enum ggml_status ggml_zdnn_graph_compute(ggml_backend_t backend, ggml_cgraph * gf) {
    ggml_backend_zdnn_context        * ctx     = (       ggml_backend_zdnn_context *)backend->context;
    ggml_backend_zdnn_device_context * ctx_dev = (ggml_backend_zdnn_device_context *)backend->device->context;
    ctx->gf = gf;
    for (int i = 0; i < gf->n_nodes; i++) {
        ggml_tensor * node = gf->nodes[i];
        if (ggml_is_empty(node)
            || node->op == GGML_OP_NONE
            || node->op == GGML_OP_RESHAPE
            || node->op == GGML_OP_VIEW
            || node->op == GGML_OP_PERMUTE
            || node->op == GGML_OP_TRANSPOSE) {
            continue;
        }
        bool ok = ggml_zdnn_compute_forward(ctx, node);
        if (!ok) {
            GGML_LOG_ERROR("%s: unsupported op %s (%s)\n",
                           __func__, node->name, ggml_op_name(node->op));
        }
        GGML_ASSERT(ok);
    }
    return GGML_STATUS_SUCCESS;
 }
 static bool ggml_zdnn_supports_op(const ggml_backend_zdnn_device_context * ctx_dev, const ggml_tensor * op) {
    switch (op->op) {
        case GGML_OP_NONE:
        case GGML_OP_RESHAPE:
        case GGML_OP_VIEW:
        case GGML_OP_TRANSPOSE:
        case GGML_OP_PERMUTE:
            return true;
        case GGML_OP_MUL_MAT:
            {
                const ggml_tensor * src0 = op->src[0];
                const ggml_tensor * src1 = op->src[1];
                const int64_t ne10 = src1->ne[0];
                const int64_t ne0 = op->ne[0];
                const int64_t ne1 = op->ne[1];
                const int64_t max_batch = ctx_dev->max_size;
                return ggml_is_matrix(src0) &&
                       ggml_is_matrix(src1) &&
                       ggml_is_contiguous(src0) &&
                       ggml_is_contiguous(src1) &&
                       src0->view_src == nullptr && src1->view_src == nullptr &&
                       src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 &&
                       (ne0 <= max_batch && ne1 <= max_batch && ne10 <= max_batch);
            } break;
        default:
            return false;
    }
 }
 ////////////////////////////////////////////////////////////////////////////////
 //
 // globals
 //
 // initialised in ggml_backend_zdnn_reg
 static ggml_backend_reg    g_ggml_backend_zdnn_reg;
 static ggml_backend_device g_ggml_backend_zdnn_device;
 static ggml_backend_zdnn_device_context g_ggml_ctx_dev_main = {
    /* .zdnn_device           = */ 0,
    /* .zdnn_device_ref_count = */ 0,
    /* .has_parmblkformat_0   = */ false,
    /* .has_parmblkformat_1   = */ false,
    /* .max_size              = */ 0,
    /* .name                  = */ "",
 };
 static int ggml_backend_zdnn_device_acq(ggml_backend_zdnn_device_context * ctx) {
    assert(ctx != NULL);
    if (ctx->zdnn_device == 0) {
        ctx->zdnn_device = 1;
    }
    if (ctx->zdnn_device >= 1) {
        ctx->has_parmblkformat_0 = zdnn_is_nnpa_parmblk_fmt_installed(1, NNPA_PARMBLKFORMAT_0);
        ctx->has_parmblkformat_1 = zdnn_is_nnpa_parmblk_fmt_installed(1, NNPA_PARMBLKFORMAT_1);
        ctx->max_size = zdnn_get_nnpa_max_dim_idx_size();
        strncpy(ctx->name, GGML_ZDNN_NAME, sizeof(ctx->name) - 1);
    }
    ctx->zdnn_device_ref_count++;
    return ctx->zdnn_device;
 }
 static void ggml_backend_zdnn_device_rel(ggml_backend_zdnn_device_context * ctx) {
    assert(ctx != NULL);
    assert(ctx->zdnn_device_ref_count > 0);
    ctx->zdnn_device_ref_count--;
    if (ctx->zdnn_device_ref_count == 0) {
        if (ctx->zdnn_device >= 0) {
            ctx->zdnn_device = 0;
        }
    }
 }
 static ggml_backend_zdnn_context * ggml_zdnn_init(ggml_backend_dev_t dev) {
    GGML_LOG_INFO("%s: allocating\n", __func__);
    GGML_LOG_INFO("%s: found 1 device\n", __func__);
    #ifdef STATIC_LIB
    zdnn_init();
    #endif
    ggml_backend_zdnn_context * ctx = new ggml_backend_zdnn_context();
    ggml_backend_zdnn_device_context * ctx_dev = (ggml_backend_zdnn_device_context *)dev->context;
    int device = 1;
    GGML_LOG_INFO("%s: picking default device: %s\n", __func__, ctx_dev->name);
    ctx->device = device;
    GGML_LOG_INFO("%s: NNPA name: %s\n", __func__, ctx_dev->name);
    GGML_LOG_INFO("%s: NNPA_PARMBLKFORMAT_0 = %s\n", __func__, ctx_dev->has_parmblkformat_0 ? "true" : "false");
    GGML_LOG_INFO("%s: NNPA_PARMBLKFORMAT_1 = %s\n", __func__, ctx_dev->has_parmblkformat_1 ? "true" : "false");
    ctx->gf = nullptr;
    return ctx;
 }
 static void ggml_zdnn_free(ggml_backend_zdnn_context * ctx) {
    GGML_LOG_INFO("%s: deallocating\n", __func__);
    delete ctx;
 }
 //
 // backend interface
 //
 static void ggml_backend_zdnn_buffer_free_buffer(ggml_backend_buffer_t buffer) {
    ggml_backend_zdnn_buffer_context * ctx = (ggml_backend_zdnn_buffer_context *)buffer->context;
    for (int i = 0; i < ctx->n_buffers; i++) {
        if (ctx->buffers[i]->ztensor.buffer != NULL && ctx->buffers[i]->ztensor.is_transformed) {
            ZDNN_CHECK(zdnn_free_ztensor_buffer(&ctx->buffers[i]->ztensor));
        }
    }
    delete ctx;
 }
 static void * ggml_backend_zdnn_buffer_get_base(ggml_backend_buffer_t buffer) {
    ggml_backend_zdnn_buffer_context * ctx = (ggml_backend_zdnn_buffer_context *)buffer->context;
    return ctx->all_data;
 }
 static enum ggml_status ggml_backend_zdnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
    if (tensor->view_src != NULL) {
        assert(tensor->view_src->buffer->buft == buffer->buft);
        return GGML_STATUS_SUCCESS;
    }
    ggml_backend_zdnn_buffer_context * ctx = (ggml_backend_zdnn_buffer_context *)buffer->context;
    const int64_t tsize = ggml_nbytes(tensor);
    int buffer_idx = ctx->n_buffers;
    std::unique_ptr<ggml_backend_zdnn_buffer> zdnn_buffer = std::make_unique<ggml_backend_zdnn_buffer>();
    zdnn_buffer->data = tensor->data;
    zdnn_buffer->size = tsize;
    strncpy(zdnn_buffer->name, tensor->name, GGML_MAX_NAME - 1);
    ggml_zdnn_init_tensor(zdnn_buffer.get(), tensor);
    tensor->extra = zdnn_buffer.get();
    ctx->buffers.push_back(std::move(zdnn_buffer));
    ctx->n_buffers++;
    // GGML_LOG_INFO("%s: initialised tensor '%s' in buffer %d, size = %8.2f MiB\n",
    //               __func__, tensor->name, buffer_idx, tsize);
    return GGML_STATUS_SUCCESS;
 }
 static void ggml_backend_zdnn_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
    memset((char *)tensor->data + offset, value, size);
    GGML_UNUSED(buffer);
 }
 static void ggml_backend_zdnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
    memcpy((char *)tensor->data + offset, data, size);
    GGML_UNUSED(buffer);
 }
 static void ggml_backend_zdnn_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
    memcpy(data, (const char *)tensor->data + offset, size);
    GGML_UNUSED(buffer);
 }
 static void ggml_backend_zdnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
    ggml_backend_zdnn_buffer_context * ctx = (ggml_backend_zdnn_buffer_context *)buffer->context;
    memset(ctx->all_data, value, ctx->all_size);
 }
 static ggml_backend_buffer_i ggml_backend_zdnn_buffer_i = {
    /* .free_buffer   = */ ggml_backend_zdnn_buffer_free_buffer,
    /* .get_base      = */ ggml_backend_zdnn_buffer_get_base,
    /* .init_tensor   = */ ggml_backend_zdnn_buffer_init_tensor,
    /* .memset_tensor = */ ggml_backend_zdnn_buffer_memset_tensor,
    /* .set_tensor    = */ ggml_backend_zdnn_buffer_set_tensor,
    /* .get_tensor    = */ ggml_backend_zdnn_buffer_get_tensor,
    /* .cpy_tensor    = */ NULL,
    /* .clear         = */ ggml_backend_zdnn_buffer_clear,
    /* .reset         = */ NULL,
 };
 //
 // default buffer type
 //
 static const char * ggml_backend_zdnn_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
    return GGML_ZDNN_NAME;
    GGML_UNUSED(buft);
 }
 static ggml_backend_buffer_t ggml_backend_zdnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
    ggml_backend_zdnn_buffer_context * ctx = new ggml_backend_zdnn_buffer_context();
    const size_t size_page = sysconf(_SC_PAGESIZE);
    size_t size_aligned = size;
    if ((size_aligned % size_page) != 0) {
        size_aligned += size_page - (size_aligned % size_page);
    }
    ggml_backend_zdnn_device_context * ctx_dev = (ggml_backend_zdnn_device_context *)buft->device->context;
    GGML_ASSERT(ctx_dev->zdnn_device >= 0);
    int device = ctx_dev->zdnn_device; GGML_UNUSED(device);
    ctx->all_data  = ggml_aligned_malloc(size_aligned);
    ctx->all_size  = size_aligned;
    ctx->owned     = true;
    ctx->n_buffers = 1;
    if (ctx->all_data != NULL) {
        std::unique_ptr<ggml_backend_zdnn_buffer> zdnn_buffer = std::make_unique<ggml_backend_zdnn_buffer>();
        zdnn_buffer->data = ctx->all_data;
        zdnn_buffer->size = size_aligned;
        ctx->buffers.push_back(std::move(zdnn_buffer));
    }
    if (size_aligned > 0 && (ctx->all_data == NULL)) {
        GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f\n",
                       __func__, size_aligned / 1024.0 / 1024.0);
        delete ctx;
        return NULL;
    }
    return ggml_backend_buffer_init(buft, ggml_backend_zdnn_buffer_i, ctx, size);
 }
 static size_t ggml_backend_zdnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
    return 256;
    GGML_UNUSED(buft);
 }
 static bool ggml_backend_zdnn_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
    return true;
    GGML_UNUSED(buft);
 }
 ggml_backend_buffer_type_t ggml_backend_zdnn_buffer_type(void) {
    static ggml_backend_buffer_type ggml_backend_buffer_type_zdnn = {
        /* .iface   = */ {
            /* .get_name       = */ ggml_backend_zdnn_buffer_type_get_name,
            /* .alloc_buffer   = */ ggml_backend_zdnn_buffer_type_alloc_buffer,
            /* .get_alignment  = */ ggml_backend_zdnn_buffer_type_get_alignment,
            /* .get_max_size   = */ NULL,
            /* .get_alloc_size = */ NULL,  // defaults to ggml_nbytes
            /* .is_host        = */ ggml_backend_zdnn_buffer_type_is_host,
        },
        /* .device  = */ &g_ggml_backend_zdnn_device,
        /* .context = */ NULL,
    };
    return &ggml_backend_buffer_type_zdnn;
 }
 static const char * ggml_backend_zdnn_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
    return GGML_ZDNN_NAME "_Mapped";
    GGML_UNUSED(buft);
 }
 static ggml_backend_buffer_type_t ggml_backend_zdnn_buffer_from_ptr_type(void) {
    static ggml_backend_buffer_type ggml_backend_buffer_from_ptr_type_zdnn = {
        /* .iface = */ {
            /* .get_name       = */ ggml_backend_zdnn_buffer_from_ptr_type_get_name,
            /* .alloc_buffer   = */ ggml_backend_zdnn_buffer_type_alloc_buffer,
            /* .get_alignment  = */ ggml_backend_zdnn_buffer_type_get_alignment,
            /* .get_max_size   = */ NULL,
            /* .get_alloc_size = */ NULL,  // defaults to ggml_nbytes
            /* .is_host        = */ ggml_backend_zdnn_buffer_type_is_host,
        },
        /* .device  = */ &g_ggml_backend_zdnn_device,
        /* .context = */ NULL,
    };
    return &ggml_backend_buffer_from_ptr_type_zdnn;
 }
 //
 // backend
 //
 static const char * ggml_backend_zdnn_name(ggml_backend_t backend) {
    return GGML_ZDNN_NAME;
    GGML_UNUSED(backend);
 }
 static void ggml_backend_zdnn_free(ggml_backend_t backend) {
    ggml_backend_zdnn_context * ctx = (ggml_backend_zdnn_context *)backend->context;
    ggml_zdnn_free(ctx);
    free(backend);
 }
 static enum ggml_status ggml_backend_zdnn_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
    return ggml_zdnn_graph_compute(backend, cgraph);
 }
 static ggml_backend_i ggml_backend_zdnn_i = {
    /* .get_name           = */ ggml_backend_zdnn_name,
    /* .free               = */ ggml_backend_zdnn_free,
    /* .set_tensor_async   = */ NULL,
    /* .get_tensor_async   = */ NULL,
    /* .cpy_tensor_async   = */ NULL,
    /* .synchronize        = */ NULL,
    /* .graph_plan_create  = */ NULL,
    /* .graph_plan_free    = */ NULL,
    /* .graph_plan_update  = */ NULL,
    /* .graph_plan_compute = */ NULL,
    /* .graph_compute      = */ ggml_backend_zdnn_graph_compute,
    /* .event_record       = */ NULL,
    /* .event_wait         = */ NULL,
 };
 static ggml_guid_t ggml_backend_zdnn_guid(void) {
    static const char * guid_str = "IBM-ZDNN-ACCELER";
    return reinterpret_cast<ggml_guid_t>((void *)guid_str);
 }
 // TODO: remove in the future
 ggml_backend_t ggml_backend_zdnn_init(void) {
    ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_zdnn_reg(), 0);
    ggml_backend_zdnn_context * ctx = ggml_zdnn_init(dev);
    if (ctx == NULL) {
        GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
        return NULL;
    }
    ggml_backend_t backend = (ggml_backend_t)malloc(sizeof(ggml_backend));
    *backend = (ggml_backend) {
        /* .guid       = */ ggml_backend_zdnn_guid(),
        /* .iface      = */ ggml_backend_zdnn_i,
        /* .device     = */ dev,
        /* .context    = */ ctx,
    };
    return backend;
 }
 bool ggml_backend_is_zdnn(ggml_backend_t backend) {
    return backend != NULL &&
           ggml_guid_matches(backend->guid, ggml_backend_zdnn_guid());
    GGML_UNUSED(backend);
 }
 //
 // backend device
 //
 static const char * ggml_backend_zdnn_device_get_name(ggml_backend_dev_t dev) {
    return GGML_ZDNN_NAME;
    GGML_UNUSED(dev);
 }
 static const char * ggml_backend_zdnn_device_get_description(ggml_backend_dev_t dev) {
    return "IBM Z Neural Network Processing Assist (NNPA)";
 }
 static void ggml_backend_zdnn_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
    *free  = 0;
    *total = 0;
 }
 static enum ggml_backend_dev_type ggml_backend_zdnn_device_get_type(ggml_backend_dev_t dev) {
    return GGML_BACKEND_DEVICE_TYPE_ACCEL;
    GGML_UNUSED(dev);
 }
 static void ggml_backend_zdnn_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
    props->name        = ggml_backend_zdnn_device_get_name(dev);
    props->description = ggml_backend_zdnn_device_get_description(dev);
    props->type        = ggml_backend_zdnn_device_get_type(dev);
    ggml_backend_zdnn_device_get_memory(dev, &props->memory_free, &props->memory_total);
    props->caps = (ggml_backend_dev_caps) {
        /* .async                = */ false,
        /* .host_buffer          = */ false,
        /* .buffer_from_host_ptr = */ true,
        /* .events               = */ false,
    };
 }
 static ggml_backend_t ggml_backend_zdnn_device_init(ggml_backend_dev_t dev, const char * params) {
    ggml_backend_zdnn_context * ctx = ggml_zdnn_init(dev);
    if (ctx == NULL) {
        GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
        return NULL;
    }
    ggml_backend_t backend = (ggml_backend *)malloc(sizeof(ggml_backend));
    *backend = (ggml_backend) {
        /* .guid       = */ ggml_backend_zdnn_guid(),
        /* .iface      = */ ggml_backend_zdnn_i,
        /* .device     = */ dev,
        /* .context    = */ ctx,
    };
    return backend;
    GGML_UNUSED(params);
 }
 static ggml_backend_buffer_type_t ggml_backend_zdnn_device_get_buffer_type(ggml_backend_dev_t dev) {
    return ggml_backend_zdnn_buffer_type();
    GGML_UNUSED(dev);
 }
 static ggml_backend_buffer_t ggml_backend_zdnn_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
    ggml_backend_zdnn_buffer_context * ctx = new ggml_backend_zdnn_buffer_context();
    ctx->all_data  = ptr;
    ctx->all_size  = size;
    ctx->owned     = false;
    ctx->n_buffers = 0;
    const size_t size_page = sysconf(_SC_PAGESIZE);
    // page-align the data ptr
    {
        const uintptr_t offs = (uintptr_t) ptr % size_page;
        ptr  = (void *)((char *)ptr - offs);
        size += offs;
    }
    size_t size_aligned = size;
    if ((size_aligned % size_page) != 0) {
        size_aligned += size_page - (size_aligned % size_page);
    }
    ggml_backend_zdnn_device_context * ctx_dev = (ggml_backend_zdnn_device_context *)dev->context;
    GGML_ASSERT(ctx_dev->zdnn_device >= 0);
    int device = ctx_dev->zdnn_device; GGML_UNUSED(device);
    std::unique_ptr<ggml_backend_zdnn_buffer> zdnn_buffer = std::make_unique<ggml_backend_zdnn_buffer>();
    zdnn_buffer->data = ptr;
    zdnn_buffer->size = size;
    ctx->buffers.push_back(std::move(zdnn_buffer));
    GGML_LOG_INFO("%s: allocated buffer, size = %8.2f MiB\n",
                  __func__, size_aligned / 1024.0 / 1024.0);
    ++ctx->n_buffers;
    return ggml_backend_buffer_init(ggml_backend_zdnn_buffer_from_ptr_type(), ggml_backend_zdnn_buffer_i, ctx, size);
 }
 static bool ggml_backend_zdnn_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
    ggml_backend_zdnn_device_context * ctx_dev = (ggml_backend_zdnn_device_context *) dev->context;
    return ggml_zdnn_supports_op(ctx_dev, op);
 }
 static bool ggml_backend_zdnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
    return
        buft->iface.get_name == ggml_backend_zdnn_buffer_type_get_name ||
        buft->iface.get_name == ggml_backend_zdnn_buffer_from_ptr_type_get_name;
    GGML_UNUSED(dev);
 }
 static ggml_backend_device_i ggml_backend_zdnn_device_i = {
    /* .get_name             = */ ggml_backend_zdnn_device_get_name,
    /* .get_description      = */ ggml_backend_zdnn_device_get_description,
    /* .get_memory           = */ ggml_backend_zdnn_device_get_memory,
    /* .get_type             = */ ggml_backend_zdnn_device_get_type,
    /* .get_props            = */ ggml_backend_zdnn_device_get_props,
    /* .init_backend         = */ ggml_backend_zdnn_device_init,
    /* .get_buffer_type      = */ ggml_backend_zdnn_device_get_buffer_type,
    /* .get_host_buffer_type = */ NULL,
    /* .buffer_from_host_ptr = */ ggml_backend_zdnn_device_buffer_from_ptr,
    /* .supports_op          = */ ggml_backend_zdnn_device_supports_op,
    /* .supports_buft        = */ ggml_backend_zdnn_device_supports_buft,
    /* .offload_op           = */ NULL,
    /* .event_new            = */ NULL,
    /* .event_free           = */ NULL,
    /* .event_synchronize    = */ NULL,
 };
 //
 // backend registry
 //
 static const char * ggml_backend_zdnn_reg_get_name(ggml_backend_reg_t reg) {
    return GGML_ZDNN_NAME;
    GGML_UNUSED(reg);
 }
 static size_t ggml_backend_zdnn_reg_device_count(ggml_backend_reg_t reg) {
    if (!zdnn_is_nnpa_installed()) {
        return 0;
    }
    return 1;
    GGML_UNUSED(reg);
 }
 static ggml_backend_dev_t ggml_backend_zdnn_reg_device_get(ggml_backend_reg_t reg, size_t index) {
    GGML_ASSERT(index == 0);
    return &g_ggml_backend_zdnn_device;
    GGML_UNUSED(reg);
    GGML_UNUSED(index);
 }
 static ggml_backend_feature g_ggml_backend_zdnn_features[] = {
    { "NNPA", zdnn_is_nnpa_installed() ? "1" : "0" },
    { "NNPA_PARMBLKFORMAT_0", zdnn_is_nnpa_parmblk_fmt_installed(1, NNPA_PARMBLKFORMAT_0) ? "1" : "0" },
    { "NNPA_PARMBLKFORMAT_1", zdnn_is_nnpa_parmblk_fmt_installed(1, NNPA_PARMBLKFORMAT_1) ? "1" : "0" },
    { NULL, NULL },
 };
 static ggml_backend_feature * ggml_backend_zdnn_get_features(ggml_backend_reg_t reg) {
    return g_ggml_backend_zdnn_features;
    GGML_UNUSED(reg);
 }
 static void * ggml_backend_zdnn_get_proc_address(ggml_backend_reg_t reg, const char * name) {
    if (strcmp(name, "ggml_backend_get_features") == 0) {
        return (void *) ggml_backend_zdnn_get_features;
    }
    return NULL;
    GGML_UNUSED(reg);
 }
 static ggml_backend_reg_i ggml_backend_zdnn_reg_i = {
    /* .get_name         = */ ggml_backend_zdnn_reg_get_name,
    /* .get_device_count = */ ggml_backend_zdnn_reg_device_count,
    /* .get_device       = */ ggml_backend_zdnn_reg_device_get,
    /* .get_proc_address = */ ggml_backend_zdnn_get_proc_address,
 };
 static void ggml_zdnn_cleanup(void) {
    ggml_backend_zdnn_device_rel(&g_ggml_ctx_dev_main);
 }
 // TODO: make thread-safe
 ggml_backend_reg_t ggml_backend_zdnn_reg(void) {
    ggml_backend_zdnn_device_acq(&g_ggml_ctx_dev_main);
    // register cleanup callback
    atexit(ggml_zdnn_cleanup);
    {
        g_ggml_backend_zdnn_reg = (ggml_backend_reg) {
            /* .api_version = */ GGML_ZDNN_VERSION,
            /* .iface       = */ ggml_backend_zdnn_reg_i,
            /* .context     = */ NULL,
        };
        g_ggml_backend_zdnn_device = (ggml_backend_device) {
            /* .iface       = */ ggml_backend_zdnn_device_i,
            /* .reg         = */ &g_ggml_backend_zdnn_reg,
            /* .context     = */ &g_ggml_ctx_dev_main,
        };
        return &g_ggml_backend_zdnn_reg;
    }
 }
 GGML_BACKEND_DL_IMPL(ggml_backend_zdnn_reg)
--- a/models/templates/openai-gpt-oss-120b.jinja
+++ b/models/templates/openai-gpt-oss-120b.jinja
@ -0,0 +1,331 @@
 {#-
  In addition to the normal inputs of `messages` and `tools`, this template also accepts the
  following kwargs:
  - "builtin_tools": A list, can contain "browser" and/or "python".
  - "model_identity": A string that optionally describes the model identity.
  - "reasoning_effort": A string that describes the reasoning effort, defaults to "medium".
 #}
 {#- Tool Definition Rendering ============================================== #}
 {%- macro render_typescript_type(param_spec, required_params, is_nullable=false) -%}
    {%- if param_spec.type == "array" -%}
        {%- if param_spec['items'] -%}
            {%- if param_spec['items']['type'] == "string" -%}
                {{- "string[]" }}
            {%- elif param_spec['items']['type'] == "number" -%}
                {{- "number[]" }}
            {%- elif param_spec['items']['type'] == "integer" -%}
                {{- "number[]" }}
            {%- elif param_spec['items']['type'] == "boolean" -%}
                {{- "boolean[]" }}
            {%- else -%}
                {%- set inner_type = render_typescript_type(param_spec['items'], required_params) -%}
                {%- if inner_type == "object | object" or inner_type|length > 50 -%}
                    {{- "any[]" }}
                {%- else -%}
                    {{- inner_type + "[]" }}
                {%- endif -%}
            {%- endif -%}
            {%- if param_spec.nullable -%}
                {{- " | null" }}
            {%- endif -%}
        {%- else -%}
            {{- "any[]" }}
            {%- if param_spec.nullable -%}
                {{- " | null" }}
            {%- endif -%}
        {%- endif -%}
    {%- elif param_spec.type is defined and param_spec.type is iterable and param_spec.type is not string and param_spec.type is not mapping and param_spec.type[0] is defined -%}
        {#- Handle array of types like ["object", "object"] from Union[dict, list] #}
        {%- if param_spec.type | length > 1 -%}
            {{- param_spec.type | join(" | ") }}
        {%- else -%}
            {{- param_spec.type[0] }}
        {%- endif -%}
    {%- elif param_spec.oneOf -%}
        {#- Handle oneOf schemas - check for complex unions and fallback to any #}
        {%- set has_object_variants = false -%}
        {%- for variant in param_spec.oneOf -%}
            {%- if variant.type == "object" -%}
                {%- set has_object_variants = true -%}
            {%- endif -%}
        {%- endfor -%}
        {%- if has_object_variants and param_spec.oneOf|length > 1 -%}
            {{- "any" }}
        {%- else -%}
            {%- for variant in param_spec.oneOf -%}
                {{- render_typescript_type(variant, required_params) -}}
                {%- if variant.description %}
                    {{- "// " + variant.description }}
                {%- endif -%}
                {%- if variant.default is defined %}
                    {{ "// default: " + variant.default|tojson }}
                {%- endif -%}
                {%- if not loop.last %}
                    {{- " | " }}
                {% endif -%}
            {%- endfor -%}
        {%- endif -%}
    {%- elif param_spec.type == "string" -%}
        {%- if param_spec.enum -%}
            {{- '"' + param_spec.enum|join('" | "') + '"' -}}
        {%- else -%}
            {{- "string" }}
            {%- if param_spec.nullable %}
                {{- " | null" }}
            {%- endif -%}
        {%- endif -%}
    {%- elif param_spec.type == "number" -%}
        {{- "number" }}
    {%- elif param_spec.type == "integer" -%}
        {{- "number" }}
    {%- elif param_spec.type == "boolean" -%}
        {{- "boolean" }}
    {%- elif param_spec.type == "object" -%}
        {%- if param_spec.properties -%}
            {{- "{\n" }}
            {%- for prop_name, prop_spec in param_spec.properties.items() -%}
                {{- prop_name -}}
                {%- if prop_name not in (param_spec.required or []) -%}
                    {{- "?" }}
                {%- endif -%}
                {{- ": " }}
                {{ render_typescript_type(prop_spec, param_spec.required or []) }}
                {%- if not loop.last -%}
                    {{-", " }}
                {%- endif -%}
            {%- endfor -%}
            {{- "}" }}
        {%- else -%}
            {{- "object" }}
        {%- endif -%}
    {%- else -%}
        {{- "any" }}
    {%- endif -%}
 {%- endmacro -%}
 {%- macro render_tool_namespace(namespace_name, tools) -%}
    {{- "## " + namespace_name + "\n\n" }}
    {{- "namespace " + namespace_name + " {\n\n" }}
    {%- for tool in tools %}
        {%- set tool = tool.function %}
        {{- "// " + tool.description + "\n" }}
        {{- "type "+ tool.name + " = " }}
        {%- if tool.parameters and tool.parameters.properties %}
            {{- "(_: {\n" }}
            {%- for param_name, param_spec in tool.parameters.properties.items() %}
                {%- if param_spec.description %}
                    {{- "// " + param_spec.description + "\n" }}
                {%- endif %}
                {{- param_name }}
                {%- if param_name not in (tool.parameters.required or []) -%}
                    {{- "?" }}
                {%- endif -%}
                {{- ": " }}
                {{- render_typescript_type(param_spec, tool.parameters.required or []) }}
                {%- if param_spec.default is defined -%}
                    {%- if param_spec.enum %}
                        {{- ", // default: " + param_spec.default }}
                    {%- elif param_spec.oneOf %}
                        {{- "// default: " + param_spec.default }}
                    {%- else %}
                        {{- ", // default: " + param_spec.default|tojson }}
                    {%- endif -%}
                {%- endif -%}
                {%- if not loop.last %}
                    {{- ",\n" }}
                {%- else %}
                    {{- ",\n" }}
                {%- endif -%}
            {%- endfor %}
            {{- "}) => any;\n\n" }}
        {%- else -%}
            {{- "() => any;\n\n" }}
        {%- endif -%}
    {%- endfor %}
    {{- "} // namespace " + namespace_name }}
 {%- endmacro -%}
 {%- macro render_builtin_tools(browser_tool, python_tool) -%}
    {%- if browser_tool %}
        {{- "## browser\n\n" }}
        {{- "// Tool for browsing.\n" }}
        {{- "// The `cursor` appears in brackets before each browsing display: `[{cursor}]`.\n" }}
        {{- "// Cite information from the tool using the following format:\n" }}
        {{- "// `【{cursor}†L{line_start}(-L{line_end})?】`, for example: `【6†L9-L11】` or `【8†L3】`.\n" }}
        {{- "// Do not quote more than 10 words directly from the tool output.\n" }}
        {{- "// sources=web (default: web)\n" }}
        {{- "namespace browser {\n\n" }}
        {{- "// Searches for information related to `query` and displays `topn` results.\n" }}
        {{- "type search = (_: {\n" }}
        {{- "query: string,\n" }}
        {{- "topn?: number, // default: 10\n" }}
        {{- "source?: string,\n" }}
        {{- "}) => any;\n\n" }}
        {{- "// Opens the link `id` from the page indicated by `cursor` starting at line number `loc`, showing `num_lines` lines.\n" }}
        {{- "// Valid link ids are displayed with the formatting: `【{id}†.*】`.\n" }}
        {{- "// If `cursor` is not provided, the most recent page is implied.\n" }}
        {{- "// If `id` is a string, it is treated as a fully qualified URL associated with `source`.\n" }}
        {{- "// If `loc` is not provided, the viewport will be positioned at the beginning of the document or centered on the most relevant passage, if available.\n" }}
        {{- "// Use this function without `id` to scroll to a new location of an opened page.\n" }}
        {{- "type open = (_: {\n" }}
        {{- "id?: number | string, // default: -1\n" }}
        {{- "cursor?: number, // default: -1\n" }}
        {{- "loc?: number, // default: -1\n" }}
        {{- "num_lines?: number, // default: -1\n" }}
        {{- "view_source?: boolean, // default: false\n" }}
        {{- "source?: string,\n" }}
        {{- "}) => any;\n\n" }}
        {{- "// Finds exact matches of `pattern` in the current page, or the page given by `cursor`.\n" }}
        {{- "type find = (_: {\n" }}
        {{- "pattern: string,\n" }}
        {{- "cursor?: number, // default: -1\n" }}
        {{- "}) => any;\n\n" }}
        {{- "} // namespace browser\n\n" }}
    {%- endif -%}
    {%- if python_tool %}
        {{- "## python\n\n" }}
        {{- "Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).\n\n" }}
        {{- "When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 120.0 seconds. The drive at '/mnt/data' can be used to save and persist user files. Internet access for this session is UNKNOWN. Depends on the cluster.\n\n" }}
    {%- endif -%}
 {%- endmacro -%}
 {#- System Message Construction ============================================ #}
 {%- macro build_system_message() -%}
    {%- if model_identity is not defined %}
        {%- set model_identity = "You are ChatGPT, a large language model trained by OpenAI." %}
    {%- endif %}
    {{- model_identity + "\n" }}
    {{- "Knowledge cutoff: 2024-06\n" }}
    {{- "Current date: " + strftime_now("%Y-%m-%d") + "\n\n" }}
    {%- if reasoning_effort is not defined %}
        {%- set reasoning_effort = "medium" %}
    {%- endif %}
    {{- "Reasoning: " + reasoning_effort + "\n\n" }}
    {%- if builtin_tools %}
        {{- "# Tools\n\n" }}
        {%- set available_builtin_tools = namespace(browser=false, python=false) %}
        {%- for tool in builtin_tools %}
            {%- if tool == "browser" %}
                {%- set available_builtin_tools.browser = true %}
            {%- elif tool == "python" %}
                {%- set available_builtin_tools.python = true %}
            {%- endif %}
        {%- endfor %}
        {{- render_builtin_tools(available_builtin_tools.browser, available_builtin_tools.python) }}
    {%- endif -%}
    {{- "# Valid channels: analysis, commentary, final. Channel must be included for every message." }}
    {%- if tools -%}
        {{- "\nCalls to these tools must go to the commentary channel: 'functions'." }}
    {%- endif -%}
 {%- endmacro -%}
 {#- Main Template Logic ================================================= #}
 {#- Set defaults #}
 {#- Render system message #}
 {{- "<|start|>system<|message|>" }}
 {{- build_system_message() }}
 {{- "<|end|>" }}
 {#- Extract developer message #}
 {%- if messages[0].role == "developer" or messages[0].role == "system" %}
    {%- set developer_message = messages[0].content %}
    {%- set loop_messages = messages[1:] %}
 {%- else %}
    {%- set developer_message = "" %}
    {%- set loop_messages = messages %}
 {%- endif %}
 {#- Render developer message #}
 {%- if developer_message or tools %}
    {{- "<|start|>developer<|message|>" }}
    {%- if developer_message %}
        {{- "# Instructions\n\n" }}
        {{- developer_message }}
        {{- "\n\n" }}
    {%- endif %}
    {%- if tools -%}
        {{- "# Tools\n\n" }}
        {{- render_tool_namespace("functions", tools) }}
    {%- endif -%}
    {{- "<|end|>" }}
 {%- endif %}
 {#- Render messages #}
 {%- set last_tool_call = namespace(name=none) %}
 {%- for message in loop_messages -%}
    {#- At this point only assistant/user/tool messages should remain #}
    {%- if message.role == 'assistant' -%}
        {#- Checks to ensure the messages are being passed in the format we expect #}
        {%- if "content" in message %}
            {%- if "<|channel|>analysis<|message|>" in message.content or "<|channel|>final<|message|>" in message.content %}
                {{- raise_exception("You have passed a message containing <|channel|> tags in the content field. Instead of doing this, you should pass analysis messages (the string between '<|message|>' and '<|end|>') in the 'thinking' field, and final messages (the string between '<|message|>' and '<|end|>') in the 'content' field.") }}
            {%- endif %}
        {%- endif %}
        {%- if "thinking" in message %}
            {%- if "<|channel|>analysis<|message|>" in message.thinking or "<|channel|>final<|message|>" in message.thinking %}
                {{- raise_exception("You have passed a message containing <|channel|> tags in the thinking field. Instead of doing this, you should pass analysis messages (the string between '<|message|>' and '<|end|>') in the 'thinking' field, and final messages (the string between '<|message|>' and '<|end|>') in the 'content' field.") }}
            {%- endif %}
        {%- endif %}
        {%- if "tool_calls" in message %}
            {#- We need very careful handling here - we want to drop the tool call analysis message if the model #}
            {#- has output a later <|final|> message, but otherwise we want to retain it. This is the only case #}
            {#- when we render CoT/analysis messages in inference. #}
            {%- set future_final_message = namespace(found=false) %}
            {%- for future_message in loop_messages[loop.index:] %}
                {%- if future_message.role == 'assistant' and "tool_calls" not in future_message %}
                    {%- set future_final_message.found = true %}
                {%- endif %}
            {%- endfor %}
            {#- We assume max 1 tool call per message, and so we infer the tool call name #}
            {#- in "tool" messages from the most recent assistant tool call name #}
            {%- set tool_call = message.tool_calls[0] %}
            {%- if tool_call.function %}
                {%- set tool_call = tool_call.function %}
            {%- endif %}
            {%- if message.content and message.thinking %}
                {{- raise_exception("Cannot pass both content and thinking in an assistant message with tool calls! Put the analysis message in one or the other, but not both.") }}
            {%- elif message.content and not future_final_message.found %}
                {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.content + "<|end|>" }}
            {%- elif message.thinking and not future_final_message.found %}
                {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }}
            {%- endif %}
            {{- "<|start|>assistant to=" }}
            {{- "functions." + tool_call.name + "<|channel|>commentary " }}
            {{- (tool_call.content_type if tool_call.content_type is defined else "json") + "<|message|>" }}
            {{- tool_call.arguments|tojson }}
            {{- "<|call|>" }}
            {%- set last_tool_call.name = tool_call.name %}
        {%- elif loop.last and not add_generation_prompt %}
            {#- Only render the CoT if the final turn is an assistant turn and add_generation_prompt is false #}
            {#- This is a situation that should only occur in training, never in inference. #}
            {%- if "thinking" in message %}
                {{- "<|start|>assistant<|channel|>analysis<|message|>" + message.thinking + "<|end|>" }}
            {%- endif %}
            {#- <|return|> indicates the end of generation, but <|end|> does not #}
            {#- <|return|> should never be an input to the model, but we include it as the final token #}
            {#- when training, so the model learns to emit it. #}
            {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|return|>" }}
        {%- else %}
            {#- CoT is dropped during all previous turns, so we never render it for inference #}
            {{- "<|start|>assistant<|channel|>final<|message|>" + message.content + "<|end|>" }}
            {%- set last_tool_call.name = none %}
        {%- endif %}
    {%- elif message.role == 'tool' -%}
        {%- if last_tool_call.name is none %}
            {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }}
        {%- endif %}
        {{- "<|start|>functions." + last_tool_call.name }}
        {{- " to=assistant<|channel|>commentary<|message|>" + message.content|tojson + "<|end|>" }}
    {%- elif message.role == 'user' -%}
        {{- "<|start|>user<|message|>" + message.content + "<|end|>" }}
    {%- endif -%}
 {%- endfor -%}
 {#- Generation prompt #}
 {%- if add_generation_prompt -%}
 <|start|>assistant
 {%- endif -%}
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -1100,6 +1100,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                switch (hparams.n_layer) {
                    case 18: type = LLM_TYPE_537M; break;
                    case 26: type = LLM_TYPE_1B; break;
                    case 34: type = LLM_TYPE_4B; break;
                    case 48: type = LLM_TYPE_12B; break;
--- a/src/llama-model.h
+++ b/src/llama-model.h
@ -39,6 +39,7 @@ enum llm_type {
    LLM_TYPE_410M,
    LLM_TYPE_450M,
    LLM_TYPE_475M,
    LLM_TYPE_537M,
    LLM_TYPE_700M,
    LLM_TYPE_770M,
    LLM_TYPE_780M,
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@ -2580,7 +2580,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
        // @ngxson : quick hack for gpt-oss, always render these tokens
        for (const auto & t : token_to_id) {
-            if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>") {
+            if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
                id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
            }
        }
@ -2627,6 +2627,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
            if (has_return && has_call && has_end) {
                special_eog_ids.erase(end_id);
                id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
                LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
            }
        }
--- a/tools/main/main.cpp
+++ b/tools/main/main.cpp
@ -221,7 +221,7 @@ int main(int argc, char ** argv) {
                LOG_WRN("*** User-specified prompt will pre-start conversation, did you mean to set --system-prompt (-sys) instead?\n");
            }
-            LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.get(), params.use_jinja).c_str());
+            LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.get(), params.use_jinja, params.default_template_kwargs).c_str());
        } else {
            LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
        }
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@ -108,7 +108,7 @@ struct mtmd_cli_context {
        }
        tmpls = common_chat_templates_init(model, params.chat_template);
-        LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(tmpls.get(), params.use_jinja).c_str());
+        LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(tmpls.get(), params.use_jinja, params.default_template_kwargs).c_str());
        init_vision_context(params);
--- a/tools/server/public/index.html.gz
+++ b/tools/server/public/index.html.gz
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@ -2053,7 +2053,7 @@ struct server_context {
        chat_templates = common_chat_templates_init(model, params_base.chat_template);
        try {
-            common_chat_format_example(chat_templates.get(), params.use_jinja);
+            common_chat_format_example(chat_templates.get(), params.use_jinja, params.default_template_kwargs);
        } catch (const std::exception & e) {
            SRV_WRN("%s: Chat template parsing error: %s\n", __func__, e.what());
            SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
@ -5075,7 +5075,7 @@ int main(int argc, char ** argv) {
    // print sample chat example to make it clear which template is used
    LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
        common_chat_templates_source(ctx_server.chat_templates.get()),
-        common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params_base.use_jinja).c_str());
+        common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params_base.use_jinja, ctx_server.params_base.default_template_kwargs).c_str());
    ctx_server.queue_tasks.on_new_task([&ctx_server](server_task && task) {
        ctx_server.process_single_task(std::move(task));
--- a/tools/server/webui/src/components/ChatMessage.tsx
+++ b/tools/server/webui/src/components/ChatMessage.tsx
@ -62,8 +62,7 @@ export default function ChatMessage({
      return { content: msg.content };
    }
    const REGEX_THINK_OPEN = /<think>|<\|channel\|>analysis<\|message\|>/;
-    const REGEX_THINK_CLOSE =
+    const REGEX_THINK_CLOSE = /<\/think>|<\|end\|>/;
      /<\/think>|<\|start\|>assistant<\|channel\|>final<\|message\|>/;
    let actualContent = '';
    let thought = '';
    let isThinking = false;
--- a/tools/server/webui/src/utils/misc.ts
+++ b/tools/server/webui/src/utils/misc.ts
@ -131,9 +131,7 @@ export function filterThoughtFromMsgs(messages: APIMessage[]) {
      content:
        msg.role === 'assistant'
          ? contentStr
-              .split(
+              .split(/<\/think>|<\|end\|>/)
                /<\/think>|<\|start\|>assistant<\|channel\|>final<\|message\|>/
              )
              .at(-1)!
              .trim()
          : contentStr,