note: smartcache is broken for rnn currently

2026-07-10 01:18:32 +00:00 · 2026-03-15 11:31:47 +08:00 · 2026-03-15 11:31:47 +08:00 · f3d2f58fa8
commit f3d2f58fa8
parent b1c500ae2b 9f774e45ee
30 changed files with 2232 additions and 875 deletions
--- a/.devops/openvino.Dockerfile
+++ b/.devops/openvino.Dockerfile
@ -0,0 +1,138 @@
+ARG OPENVINO_VERSION_MAJOR=2026.0
+ARG OPENVINO_VERSION_FULL=2026.0.0.20965.c6d6a13a886
+ARG UBUNTU_VERSION=24.04
+
+# Optional proxy build arguments - empty by default
+ARG http_proxy=
+ARG https_proxy=
+
+## Build Image
+FROM ubuntu:${UBUNTU_VERSION} AS build
+
+# Pass proxy args to build stage
+ARG http_proxy
+ARG https_proxy
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        ca-certificates \
+        gnupg \
+        wget \
+        git \
+        cmake \
+        ninja-build \
+        build-essential \
+        libtbb12 \
+        libssl-dev \
+        ocl-icd-opencl-dev \
+        opencl-headers \
+        opencl-clhpp-headers \
+        intel-opencl-icd && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install OpenVINO for Ubuntu 24.04
+ARG OPENVINO_VERSION_MAJOR
+ARG OPENVINO_VERSION_FULL
+RUN mkdir -p /opt/intel && \
+    wget https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
+    tar -xf openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \
+    mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
+    cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \
+    echo "Y" | ./install_dependencies/install_openvino_dependencies.sh && \
+    cd - && \
+    ln -s /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} /opt/intel/openvino
+
+ENV OpenVINO_DIR=/opt/intel/openvino
+
+WORKDIR /app
+
+COPY . .
+
+# Build Stage
+RUN bash -c "source ${OpenVINO_DIR}/setupvars.sh && \
+    cmake -B build/ReleaseOV -G Ninja \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DGGML_OPENVINO=ON && \
+    cmake --build build/ReleaseOV -j$(nproc)"
+
+# Copy all necessary libraries
+RUN mkdir -p /app/lib && \
+    find build/ReleaseOV -name '*.so*' -exec cp {} /app/lib \; && \
+    find ${OpenVINO_DIR}/runtime/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \; 2>/dev/null || \
+    find ${OpenVINO_DIR}/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \;
+
+# Create runtime directories and copy binaries
+RUN mkdir -p /app/full \
+    && cp build/ReleaseOV/bin/* /app/full/ \
+    && cp *.py /app/full \
+    && cp -r gguf-py /app/full \
+    && cp -r requirements /app/full \
+    && cp requirements.txt /app/full \
+    && cp .devops/tools.sh /app/full/tools.sh
+
+## Base Runtime Image
+FROM ubuntu:${UBUNTU_VERSION} AS base
+
+# Pass proxy args to runtime stage
+ARG http_proxy
+ARG https_proxy
+
+RUN apt-get update \
+    && apt-get install -y libgomp1 libtbb12 curl\
+    && apt autoremove -y \
+    && apt clean -y \
+    && rm -rf /tmp/* /var/tmp/* \
+    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
+    && find /var/cache -type f -delete
+
+COPY --from=build /app/lib/ /app/
+
+### Full (all binaries)
+FROM base AS full
+
+ARG http_proxy
+ARG https_proxy
+
+COPY --from=build /app/full /app/
+
+WORKDIR /app
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    git \
+    python3 \
+    python3-venv \
+    python3-pip && \
+    python3 -m venv /ov-venv && \
+    /ov-venv/bin/pip install --no-cache-dir --upgrade pip setuptools wheel && \
+    /ov-venv/bin/pip install --no-cache-dir -r requirements.txt && \
+    apt-get autoremove -y && \
+    apt-get clean && \
+    rm -rf /tmp/* /var/tmp/* && \
+    find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
+    find /var/cache -type f -delete
+
+ENTRYPOINT ["/bin/bash", "-c", "source /ov-venv/bin/activate && exec /app/tools.sh \"$@\"", "--"]
+
+
+### Light, CLI only
+FROM base AS light
+
+COPY --from=build /app/full/llama-cli /app/
+
+WORKDIR /app
+
+ENTRYPOINT [ "/app/llama-cli" ]
+
+### Server, Server only
+FROM base AS server
+
+ENV LLAMA_ARG_HOST=0.0.0.0
+
+COPY --from=build /app/full/llama-server /app/
+
+WORKDIR /app
+
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+
+ENTRYPOINT [ "/app/llama-server" ]
--- a/common/chat-auto-parser-generator.cpp
+++ b/common/chat-auto-parser-generator.cpp
@ -3,6 +3,7 @@
 #include "chat.h"
 #include "common.h"
 #include "json-schema-to-grammar.h"
+#include "log.h"
 #include "nlohmann/json.hpp"

 #include <stdexcept>
@ -182,7 +183,10 @@ common_peg_parser analyze_tools::build_parser(parser_build_context & ctx) const
        case tool_format::TAG_WITH_TAGGED:
            return build_tool_parser_tag_tagged(ctx);
        default:
-            GGML_ABORT("Unable to create tool parser");
+            LOG_ERR("[ERROR] Template seems to support tool calls, but failed to determine tool format. Tool calling will not work properly. "
+                "Check for a fixed template for your model in the models/templates directory of your llama.cpp installation or "
+                "report an issue at https://github.com/ggml-org/llama.cpp/issues\n");
+            return ctx.p.eps();
    }
 }

--- a/ggml/include/ggml-openvino.h
+++ b/ggml/include/ggml-openvino.h
@ -0,0 +1,37 @@
+#pragma once
+
+#include "ggml-backend.h"
+
+#include <cstring>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define GGML_OPENVINO_NAME "OPENVINO"
+
+// backend API
+GGML_BACKEND_API ggml_backend_t ggml_backend_openvino_init(int device);
+
+GGML_BACKEND_API bool ggml_backend_is_openvino(ggml_backend_t backend);
+
+GGML_BACKEND_API bool ggml_backend_buffer_is_openvino(ggml_backend_buffer_t buffer);
+
+GGML_BACKEND_API bool ggml_backend_buft_is_openvino(ggml_backend_buffer_type_t buft);
+
+GGML_BACKEND_API bool ggml_backend_buft_is_openvino_host(ggml_backend_buffer_type_t buft);
+
+GGML_BACKEND_API size_t ggml_backend_openvino_buffer_get_ctx_id(ggml_backend_buffer_t buffer);
+
+// device buffer
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_buffer_type(int device);
+
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_openvino_host_buffer_type(int device);
+
+GGML_BACKEND_API int ggml_backend_openvino_get_device_count(void);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_openvino_reg(void);
+
+#ifdef __cplusplus
+}
+#endif
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@ -82,6 +82,10 @@
 #include "ggml-zendnn.h"
 #endif

+#ifdef GGML_USE_OPENVINO
+#include "ggml-openvino.h"
+#endif
+
 namespace fs = std::filesystem;

 static std::string path_str(const fs::path & path) {
@ -154,6 +158,9 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_RPC
        register_backend(ggml_backend_rpc_reg());
 #endif
+#ifdef GGML_USE_OPENVINO
+        register_backend(ggml_backend_openvino_reg());
+#endif
 #ifdef GGML_USE_CPU
        register_backend(ggml_backend_cpu_reg());
 #endif
@ -558,6 +565,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
    ggml_backend_load_best("opencl", silent, dir_path);
    ggml_backend_load_best("hexagon", silent, dir_path);
    ggml_backend_load_best("musa", silent, dir_path);
+    ggml_backend_load_best("openvino", silent, dir_path);
    ggml_backend_load_best("cpu", silent, dir_path);
    // check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
    const char * backend_path = std::getenv("GGML_BACKEND_PATH");
--- a/ggml/src/ggml-cpu/arch-fallback.h
+++ b/ggml/src/ggml-cpu/arch-fallback.h
@ -175,13 +175,6 @@
 #define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0
 #elif defined(__riscv)
 // quants.c
-#define quantize_row_q8_K_generic quantize_row_q8_K
-#define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K
-#define ggml_vec_dot_iq2_xs_q8_K_generic ggml_vec_dot_iq2_xs_q8_K
-#define ggml_vec_dot_iq3_xxs_q8_K_generic ggml_vec_dot_iq3_xxs_q8_K
-#define ggml_vec_dot_iq4_nl_q8_0_generic ggml_vec_dot_iq4_nl_q8_0
-#define ggml_vec_dot_iq4_xs_q8_K_generic ggml_vec_dot_iq4_xs_q8_K
-#define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0
 #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0
 // repack.cpp
 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4
--- a/ggml/src/ggml-cpu/arch/riscv/quants.c
+++ b/ggml/src/ggml-cpu/arch/riscv/quants.c
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@ -9624,7 +9624,7 @@ void ggml_compute_forward_win_unpart(
    }
 }

-//gmml_compute_forward_unary
+//ggml_compute_forward_unary

 void ggml_compute_forward_unary(
        const ggml_compute_params * params,
@ -10477,34 +10477,40 @@ static void ggml_compute_forward_gated_delta_net_one_chunk(
            const float beta_val = *(const float *)((const char *)src_beta->data + iv3 * nbb3 + t * nbb2 + iv1 * nbb1);
            const float * g_d    =  (const float *)((const char *)src_g->data    + iv3 * nbg3 + t * nbg2 + iv1 * nbg1);

+            // state is stored transposed: s_out[j*S_v + i] = S[i][j]
+            // so row j of s_out = column j of S (contiguous access)
+
            if (kda) {
+                // precompute exp(g) into delta scratch (reused below)
                for (int64_t i = 0; i < S_v; ++i) {
-                    ggml_vec_scale_f32(S_v, &s_out[i * S_v], expf(g_d[i]));
+                    delta[i] = expf(g_d[i]);
+                }
+                // S[i][:] *= exp(g[i]) => for each row j of M: M[j][i] *= exp(g[i])
+                for (int64_t j = 0; j < S_v; ++j) {
+                    ggml_vec_mul_f32(S_v, &s_out[j * S_v], &s_out[j * S_v], delta);
                }
            } else {
                ggml_vec_scale_f32(S_v * S_v, s_out, expf(g_d[0]));
            }

-            // delta[j] = sum_i S[j][i] * k[i]
-            memset(delta, 0, S_v * sizeof(float));
-            for (int64_t i = 0; i < S_v; ++i) {
-                ggml_vec_mad_f32(S_v, delta, &s_out[i * S_v], k_d[i]);
-            }
+            // delta[j] = sum_i S[i][j] * k[i] = dot(row j of M, k)
            for (int64_t j = 0; j < S_v; ++j) {
-                delta[j] = (v_d[j] - delta[j]) * beta_val;
+                float sum = 0.0f;
+                ggml_vec_dot_f32(S_v, &sum, 0, &s_out[j * S_v], 0, k_d, 0, 1);
+                delta[j] = (v_d[j] - sum) * beta_val;
            }

-            // outer product: S[j][i] += k[i] * delta[j]
-            for (int64_t i = 0; i < S_v; ++i) {
-                ggml_vec_mad_f32(S_v, &s_out[i * S_v], delta, k_d[i]);
+            // outer product: S[i][j] += k[i] * delta[j] => M[j][i] += delta[j] * k[i]
+            for (int64_t j = 0; j < S_v; ++j) {
+                ggml_vec_mad_f32(S_v, &s_out[j * S_v], k_d, delta[j]);
            }

-            // attn_out[j] = sum_i S[j][i] * q[i]
-            memset(attn_data, 0, S_v * sizeof(float));
-            for (int64_t i = 0; i < S_v; ++i) {
-                ggml_vec_mad_f32(S_v, attn_data, &s_out[i * S_v], q_d[i]);
+            // attn_out[j] = sum_i S[i][j] * q[i] = dot(row j of M, q)
+            for (int64_t j = 0; j < S_v; ++j) {
+                float sum = 0.0f;
+                ggml_vec_dot_f32(S_v, &sum, 0, &s_out[j * S_v], 0, q_d, 0, 1);
+                attn_data[j] = sum * scale;
            }
-            ggml_vec_scale_f32(S_v, attn_data, scale);

            attn_data += S_v * H; // advance to next token
        }
--- a/ggml/src/ggml-cpu/simd-mappings.h
+++ b/ggml/src/ggml-cpu/simd-mappings.h
@ -479,13 +479,51 @@ do {                                                                  \

 // F16 AVX512

-// F16 AVX
+#if defined(__AVX512FP16__)
+
+#define GGML_F16_STEP 128
+#define GGML_F16_EPR  32
+
+#define GGML_F16x32              __m512h
+#define GGML_F16x32_ZERO         _mm512_setzero_ph()
+#define GGML_F16x32_SET1(x)      _mm512_set1_ph(__extension__(_Float16)(x))
+#define GGML_F16x32_LOAD(x)      _mm512_loadu_ph(x)
+#define GGML_F16x32_STORE(x, y)  _mm512_storeu_ph(x, y)
+#define GGML_F16x32_FMA(a, b, c) _mm512_fmadd_ph(b, c, a)
+#define GGML_F16x32_ADD          _mm512_add_ph
+#define GGML_F16x32_MUL          _mm512_mul_ph
+#define GGML_F16x32_REDUCE(res, x)                                     \
+do {                                                                   \
+    int offset = GGML_F16_ARR >> 1;                                    \
+    for (int i = 0; i < offset; ++i) {                                 \
+        x[i] = _mm512_add_ph(x[i], x[offset+i]);                       \
+    }                                                                  \
+    offset >>= 1;                                                      \
+    for (int i = 0; i < offset; ++i) {                                 \
+        x[i] = _mm512_add_ph(x[i], x[offset+i]);                       \
+    }                                                                  \
+    offset >>= 1;                                                      \
+    for (int i = 0; i < offset; ++i) {                                 \
+        x[i] = _mm512_add_ph(x[i], x[offset+i]);                       \
+    }                                                                  \
+    res = (ggml_float) _mm512_reduce_add_ph(x[0]);                     \
+} while (0)
+
+#define GGML_F16_VEC                GGML_F16x32
+#define GGML_F16_VEC_ZERO           GGML_F16x32_ZERO
+#define GGML_F16_VEC_SET1           GGML_F16x32_SET1
+#define GGML_F16_VEC_LOAD(p, i)     GGML_F16x32_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x32_STORE(p, r[i])
+#define GGML_F16_VEC_FMA            GGML_F16x32_FMA
+#define GGML_F16_VEC_ADD            GGML_F16x32_ADD
+#define GGML_F16_VEC_MUL            GGML_F16x32_MUL
+#define GGML_F16_VEC_REDUCE         GGML_F16x32_REDUCE
+
+#else // Fallback FP16 <-> FP32

 #define GGML_F16_STEP 64
 #define GGML_F16_EPR  16

-// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
-
 #define GGML_F32Cx16             __m512
 #define GGML_F32Cx16_ZERO        _mm512_setzero_ps()
 #define GGML_F32Cx16_SET1(x)     _mm512_set1_ps(x)
@ -525,6 +563,8 @@ do {                                                              \
 #define GGML_F16_VEC_MUL            GGML_F32Cx16_MUL

 #define GGML_F16_VEC_REDUCE         GGML_F32Cx16_REDUCE
+
+#endif // __AVX512FP16__
 #elif defined(__AVX__)

 #define GGML_SIMD
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@ -56,7 +56,8 @@ static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const
    const int tx = blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.x;  // transpose block offset
    const int ty = blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.y;

-    __shared__ float tile[CUDA_CPY_TILE_DIM_2D][CUDA_CPY_TILE_DIM_2D+1];
+    __shared__ float tile[2][CUDA_CPY_TILE_DIM_2D][CUDA_CPY_TILE_DIM_2D+1];
+    int cur_tile_buf = 0;

 #pragma unroll
    for (int i = 0; i < CUDA_CPY_BLOCK_NM; ++i) {
@ -70,7 +71,7 @@ static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const
            if(x < ne01 && y + j < ne00){
                const int row = threadIdx.y+j;
                const int col = threadIdx.x * sizeof(float)/sizeof(T);
-                T *tile2 = reinterpret_cast<T*>(tile[row]);
+                T *tile2 = reinterpret_cast<T*>(tile[cur_tile_buf][row]);
                tile2[col] = src[imat*n + (y+j)*ne01 + x];
            }
        }
@ -81,10 +82,12 @@ static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const
        for (int j = 0; j < CUDA_CPY_TILE_DIM_2D; j += CUDA_CPY_BLOCK_ROWS) {
            if (ty + j < ne01 && tx < ne00) {
                const int col = (threadIdx.y+j)*sizeof(float)/sizeof(T);
-                const T *tile2 = reinterpret_cast<const T*>(tile[threadIdx.x]);
+                const T *tile2 = reinterpret_cast<const T*>(tile[cur_tile_buf][threadIdx.x]);
                dst[imat*n + (ty+j)*ne00 + tx] = tile2[col];
            }
        }
+
+        cur_tile_buf = (cur_tile_buf + 1) % 2;
    }

    GGML_UNUSED_VARS(ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11,
--- a/ggml/src/ggml-cuda/gated_delta_net.cu
+++ b/ggml/src/ggml-cuda/gated_delta_net.cu
@ -45,10 +45,11 @@ __global__ void gated_delta_net_cuda(const float * q,
    static_assert(S_v % warp_size == 0, "S_v must be a multiple of warp_size");
    constexpr int rows_per_lane = (S_v + warp_size - 1) / warp_size;
    float         s_shard[rows_per_lane];
+    // state is stored transposed: M[col][i] = S[i][col], row col is contiguous
 #pragma unroll
    for (int r = 0; r < rows_per_lane; r++) {
        const int i = r * warp_size + lane;
-        s_shard[r]  = curr_state[i * S_v + col];
+        s_shard[r]  = curr_state[col * S_v + i];
    }

    for (int t = 0; t < n_tokens; t++) {
@ -126,23 +127,14 @@ __global__ void gated_delta_net_cuda(const float * q,
        attn_data += S_v * H;
    }

-    // Write state back to global memory
+    // Write state back to global memory (transposed layout)
 #pragma unroll
    for (int r = 0; r < rows_per_lane; r++) {
        const int i          = r * warp_size + lane;
-        state[i * S_v + col] = s_shard[r];
+        state[col * S_v + i] = s_shard[r];
    }
 }

-static size_t calculate_smem(const int sv, int cc)
-{
-    size_t smem = 0;
-    if ((GGML_CUDA_CC_IS_AMD(cc) && !GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_RDNA4(cc)) || GGML_CUDA_CC_IS_MTHREADS(cc)) {
-        smem = sv * sv * sizeof(float);
-    }
-    return smem;
-}
-
 template <bool KDA>
 static void launch_gated_delta_net(
        const float * q_d, const float * k_d, const float * v_d,
@ -179,18 +171,14 @@ static void launch_gated_delta_net(
                sb1, sb2, sb3, neqk1_magic, rq3_magic, scale);
            break;
        case 64: {
-            constexpr int sv = 64;
-            size_t smem = calculate_smem(sv, cc);
-            gated_delta_net_cuda<sv, KDA><<<grid_dims, block_dims, smem, stream>>>(
+            gated_delta_net_cuda<64, KDA><<<grid_dims, block_dims, 0, stream>>>(
                q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
                n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
                sb1, sb2, sb3, neqk1_magic, rq3_magic, scale);
            break;
        }
        case 128: {
-            constexpr int sv = 128;
-            size_t smem = calculate_smem(sv, cc);
-            gated_delta_net_cuda<sv, KDA><<<grid_dims, block_dims, smem, stream>>>(
+            gated_delta_net_cuda<128, KDA><<<grid_dims, block_dims, 0, stream>>>(
                q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H,
                n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3,
                sb1, sb2, sb3, neqk1_magic, rq3_magic, scale);
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@ -1242,6 +1242,34 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
    }
 }

+struct cublas_force_compute_type {
+    bool fp32 = false;
+    bool fp16 = false;
+};
+
+static const cublas_force_compute_type & ggml_cuda_cublas_get_force_compute_type() {
+    static const cublas_force_compute_type compute_type = [] {
+        cublas_force_compute_type result;
+
+        const bool ggml_cuda_force_cublas_compute_32f_env = getenv("GGML_CUDA_FORCE_CUBLAS_COMPUTE_32F") != nullptr;
+        const bool ggml_cuda_force_cublas_compute_16f_env = getenv("GGML_CUDA_FORCE_CUBLAS_COMPUTE_16F") != nullptr;
+
+        GGML_ASSERT(ggml_cuda_force_cublas_compute_16f_env == false || ggml_cuda_force_cublas_compute_32f_env == false);
+
+        if (ggml_cuda_force_cublas_compute_32f_env) {
+            GGML_LOG_INFO("Detected GGML_CUDA_FORCE_CUBLAS_COMPUTE_32F\n");
+            result.fp32 = true;
+        } else if (ggml_cuda_force_cublas_compute_16f_env) {
+            GGML_LOG_INFO("Detected GGML_CUDA_FORCE_CUBLAS_COMPUTE_16F\n");
+            result.fp16 = true;
+        }
+
+        return result;
+    }();
+
+    return compute_type;
+}
+
 static void ggml_cuda_op_mul_mat_cublas(
    ggml_backend_cuda_context & ctx,
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
@ -1324,7 +1352,13 @@ static void ggml_cuda_op_mul_mat_cublas(

        CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));

-        if (GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
+        const auto & force_compute_type = ggml_cuda_cublas_get_force_compute_type();
+
+        if (!force_compute_type.fp16 && (GGML_CUDA_CC_IS_CDNA(cc)
+                                        || GGML_CUDA_CC_IS_RDNA4(cc)
+                                        || cc == GGML_CUDA_CC_VOLTA
+                                        || force_compute_type.fp32))
+        {
            const float alpha = 1.0f;
            const float beta = 0.0f;
            CUBLAS_CHECK(
@ -1923,10 +1957,23 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
    cudaDataType_t cu_data_type_b = traits::data_type;
    const void * alpha = traits::get_alpha();
    const void * beta = traits::get_beta();
-    const float alpha_f32 = 1.0f;
-    const float beta_f32 = 0.0f;

-    if (dst->op_params[0] == GGML_PREC_DEFAULT) {
+    const auto & force_compute_type = ggml_cuda_cublas_get_force_compute_type();
+
+    int id = ggml_cuda_get_device();
+    const int cc = ggml_cuda_info().devices[id].cc;
+    static constexpr bool is_src0_type_f16 = src0_type == GGML_TYPE_F16;
+
+    // bf16 and fp32 are already being computed in fp32 (ensure it using static_assert),
+    // so checking necessity of forced fp32 only for fp16 src0_type
+    static_assert(is_src0_type_f16 || traits::compute_type == CUBLAS_COMPUTE_32F);
+
+    const bool need_compute_32f = is_src0_type_f16 && !force_compute_type.fp16 && (GGML_CUDA_CC_IS_CDNA(cc)
+                                                                                  || GGML_CUDA_CC_IS_RDNA4(cc)
+                                                                                  || cc == GGML_CUDA_CC_VOLTA
+                                                                                  || force_compute_type.fp32);
+
+    if (dst->op_params[0] == GGML_PREC_DEFAULT && !need_compute_32f) {
        if constexpr (src0_type == GGML_TYPE_F32) {
            dst_t = (char *) dst_ddf;  // Direct F32 output
        } else {
@ -1936,18 +1983,10 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
        }
    } else {
        dst_t = (char *) dst_ddf;
-        cu_compute_type = CUBLAS_COMPUTE_32F;
-        cu_data_type = CUDA_R_32F;
-        alpha = &alpha_f32;
-        beta = &beta_f32;
-    }
-
-    int id = ggml_cuda_get_device();
-    const int cc = ggml_cuda_info().devices[id].cc;
-    if (GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
-        cu_compute_type = CUBLAS_COMPUTE_32F;
-        alpha = &alpha_f32;
-        beta = &beta_f32;
+        cu_compute_type = batched_mul_mat_traits<GGML_TYPE_F32>::compute_type;
+        cu_data_type = batched_mul_mat_traits<GGML_TYPE_F32>::data_type;
+        alpha = batched_mul_mat_traits<GGML_TYPE_F32>::get_alpha();
+        beta = batched_mul_mat_traits<GGML_TYPE_F32>::get_beta();
    }

    GGML_ASSERT(ne12 % ne02 == 0);
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@ -2469,13 +2469,14 @@ kernel void kernel_gated_delta_net_impl(

    const float scale = 1.0f / sqrt((float)S_v);

-    device const float * s_ptr = (device const float *) (s) + (i23*args.ne21 + i21)*S_v*S_v + i20;
+    // state is stored transposed: M[i20][is] = S[is][i20], so row i20 is contiguous
+    device const float * s_ptr = (device const float *) (s) + (i23*args.ne21 + i21)*S_v*S_v + i20*S_v;

    float ls[NSG];

    FOR_UNROLL (short j = 0; j < NSG; j++) {
        const short is = tx*NSG + j;
-        ls[j] = s_ptr[is*S_v];
+        ls[j] = s_ptr[is];
    }

    device float * dst_attn = (device float *) (dst) + (i23*args.ne22*args.ne21 + i21)*S_v + i20;
@ -2536,11 +2537,11 @@ kernel void kernel_gated_delta_net_impl(
        g_ptr += args.ne21*G;
    }

-    device float * dst_state = (device float *) (dst) + args.ne23*args.ne22*args.ne21*S_v + (i23*args.ne21 + i21)*S_v*S_v + i20;
+    device float * dst_state = (device float *) (dst) + args.ne23*args.ne22*args.ne21*S_v + (i23*args.ne21 + i21)*S_v*S_v + i20*S_v;

    FOR_UNROLL (short j = 0; j < NSG; j++) {
        const short is = tx*NSG + j;
-        dst_state[is*S_v] = ls[j];
+        dst_state[is] = ls[j];
    }

 #undef S_v
--- a/ggml/src/ggml-opencl/kernels/cumsum.cl
+++ b/ggml/src/ggml-opencl/kernels/cumsum.cl
@ -1,139 +0,0 @@
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-#ifdef cl_intel_required_subgroup_size
-#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
-#define INTEL_GPU 1
-#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
-#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
-#elif defined(cl_qcom_reqd_sub_group_size)
-#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
-#define ADRENO_GPU 1
-#define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half")))
-#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
-#endif
-
-// max workgroup size is usually 1024, this covers various subgroups sizes
-#define MAX_SUBGROUPS 128
-
-#ifdef INTEL_GPU
-REQD_SUBGROUP_SIZE_32
-#elif defined (ADRENO_GPU)
-REQD_SUBGROUP_SIZE_64
-#endif
-kernel void kernel_cumsum_blk(
-        global char * src0,
-        ulong offset0,
-        global char * tmp,
-        global char * dst,
-        ulong offsetd,
-        int   ne00,
-        int   ne01,
-        int   ne02,
-        int   ne03,
-        ulong nb00,
-        ulong nb01,
-        ulong nb02,
-        ulong nb03,
-        uint net0,
-        uint net1,
-        uint net2
-) {
-    src0 = src0 + offset0;
-    dst  = dst + offsetd;
-
-    const int i3 = get_group_id(2);
-    const int i2 = get_group_id(1);
-    const int i1 = get_group_id(0);
-
-    const int nth = get_local_size(0);
-    const int tid = get_local_id(0);
-
-    const uint sg_size = get_sub_group_size();
-    const uint sg_id = get_sub_group_id();
-    const uint sg_lid = get_sub_group_local_id();
-
-    const int ib = i1 / ne01;
-    const int i00 = ib * nth;
-    const int i01 = i1 % ne01;
-    const int i02 = i2;
-    const int i03 = i3;
-
-    global const float * src0_row = (global const float *)(src0 + i03*nb03 + i02*nb02 + i01*nb01);
-    global       float * tmp_row  = (global float *)tmp + net0 * i01 + net0 * net1 * i02 + net0 * net1 * net2 * i03;
-    global       float * dst_row  = (global float *)dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
-
-    __local float partial[MAX_SUBGROUPS];
-
-    float v = 0.0f;
-    if (i00 + tid < ne00) {
-        v = src0_row[i00 + tid];
-    }
-
-    float s = sub_group_scan_inclusive_add(v);
-    if (sg_lid == sg_size - 1) {
-        partial[sg_id] = s;
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    // NB: subgroup size should be larger than number of subgroups
-    // assuming max workgroup size of 1024, subgroup size should be >= 32
-    if (sg_id == 0) {
-        float x = 0.0f;
-        if (sg_lid < get_num_sub_groups()) {
-            x = partial[sg_lid];
-        }
-        float ex = sub_group_scan_exclusive_add(x);
-        if (sg_lid < get_num_sub_groups()) {
-            partial[sg_lid] = ex;
-        }
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    s += partial[sg_id];
-
-    if (i00 + tid < ne00) {
-        dst_row[i00 + tid] = s;
-    }
-    if (ne00 > nth && tid == nth - 1) {
-        tmp_row[ib] = s;
-    }
-}
-
-kernel void kernel_cumsum_add(
-        global char * tmp,
-        global char * dst,
-        ulong offsetd,
-        int   ne00,
-        int   ne01,
-        int   ne02,
-        int   ne03,
-        uint nbt0,
-        uint nbt1,
-        uint nbt2,
-        uint nbt3
-) {
-    dst  = dst + offsetd;
-
-    const int i3 = get_group_id(2);
-    const int i2 = get_group_id(1);
-    const int i1 = get_group_id(0);
-
-    const int nth = get_local_size(0);
-    const int tid = get_local_id(0);
-
-    const int ib = i1 / ne01;
-    if (ib == 0) {
-        return;
-    }
-    const int i00 = ib * nth;
-    const int i01 = i1 % ne01;
-    const int i02 = i2;
-    const int i03 = i3;
-
-    global float * tmp_row  = (global float *)(tmp + nbt1 * i01 + nbt2 * i02 + nbt3 * i03);
-    global float * dst_row  = (global float *)dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
-
-    if (i00 + tid < ne00) {
-        dst_row[i00 + tid] += tmp_row[ib - 1];
-    }
-}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/gated_delta_net.comp
@ -44,7 +44,7 @@ void main() {

    FLOAT_TYPE state[S_V];
    [[unroll]] for (uint i = 0; i < S_V; i++) {
-        state[i] = FLOAT_TYPE(data_state[state_base + i * S_V + col]);
+        state[i] = FLOAT_TYPE(data_state[state_base + col * S_V + i]);
    }

    uint attn_off = (seq_id * n_tokens * H + head_id) * S_V;
@ -123,6 +123,6 @@ void main() {
    }

    [[unroll]] for (uint i = 0; i < S_V; i++) {
-        data_dst[s_off + state_base + i * S_V + col] = state[i];
+        data_dst[s_off + state_base + col * S_V + i] = state[i];
    }
 }
--- a/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/repeat.wgsl
@ -1,67 +0,0 @@
-enable f16;
-
-struct Params {
-    ne: u32,
-
-    offset_src0: u32,
-    offset_dst: u32,
-
-    stride_src0_0: u32,
-    stride_src0_1: u32,
-    stride_src0_2: u32,
-    stride_src0_3: u32,
-
-    a_ne0: u32,
-    a_ne1: u32,
-    a_ne2: u32,
-    a_ne3: u32,
-
-    ne0: u32,
-    ne1: u32,
-    ne2: u32,
-};
-
-#ifdef TYPE_F32
-#define DataType f32
-#endif
-#ifdef TYPE_I32
-#define DataType i32
-#endif
-#ifdef TYPE_I16
-// same size (16-bit) is sufficient for repeat
-#define DataType f16
-#endif
-
-@group(0) @binding(0)
-var<storage, read_write> src0: array<DataType>;
-
-@group(0) @binding(1)
-var<storage, read_write> dst: array<DataType>;
-
-@group(0) @binding(2)
-var<uniform> params: Params;
-
-@compute @workgroup_size(WG_SIZE)
-fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-    if (gid.x < params.ne) {
-        var i = gid.x;
-        let i3 = i / (params.ne2 * params.ne1 * params.ne0);
-        i = i % (params.ne2 * params.ne1 * params.ne0);
-        let i2 = i / (params.ne1 * params.ne0);
-        i = i % (params.ne1 * params.ne0);
-        let i1 = i / params.ne0;
-        let i0 = i % params.ne0;
-
-        let a_i0 = i0 % params.a_ne0;
-        let a_i1 = i1 % params.a_ne1;
-        let a_i2 = i2 % params.a_ne2;
-        let a_i3 = i3 % params.a_ne3;
-
-        let a_index = a_i0 * params.stride_src0_0 +
-                           a_i1 * params.stride_src0_1 +
-                           a_i2 * params.stride_src0_2 +
-                           a_i3 * params.stride_src0_3;
-
-        dst[params.offset_dst + gid.x] = src0[params.offset_src0 + a_index];
-    }
-}
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@ -521,7 +521,12 @@ void llama_context::sched_reserve() {

        if (cparams.fused_gdn_ch) {
            // more than one token in the batch per sequence in order to take the chunked path
-            auto * gf = graph_reserve(16*n_seqs, n_seqs, n_outputs, mctx.get(), true);
+            // note: n_outputs must match n_tokens for embedding models with mean/rank pooling,
+            // because build_pooling creates inp_mean with shape [n_tokens, n_seqs] and multiplies
+            // it with t_embd which is reduced to [n_outputs, ...] via out_ids. if n_outputs != n_tokens,
+            // the ggml_mul_mat assertion fails. this matches the pp reservation below (line ~553).
+            const uint32_t n_tokens_ch = 16*n_seqs;
+            auto * gf = graph_reserve(n_tokens_ch, n_seqs, n_tokens_ch, mctx.get(), true);
            if (!gf) {
                throw std::runtime_error("failed to reserve graph for fused Gated Delta Net check (chunked)");
            }
--- a/src/models/delta-net-base.cpp
+++ b/src/models/delta-net-base.cpp
@ -225,9 +225,8 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
    ggml_tensor * kg_t = ggml_cont(ctx0, ggml_transpose(ctx0, kg));
    cb(kg_t, "key_gdiff_t", il);

-    ggml_tensor * s_t = ggml_transpose(ctx0, s);
-    s_t = ggml_cont_4d(ctx0, s_t, S_v, S_v, 1, H_v * n_seqs);
-    cb(s_t, "dnet_add_ch_state", il);
+    s = ggml_reshape_4d(ctx0, s, S_v, S_v, 1, H_v * n_seqs);
+    cb(s, "dnet_add_ch_state", il);

    // [CS, S_v, n_chunks, H_v * n_seqs]
    ggml_tensor * v_t = ggml_cont(ctx0, ggml_transpose(ctx0, v));
@ -240,7 +239,7 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
        ggml_tensor * ch_kg_t    = get_slice_2d(ctx0, kg_t,    chunk); // [ CS, S_k, 1, H_v * n_seqs]

        // [CS, S_v, 1, H_v * n_seqs]
-        ggml_tensor * v_t_p = ggml_mul_mat(ctx0, ch_k_cd, s_t);
+        ggml_tensor * v_t_p = ggml_mul_mat(ctx0, ch_k_cd, s);
        cb(v_t_p, "v_prime", il);

        // [CS, S_v, 1, H_v * n_seqs]
@ -252,7 +251,7 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
        cb(v_attn, "v_attn", il);

        // [S_v, CS, 1, H_v * n_seqs]
-        ggml_tensor * attn_inter = ggml_mul_mat(ctx0, s_t, ch_q_g_exp);
+        ggml_tensor * attn_inter = ggml_mul_mat(ctx0, s, ch_q_g_exp);
        cb(attn_inter, "attn_inter", il);

        // [S_v, CS, 1, H_v * n_seqs]
@ -268,13 +267,11 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
        // last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
        ggml_tensor * ch_g_last_exp_t = get_slice_2d(ctx0, g_last_exp_t, chunk);

-        s_t = ggml_mul(ctx0, s_t, ch_g_last_exp_t);
-        s_t = ggml_add(ctx0, s_t, kgv);
-        cb(s_t, "dnet_add_ch_state", il);
+        s = ggml_mul(ctx0, s, ch_g_last_exp_t);
+        s = ggml_add(ctx0, s, kgv);
+        cb(s, "dnet_add_ch_state", il);
    }

-    s_t = ggml_reshape_4d(ctx0, s_t, S_v, S_v, H_v, n_seqs);
-
    // truncate padded tokens
    ggml_tensor * o = ggml_view_4d(ctx0, v,
            S_v, n_tokens, H_v, n_seqs,
@ -282,7 +279,7 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
            ggml_row_size(v->type, S_v * CS * n_chunks),
            ggml_row_size(v->type, S_v * CS * n_chunks * H_v), 0);
    o = ggml_permute  (ctx0, o, 0, 2, 1, 3); // [S_v, H_v, n_tokens, n_seqs]
-    s = ggml_transpose(ctx0, s_t);
+    s = ggml_reshape_4d(ctx0, s, S_v, S_v, H_v, n_seqs);
    cb(s, "output_state", il);

    return {o, s};
@ -341,11 +338,9 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
    g = ggml_exp(ctx0, g);
    s = ggml_mul(ctx0, s, g);

-    ggml_tensor * s_t = ggml_cont(ctx0, ggml_transpose(ctx0, s));
-
    // [1, S_v, H_v, n_seqs]
    ggml_tensor * sk;
-    sk = ggml_mul     (ctx0, s_t, k);
+    sk = ggml_mul     (ctx0, s, k);
    sk = ggml_sum_rows(ctx0, sk);

    // [S_v, 1, H_v, n_seqs]
@ -362,15 +357,14 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_delta_net_base::build_delta_ne
    k  = ggml_repeat(ctx0, k, s);
    kd = ggml_mul   (ctx0, k, d_t);

-    s_t = ggml_add(ctx0, s_t, kd);
+    s = ggml_add(ctx0, s, kd);

-    cb(s_t, "dnet_add_ar_state", il);
+    cb(s, "dnet_add_ar_state", il);

-    ggml_tensor * s_q = ggml_mul     (ctx0, s_t, q);
+    ggml_tensor * s_q = ggml_mul     (ctx0, s, q);
    ggml_tensor * o   = ggml_sum_rows(ctx0, s_q);

    o = ggml_permute  (ctx0, o, 2, 0, 1, 3); // [S_v, H_v, n_tokens, n_seqs]
-    s = ggml_transpose(ctx0, s_t);           // [S_v, S_v, H_v, n_seqs]

    return {o, s};
 }
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@ -579,10 +579,9 @@ static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) {
    }
 }

-void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value);
-
 //
 // API used internally with mtmd
 //

 projector_type clip_get_projector_type(const struct clip_ctx * ctx);
+void clip_set_debug_output_embeddings(struct clip_ctx * ctx, bool debug);
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@ -209,6 +209,8 @@ struct clip_ctx {
    clip_flash_attn_type flash_attn_type = CLIP_FLASH_ATTN_TYPE_AUTO;
    bool is_allocated = false;

+    bool debug_output_embeddings = false;
+
    clip_ctx(clip_context_params & ctx_params) {
        flash_attn_type = ctx_params.flash_attn_type;
 	    backend_cpu = ggml_backend_cpu_init(); //always has CPU backend
@ -255,6 +257,8 @@ struct clip_ctx {
        if (ctx_params.cb_eval != nullptr) {
            ggml_backend_sched_set_eval_callback(sched.get(), ctx_params.cb_eval, ctx_params.cb_eval_user_data);
        }
+
+        debug_output_embeddings = std::getenv("MTMD_DEBUG_EMBEDDINGS") != nullptr;
    }

    ~clip_ctx() {
@ -2277,8 +2281,6 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
            // TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
            // we can remove this check when we implement audio support for Gemma 3N
            skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV;
-
-            // clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f);
        }

        if (loader.has_audio && !skip_audio) {
@ -4179,7 +4181,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    }

    // Debug: dump final embeddings if MTMD_DEBUG_EMBEDDINGS is set
-    if (std::getenv("MTMD_DEBUG_EMBEDDINGS") != nullptr) {
+    if (ctx->debug_output_embeddings) {
        const int64_t n_embd = embeddings->ne[0];
        const int64_t n_tokens = embeddings->ne[1];
        std::vector<float> emb_data(n_embd * n_tokens);
@ -4586,14 +4588,7 @@ const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
 //
 // API for debugging
 //
-void clip_debug_encode(clip_ctx * ctx, int h, int w, float fill_value) {
-    clip_image_f32 img;
-    img.nx = w;
-    img.ny = h;
-    img.buf.resize(h * w * 3);
-    for (int i = 0; i < h * w * 3; i++) {
-        img.buf[i] = static_cast<float>(fill_value);
-    }
-    clip_image_encode(ctx, 1, &img, nullptr);
-    GGML_ASSERT(img.buf.empty() && "expected, always stop here");
+
+void clip_set_debug_output_embeddings(clip_ctx * ctx, bool enable) {
+    ctx->debug_output_embeddings = enable;
 }
--- a/tools/mtmd/debug/mtmd-debug.cpp
+++ b/tools/mtmd/debug/mtmd-debug.cpp
@ -0,0 +1,229 @@
+#include "mtmd-debug.h"
+
+#include "arg.h"
+#include "debug.h"
+#include "log.h"
+#include "common.h"
+#include "llama.h"
+#include "ggml.h"
+#include "mtmd.h"
+#include "mtmd-helper.h"
+
+#include <vector>
+#include <cmath>
+#include <limits.h>
+#include <cinttypes>
+#include <clocale>
+
+// INTERNAL TOOL FOR DEBUGGING PURPOSES ONLY
+// NOT INTENDED FOR PUBLIC USE
+
+static void show_additional_info(int /*argc*/, char ** argv) {
+    LOG(
+        "Internal debugging tool for mtmd; See mtmd-debug.md for the pytorch equivalent code\n"
+        "Note: we repurpose some args from other examples, they will have different meaning here\n"
+        "\n"
+        "Usage: %s -m <model> --mmproj <mmproj> -p <mode> -n <size> --image <image> --audio <audio>\n"
+        "\n"
+        "    -n <size>: number of pixels per edge for image (always square image), or number of samples for audio\n"
+        "\n"
+        "    -p \"encode\" (debugging encode pass, default case):\n"
+        "        --image can be:\n"
+        "          \"white\", \"black\", \"gray\": filled 1.0f, 0.0f and 0.5f respectively\n"
+        "          \"cb\": checkerboard pattern, alternate 1.0f and 0.0f\n"
+        "        --audio can be:\n"
+        "          \"one\", \"zero\", \"half\": filled 1.0f, 0.0f and 0.5f respectively\n"
+        "          \"1010\": checkerboard pattern, alternate 1.0f and 0.0f\n"
+        "\n"
+        "    -p \"preproc\" (debugging preprocessing pass):\n"
+        "        --image can be:\n"
+        "          \"white\", \"black\", \"gray\": filled image with respective colors\n"
+        "          \"cb\": checkerboard pattern\n"
+        "        --audio can be:\n"
+        "          \"one\", \"zero\", \"half\": filled 1.0f, 0.0f and 0.5f respectively\n"
+        "          \"440\": sine wave with 440 Hz frequency\n"
+        "\n",
+        argv[0]
+    );
+}
+
+int main(int argc, char ** argv) {
+    std::setlocale(LC_NUMERIC, "C");
+
+    ggml_time_init();
+
+    common_params params;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MTMD, show_additional_info)) {
+        return 1;
+    }
+
+    common_init();
+    mtmd_helper_log_set(common_log_default_callback, nullptr);
+
+    if (params.mmproj.path.empty()) {
+        show_additional_info(argc, argv);
+        LOG_ERR("ERR: Missing --mmproj argument\n");
+        return 1;
+    }
+
+    LOG_INF("%s: loading model: %s\n", __func__, params.model.path.c_str());
+
+    mtmd::context_ptr ctx_mtmd;
+    common_init_result_ptr llama_init;
+    base_callback_data cb_data;
+
+    llama_init = common_init_from_params(params);
+    {
+        auto * model = llama_init->model();
+        const char * clip_path = params.mmproj.path.c_str();
+        mtmd_context_params mparams = mtmd_context_params_default();
+        mparams.use_gpu          = params.mmproj_use_gpu;
+        mparams.print_timings    = true;
+        mparams.n_threads        = params.cpuparams.n_threads;
+        mparams.flash_attn_type  = params.flash_attn_type;
+        mparams.warmup           = params.warmup;
+        mparams.image_min_tokens = params.image_min_tokens;
+        mparams.image_max_tokens = params.image_max_tokens;
+        {
+            // always enable debug callback
+            mparams.cb_eval_user_data = &cb_data;
+            mparams.cb_eval = common_debug_cb_eval<false>;
+        }
+        ctx_mtmd.reset(mtmd_init_from_file(clip_path, model, mparams));
+        if (!ctx_mtmd.get()) {
+            LOG_ERR("Failed to load vision model from %s\n", clip_path);
+            exit(1);
+        }
+    }
+
+    std::string input;
+    int32_t inp_size = params.n_predict;
+    if (params.image.empty()) {
+        LOG_ERR("ERR: At least one of --image or --audio must be specified\n");
+        return 1;
+    }
+    if (inp_size <= 0) {
+        LOG_ERR("ERR: Invalid size specified with -n, must be greater than 0\n");
+        return 1;
+    }
+    input = params.image[0];
+
+    if (params.prompt.empty() || params.prompt == "encode") {
+        std::vector<std::vector<float>> image;
+        std::vector<float> samples;
+
+        if (input == "black") {
+            for (int i = 0; i < inp_size; ++i) {
+                auto row = std::vector<float>(inp_size * 3, 0.0f);
+                image.push_back(row);
+            }
+        } else if (input == "white") {
+            for (int i = 0; i < inp_size; ++i) {
+                auto row = std::vector<float>(inp_size * 3, 1.0f);
+                image.push_back(row);
+            }
+        } else if (input == "gray") {
+            for (int i = 0; i < inp_size; ++i) {
+                auto row = std::vector<float>(inp_size * 3, 0.5f);
+                image.push_back(row);
+            }
+        } else if (input == "cb") {
+            for (int i = 0; i < inp_size; ++i) {
+                auto row = std::vector<float>(inp_size * 3, 0.0f);
+                image.push_back(row);
+            }
+            for (int y = 0; y < inp_size; ++y) {
+                for (int x = 0; x < inp_size; ++x) {
+                    float v = ((x + y) % 2) ? 0.0f : 1.0f;
+                    image[y][x * 3 + 0] = v;
+                    image[y][x * 3 + 1] = v;
+                    image[y][x * 3 + 2] = v;
+                }
+            }
+        } else if (input == "one") {
+            samples = std::vector<float>(inp_size, 1.0f);
+        } else if (input == "zero") {
+            samples = std::vector<float>(inp_size, 0.0f);
+        } else if (input == "half") {
+            samples = std::vector<float>(inp_size, 0.5f);
+        } else if (input == "1010") {
+            samples.resize(inp_size);
+            for (int i = 0; i < inp_size; ++i) {
+                samples[i] = (i % 2) ? 0.0f : 1.0f;
+            }
+        } else {
+            LOG_ERR("ERR: Invalid input specified with --image/--audio\n");
+            show_additional_info(argc, argv);
+            return 1;
+        }
+
+        // run encode pass
+        LOG_INF("Running encode pass for input type: %s\n", input.c_str());
+        if (samples.size() > 0) {
+            LOG_INF("Input audio with %zu samples, type: %s\n", samples.size(), input.c_str());
+            mtmd_debug_encode_audio(ctx_mtmd.get(), samples);
+        } else {
+            LOG_INF("Input image with dimensions %d x %d, type: %s\n", inp_size, inp_size, input.c_str());
+            mtmd_debug_encode_image(ctx_mtmd.get(), image);
+        }
+
+    } else if (params.prompt == "preproc") {
+        std::vector<uint8_t> rgb_values;
+        std::vector<float> pcm_samples;
+
+        if (input == "black") {
+            rgb_values = std::vector<uint8_t>(inp_size * inp_size * 3, 0);
+        } else if (input == "white") {
+            rgb_values = std::vector<uint8_t>(inp_size * inp_size * 3, 255);
+        } else if (input == "gray") {
+            rgb_values = std::vector<uint8_t>(inp_size * inp_size * 3, 128);
+        } else if (input == "cb") {
+            rgb_values.resize(inp_size * inp_size * 3);
+            for (int y = 0; y < inp_size; ++y) {
+                for (int x = 0; x < inp_size; ++x) {
+                    uint8_t v = ((x + y) % 2) ? 0 : 255;
+                    rgb_values[(y * inp_size + x) * 3 + 0] = v;
+                    rgb_values[(y * inp_size + x) * 3 + 1] = v;
+                    rgb_values[(y * inp_size + x) * 3 + 2] = v;
+                }
+            }
+        } else if (input == "one") {
+            pcm_samples = std::vector<float>(inp_size, 1.0f);
+        } else if (input == "zero") {
+            pcm_samples = std::vector<float>(inp_size, 0.0f);
+        } else if (input == "half") {
+            pcm_samples = std::vector<float>(inp_size, 0.5f);
+        } else if (input == "440") {
+            pcm_samples.resize(inp_size);
+            float freq = 440.0f;
+            float sample_rate = mtmd_get_audio_sample_rate(ctx_mtmd.get());
+            float pi = 3.14159265f;
+            for (int i = 0; i < inp_size; ++i) {
+                pcm_samples[i] = sinf(2 * pi * freq * i / sample_rate);
+            }
+        } else {
+            LOG_ERR("ERR: Invalid input specified with --image/--audio\n");
+            show_additional_info(argc, argv);
+            return 1;
+        }
+
+        // run preprocessing pass
+        LOG_INF("Running preprocessing pass for input type: %s\n", input.c_str());
+        if (pcm_samples.size() > 0) {
+            LOG_INF("Input audio with %zu samples, type: %s\n", pcm_samples.size(), input.c_str());
+            mtmd_debug_preprocess_audio(ctx_mtmd.get(), pcm_samples);
+        } else {
+            LOG_INF("Input image with dimensions %d x %d, type: %s\n", inp_size, inp_size, input.c_str());
+            mtmd_debug_preprocess_image(ctx_mtmd.get(), rgb_values, inp_size, inp_size);
+        }
+
+    } else {
+        LOG_ERR("ERR: Invalid mode specified with -p\n");
+        show_additional_info(argc, argv);
+        return 1;
+    }
+
+    return 0;
+}
+
--- a/tools/mtmd/debug/mtmd-debug.h
+++ b/tools/mtmd/debug/mtmd-debug.h
@ -0,0 +1,17 @@
+#pragma once
+
+#include "mtmd.h"
+
+#include <vector>
+
+// INTERNAL HEADER FOR DEBUGGING PURPOSES ONLY
+// NOT INTENDED FOR PUBLIC USE
+// Do not raise issues related to this debugging API
+
+// encode take the pre-processed f32 values, print the intermidiate values via cb_eval callback
+MTMD_API void mtmd_debug_encode_image(mtmd_context * ctx, const std::vector<std::vector<float>> & image);
+MTMD_API void mtmd_debug_encode_audio(mtmd_context * ctx, const std::vector<float> & input); // will be broadcasted to fit n_mel
+
+// preprocess take the raw input values
+MTMD_API void mtmd_debug_preprocess_image(mtmd_context * ctx, const std::vector<uint8_t> & rgb_values, int nx, int ny);
+MTMD_API void mtmd_debug_preprocess_audio(mtmd_context * ctx, const std::vector<float> & pcm_samples);
--- a/tools/mtmd/debug/mtmd-debug.md
+++ b/tools/mtmd/debug/mtmd-debug.md
@ -0,0 +1,25 @@
+# mtmd-debug
+
+## Debugging encode pass
+
+Example of debugging an input gray image (raw, not preprocessed):
+
+```py
+from transformers import AutoModel
+
+model = AutoModel.from_pretrained(...)
+
+def test_vision():
+  img_size = 896 # number of patches per side
+  pixel_values = torch.zeros(1, 3, img_size, img_size) + 0.5 # gray image
+  with torch.no_grad():
+    outputs = model.model.get_image_features(pixel_values=pixel_values)
+  print("last_hidden_state shape:", outputs.last_hidden_state.shape)
+  print("last_hidden_state:", outputs.last_hidden_state)
+
+test_vision()
+```
+
+## Debugging preprocess pass
+
+(TODO)
--- a/tools/mtmd/mtmd-helper.cpp
+++ b/tools/mtmd/mtmd-helper.cpp
@ -470,12 +470,12 @@ static bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int
 mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len) {
    if (audio_helpers::is_audio_file((const char *)buf, len)) {
        std::vector<float> pcmf32;
-        int bitrate = mtmd_get_audio_bitrate(ctx);
-        if (bitrate < 0) {
+        const int sample_rate = mtmd_get_audio_sample_rate(ctx);
+        if (sample_rate < 0) {
            LOG_ERR("This model does not support audio input\n");
            return nullptr;
        }
-        if (!audio_helpers::decode_audio_from_buf(buf, len, bitrate, pcmf32)) {
+        if (!audio_helpers::decode_audio_from_buf(buf, len, sample_rate, pcmf32)) {
            LOG_ERR("Unable to read WAV audio file from buffer\n");
            return nullptr;
        }
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@ -2,6 +2,7 @@
 #include "clip-impl.h"
 #include "mtmd.h"
 #include "mtmd-audio.h"
+#include "debug/mtmd-debug.h"

 #include "llama.h"

@ -912,7 +913,7 @@ bool mtmd_support_audio(mtmd_context * ctx) {
    return ctx->ctx_a != nullptr;
 }

-int mtmd_get_audio_bitrate(mtmd_context * ctx) {
+int mtmd_get_audio_sample_rate(mtmd_context * ctx) {
    if (!ctx->ctx_a) {
        return -1;
    }
@ -1157,3 +1158,104 @@ void mtmd_log_set(ggml_log_callback log_callback, void * user_data) {
    g_logger_state.log_callback = log_callback ? log_callback : clip_log_callback_default;
    g_logger_state.log_callback_user_data = user_data;
 }
+
+//
+// Debugging API (NOT intended for public use)
+//
+
+static void mtmd_debug_encode_impl(mtmd_context * ctx, clip_ctx * ctx_clip, clip_image_f32 & image) {
+    clip_set_debug_output_embeddings(ctx_clip, true);
+    int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip);
+    int n_tokens = clip_n_output_tokens(ctx_clip, &image);
+    std::vector<float> embd_output(n_tokens * n_mmproj_embd, 0.0f);
+    bool ok = clip_image_encode(
+        ctx_clip,
+        ctx->n_threads,
+        &image,
+        embd_output.data());
+    if (!ok) {
+        LOG_ERR("%s: failed to encode image\n", __func__);
+    }
+}
+
+void mtmd_debug_encode_image(mtmd_context * ctx, const std::vector<std::vector<float>> & image) {
+    if (!ctx->ctx_v) {
+        LOG_ERR("%s: model does not support vision input\n", __func__);
+        return;
+    }
+    clip_image_f32 inp_image;
+    inp_image.nx = image.size();
+    inp_image.ny = inp_image.nx;
+    inp_image.buf.reserve(inp_image.nx * inp_image.ny);
+    for (const auto & row : image) {
+        inp_image.buf.insert(inp_image.buf.end(), row.begin(), row.end());
+    }
+    LOG_INF("%s: created input image with nx=%d, ny=%d\n", __func__, inp_image.nx, inp_image.ny);
+    mtmd_debug_encode_impl(ctx, ctx->ctx_v, inp_image);
+}
+
+void mtmd_debug_encode_audio(mtmd_context * ctx, const std::vector<float> & input) {
+    if (!ctx->ctx_a) {
+        LOG_ERR("%s: model does not support audio input\n", __func__);
+        return;
+    }
+    int n_mel = clip_get_hparams(ctx->ctx_a)->n_mel_bins;
+    clip_image_f32 inp_audio;
+    inp_audio.nx = input.size();
+    inp_audio.ny = n_mel;
+    inp_audio.buf.resize(input.size() * n_mel);
+    for (size_t i = 0; i < input.size(); i++) {
+        for (int j = 0; j < n_mel; j++) {
+            inp_audio.buf[j * inp_audio.nx + i] = input[i];
+        }
+    }
+    LOG_INF("%s: created input audio with nx=%d, ny=%d\n", __func__, inp_audio.nx, inp_audio.ny);
+    mtmd_debug_encode_impl(ctx, ctx->ctx_a, inp_audio);
+}
+
+void mtmd_debug_preprocess_image(mtmd_context * ctx, const std::vector<uint8_t> & rgb_values, int nx, int ny) {
+    if (!ctx->ctx_v) {
+        LOG_ERR("%s: model does not support vision input\n", __func__);
+        return;
+    }
+    clip_image_u8 img_u8;
+    img_u8.nx = nx;
+    img_u8.ny = ny;
+    img_u8.buf = rgb_values;
+    clip_image_f32_batch batch_f32;
+    bool ok = clip_image_preprocess(ctx->ctx_v, &img_u8, &batch_f32);
+    if (!ok) {
+        LOG_ERR("%s: failed to preprocess image\n", __func__);
+        return;
+    }
+    LOG_INF("%s: preprocessed image to batch_f32 with %d entries\n", __func__, (int)batch_f32.entries.size());
+    for (size_t i = 0; i < batch_f32.entries.size(); i++) {
+        LOG_INF("%s: entry %zu has nx=%d, ny=%d\n", __func__, i, batch_f32.entries[i]->nx, batch_f32.entries[i]->ny);
+        // TODO: better way to dump entry content?
+    }
+}
+
+void mtmd_debug_preprocess_audio(mtmd_context * ctx, const std::vector<float> & samples) {
+    if (!ctx->ctx_a) {
+        LOG_ERR("%s: model does not support audio input\n", __func__);
+        return;
+    }
+    std::vector<mtmd_audio_mel> mel_spec_chunks;
+    bool ok = ctx->audio_preproc->preprocess(samples.data(), samples.size(), mel_spec_chunks);
+    if (!ok) {
+        LOG_ERR("%s: failed to preprocess audio\n", __func__);
+        return;
+    }
+    LOG_INF("%s: preprocessed audio to %zu mel spec chunks\n", __func__, mel_spec_chunks.size());
+    for (size_t i = 0; i < mel_spec_chunks.size(); i++) {
+        LOG_INF("%s: mel spec chunk %zu has n_len=%d, n_mel=%d\n", __func__, i, mel_spec_chunks[i].n_len, mel_spec_chunks[i].n_mel);
+
+        // dump mel entries: data is stored as [n_mel][n_len] (mel-major)
+        const auto & mel = mel_spec_chunks[i];
+        for (int m = 0; m < mel.n_mel; m++) {
+            for (int t = 0; t < mel.n_len; t++) {
+                LOG_INF("mel[%zu][m=%d][t=%d] = %f\n", i, m, t, mel.data[m * mel.n_len + t]);
+            }
+        }
+    }
+}
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@ -125,9 +125,9 @@ MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
 // whether the current model supports audio input
 MTMD_API bool mtmd_support_audio(mtmd_context * ctx);

-// get audio bitrate in Hz, for example 16000 for Whisper
+// get audio sample rate in Hz, for example 16000 for Whisper
 // return -1 if audio is not supported
-MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
+MTMD_API int mtmd_get_audio_sample_rate(mtmd_context * ctx);

 // mtmd_bitmap
 //
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@ -1189,6 +1189,9 @@ private:
            ? SLOT_STATE_WAIT_OTHER // wait for the parent to process prompt
            : SLOT_STATE_STARTED;

+        // reset server kill-switch counter
+        n_empty_consecutive = 0;
+
        SLT_INF(slot, "processing task, is_child = %d\n", slot.task->is_child());
        return true;
    }
--- a/tools/server/tests/unit/test_embedding.py
+++ b/tools/server/tests/unit/test_embedding.py
@ -101,6 +101,40 @@ def test_embedding_mixed_input(input, is_multi_prompt: bool):
        assert len(data[0]['embedding']) > 1


+def test_embedding_pooling_mean():
+    global server
+    server.pooling = 'mean'
+    server.start()
+    res = server.make_request("POST", "/v1/embeddings", data={
+        "input": "I believe the meaning of life is",
+    })
+    assert res.status_code == 200
+    assert len(res.body['data']) == 1
+    assert 'embedding' in res.body['data'][0]
+    assert len(res.body['data'][0]['embedding']) > 1
+
+    # make sure embedding vector is normalized
+    assert abs(sum([x ** 2 for x in res.body['data'][0]['embedding']]) - 1) < EPSILON
+
+
+def test_embedding_pooling_mean_multiple():
+    global server
+    server.pooling = 'mean'
+    server.start()
+    res = server.make_request("POST", "/v1/embeddings", data={
+        "input": [
+            "I believe the meaning of life is",
+            "Write a joke about AI",
+            "This is a test",
+        ],
+    })
+    assert res.status_code == 200
+    assert len(res.body['data']) == 3
+    for d in res.body['data']:
+        assert 'embedding' in d
+        assert len(d['embedding']) > 1
+
+
 def test_embedding_pooling_none():
    global server
    server.pooling = 'none'
--- a/tools/server/webui/src/lib/components/app/dialogs/DialogCodePreview.svelte
+++ b/tools/server/webui/src/lib/components/app/dialogs/DialogCodePreview.svelte
@ -37,7 +37,7 @@
 			<iframe
 				bind:this={iframeRef}
 				title="Preview {language}"
-				sandbox="allow-scripts allow-same-origin"
+				sandbox="allow-scripts"
 				class="code-preview-iframe"
 			></iframe>

--- a/vendor/cpp-httplib/httplib.cpp
+++ b/vendor/cpp-httplib/httplib.cpp
@ -1995,9 +1995,9 @@ int getaddrinfo_with_timeout(const char *node, const char *service,
    memcpy((*current)->ai_addr, sockaddr_ptr, sockaddr_len);

    // Set port if service is specified
-    if (service && strlen(service) > 0) {
-      int port = atoi(service);
-      if (port > 0) {
+    if (service && *service) {
+      int port = 0;
+      if (parse_port(service, strlen(service), port)) {
        if (sockaddr_ptr->sa_family == AF_INET) {
          reinterpret_cast<struct sockaddr_in *>((*current)->ai_addr)
              ->sin_port = htons(static_cast<uint16_t>(port));
@ -3016,6 +3016,16 @@ bool read_headers(Stream &strm, Headers &headers) {
    header_count++;
  }

+  // RFC 9110 Section 8.6: Reject requests with multiple Content-Length
+  // headers that have different values to prevent request smuggling.
+  auto cl_range = headers.equal_range("Content-Length");
+  if (cl_range.first != cl_range.second) {
+    const auto &first_val = cl_range.first->second;
+    for (auto it = std::next(cl_range.first); it != cl_range.second; ++it) {
+      if (it->second != first_val) { return false; }
+    }
+  }
+
  return true;
 }

@ -7522,6 +7532,10 @@ bool Server::listen_internal() {
      detail::set_socket_opt_time(sock, SOL_SOCKET, SO_SNDTIMEO,
                                  write_timeout_sec_, write_timeout_usec_);

+      if (tcp_nodelay_) {
+        detail::set_socket_opt(sock, IPPROTO_TCP, TCP_NODELAY, 1);
+      }
+
      if (!task_queue->enqueue(
              [this, sock]() { process_and_close_socket(sock); })) {
        output_error_log(Error::ResourceExhaustion, nullptr);
@ -8911,7 +8925,7 @@ bool ClientImpl::redirect(Request &req, Response &res, Error &error) {

  auto next_port = port_;
  if (!port_str.empty()) {
-    next_port = std::stoi(port_str);
+    if (!detail::parse_port(port_str, next_port)) { return false; }
  } else if (!next_scheme.empty()) {
    next_port = next_scheme == "https" ? 443 : 80;
  }
@ -8962,18 +8976,10 @@ bool ClientImpl::create_redirect_client(
    // Setup basic client configuration first
    setup_redirect_client(redirect_client);

-    // SSL-specific configuration for proxy environments
-    if (!proxy_host_.empty() && proxy_port_ != -1) {
-      // Critical: Disable SSL verification for proxy environments
-      redirect_client.enable_server_certificate_verification(false);
-      redirect_client.enable_server_hostname_verification(false);
-    } else {
-      // For direct SSL connections, copy SSL verification settings
-      redirect_client.enable_server_certificate_verification(
-          server_certificate_verification_);
-      redirect_client.enable_server_hostname_verification(
-          server_hostname_verification_);
-    }
+    redirect_client.enable_server_certificate_verification(
+        server_certificate_verification_);
+    redirect_client.enable_server_hostname_verification(
+        server_hostname_verification_);

    // Transfer CA certificate to redirect client
    if (!ca_cert_pem_.empty()) {
@ -10690,7 +10696,8 @@ Client::Client(const std::string &scheme_host_port,
    if (host.empty()) { host = m[3].str(); }

    auto port_str = m[4].str();
-    auto port = !port_str.empty() ? std::stoi(port_str) : (is_ssl ? 443 : 80);
+    auto port = is_ssl ? 443 : 80;
+    if (!port_str.empty() && !detail::parse_port(port_str, port)) { return; }

    if (is_ssl) {
 #ifdef CPPHTTPLIB_SSL_ENABLED
@ -16103,7 +16110,8 @@ WebSocketClient::WebSocketClient(
    if (host_.empty()) { host_ = m[3].str(); }

    auto port_str = m[4].str();
-    port_ = !port_str.empty() ? std::stoi(port_str) : (is_ssl ? 443 : 80);
+    port_ = is_ssl ? 443 : 80;
+    if (!port_str.empty() && !detail::parse_port(port_str, port_)) { return; }

    path_ = m[5].str();

--- a/vendor/cpp-httplib/httplib.h
+++ b/vendor/cpp-httplib/httplib.h
@ -8,8 +8,8 @@
 #ifndef CPPHTTPLIB_HTTPLIB_H
 #define CPPHTTPLIB_HTTPLIB_H

-#define CPPHTTPLIB_VERSION "0.37.1"
-#define CPPHTTPLIB_VERSION_NUM "0x002501"
+#define CPPHTTPLIB_VERSION "0.37.2"
+#define CPPHTTPLIB_VERSION_NUM "0x002502"

 #ifdef _WIN32
 #if defined(_WIN32_WINNT) && _WIN32_WINNT < 0x0A00
@ -689,6 +689,18 @@ inline from_chars_result<double> from_chars(const char *first, const char *last,
  return {first + (endptr - s.c_str()), std::errc{}};
 }

+inline bool parse_port(const char *s, size_t len, int &port) {
+  int val = 0;
+  auto r = from_chars(s, s + len, val);
+  if (r.ec != std::errc{} || val < 1 || val > 65535) { return false; }
+  port = val;
+  return true;
+}
+
+inline bool parse_port(const std::string &s, int &port) {
+  return parse_port(s.data(), s.size(), port);
+}
+
 } // namespace detail

 enum SSLVerifierResponse {