Merge branch 'upstream' into concedo_experimental

# Conflicts: # .devops/openvino.Dockerfile # .github/workflows/build-self-hosted.yml # .github/workflows/build.yml # common/chat.cpp # docs/backend/OPENVINO.md # examples/speculative-simple/speculative-simple.cpp # ggml/src/ggml-hexagon/ggml-hexagon.cpp # ggml/src/ggml-hexagon/htp/CMakeLists.txt # ggml/src/ggml-hexagon/htp/htp-ctx.h # ggml/src/ggml-hexagon/htp/htp-ops.h # ggml/src/ggml-hexagon/htp/main.c # ggml/src/ggml-hexagon/libggml-htp.inf # ggml/src/ggml-openvino/ggml-decoder.cpp # ggml/src/ggml-openvino/ggml-openvino-extra.cpp # ggml/src/ggml-openvino/ggml-openvino.cpp # ggml/src/ggml-openvino/ggml-quants.cpp # ggml/src/ggml-openvino/openvino/op/rope.cpp # ggml/src/ggml-openvino/openvino/op_table.cpp # ggml/src/ggml-openvino/openvino/op_table.h # ggml/src/ggml-openvino/openvino/translate_session.cpp # ggml/src/ggml-openvino/openvino/utils.cpp # ggml/src/ggml-openvino/openvino/utils.h # ggml/src/ggml-openvino/utils.cpp # ggml/src/ggml-openvino/utils.h # ggml/src/ggml-sycl/common.hpp # ggml/src/ggml-sycl/convert.cpp # ggml/src/ggml-sycl/convert.hpp # ggml/src/ggml-sycl/gemm.hpp # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-sycl/set_rows.cpp # ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp # ggml/src/ggml-webgpu/ggml-webgpu.cpp # scripts/sync_vendor.py # tests/CMakeLists.txt # tests/test-chat.cpp # tools/cli/cli.cpp # tools/mtmd/CMakeLists.txt # tools/server/CMakeLists.txt
2026-04-28 03:30:20 +00:00 · 2026-04-23 00:55:05 +08:00 · 2026-04-23 00:55:05 +08:00 · 0755f27372
commit 0755f27372
parent becf70d49b 8bccdbbff9
42 changed files with 1531 additions and 3199 deletions
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@ -150,7 +150,7 @@
 #define TN_TOK_BOI         "v.boi"
 #define TN_TOK_EOI         "v.eoi"

-// hunyuanocr
+// hunyuanocr / hunyuanvl (shared GGUF tensor names)
 #define TN_MM_PRE_NORM     "mm.pre_norm.%s"
 #define TN_TOK_IMG_BEGIN   "mm.image_begin"
 #define TN_TOK_IMG_END     "mm.image_end"
@ -242,6 +242,15 @@
 #define TN_STD_BIAS              "v.std_bias"
 #define TN_STD_SCALE             "v.std_scale"

+// yasa2
+#define TN_YASA_PATCH_LN_W       "v.patch_ln.weight"
+#define TN_YASA_PATCH_LN_B       "v.patch_ln.bias"
+#define TN_YASA_BACKBONE_LN_W    "v.backbone_ln.weight"
+#define TN_YASA_BACKBONE_LN_B    "v.backbone_ln.bias"
+#define TN_YASA_POS_EMBD         "v.vision_pos_embed"
+#define TN_YASA_STAGE_DOWN_LN    "v.stage.%d.down.ln.%s"
+#define TN_YASA_STAGE_DOWN_CONV  "v.stage.%d.down.conv.%s"
+#define TN_YASA_STAGE_BLK        "v.stage.%d.blk.%d.%s.%s"

 // align x to upper multiple of n
 #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
@ -290,9 +299,11 @@ enum projector_type {
    PROJECTOR_TYPE_LFM2A,
    PROJECTOR_TYPE_GLM4V,
    PROJECTOR_TYPE_YOUTUVL,
+    PROJECTOR_TYPE_YASA2,
    PROJECTOR_TYPE_KIMIK25,
    PROJECTOR_TYPE_NEMOTRON_V2_VL,
    PROJECTOR_TYPE_HUNYUANOCR,
+    PROJECTOR_TYPE_HUNYUANVL,
    PROJECTOR_TYPE_UNKNOWN,
 };

@ -335,9 +346,11 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_LFM2A,     "lfm2a"},
    { PROJECTOR_TYPE_GLM4V,     "glm4v"},
    { PROJECTOR_TYPE_YOUTUVL,   "youtuvl"},
+    { PROJECTOR_TYPE_YASA2,     "yasa2"},
    { PROJECTOR_TYPE_KIMIK25,   "kimik25"},
    { PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"},
    { PROJECTOR_TYPE_HUNYUANOCR, "hunyuanocr"},
+    { PROJECTOR_TYPE_HUNYUANVL,  "hunyuanvl"},
 };

 static projector_type clip_projector_type_from_string(const std::string & str) {
--- a/tools/mtmd/clip-model.h
+++ b/tools/mtmd/clip-model.h
@ -268,6 +268,27 @@ struct mobilenetv5_block {
    ggml_tensor * attn_norm_w   = nullptr;
 };

+struct yasa2_block {
+    ggml_tensor * dw_w  = nullptr;
+    ggml_tensor * dw_b  = nullptr;
+    ggml_tensor * ln_w  = nullptr;
+    ggml_tensor * ln_b  = nullptr;
+    ggml_tensor * pw1_w = nullptr;
+    ggml_tensor * pw1_b = nullptr;
+    ggml_tensor * grn_w = nullptr;
+    ggml_tensor * grn_b = nullptr;
+    ggml_tensor * pw2_w = nullptr;
+    ggml_tensor * pw2_b = nullptr;
+};
+
+struct yasa2_stage {
+    ggml_tensor * down_ln_w   = nullptr;
+    ggml_tensor * down_ln_b   = nullptr;
+    ggml_tensor * down_conv_w = nullptr;
+    ggml_tensor * down_conv_b = nullptr;
+    std::vector<yasa2_block> blocks;
+};
+
 struct clip_model {
    clip_modality modality = CLIP_MODALITY_VISION;
    projector_type proj_type = PROJECTOR_TYPE_MLP;
@ -402,6 +423,15 @@ struct clip_model {
    ggml_tensor * msfa_ffn_expand_bn = nullptr;
    ggml_tensor * msfa_ffn_project_bn = nullptr;

+    // yasa2
+    ggml_tensor * yasa_patch_w = nullptr;
+    ggml_tensor * yasa_patch_b = nullptr;
+    ggml_tensor * yasa_patch_ln_w = nullptr;
+    ggml_tensor * yasa_patch_ln_b = nullptr;
+    ggml_tensor * yasa_backbone_ln_w = nullptr;
+    ggml_tensor * yasa_backbone_ln_b = nullptr;
+    ggml_tensor * yasa_vision_pos_embed = nullptr;
+    std::vector<yasa2_stage> yasa_stages;

    // pixtral, glm4v
    ggml_tensor * token_embd_img_break = nullptr;
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@ -76,6 +76,7 @@
 #include "models/deepseekocr.cpp"
 #include "models/mobilenetv5.cpp"
 #include "models/youtuvl.cpp"
+#include "models/yasa2.cpp"

 struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL};

@ -969,6 +970,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
                builder = std::make_unique<clip_graph_cogvlm>(ctx, img);
            } break;
        case PROJECTOR_TYPE_HUNYUANOCR:
+        case PROJECTOR_TYPE_HUNYUANVL:
            {
                builder = std::make_unique<clip_graph_hunyuanocr>(ctx, img);
            } break;
@ -1004,6 +1006,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            {
                builder = std::make_unique<clip_graph_youtuvl>(ctx, img);
            } break;
+        case PROJECTOR_TYPE_YASA2:
+            {
+                builder = std::make_unique<clip_graph_yasa2>(ctx, img);
+            } break;
        default:
            GGML_ABORT("missing cgraph builder");
    }
@ -1474,6 +1480,16 @@ struct clip_model_loader {
                        hparams.set_limit_image_tokens(1, 62500);
                        hparams.set_warmup_n_tokens(16*16); // avoid OOM on warmup
                    } break;
+                case PROJECTOR_TYPE_YASA2:
+                    {
+                        hparams.ffn_op = FFN_GELU_ERF;
+                        log_ffn_op = "gelu_erf";
+                        hparams.image_resize_algo = RESIZE_ALGO_BICUBIC;
+
+                        // reka model performs better when using resize_bicubic, which stretches
+                        // the image to fit fixed square size
+                        hparams.image_resize_pad = false;
+                    } break;
                case PROJECTOR_TYPE_GLM4V:
                    {
                        hparams.rope_theta = 10000.0f;
@ -1544,6 +1560,16 @@ struct clip_model_loader {
                        get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels);
                        hparams.set_warmup_n_tokens(28*28);
                    } break;
+                case PROJECTOR_TYPE_HUNYUANVL:
+                    {
+                        hparams.n_merge = 2;
+                        hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
+                        hparams.image_resize_pad = false;
+                        hparams.ffn_op = FFN_GELU;
+                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
+                        hparams.set_limit_image_tokens(256, 16384);
+                        hparams.set_warmup_n_tokens(32*32);
+                    } break;
                case PROJECTOR_TYPE_LFM2A:
                    {
                        // audio preprocessing params
@ -1929,6 +1955,55 @@ struct clip_model_loader {
                    model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));  // merger.mlp.2
                    model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
                } break;
+            case PROJECTOR_TYPE_YASA2:
+                {
+                    // reuse tensors already loaded by the common section
+                    // (TN_PATCH_EMBD and TN_PATCH_BIAS have the same tensor names)
+                    GGML_ASSERT(model.patch_embeddings_0 && "yasa2 requires v.patch_embd.weight");
+                    model.yasa_patch_w = model.patch_embeddings_0;
+                    model.yasa_patch_b = model.patch_bias;
+                    model.yasa_patch_ln_w = get_tensor(TN_YASA_PATCH_LN_W, false);
+                    model.yasa_patch_ln_b = get_tensor(TN_YASA_PATCH_LN_B, false);
+                    model.yasa_backbone_ln_w = get_tensor(TN_YASA_BACKBONE_LN_W, false);
+                    model.yasa_backbone_ln_b = get_tensor(TN_YASA_BACKBONE_LN_B, false);
+                    model.yasa_vision_pos_embed = get_tensor(TN_YASA_POS_EMBD, false);
+                    model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
+                    model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"), false);
+                    model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
+                    model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
+
+                    model.yasa_stages.clear();
+                    for (int s = 0; ; ++s) {
+                        yasa2_stage stage;
+                        stage.down_ln_w   = get_tensor(string_format(TN_YASA_STAGE_DOWN_LN, s, "weight"), false);
+                        stage.down_ln_b   = get_tensor(string_format(TN_YASA_STAGE_DOWN_LN, s, "bias"), false);
+                        stage.down_conv_w = get_tensor(string_format(TN_YASA_STAGE_DOWN_CONV, s, "weight"), false);
+                        stage.down_conv_b = get_tensor(string_format(TN_YASA_STAGE_DOWN_CONV, s, "bias"), false);
+
+                        for (int bi = 0; ; ++bi) {
+                            yasa2_block blk;
+                            blk.dw_w = get_tensor(string_format(TN_YASA_STAGE_BLK, s, bi, "dw", "weight"), false);
+                            if (!blk.dw_w) {
+                                break;
+                            }
+                            blk.dw_b  = get_tensor(string_format(TN_YASA_STAGE_BLK, s, bi, "dw", "bias"), false);
+                            blk.ln_w  = get_tensor(string_format(TN_YASA_STAGE_BLK, s, bi, "ln", "weight"), false);
+                            blk.ln_b  = get_tensor(string_format(TN_YASA_STAGE_BLK, s, bi, "ln", "bias"), false);
+                            blk.pw1_w = get_tensor(string_format(TN_YASA_STAGE_BLK, s, bi, "pw1", "weight"), false);
+                            blk.pw1_b = get_tensor(string_format(TN_YASA_STAGE_BLK, s, bi, "pw1", "bias"), false);
+                            blk.grn_w = get_tensor(string_format(TN_YASA_STAGE_BLK, s, bi, "grn", "weight"), false);
+                            blk.grn_b = get_tensor(string_format(TN_YASA_STAGE_BLK, s, bi, "grn", "bias"), false);
+                            blk.pw2_w = get_tensor(string_format(TN_YASA_STAGE_BLK, s, bi, "pw2", "weight"), false);
+                            blk.pw2_b = get_tensor(string_format(TN_YASA_STAGE_BLK, s, bi, "pw2", "bias"), false);
+                            stage.blocks.push_back(blk);
+                        }
+
+                        if (!stage.down_conv_w && stage.blocks.empty()) {
+                            break;
+                        }
+                        model.yasa_stages.push_back(std::move(stage));
+                    }
+                } break;
            case PROJECTOR_TYPE_GLM4V:
                {
                    model.mm_fc_w        = get_tensor(string_format(TN_MM_PROJECTOR, "weight"));
@ -2249,6 +2324,7 @@ struct clip_model_loader {
                    model.mm_eoi            = get_tensor(TN_TOK_EOI);
                } break;
            case PROJECTOR_TYPE_HUNYUANOCR:
+            case PROJECTOR_TYPE_HUNYUANVL:
                {
                    // proj.0 -> mm.0 (conv1), proj.2 -> mm.2 (conv2), mlp -> mm.model.fc (linear)
                    model.mm_0_w            = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
@ -3062,6 +3138,19 @@ void setup_init_vision_shim_kcpp(struct clip_ctx * ctx_v) {
                    img_end = "<|vision_end|>";
                    image_preproc = std::make_unique<mtmd_image_preprocessor_youtuvl>(ctx_v);
                } break;
+            case PROJECTOR_TYPE_YASA2:
+                {
+                    img_beg = "<image>";
+                    img_end = "</image>";
+                    // Currently only supprots single-tile preprocessing: any input is downscaled
+                    // to one image_size x image_size tile (64 output tokens via 8x8 adaptive avg
+                    // pool).
+                    // However, the model itself supports llava-uhd multi-tile tiling for high-res
+                    // images. This will be implemented in a future PR (dispatch on has_pinpoints
+                    // - see LDP/COGVLM branch above) and emit image_grid_pinpoints in the conversion
+                    // script.
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_fixed_size>(ctx_v);
+                } break;
            case PROJECTOR_TYPE_GEMMA3:
            case PROJECTOR_TYPE_GEMMA3NV:
                {
@ -3199,6 +3288,7 @@ void setup_init_vision_shim_kcpp(struct clip_ctx * ctx_v) {
                    image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
                } break;
            case PROJECTOR_TYPE_HUNYUANOCR:
+            case PROJECTOR_TYPE_HUNYUANVL:
                {
                    // note: these use fullwidth ｜ (U+FF5C) and ▁ (U+2581) to match the tokenizer vocabulary
                    img_beg = "<｜hy_place▁holder▁no▁100｜>";
@ -3287,6 +3377,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
        case PROJECTOR_TYPE_GLM4V:
        case PROJECTOR_TYPE_PADDLEOCR:
        case PROJECTOR_TYPE_HUNYUANOCR:
+        case PROJECTOR_TYPE_HUNYUANVL:
        case PROJECTOR_TYPE_YOUTUVL:
            return (img->nx / params.patch_size) / 2;
        case PROJECTOR_TYPE_STEP3VL:
@ -3306,6 +3397,7 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
        case PROJECTOR_TYPE_QWEN3VL:
        case PROJECTOR_TYPE_GLM4V:
        case PROJECTOR_TYPE_PADDLEOCR:
+        case PROJECTOR_TYPE_HUNYUANVL:
        case PROJECTOR_TYPE_YOUTUVL:
            return (img->ny / params.patch_size) / 2;
        case PROJECTOR_TYPE_STEP3VL:
@ -3333,6 +3425,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
            {
                // do nothing
            } break;
+        case PROJECTOR_TYPE_YASA2:
+            {
+                n_patches = 64; // adaptive average pooling to 8x8 tokens
+            } break;
        case PROJECTOR_TYPE_LDP:
        case PROJECTOR_TYPE_LDPV2:
        case PROJECTOR_TYPE_GLM_EDGE:
@ -3493,6 +3589,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
            n_patches = h * (h + 1) + 1;
        } break;
        case PROJECTOR_TYPE_HUNYUANOCR:
+        case PROJECTOR_TYPE_HUNYUANVL:
            {
                int merge = ctx->model.hparams.n_merge;
                int ow = (img->nx / patch_size) / merge;
@ -3953,9 +4050,74 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        case PROJECTOR_TYPE_PHI4:
        case PROJECTOR_TYPE_COGVLM:
        case PROJECTOR_TYPE_HUNYUANOCR:
+        case PROJECTOR_TYPE_YASA2:
            {
                // do nothing
            } break;
+        case PROJECTOR_TYPE_HUNYUANVL:
+            {
+                // Compute the HunyuanVL 2D position embedding on CPU (with the
+                // custom sf=(target+0.1)/n_grid bilinear sampling that the
+                // reference implementation uses) and upload it to the graph
+                // input declared in clip_graph_hunyuanocr::build().
+                GGML_ASSERT(model.position_embeddings != nullptr);
+                ggml_tensor * src_t   = model.position_embeddings;
+                const int64_t n_embd  = src_t->ne[0];
+                const int64_t n_pos   = src_t->ne[1];            // = n_grid * n_grid
+                const int     n_grid  = (int)std::lround(std::sqrt((double)n_pos));
+                GGML_ASSERT((int64_t)n_grid * n_grid == n_pos);
+                const int     out_w   = pos_w;                    // pw
+                const int     out_h   = pos_h;                    // ph
+
+                // Pull weight to host.
+                std::vector<float> src(n_embd * n_pos);
+                ggml_backend_tensor_get(src_t, src.data(), 0, ggml_nbytes(src_t));
+
+                // Output layout matches ggml_new_tensor_2d(F32, n_embd, out_h*out_w):
+                //   ne[0] = n_embd (fastest), ne[1] = out_h*out_w
+                //   dst[(y*out_w + x) * n_embd + c]
+                std::vector<float> dst((size_t)n_embd * out_h * out_w);
+
+                const float sx = (float)(out_w + 0.1f) / (float)n_grid;
+                const float sy = (float)(out_h + 0.1f) / (float)n_grid;
+
+                for (int y = 0; y < out_h; ++y) {
+                    // Match ggml_compute_forward_upscale_f32 pixel-center
+                    // convention (align_corners=False): src_y = (y+0.5)/sy - 0.5.
+                    const float fy = ((float)y + 0.5f) / sy - 0.5f;
+                    int y0 = (int)std::floor(fy);
+                    int y1 = y0 + 1;
+                    y0 = std::clamp(y0, 0, n_grid - 1);
+                    y1 = std::clamp(y1, 0, n_grid - 1);
+                    float wy1 = std::clamp(fy - (float)y0, 0.0f, 1.0f);
+                    const float wy0 = 1.0f - wy1;
+                    for (int x = 0; x < out_w; ++x) {
+                        const float fx = ((float)x + 0.5f) / sx - 0.5f;
+                        int x0 = (int)std::floor(fx);
+                        int x1 = x0 + 1;
+                        x0 = std::clamp(x0, 0, n_grid - 1);
+                        x1 = std::clamp(x1, 0, n_grid - 1);
+                        float wx1 = std::clamp(fx - (float)x0, 0.0f, 1.0f);
+                        const float wx0 = 1.0f - wx1;
+
+                        const float w00 = wy0 * wx0;
+                        const float w01 = wy0 * wx1;
+                        const float w10 = wy1 * wx0;
+                        const float w11 = wy1 * wx1;
+
+                        const float * s00 = &src[((size_t)y0 * n_grid + x0) * n_embd];
+                        const float * s01 = &src[((size_t)y0 * n_grid + x1) * n_embd];
+                        const float * s10 = &src[((size_t)y1 * n_grid + x0) * n_embd];
+                        const float * s11 = &src[((size_t)y1 * n_grid + x1) * n_embd];
+                        float * d         = &dst[((size_t)y * out_w + x) * n_embd];
+                        for (int c = 0; c < n_embd; ++c) {
+                            d[c] = w00 * s00[c] + w01 * s01[c] + w10 * s10[c] + w11 * s11[c];
+                        }
+                    }
+                }
+
+                set_input_f32("hunyuanvl_pos_embd", dst);
+            } break;
        case PROJECTOR_TYPE_LLAMA4:
            {
                // set the 2D positions
@ -4376,8 +4538,10 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
        case PROJECTOR_TYPE_KIMIVL:
        case PROJECTOR_TYPE_PADDLEOCR:
        case PROJECTOR_TYPE_KIMIK25:
+        case PROJECTOR_TYPE_YASA2:
            return ctx->model.mm_2_w->ne[1];
        case PROJECTOR_TYPE_HUNYUANOCR:
+        case PROJECTOR_TYPE_HUNYUANVL:
            return ctx->model.mm_model_proj->ne[1];
        case PROJECTOR_TYPE_COGVLM:
            return ctx->model.mm_4h_to_h_w->ne[1];
--- a/tools/mtmd/models/hunyuanocr.cpp
+++ b/tools/mtmd/models/hunyuanocr.cpp
@ -5,7 +5,21 @@ ggml_cgraph * clip_graph_hunyuanocr::build() {
    const int pw    = n_patches_x;
    const int ph    = n_patches_y;

-    ggml_tensor * pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BILINEAR);
+    // Position embedding interpolation.
+    // HunyuanVL needs scale factors sf=(target+0.1)/n_grid, which the standard
+    // ggml_interpolate cannot express. To avoid adding a new ggml op, the
+    // resize is computed on CPU in clip_image_batch_encode and uploaded here
+    // as a graph input (named "hunyuanvl_pos_embd").
+    // HunyuanOCR uses the same square layout and the standard ratio-based
+    // interpolation provided by resize_position_embeddings().
+    ggml_tensor * pos_embd = nullptr;
+    if (proj_type == PROJECTOR_TYPE_HUNYUANVL && model.position_embeddings) {
+        pos_embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, ph * pw);
+        ggml_set_name(pos_embd, "hunyuanvl_pos_embd");
+        ggml_set_input(pos_embd);
+    } else {
+        pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BILINEAR);
+    }

    ggml_tensor * inp = build_inp();
    ggml_tensor * cur = build_vit(inp, n_patches, NORM_TYPE_NORMAL, hparams.ffn_op, pos_embd, nullptr);
--- a/tools/mtmd/models/models.h
+++ b/tools/mtmd/models/models.h
@ -43,6 +43,14 @@ struct clip_graph_youtuvl : clip_graph {
    ggml_cgraph * build() override;
 };

+struct clip_graph_yasa2 : clip_graph {
+    clip_graph_yasa2(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+
+    ggml_tensor * layer_norm_channels(ggml_tensor * inp, ggml_tensor * w, ggml_tensor * b, float eps = 1e-6f);
+    ggml_tensor * convnext_grn(ggml_tensor * inp, ggml_tensor * w, ggml_tensor * b);
+};
+
 struct clip_graph_minicpmv : clip_graph {
    clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
--- a/tools/mtmd/models/yasa2.cpp
+++ b/tools/mtmd/models/yasa2.cpp
@ -0,0 +1,191 @@
+// ABOUTME: Yasa2 vision encoder graph builder for ConvNeXt-based architecture.
+// ABOUTME: Implements patch embedding, ConvNeXt stages with GRN, and adaptive pooling.
+
+#include "models.h"
+
+static ggml_tensor * add_channel_bias(
+        ggml_context * ctx0,
+        ggml_tensor * x_whcb,
+        ggml_tensor * b_c) {
+    if (!b_c) {
+        return x_whcb;
+    }
+    ggml_tensor * b4 = ggml_reshape_4d(ctx0, b_c, 1, 1, b_c->ne[0], 1);
+    return ggml_add(ctx0, x_whcb, b4);
+}
+
+static ggml_tensor * mul_channel_weight(
+        ggml_context * ctx0,
+        ggml_tensor * x_whcb,
+        ggml_tensor * w_c) {
+    if (!w_c) {
+        return x_whcb;
+    }
+    ggml_tensor * w4 = ggml_reshape_4d(ctx0, w_c, 1, 1, w_c->ne[0], 1);
+    return ggml_mul(ctx0, x_whcb, w4);
+}
+
+ggml_tensor * clip_graph_yasa2::layer_norm_channels(ggml_tensor * inp, ggml_tensor * w, ggml_tensor * b, float eps) {
+    // Match HF ConvNextLayerNorm(channels_first):
+    // u = mean_c(x), s = mean_c((x-u)^2), x = (x-u)/sqrt(s+eps)
+    // cast back to input dtype before affine.
+    ggml_tensor * cur = ggml_permute(ctx0, inp, 2, 1, 0, 3); // [W,H,C,B] -> [C,H,W,B]
+    cur = ggml_cont(ctx0, cur);
+
+    ggml_tensor * u = ggml_mean(ctx0, cur);                 // [1,H,W,B]
+    ggml_tensor * xm = ggml_sub(ctx0, cur, u);              // [C,H,W,B]
+
+    ggml_tensor * s = ggml_mul(ctx0, xm, xm);               // [C,H,W,B]
+    s = ggml_mean(ctx0, s);                                 // [1,H,W,B]
+    s = ggml_clamp(ctx0, s, eps, 1e30f);                    // avoid div-by-zero in no-alloc warmup
+    s = ggml_sqrt(ctx0, s);                                 // [1,H,W,B]
+
+    ggml_tensor * xhat = ggml_div(ctx0, xm, s);             // [C,H,W,B]
+    xhat = ggml_permute(ctx0, xhat, 2, 1, 0, 3);            // [W,H,C,B]
+    xhat = ggml_cont(ctx0, xhat);
+    xhat = mul_channel_weight(ctx0, xhat, w);
+    xhat = add_channel_bias(ctx0, xhat, b);
+    return xhat;
+}
+
+ggml_tensor * clip_graph_yasa2::convnext_grn(ggml_tensor * inp, ggml_tensor * w, ggml_tensor * b) {
+    // Exact ConvNeXtV2 GRN:
+    // Gx = ||x||_2 over spatial dims (W,H), Nx = Gx / (mean_c(Gx) + eps)
+    // y  = w * (x * Nx) + b + x
+    const int64_t wdim = inp->ne[0];
+    const int64_t hdim = inp->ne[1];
+    const int64_t cdim = inp->ne[2];
+    const int64_t bdim = inp->ne[3];
+
+    // Keep GRN math in fp32 for stability; fp16/bf16 accumulation can drift.
+    ggml_tensor * sq = ggml_mul(ctx0, inp, inp);
+    ggml_tensor * sq_flat = ggml_reshape_4d(ctx0, sq, wdim * hdim, cdim, 1, bdim);   // [WH,C,1,B]
+    ggml_tensor * gx = ggml_sum_rows(ctx0, sq_flat);                                   // [1,C,1,B]
+    gx = ggml_sqrt(ctx0, gx);                                                           // [1,C,1,B]
+
+    ggml_tensor * gx_ch_first = ggml_permute(ctx0, gx, 1, 0, 2, 3);                    // [C,1,1,B]
+    gx_ch_first = ggml_cont(ctx0, gx_ch_first);
+    ggml_tensor * gx_mean = ggml_mean(ctx0, gx_ch_first);                              // [1,1,1,B]
+
+    gx_mean = ggml_clamp(ctx0, gx_mean, 1e-6f, 1e30f);                                  // approx +eps, warmup-safe
+    ggml_tensor * nx = ggml_div(ctx0, gx, gx_mean);                                    // [1,C,1,B]
+    nx = ggml_permute(ctx0, nx, 0, 2, 1, 3);                                            // [1,1,C,B]
+    nx = ggml_cont(ctx0, nx);
+
+    ggml_tensor * xnx = ggml_mul(ctx0, inp, nx);
+    xnx = mul_channel_weight(ctx0, xnx, w);
+    xnx = add_channel_bias(ctx0, xnx, b);
+    return ggml_add(ctx0, inp, xnx);
+}
+
+ggml_cgraph * clip_graph_yasa2::build() {
+    ggml_tensor * cur = build_inp_raw();
+
+    // Patch embedding Conv2d(kernel=4, stride=4)
+    cur = ggml_conv_2d(ctx0, model.yasa_patch_w, cur, patch_size, patch_size, 0, 0, 1, 1);
+    cur = add_channel_bias(ctx0, cur, model.yasa_patch_b);
+    ggml_set_name(cur, "yasa2_patch_conv_out");
+    cb(cur, "yasa2_patch_conv_out", -1);
+    cur = layer_norm_channels(cur, model.yasa_patch_ln_w, model.yasa_patch_ln_b, eps);
+    ggml_set_name(cur, "yasa2_patch_ln_out");
+    cb(cur, "yasa2_patch_ln_out", -1);
+
+    // ConvNeXt stages
+    for (size_t s = 0; s < model.yasa_stages.size(); ++s) {
+        const auto & stage = model.yasa_stages[s];
+
+        if (stage.down_conv_w) {
+            cur = layer_norm_channels(cur, stage.down_ln_w, stage.down_ln_b, eps);
+            cur = ggml_conv_2d(ctx0, stage.down_conv_w, cur, 2, 2, 0, 0, 1, 1);
+            cur = add_channel_bias(ctx0, cur, stage.down_conv_b);
+            ggml_format_name(cur, "yasa2_stage%zu_down_out", s);
+        }
+
+        for (size_t bi = 0; bi < stage.blocks.size(); ++bi) {
+            const auto & blk = stage.blocks[bi];
+            ggml_tensor * res = cur;
+
+            ggml_tensor * x = ggml_conv_2d_dw(ctx0, blk.dw_w, cur, 1, 1, 3, 3, 1, 1);
+            x = add_channel_bias(ctx0, x, blk.dw_b);
+            x = layer_norm_channels(x, blk.ln_w, blk.ln_b, eps);
+
+            // pwconv1/pwconv2 are HF Linear layers over channels; implement via matmul on tokens.
+            const int64_t w = x->ne[0];
+            const int64_t h = x->ne[1];
+            const int64_t b = x->ne[3];
+
+            ggml_tensor * tok = ggml_reshape_3d(ctx0, x, w * h, x->ne[2], b); // [T,C,B]
+            tok = ggml_permute(ctx0, tok, 1, 0, 2, 3);                        // [C,T,B]
+            tok = ggml_cont(ctx0, tok);
+
+            tok = ggml_mul_mat(ctx0, blk.pw1_w, tok);                         // [4C,T,B]
+            if (blk.pw1_b) {
+                ggml_tensor * b1 = ggml_reshape_3d(ctx0, blk.pw1_b, blk.pw1_b->ne[0], 1, 1); // [4C,1,1]
+                tok = ggml_add(ctx0, tok, b1);
+            }
+            x = ggml_permute(ctx0, tok, 1, 0, 2, 3);                         // [T,4C,B]
+            x = ggml_cont(ctx0, x);
+            x = ggml_reshape_4d(ctx0, x, w, h, tok->ne[0], b);               // [W,H,4C,B]
+            x = ggml_gelu_erf(ctx0, x);
+            x = convnext_grn(x, blk.grn_w, blk.grn_b);
+
+            tok = ggml_reshape_3d(ctx0, x, w * h, x->ne[2], b);              // [T,4C,B]
+            tok = ggml_permute(ctx0, tok, 1, 0, 2, 3);                       // [4C,T,B]
+            tok = ggml_cont(ctx0, tok);
+
+            tok = ggml_mul_mat(ctx0, blk.pw2_w, tok);                        // [C,T,B]
+            if (blk.pw2_b) {
+                ggml_tensor * b2 = ggml_reshape_3d(ctx0, blk.pw2_b, blk.pw2_b->ne[0], 1, 1); // [C,1,1]
+                tok = ggml_add(ctx0, tok, b2);
+            }
+            x = ggml_permute(ctx0, tok, 1, 0, 2, 3);                         // [T,C,B]
+            x = ggml_cont(ctx0, x);
+            x = ggml_reshape_4d(ctx0, x, w, h, tok->ne[0], b);               // [W,H,C,B]
+
+            cur = ggml_add(ctx0, res, x);
+            ggml_format_name(cur, "yasa2_stage%zu_blk%zu_out", s, bi);
+        }
+    }
+
+    // HF path adds vision position embeddings BEFORE adaptive pooling.
+    const int64_t pre_w = cur->ne[0];
+    const int64_t pre_h = cur->ne[1];
+    ggml_tensor * tokens_pre = ggml_reshape_3d(ctx0, cur, pre_w * pre_h, cur->ne[2], cur->ne[3]); // [T,C,B]
+    tokens_pre = ggml_permute(ctx0, tokens_pre, 1, 0, 2, 3); // [C,T,B]
+    tokens_pre = ggml_cont(ctx0, tokens_pre);
+    if (model.yasa_vision_pos_embed && tokens_pre->ne[1] == model.yasa_vision_pos_embed->ne[1]) {
+        const int64_t n_ch = model.yasa_vision_pos_embed->ne[0];
+        const int64_t n_tokens = model.yasa_vision_pos_embed->ne[1];
+        ggml_tensor * pos = ggml_reshape_3d(ctx0, model.yasa_vision_pos_embed, (int) n_ch, (int) n_tokens, 1);
+        tokens_pre = ggml_add(ctx0, tokens_pre, pos);
+    }
+    cur = ggml_permute(ctx0, tokens_pre, 1, 0, 2, 3); // [T,C,B]
+    cur = ggml_cont(ctx0, cur);
+    cur = ggml_reshape_4d(ctx0, cur, pre_w, pre_h, cur->ne[1], cur->ne[2]); // [W,H,C,B]
+
+    // AdaptiveAvgPool2d target is 8x8 for real inputs, but warmup can use tiny images.
+    const int pooled_w = std::min(8, (int) cur->ne[0]);
+    const int pooled_h = std::min(8, (int) cur->ne[1]);
+    const int kw = std::max(1, (int) cur->ne[0] / pooled_w);
+    const int kh = std::max(1, (int) cur->ne[1] / pooled_h);
+    cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kw, kh, kw, kh, 0, 0);
+
+    // [W,H,C,B] -> [C,T,B]
+    ggml_tensor * tokens = ggml_reshape_3d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2], cur->ne[3]);
+    tokens = ggml_permute(ctx0, tokens, 1, 0, 2, 3);
+    tokens = ggml_cont(ctx0, tokens);
+    cb(tokens, "yasa2_tokens", -1);
+
+    GGML_ASSERT(model.mm_0_w && model.mm_2_w);
+    ggml_tensor * embeddings = build_ffn(
+        tokens,
+        model.mm_0_w, model.mm_0_b,
+        nullptr, nullptr,
+        model.mm_2_w, model.mm_2_b,
+        FFN_GELU_ERF,
+        -1);
+    cb(embeddings, "yasa2_emb", -1);
+
+    ggml_build_forward_expand(gf, embeddings);
+    return gf;
+}
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@ -35,15 +35,23 @@ struct mtmd_bitmap {

 // position indexing for decoder model
 enum mtmd_pos_type {
-    MTMD_POS_TYPE_NORMAL, // number of positions equals to number of tokens
-    MTMD_POS_TYPE_MROPE, // qwen-vl mrope style, each image takes max(t,h,w) position indexes
+    MTMD_POS_TYPE_NORMAL,    // number of positions equals to number of tokens
+    MTMD_POS_TYPE_MROPE,     // qwen-vl mrope style, each image takes max(t,h,w) position indexes
+    MTMD_POS_TYPE_HUNYUANVL, // HunyuanVL mrope + BOI/EOI/newline layout with XD-RoPE dim-3
 };

 struct mtmd_image_tokens {
    uint32_t nx; // number of tokens in x direction
    uint32_t ny; // number of tokens in y direction
    mtmd_pos_type pos = MTMD_POS_TYPE_NORMAL;
-    uint32_t n_tokens() const { return nx * ny; }
+    uint32_t image_idx = 0; // 0-based position of this image among image chunks in the prompt(used by pos == MTMD_POS_TYPE_HUNYUANVL)
+    uint32_t n_tokens() const {
+        if (pos == MTMD_POS_TYPE_HUNYUANVL) {
+            // [BOI] [row0 tokens + newline] ... [row(ny-1) tokens + newline] [EOI]
+            return (nx + 1) * ny + 2;
+        }
+        return nx * ny;
+    }
    clip_image_f32_batch batch_f32; // preprocessed image patches
    std::string id; // optional user-defined ID, useful for KV cache tracking

@ -52,6 +60,7 @@ struct mtmd_image_tokens {
            nx,
            ny,
            pos,
+            image_idx,
            batch_f32.clone(),
            id
        };
@ -186,6 +195,7 @@ struct mtmd_context {

        auto decoder_rope_type = llama_model_rope_type(text_model);
        switch (decoder_rope_type) {
+            case LLAMA_ROPE_TYPE_NONE:
            case LLAMA_ROPE_TYPE_NORM:
            case LLAMA_ROPE_TYPE_NEOX:
                {
@ -316,6 +326,19 @@ struct mtmd_context {
                    img_end = "<|vision_end|>";
                    image_preproc = std::make_unique<mtmd_image_preprocessor_youtuvl>(ctx_v);
                } break;
+            case PROJECTOR_TYPE_YASA2:
+                {
+                    img_beg = "<image>";
+                    img_end = "</image>";
+                    // Currently only supprots single-tile preprocessing: any input is downscaled
+                    // to one image_size x image_size tile (64 output tokens via 8x8 adaptive avg
+                    // pool).
+                    // However, the model itself supports llava-uhd multi-tile tiling for high-res
+                    // images. This will be implemented in a future PR (dispatch on has_pinpoints
+                    // - see LDP/COGVLM branch above) and emit image_grid_pinpoints in the conversion
+                    // script.
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_fixed_size>(ctx_v);
+                } break;
            case PROJECTOR_TYPE_GEMMA3:
            case PROJECTOR_TYPE_GEMMA3NV:
                {
@ -453,6 +476,7 @@ struct mtmd_context {
                    image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
                } break;
            case PROJECTOR_TYPE_HUNYUANOCR:
+            case PROJECTOR_TYPE_HUNYUANVL:
                {
                    // note: these use fullwidth ｜ (U+FF5C) and ▁ (U+2581) to match the tokenizer vocabulary
                    img_beg = "<｜hy_place▁holder▁no▁100｜>";
@ -598,6 +622,7 @@ struct mtmd_tokenizer {
    const llama_vocab * vocab;

    mtmd_input_chunks cur;
+    uint32_t n_images_added = 0; // 0-based index assigned to the next image chunk

    mtmd_tokenizer(mtmd_context * ctx,
            const mtmd_input_text * text,
@ -806,6 +831,14 @@ struct mtmd_tokenizer {
                    image_tokens->ny = 1;
                }
                image_tokens->pos = ctx->pos_type;
+                // HunyuanVL wraps the image grid with BOI/EOI and adds one newline per row,
+                // and uses XD-RoPE (dim-3 = image index). Override the position type so that
+                // n_tokens() and mtmd_image_tokens_get_decoder_pos pick the HunyuanVL layout.
+                if (ctx->proj_type_v() == PROJECTOR_TYPE_HUNYUANVL) {
+                    image_tokens->pos       = MTMD_POS_TYPE_HUNYUANVL;
+                    image_tokens->image_idx = n_images_added;
+                    GGML_ASSERT(n_tokens == (size_t)image_tokens->n_tokens());
+                }
                image_tokens->batch_f32 = std::move(batch_f32);
                image_tokens->id = bitmap->id; // optional

@ -826,6 +859,9 @@ struct mtmd_tokenizer {
                add_text(ctx->img_end, true); // add image end token
            }

+            // advance image-chunk counter so the next image gets the next XD-RoPE dim-3 slot
+            n_images_added++;
+
        } else {
            // handle audio

@ -1273,6 +1309,38 @@ mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * ima
                pos.y = pos_0 + i;
                pos.z = pos_0 + i;
            } break;
+        case MTMD_POS_TYPE_HUNYUANVL:
+            {
+                // HunyuanVL layout: [BOI] [row0 tokens + newline] ... [row(ny-1) tokens + newline] [EOI]
+                // Total = 1 + ny*(nx+1) + 1. BOI and EOI use sequential positions in every dim;
+                // content and row-newline tokens use (row, col) with XD-RoPE dim-3 = image_idx.
+                const uint32_t nx      = image_tokens->nx;
+                const uint32_t n_total = image_tokens->n_tokens();
+                if (i == 0) {
+                    // BOI
+                    pos.t = pos_0 + i;
+                    pos.x = pos_0 + i;
+                    pos.y = pos_0 + i;
+                    pos.z = pos_0 + i;
+                } else if (i == n_total - 1) {
+                    // EOI
+                    pos.t = pos_0 + i;
+                    pos.x = pos_0 + i;
+                    pos.y = pos_0 + i;
+                    pos.z = pos_0 + i;
+                } else {
+                    // content token at (row, col), or the trailing newline of a row (col == nx)
+                    //   section 0 = sequential, section 1 = w(col), section 2 = h(row), section 3 = image_count.
+                    // set_position_mrope_2d writes .y -> section 1 and .x -> section 2
+                    const uint32_t offset = (uint32_t)i - 1;
+                    const uint32_t row    = offset / (nx + 1);
+                    const uint32_t col    = offset % (nx + 1);
+                    pos.t = pos_0 + i;
+                    pos.x = row;
+                    pos.y = col;
+                    pos.z = image_tokens->image_idx;
+                }
+            } break;
        default:
            GGML_ABORT("invalid position type");
    }
@ -1289,6 +1357,10 @@ llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
            return std::max(image_tokens->nx, image_tokens->ny);
        case MTMD_POS_TYPE_NORMAL:
            return image_tokens->n_tokens();
+        case MTMD_POS_TYPE_HUNYUANVL:
+            // HunyuanVL: the sequential (dim-0) position advances by the full token count
+            // (includes BOI/EOI and row newline tokens), not by max(nx, ny)
+            return image_tokens->n_tokens();
        default:
            GGML_ABORT("invalid position type");
    }
--- a/tools/mtmd/tests.sh
+++ b/tools/mtmd/tests.sh
@ -91,6 +91,7 @@ add_test_vision "ggml-org/LightOnOCR-1B-1025-GGUF:Q8_0"
 add_test_vision "ggml-org/DeepSeek-OCR-GGUF:Q8_0" -p "Free OCR." --chat-template deepseek-ocr
 add_test_vision "ggml-org/dots.ocr-GGUF:Q8_0" -p "OCR"
 add_test_vision "ggml-org/HunyuanOCR-GGUF:Q8_0" -p "OCR"
+add_test_vision "ggml-org/HunyuanVL-4B-GGUF:Q8_0"
 add_test_vision "ggml-org/gemma-4-E2B-it-GGUF:Q8_0" --jinja

 add_test_audio  "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0"
--- a/tools/server/server-chat.cpp
+++ b/tools/server/server-chat.cpp
@ -0,0 +1,588 @@
+#include "server-chat.h"
+#include "server-common.h"
+
+#include <sstream>
+
+json server_chat_convert_responses_to_chatcmpl(const json & response_body) {
+    if (!response_body.contains("input")) {
+        throw std::invalid_argument("'input' is required");
+    }
+    if (!json_value(response_body, "previous_response_id", std::string{}).empty()) {
+        throw std::invalid_argument("llama.cpp does not support 'previous_response_id'.");
+    }
+
+    const json input_value = response_body.at("input");
+    json chatcmpl_body = response_body;
+    chatcmpl_body.erase("input");
+    std::vector<json> chatcmpl_messages;
+
+    if (response_body.contains("instructions")) {
+        chatcmpl_messages.push_back({
+            {"role",    "system"},
+            {"content", json_value(response_body, "instructions", std::string())},
+        });
+        chatcmpl_body.erase("instructions");
+    }
+
+    if (input_value.is_string()) {
+        // #responses_create-input-text_input
+        chatcmpl_messages.push_back({
+            {"role",    "user"},
+            {"content", input_value},
+        });
+    } else if (input_value.is_array()) {
+        // #responses_create-input-input_item_list
+
+        static auto exists_and_is_array = [](const json & j, const char * key) -> bool {
+            return j.contains(key) && j.at(key).is_array();
+        };
+        static auto exists_and_is_string = [](const json & j, const char * key) -> bool {
+            return j.contains(key) && j.at(key).is_string();
+        };
+
+        for (json item : input_value) {
+            bool merge_prev = !chatcmpl_messages.empty() && chatcmpl_messages.back().value("role", "") == "assistant";
+
+            if (exists_and_is_string(item, "content")) {
+                // #responses_create-input-input_item_list-input_message-content-text_input
+                // Only "Input message" contains item["content"]::string
+                // After converting item["content"]::string to item["content"]::array,
+                // we can treat "Input message" as sum of "Item-Input message" and "Item-Output message"
+                item["content"] = json::array({
+                    json {
+                        {"text", item.at("content")},
+                        {"type", "input_text"}
+                    }
+                });
+            }
+
+            if (exists_and_is_array(item, "content") &&
+                exists_and_is_string(item, "role") &&
+                (item.at("role") == "user" ||
+                    item.at("role") == "system" ||
+                    item.at("role") == "developer")
+            ) {
+                // #responses_create-input-input_item_list-item-input_message
+                std::vector<json> chatcmpl_content;
+
+                for (const json & input_item : item.at("content")) {
+                    const std::string type = json_value(input_item, "type", std::string());
+
+                    if (type == "input_text") {
+                        if (!input_item.contains("text")) {
+                            throw std::invalid_argument("'Input text' requires 'text'");
+                        }
+                        chatcmpl_content.push_back({
+                            {"text", input_item.at("text")},
+                            {"type", "text"},
+                        });
+                    } else if (type == "input_image") {
+                        // While `detail` is marked as required,
+                        // it has default value("auto") and can be omitted.
+
+                        if (!input_item.contains("image_url")) {
+                            throw std::invalid_argument("'image_url' is required");
+                        }
+                        chatcmpl_content.push_back({
+                            {"image_url", json {
+                                {"url", input_item.at("image_url")}
+                            }},
+                            {"type", "image_url"},
+                        });
+                    } else if (type == "input_file") {
+                        throw std::invalid_argument("'input_file' is not supported by llamacpp at this moment");
+                    } else {
+                        throw std::invalid_argument("'type' must be one of 'input_text', 'input_image', or 'input_file'");
+                    }
+                }
+
+                if (item.contains("type")) {
+                    item.erase("type");
+                }
+                if (item.contains("status")) {
+                    item.erase("status");
+                }
+                item["content"] = chatcmpl_content;
+
+                chatcmpl_messages.push_back(item);
+            } else if (exists_and_is_string(item, "role") &&
+                item.at("role") == "assistant" &&
+                exists_and_is_string(item, "type") &&
+                item.at("type") == "message"
+            ) {
+                // #responses_create-input-input_item_list-item-output_message
+                auto chatcmpl_content = json::array();
+
+                // Handle both string content and array content
+                if (item.contains("content") && item.at("content").is_string()) {
+                    // String content - convert to text content part
+                    chatcmpl_content.push_back({
+                        {"text", item.at("content")},
+                        {"type", "text"},
+                    });
+                } else if (exists_and_is_array(item, "content")) {
+                    // Array content - process each item
+                    for (const auto & output_text : item.at("content")) {
+                        const std::string type = json_value(output_text, "type", std::string());
+                        if (type == "output_text" || type == "input_text") {
+                            // Accept both output_text and input_text (string content gets converted to input_text)
+                            if (!exists_and_is_string(output_text, "text")) {
+                                throw std::invalid_argument("'Output text' requires 'text'");
+                            }
+                            chatcmpl_content.push_back({
+                                {"text", output_text.at("text")},
+                                {"type", "text"},
+                            });
+                        } else if (type == "refusal") {
+                            if (!exists_and_is_string(output_text, "refusal")) {
+                                throw std::invalid_argument("'Refusal' requires 'refusal'");
+                            }
+                            chatcmpl_content.push_back({
+                                {"refusal", output_text.at("refusal")},
+                                {"type", "refusal"},
+                            });
+                        } else {
+                            throw std::invalid_argument("'type' must be one of 'output_text' or 'refusal'");
+                        }
+                    }
+                }
+
+                if (merge_prev) {
+                    auto & prev_msg = chatcmpl_messages.back();
+                    if (!exists_and_is_array(prev_msg, "content")) {
+                        prev_msg["content"] = json::array();
+                    }
+                    auto & prev_content = prev_msg["content"];
+                    prev_content.insert(prev_content.end(), chatcmpl_content.begin(), chatcmpl_content.end());
+                } else {
+                    item.erase("status");
+                    item.erase("type");
+                    item["content"] = chatcmpl_content;
+                    chatcmpl_messages.push_back(item);
+                }
+            } else if (exists_and_is_string(item, "arguments") &&
+                exists_and_is_string(item, "call_id") &&
+                exists_and_is_string(item, "name") &&
+                exists_and_is_string(item, "type") &&
+                item.at("type") == "function_call"
+            ) {
+                // #responses_create-input-input_item_list-item-function_tool_call
+                json tool_call = {
+                    {"function", json {
+                        {"arguments", item.at("arguments")},
+                        {"name",      item.at("name")},
+                    }},
+                    {"id",   item.at("call_id")},
+                    {"type", "function"},
+                };
+
+                if (merge_prev) {
+                    auto & prev_msg = chatcmpl_messages.back();
+                    if (!exists_and_is_array(prev_msg, "tool_calls")) {
+                        prev_msg["tool_calls"] = json::array();
+                    }
+                    prev_msg["tool_calls"].push_back(tool_call);
+                } else {
+                    chatcmpl_messages.push_back(json {
+                        {"role",       "assistant"},
+                        {"tool_calls", json::array({tool_call})}
+                    });
+                }
+            } else if (exists_and_is_string(item, "call_id") &&
+                (exists_and_is_string(item, "output") || exists_and_is_array(item, "output")) &&
+                exists_and_is_string(item, "type") &&
+                item.at("type") == "function_call_output"
+            ) {
+                // #responses_create-input-input_item_list-item-function_tool_call_output
+                if (item.at("output").is_string()) {
+                    chatcmpl_messages.push_back(json {
+                        {"content",      item.at("output")},
+                        {"role",         "tool"},
+                        {"tool_call_id", item.at("call_id")},
+                    });
+                } else {
+                    json chatcmpl_outputs = item.at("output");
+                    for (json & chatcmpl_output : chatcmpl_outputs) {
+                        if (!chatcmpl_output.contains("type") || chatcmpl_output.at("type") != "input_text") {
+                            throw std::invalid_argument("Output of tool call should be 'Input text'");
+                        }
+                        chatcmpl_output["type"] = "text";
+                    }
+                    chatcmpl_messages.push_back(json {
+                        {"content",      chatcmpl_outputs},
+                        {"role",         "tool"},
+                        {"tool_call_id", item.at("call_id")},
+                    });
+                }
+            } else if (exists_and_is_array(item, "summary") &&
+                exists_and_is_string(item, "type") &&
+                item.at("type") == "reasoning") {
+                // #responses_create-input-input_item_list-item-reasoning
+
+                if (!exists_and_is_array(item, "content")) {
+                    throw std::invalid_argument("item['content'] is not an array");
+                }
+                if (item.at("content").empty()) {
+                    throw std::invalid_argument("item['content'] is empty");
+                }
+                if (!exists_and_is_string(item.at("content")[0], "text")) {
+                    throw std::invalid_argument("item['content']['text'] is not a string");
+                }
+
+                if (merge_prev) {
+                    auto & prev_msg = chatcmpl_messages.back();
+                    prev_msg["reasoning_content"] = item.at("content")[0].at("text");
+                } else {
+                    chatcmpl_messages.push_back(json {
+                        {"role", "assistant"},
+                        {"content", json::array()},
+                        {"reasoning_content", item.at("content")[0].at("text")},
+                    });
+                }
+            } else {
+                throw std::invalid_argument("Cannot determine type of 'item'");
+            }
+        }
+    } else {
+        throw std::invalid_argument("'input' must be a string or array of objects");
+    }
+
+    chatcmpl_body["messages"] = chatcmpl_messages;
+
+    if (response_body.contains("tools")) {
+        if (!response_body.at("tools").is_array()) {
+            throw std::invalid_argument("'tools' must be an array of objects");
+        }
+        std::vector<json> chatcmpl_tools;
+        for (json resp_tool : response_body.at("tools")) {
+            json chatcmpl_tool;
+
+            if (json_value(resp_tool, "type", std::string()) != "function") {
+                throw std::invalid_argument("'type' of tool must be 'function'");
+            }
+            resp_tool.erase("type");
+            chatcmpl_tool["type"] = "function";
+
+            if (!resp_tool.contains("strict")) {
+                resp_tool["strict"] = true;
+            }
+            chatcmpl_tool["function"] = resp_tool;
+            chatcmpl_tools.push_back(chatcmpl_tool);
+        }
+        chatcmpl_body.erase("tools");
+        chatcmpl_body["tools"] = chatcmpl_tools;
+    }
+
+    if (response_body.contains("max_output_tokens")) {
+        chatcmpl_body.erase("max_output_tokens");
+        chatcmpl_body["max_tokens"] = response_body["max_output_tokens"];
+    }
+
+    return chatcmpl_body;
+}
+
+json server_chat_convert_anthropic_to_oai(const json & body) {
+    json oai_body;
+
+    // Convert system prompt
+    json oai_messages = json::array();
+    auto system_param = json_value(body, "system", json());
+    if (!system_param.is_null()) {
+        std::string system_content;
+
+        if (system_param.is_string()) {
+            system_content = system_param.get<std::string>();
+        } else if (system_param.is_array()) {
+            for (const auto & block : system_param) {
+                if (json_value(block, "type", std::string()) == "text") {
+                    system_content += json_value(block, "text", std::string());
+                }
+            }
+        }
+
+        oai_messages.push_back({
+            {"role", "system"},
+            {"content", system_content}
+        });
+    }
+
+    // Convert messages
+    if (!body.contains("messages")) {
+        throw std::runtime_error("'messages' is required");
+    }
+    const json & messages = body.at("messages");
+    if (messages.is_array()) {
+        for (const auto & msg : messages) {
+            std::string role = json_value(msg, "role", std::string());
+
+            if (!msg.contains("content")) {
+                if (role == "assistant") {
+                    continue;
+                }
+                oai_messages.push_back(msg);
+                continue;
+            }
+
+            const json & content = msg.at("content");
+
+            if (content.is_string()) {
+                oai_messages.push_back(msg);
+                continue;
+            }
+
+            if (!content.is_array()) {
+                oai_messages.push_back(msg);
+                continue;
+            }
+
+            json tool_calls = json::array();
+            json converted_content = json::array();
+            json tool_results = json::array();
+            std::string reasoning_content;
+            bool has_tool_calls = false;
+
+            for (const auto & block : content) {
+                std::string type = json_value(block, "type", std::string());
+
+                if (type == "text") {
+                    converted_content.push_back(block);
+                } else if (type == "thinking") {
+                    reasoning_content += json_value(block, "thinking", std::string());
+                } else if (type == "image") {
+                    json source = json_value(block, "source", json::object());
+                    std::string source_type = json_value(source, "type", std::string());
+
+                    if (source_type == "base64") {
+                        std::string media_type = json_value(source, "media_type", std::string("image/jpeg"));
+                        std::string data = json_value(source, "data", std::string());
+                        std::ostringstream ss;
+                        ss << "data:" << media_type << ";base64," << data;
+
+                        converted_content.push_back({
+                            {"type", "image_url"},
+                            {"image_url", {
+                                {"url", ss.str()}
+                            }}
+                        });
+                    } else if (source_type == "url") {
+                        std::string url = json_value(source, "url", std::string());
+                        converted_content.push_back({
+                            {"type", "image_url"},
+                            {"image_url", {
+                                {"url", url}
+                            }}
+                        });
+                    }
+                } else if (type == "tool_use") {
+                    tool_calls.push_back({
+                        {"id", json_value(block, "id", std::string())},
+                        {"type", "function"},
+                        {"function", {
+                            {"name", json_value(block, "name", std::string())},
+                            {"arguments", json_value(block, "input", json::object()).dump()}
+                        }}
+                    });
+                    has_tool_calls = true;
+                } else if (type == "tool_result") {
+                    std::string tool_use_id = json_value(block, "tool_use_id", std::string());
+
+                    auto result_content = json_value(block, "content", json());
+                    std::string result_text;
+                    if (result_content.is_string()) {
+                        result_text = result_content.get<std::string>();
+                    } else if (result_content.is_array()) {
+                        for (const auto & c : result_content) {
+                            if (json_value(c, "type", std::string()) == "text") {
+                                result_text += json_value(c, "text", std::string());
+                            }
+                        }
+                    }
+
+                    tool_results.push_back({
+                        {"role", "tool"},
+                        {"tool_call_id", tool_use_id},
+                        {"content", result_text}
+                    });
+                }
+            }
+
+            if (!converted_content.empty() || has_tool_calls || !reasoning_content.empty()) {
+                json new_msg = {{"role", role}};
+                if (!converted_content.empty()) {
+                    new_msg["content"] = converted_content;
+                } else if (has_tool_calls || !reasoning_content.empty()) {
+                    new_msg["content"] = "";
+                }
+                if (!tool_calls.empty()) {
+                    new_msg["tool_calls"] = tool_calls;
+                }
+                if (!reasoning_content.empty()) {
+                    new_msg["reasoning_content"] = reasoning_content;
+                }
+                oai_messages.push_back(new_msg);
+            }
+
+            for (const auto & tool_msg : tool_results) {
+                oai_messages.push_back(tool_msg);
+            }
+        }
+    }
+
+    oai_body["messages"] = oai_messages;
+
+    // Convert tools
+    if (body.contains("tools")) {
+        const json & tools = body.at("tools");
+        if (tools.is_array()) {
+            json oai_tools = json::array();
+            for (const auto & tool : tools) {
+                oai_tools.push_back({
+                    {"type", "function"},
+                    {"function", {
+                        {"name", json_value(tool, "name", std::string())},
+                        {"description", json_value(tool, "description", std::string())},
+                        {"parameters", tool.contains("input_schema") ? tool.at("input_schema") : json::object()}
+                    }}
+                });
+            }
+            oai_body["tools"] = oai_tools;
+        }
+    }
+
+    // Convert tool_choice
+    if (body.contains("tool_choice")) {
+        const json & tc = body.at("tool_choice");
+        if (tc.is_object()) {
+            std::string type = json_value(tc, "type", std::string());
+            if (type == "auto") {
+                oai_body["tool_choice"] = "auto";
+            } else if (type == "any" || type == "tool") {
+                oai_body["tool_choice"] = "required";
+            }
+        }
+    }
+
+    // Convert stop_sequences to stop
+    if (body.contains("stop_sequences")) {
+        oai_body["stop"] = body.at("stop_sequences");
+    }
+
+    // Handle max_tokens (required in Anthropic, but we're permissive)
+    if (body.contains("max_tokens")) {
+        oai_body["max_tokens"] = body.at("max_tokens");
+    } else {
+        oai_body["max_tokens"] = 4096;
+    }
+
+    // Pass through common params
+    for (const auto & key : {"temperature", "top_p", "top_k", "stream"}) {
+        if (body.contains(key)) {
+            oai_body[key] = body.at(key);
+        }
+    }
+
+    // Handle Anthropic-specific thinking param
+    if (body.contains("thinking")) {
+        json thinking = json_value(body, "thinking", json::object());
+        std::string thinking_type = json_value(thinking, "type", std::string());
+        if (thinking_type == "enabled") {
+            int budget_tokens = json_value(thinking, "budget_tokens", 10000);
+            oai_body["thinking_budget_tokens"] = budget_tokens;
+        }
+    }
+
+    // Handle Anthropic-specific metadata param
+    if (body.contains("metadata")) {
+        json metadata = json_value(body, "metadata", json::object());
+        std::string user_id = json_value(metadata, "user_id", std::string());
+        if (!user_id.empty()) {
+            oai_body["__metadata_user_id"] = user_id;
+        }
+    }
+
+    return oai_body;
+}
+
+json server_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
+    json delta = json::object();
+    if (!diff.reasoning_content_delta.empty()) {
+        delta["reasoning_content"] = diff.reasoning_content_delta;
+    }
+    if (!diff.content_delta.empty()) {
+        delta["content"] = diff.content_delta;
+    }
+    if (diff.tool_call_index != std::string::npos) {
+        json tool_call;
+        tool_call["index"] = diff.tool_call_index;
+        if (!diff.tool_call_delta.id.empty()) {
+            tool_call["id"]   = diff.tool_call_delta.id;
+            tool_call["type"] = "function";
+        }
+        if (!diff.tool_call_delta.name.empty() || !diff.tool_call_delta.arguments.empty()) {
+            json function = json::object();
+            if (!diff.tool_call_delta.name.empty()) {
+                function["name"] = diff.tool_call_delta.name;
+            }
+            if (!diff.tool_call_delta.arguments.empty()) {
+                function["arguments"] = diff.tool_call_delta.arguments;
+            }
+            tool_call["function"] = function;
+        }
+        delta["tool_calls"] = json::array({ tool_call });
+    }
+    return delta;
+}
+
+json convert_transcriptions_to_chatcmpl(
+        const json & inp_body,
+        const std::map<std::string, raw_buffer> & in_files,
+        std::vector<raw_buffer> & out_files) {
+    // TODO @ngxson : this function may need to be improved in the future
+    // handle input files
+    out_files.clear();
+    auto it = in_files.find("file");
+    if (it != in_files.end()) {
+        out_files.push_back(it->second);
+    } else {
+        throw std::invalid_argument("No input file found for transcription");
+    }
+
+    // handle input data
+    std::string prompt = json_value(inp_body, "prompt", std::string());
+    std::string language = json_value(inp_body, "language", std::string());
+    std::string response_format = json_value(inp_body, "response_format", std::string("json"));
+    if (response_format != "json") {
+        throw std::invalid_argument("Only 'json' response_format is supported for transcription");
+    }
+    if (prompt.empty()) {
+        prompt = "Transcribe audio to text";
+    }
+    if (!language.empty()) {
+        prompt += string_format(" (language: %s)", language.c_str());
+    }
+    prompt += get_media_marker();
+
+    json chatcmpl_body = inp_body; // copy all fields
+    chatcmpl_body["messages"] = json::array({
+        {
+            {"role", "user"},
+            {"content", prompt},
+        },
+    });
+
+    // because input from form-data, everything is string, we need to correct the types here
+    std::string stream = json_value(inp_body, "stream", std::string("false"));
+    chatcmpl_body["stream"] = stream == "true";
+
+    if (inp_body.contains("max_tokens")) {
+        std::string inp = inp_body["max_tokens"].get<std::string>();
+        chatcmpl_body["max_tokens"] = std::stoul(inp);
+    }
+
+    if (inp_body.contains("temperature")) {
+        std::string inp = inp_body["temperature"].get<std::string>();
+        chatcmpl_body["temperature"] = std::stof(inp);
+    }
+
+    return chatcmpl_body;
+}
--- a/tools/server/server-chat.h
+++ b/tools/server/server-chat.h
@ -0,0 +1,24 @@
+// Chat conversion functions for server (Responses API, Anthropic API, OAI streaming diffs)
+
+#pragma once
+
+#include "chat.h"
+#include "server-common.h"
+
+#include <nlohmann/json_fwd.hpp>
+
+using json = nlohmann::ordered_json;
+
+// Convert OpenAI Responses API format to OpenAI Chat Completions API format
+json server_chat_convert_responses_to_chatcmpl(const json & body);
+
+// Convert Anthropic Messages API format to OpenAI Chat Completions API format
+json server_chat_convert_anthropic_to_oai(const json & body);
+
+// convert OpenAI transcriptions API format to OpenAI Chat Completions API format
+json convert_transcriptions_to_chatcmpl(
+    const json & body,
+    const std::map<std::string, raw_buffer> & in_files,
+    std::vector<raw_buffer> & out_files);
+
+json server_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff);
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@ -1027,6 +1027,8 @@ json oaicompat_chat_params_parse(
        }
    }

+    auto caps = common_chat_templates_get_caps(opt.tmpls.get());
+
    common_chat_templates_inputs inputs;
    inputs.messages              = common_chat_msgs_parse_oaicompat(messages);
    inputs.tools                 = common_chat_tools_parse_oaicompat(tools);
@ -1034,7 +1036,7 @@ json oaicompat_chat_params_parse(
    inputs.json_schema           = json_schema.is_null() ? "" : json_schema.dump();
    inputs.grammar               = grammar;
    inputs.use_jinja             = opt.use_jinja;
-    inputs.parallel_tool_calls   = json_value(body, "parallel_tool_calls", false);
+    inputs.parallel_tool_calls   = json_value(body, "parallel_tool_calls", caps["supports_parallel_tool_calls"]);
    inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
    inputs.reasoning_format      = opt.reasoning_format;
    if (body.contains("reasoning_format")) {
@ -1164,573 +1166,6 @@ json oaicompat_chat_params_parse(
    return llama_params;
 }

-json convert_responses_to_chatcmpl(const json & response_body) {
-    if (!response_body.contains("input")) {
-        throw std::invalid_argument("'input' is required");
-    }
-    if (!json_value(response_body, "previous_response_id", std::string{}).empty()) {
-        throw std::invalid_argument("llama.cpp does not support 'previous_response_id'.");
-    }
-
-    const json input_value = response_body.at("input");
-    json chatcmpl_body = response_body;
-    chatcmpl_body.erase("input");
-    std::vector<json> chatcmpl_messages;
-
-    if (response_body.contains("instructions")) {
-        chatcmpl_messages.push_back({
-            {"role",    "system"},
-            {"content", json_value(response_body, "instructions", std::string())},
-        });
-        chatcmpl_body.erase("instructions");
-    }
-
-    if (input_value.is_string()) {
-        // #responses_create-input-text_input
-        chatcmpl_messages.push_back({
-            {"role",    "user"},
-            {"content", input_value},
-        });
-    } else if (input_value.is_array()) {
-        // #responses_create-input-input_item_list
-
-        static auto exists_and_is_array = [](const json & j, const char * key) -> bool {
-            return j.contains(key) && j.at(key).is_array();
-        };
-        static auto exists_and_is_string = [](const json & j, const char * key) -> bool {
-            return j.contains(key) && j.at(key).is_string();
-        };
-
-        for (json item : input_value) {
-            bool merge_prev = !chatcmpl_messages.empty() && chatcmpl_messages.back().value("role", "") == "assistant";
-
-            if (exists_and_is_string(item, "content")) {
-                // #responses_create-input-input_item_list-input_message-content-text_input
-                // Only "Input message" contains item["content"]::string
-                // After converting item["content"]::string to item["content"]::array,
-                // we can treat "Input message" as sum of "Item-Input message" and "Item-Output message"
-                item["content"] = json::array({
-                    json {
-                        {"text", item.at("content")},
-                        {"type", "input_text"}
-                    }
-                });
-            }
-
-            if (exists_and_is_array(item, "content") &&
-                exists_and_is_string(item, "role") &&
-                (item.at("role") == "user" ||
-                    item.at("role") == "system" ||
-                    item.at("role") == "developer")
-            ) {
-                // #responses_create-input-input_item_list-item-input_message
-                std::vector<json> chatcmpl_content;
-
-                for (const json & input_item : item.at("content")) {
-                    const std::string type = json_value(input_item, "type", std::string());
-
-                    if (type == "input_text") {
-                        if (!input_item.contains("text")) {
-                            throw std::invalid_argument("'Input text' requires 'text'");
-                        }
-                        chatcmpl_content.push_back({
-                            {"text", input_item.at("text")},
-                            {"type", "text"},
-                        });
-                    } else if (type == "input_image") {
-                        // While `detail` is marked as required,
-                        // it has default value("auto") and can be omitted.
-
-                        if (!input_item.contains("image_url")) {
-                            throw std::invalid_argument("'image_url' is required");
-                        }
-                        chatcmpl_content.push_back({
-                            {"image_url", json {
-                                {"url", input_item.at("image_url")}
-                            }},
-                            {"type", "image_url"},
-                        });
-                    } else if (type == "input_file") {
-                        throw std::invalid_argument("'input_file' is not supported by llamacpp at this moment");
-                        // if (input_item.contains("file_url")) {
-                        //     // chat completion API does not support file_url
-                        //     throw std::invalid_argument("'file_url' is not supported");
-                        // }
-                        // if (!input_item.contains("file_data") || !input_item.contains("filename")) {
-                        //     throw std::invalid_argument("Both 'file_data' and 'filename' are required");
-                        // }
-                        // chatcmpl_content.push_back({
-                        //     {"file", json {
-                        //         {"file_data", input_item.at("file_data")},
-                        //         {"filename",  input_item.at("filename")},
-                        //     }},
-                        //     {"type", "file"},
-                        // });
-                    } else {
-                        throw std::invalid_argument("'type' must be one of 'input_text', 'input_image', or 'input_file'");
-                    }
-                }
-
-                if (item.contains("type")) {
-                    item.erase("type");
-                }
-                if (item.contains("status")) {
-                    item.erase("status");
-                }
-                item["content"] = chatcmpl_content;
-
-                chatcmpl_messages.push_back(item);
-            } else if (exists_and_is_array(item, "content") &&
-                exists_and_is_string(item, "role") &&
-                item.at("role") == "assistant" &&
-                // exists_and_is_string(item, "status") &&
-                // (item.at("status") == "in_progress" ||
-                //     item.at("status") == "completed" ||
-                //     item.at("status") == "incomplete") &&
-                // item["status"] not sent by codex-cli
-                exists_and_is_string(item, "type") &&
-                item.at("type") == "message"
-            ) {
-                // #responses_create-input-input_item_list-item-output_message
-                auto chatcmpl_content = json::array();
-
-                for (const auto & output_text : item.at("content")) {
-                    const std::string type = json_value(output_text, "type", std::string());
-                    if (type == "output_text") {
-                        if (!exists_and_is_string(output_text, "text")) {
-                            throw std::invalid_argument("'Output text' requires 'text'");
-                            // Ignore annotations and logprobs for now
-                            chatcmpl_content.push_back({
-                                {"text", output_text.at("text")},
-                                {"type", "text"},
-                            });
-                        }
-                    } else if (type == "refusal") {
-                        if (!exists_and_is_string(output_text, "refusal")) {
-                            throw std::invalid_argument("'Refusal' requires 'refusal'");
-                            // Ignore annotations and logprobs for now
-                            chatcmpl_content.push_back({
-                                {"refusal", output_text.at("refusal")},
-                                {"type", "refusal"},
-                            });
-                        }
-                    } else {
-                        throw std::invalid_argument("'type' must be one of 'output_text' or 'refusal'");
-                    }
-                }
-
-                if (merge_prev) {
-                    auto & prev_msg = chatcmpl_messages.back();
-                    if (!exists_and_is_array(prev_msg, "content")) {
-                        prev_msg["content"] = json::array();
-                    }
-                    auto & prev_content = prev_msg["content"];
-                    prev_content.insert(prev_content.end(), chatcmpl_content.begin(), chatcmpl_content.end());
-                } else {
-                    item.erase("status");
-                    item.erase("type");
-                    item["content"] = chatcmpl_content;
-                    chatcmpl_messages.push_back(item);
-                }
-            } else if (exists_and_is_string(item, "arguments") &&
-                exists_and_is_string(item, "call_id") &&
-                exists_and_is_string(item, "name") &&
-                exists_and_is_string(item, "type") &&
-                item.at("type") == "function_call"
-            ) {
-                // #responses_create-input-input_item_list-item-function_tool_call
-                json tool_call = {
-                    {"function", json {
-                        {"arguments", item.at("arguments")},
-                        {"name",      item.at("name")},
-                    }},
-                    {"id",   item.at("call_id")},
-                    {"type", "function"},
-                };
-
-                if (merge_prev) {
-                    auto & prev_msg = chatcmpl_messages.back();
-                    if (!exists_and_is_array(prev_msg, "tool_calls")) {
-                        prev_msg["tool_calls"] = json::array();
-                    }
-                    prev_msg["tool_calls"].push_back(tool_call);
-                } else {
-                    chatcmpl_messages.push_back(json {
-                        {"role",       "assistant"},
-                        {"tool_calls", json::array({tool_call})}
-                    });
-                }
-            } else if (exists_and_is_string(item, "call_id") &&
-                (exists_and_is_string(item, "output") || exists_and_is_array(item, "output")) &&
-                exists_and_is_string(item, "type") &&
-                item.at("type") == "function_call_output"
-            ) {
-                // #responses_create-input-input_item_list-item-function_tool_call_output
-                if (item.at("output").is_string()) {
-                    chatcmpl_messages.push_back(json {
-                        {"content",      item.at("output")},
-                        {"role",         "tool"},
-                        {"tool_call_id", item.at("call_id")},
-                    });
-                } else {
-                    json chatcmpl_outputs = item.at("output");
-                    for (json & chatcmpl_output : chatcmpl_outputs) {
-                        if (!chatcmpl_output.contains("type") || chatcmpl_output.at("type") != "input_text") {
-                            throw std::invalid_argument("Output of tool call should be 'Input text'");
-                        }
-                        chatcmpl_output["type"] = "text";
-                    }
-                    chatcmpl_messages.push_back(json {
-                        {"content",      chatcmpl_outputs},
-                        {"role",         "tool"},
-                        {"tool_call_id", item.at("call_id")},
-                    });
-                }
-            } else if (// exists_and_is_string(item, "id") &&
-                // item["id"] not sent by codex-cli
-                exists_and_is_array(item, "summary") &&
-                exists_and_is_string(item, "type") &&
-                item.at("type") == "reasoning") {
-                // #responses_create-input-input_item_list-item-reasoning
-
-                if (!exists_and_is_array(item, "content")) {
-                    throw std::invalid_argument("item['content'] is not an array");
-                }
-                if (item.at("content").empty()) {
-                    throw std::invalid_argument("item['content'] is empty");
-                }
-                if (!exists_and_is_string(item.at("content")[0], "text")) {
-                    throw std::invalid_argument("item['content']['text'] is not a string");
-                }
-
-                if (merge_prev) {
-                    auto & prev_msg = chatcmpl_messages.back();
-                    prev_msg["reasoning_content"] = item.at("content")[0].at("text");
-                } else {
-                    chatcmpl_messages.push_back(json {
-                        {"role", "assistant"},
-                        {"content", json::array()},
-                        {"reasoning_content", item.at("content")[0].at("text")},
-                    });
-                }
-            } else {
-                throw std::invalid_argument("Cannot determine type of 'item'");
-            }
-        }
-    } else {
-        throw std::invalid_argument("'input' must be a string or array of objects");
-    }
-
-    chatcmpl_body["messages"] = chatcmpl_messages;
-
-    if (response_body.contains("tools")) {
-        if (!response_body.at("tools").is_array()) {
-            throw std::invalid_argument("'tools' must be an array of objects");
-        }
-        std::vector<json> chatcmpl_tools;
-        for (json resp_tool : response_body.at("tools")) {
-            json chatcmpl_tool;
-
-            if (json_value(resp_tool, "type", std::string()) != "function") {
-                throw std::invalid_argument("'type' of tool must be 'function'");
-            }
-            resp_tool.erase("type");
-            chatcmpl_tool["type"] = "function";
-
-            if (!resp_tool.contains("strict")) {
-                resp_tool["strict"] = true;
-            }
-            chatcmpl_tool["function"] = resp_tool;
-            chatcmpl_tools.push_back(chatcmpl_tool);
-        }
-        chatcmpl_body.erase("tools");
-        chatcmpl_body["tools"] = chatcmpl_tools;
-    }
-
-    if (response_body.contains("max_output_tokens")) {
-        chatcmpl_body.erase("max_output_tokens");
-        chatcmpl_body["max_tokens"] = response_body["max_output_tokens"];
-    }
-
-    return chatcmpl_body;
-}
-
-json convert_transcriptions_to_chatcmpl(
-        const json & inp_body,
-        const std::map<std::string, raw_buffer> & in_files,
-        std::vector<raw_buffer> & out_files) {
-    // TODO @ngxson : this function may need to be improved in the future
-    // handle input files
-    out_files.clear();
-    auto it = in_files.find("file");
-    if (it != in_files.end()) {
-        out_files.push_back(it->second);
-    } else {
-        throw std::invalid_argument("No input file found for transcription");
-    }
-
-    // handle input data
-    std::string prompt = json_value(inp_body, "prompt", std::string());
-    std::string language = json_value(inp_body, "language", std::string());
-    std::string response_format = json_value(inp_body, "response_format", std::string("json"));
-    if (response_format != "json") {
-        throw std::invalid_argument("Only 'json' response_format is supported for transcription");
-    }
-    if (prompt.empty()) {
-        prompt = "Transcribe audio to text";
-    }
-    if (!language.empty()) {
-        prompt += string_format(" (language: %s)", language.c_str());
-    }
-    prompt += get_media_marker();
-
-    json chatcmpl_body = inp_body; // copy all fields
-    chatcmpl_body["messages"] = json::array({
-        {
-            {"role", "user"},
-            {"content", prompt},
-        },
-    });
-
-    // because input from form-data, everything is string, we need to correct the types here
-    std::string stream = json_value(inp_body, "stream", std::string("false"));
-    chatcmpl_body["stream"] = stream == "true";
-
-    if (inp_body.contains("max_tokens")) {
-        std::string inp = inp_body["max_tokens"].get<std::string>();
-        chatcmpl_body["max_tokens"] = std::stoul(inp);
-    }
-
-    if (inp_body.contains("temperature")) {
-        std::string inp = inp_body["temperature"].get<std::string>();
-        chatcmpl_body["temperature"] = std::stof(inp);
-    }
-
-    return chatcmpl_body;
-}
-
-json convert_anthropic_to_oai(const json & body) {
-    json oai_body;
-
-    // Convert system prompt
-    json oai_messages = json::array();
-    auto system_param = json_value(body, "system", json());
-    if (!system_param.is_null()) {
-        std::string system_content;
-
-        if (system_param.is_string()) {
-            system_content = system_param.get<std::string>();
-        } else if (system_param.is_array()) {
-            for (const auto & block : system_param) {
-                if (json_value(block, "type", std::string()) == "text") {
-                    system_content += json_value(block, "text", std::string());
-                }
-            }
-        }
-
-        oai_messages.push_back({
-            {"role", "system"},
-            {"content", system_content}
-        });
-    }
-
-    // Convert messages
-    if (!body.contains("messages")) {
-        throw std::runtime_error("'messages' is required");
-    }
-    const json & messages = body.at("messages");
-    if (messages.is_array()) {
-        for (const auto & msg : messages) {
-            std::string role = json_value(msg, "role", std::string());
-
-            if (!msg.contains("content")) {
-                if (role == "assistant") {
-                    continue;
-                }
-                oai_messages.push_back(msg);
-                continue;
-            }
-
-            const json & content = msg.at("content");
-
-            if (content.is_string()) {
-                oai_messages.push_back(msg);
-                continue;
-            }
-
-            if (!content.is_array()) {
-                oai_messages.push_back(msg);
-                continue;
-            }
-
-            json tool_calls = json::array();
-            json converted_content = json::array();
-            json tool_results = json::array();
-            std::string reasoning_content;
-            bool has_tool_calls = false;
-
-            for (const auto & block : content) {
-                std::string type = json_value(block, "type", std::string());
-
-                if (type == "text") {
-                    converted_content.push_back(block);
-                } else if (type == "thinking") {
-                    reasoning_content += json_value(block, "thinking", std::string());
-                } else if (type == "image") {
-                    json source = json_value(block, "source", json::object());
-                    std::string source_type = json_value(source, "type", std::string());
-
-                    if (source_type == "base64") {
-                        std::string media_type = json_value(source, "media_type", std::string("image/jpeg"));
-                        std::string data = json_value(source, "data", std::string());
-                        std::ostringstream ss;
-                        ss << "data:" << media_type << ";base64," << data;
-
-                        converted_content.push_back({
-                            {"type", "image_url"},
-                            {"image_url", {
-                                {"url", ss.str()}
-                            }}
-                        });
-                    } else if (source_type == "url") {
-                        std::string url = json_value(source, "url", std::string());
-                        converted_content.push_back({
-                            {"type", "image_url"},
-                            {"image_url", {
-                                {"url", url}
-                            }}
-                        });
-                    }
-                } else if (type == "tool_use") {
-                    tool_calls.push_back({
-                        {"id", json_value(block, "id", std::string())},
-                        {"type", "function"},
-                        {"function", {
-                            {"name", json_value(block, "name", std::string())},
-                            {"arguments", json_value(block, "input", json::object()).dump()}
-                        }}
-                    });
-                    has_tool_calls = true;
-                } else if (type == "tool_result") {
-                    std::string tool_use_id = json_value(block, "tool_use_id", std::string());
-
-                    auto result_content = json_value(block, "content", json());
-                    std::string result_text;
-                    if (result_content.is_string()) {
-                        result_text = result_content.get<std::string>();
-                    } else if (result_content.is_array()) {
-                        for (const auto & c : result_content) {
-                            if (json_value(c, "type", std::string()) == "text") {
-                                result_text += json_value(c, "text", std::string());
-                            }
-                        }
-                    }
-
-                    tool_results.push_back({
-                        {"role", "tool"},
-                        {"tool_call_id", tool_use_id},
-                        {"content", result_text}
-                    });
-                }
-            }
-
-            if (!converted_content.empty() || has_tool_calls || !reasoning_content.empty()) {
-                json new_msg = {{"role", role}};
-                if (!converted_content.empty()) {
-                    new_msg["content"] = converted_content;
-                } else if (has_tool_calls || !reasoning_content.empty()) {
-                    new_msg["content"] = "";
-                }
-                if (!tool_calls.empty()) {
-                    new_msg["tool_calls"] = tool_calls;
-                }
-                if (!reasoning_content.empty()) {
-                    new_msg["reasoning_content"] = reasoning_content;
-                }
-                oai_messages.push_back(new_msg);
-            }
-
-            for (const auto & tool_msg : tool_results) {
-                oai_messages.push_back(tool_msg);
-            }
-        }
-    }
-
-    oai_body["messages"] = oai_messages;
-
-    // Convert tools
-    if (body.contains("tools")) {
-        const json & tools = body.at("tools");
-        if (tools.is_array()) {
-            json oai_tools = json::array();
-            for (const auto & tool : tools) {
-                oai_tools.push_back({
-                    {"type", "function"},
-                    {"function", {
-                        {"name", json_value(tool, "name", std::string())},
-                        {"description", json_value(tool, "description", std::string())},
-                        {"parameters", tool.contains("input_schema") ? tool.at("input_schema") : json::object()}
-                    }}
-                });
-            }
-            oai_body["tools"] = oai_tools;
-        }
-    }
-
-    // Convert tool_choice
-    if (body.contains("tool_choice")) {
-        const json & tc = body.at("tool_choice");
-        if (tc.is_object()) {
-            std::string type = json_value(tc, "type", std::string());
-            if (type == "auto") {
-                oai_body["tool_choice"] = "auto";
-            } else if (type == "any" || type == "tool") {
-                oai_body["tool_choice"] = "required";
-            }
-        }
-    }
-
-    // Convert stop_sequences to stop
-    if (body.contains("stop_sequences")) {
-        oai_body["stop"] = body.at("stop_sequences");
-    }
-
-    // Handle max_tokens (required in Anthropic, but we're permissive)
-    if (body.contains("max_tokens")) {
-        oai_body["max_tokens"] = body.at("max_tokens");
-    } else {
-        oai_body["max_tokens"] = 4096;
-    }
-
-    // Pass through common params
-    for (const auto & key : {"temperature", "top_p", "top_k", "stream"}) {
-        if (body.contains(key)) {
-            oai_body[key] = body.at(key);
-        }
-    }
-
-    // Handle Anthropic-specific thinking param
-    if (body.contains("thinking")) {
-        json thinking = json_value(body, "thinking", json::object());
-        std::string thinking_type = json_value(thinking, "type", std::string());
-        if (thinking_type == "enabled") {
-            int budget_tokens = json_value(thinking, "budget_tokens", 10000);
-            oai_body["thinking_budget_tokens"] = budget_tokens;
-        }
-    }
-
-    // Handle Anthropic-specific metadata param
-    if (body.contains("metadata")) {
-        json metadata = json_value(body, "metadata", json::object());
-        std::string user_id = json_value(metadata, "user_id", std::string());
-        if (!user_id.empty()) {
-            oai_body["__metadata_user_id"] = user_id;
-        }
-    }
-
-    return oai_body;
-}
-
 json format_embeddings_response_oaicompat(
        const json & request,
        const std::string & model_name,
--- a/tools/server/server-common.h
+++ b/tools/server/server-common.h
@ -307,18 +307,6 @@ json oaicompat_chat_params_parse(
    const server_chat_params & opt,
    std::vector<raw_buffer> & out_files);

-// convert OpenAI Responses API format to OpenAI Chat Completions API format
-json convert_responses_to_chatcmpl(const json & body);
-
-// convert OpenAI transcriptions API format to OpenAI Chat Completions API format
-json convert_transcriptions_to_chatcmpl(
-    const json & body,
-    const std::map<std::string, raw_buffer> & in_files,
-    std::vector<raw_buffer> & out_files);
-
-// convert Anthropic Messages API format to OpenAI Chat Completions API format
-json convert_anthropic_to_oai(const json & body);
-
 // TODO: move it to server-task.cpp
 json format_embeddings_response_oaicompat(
    const json & request,
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@ -1,5 +1,6 @@

 #include "server-context.h"
+#include "server-chat.h"
 #include "server-common.h"
 #include "server-http.h"
 #include "server-task.h"
@ -1044,8 +1045,8 @@ private:
                /* allow_image           */ mctx ? mtmd_support_vision(mctx) : false,
                /* allow_audio           */ mctx ? mtmd_support_audio (mctx) : false,
                /* enable_thinking       */ enable_thinking,
-                /* reasoning_budget      */ params_base.reasoning_budget,
-                /* reasoning_budget_msg  */ params_base.reasoning_budget_message,
+                /* reasoning_budget      */ params_base.sampling.reasoning_budget_tokens,
+                /* reasoning_budget_msg  */ params_base.sampling.reasoning_budget_message,
                /* media_path            */ params_base.media_path,
                /* force_pure_content    */ params_base.force_pure_content_parser
            };
@ -2960,7 +2961,13 @@ private:

                // verify and try to accept the draft
                {
-                    common_sampler_ptr smpl_save(common_sampler_clone(slot.smpl.get()));
+                    const bool use_ckpt = slot.ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
+
+                    // only save the sampler sampler state if we use checkpoints
+                    common_sampler_ptr smpl_save;
+                    if (use_ckpt) {
+                        smpl_save.reset(common_sampler_clone(slot.smpl.get()));
+                    }

                    GGML_ASSERT(slot.spec_i_batch.size() == n_draft + 1);
                    auto accepted = common_sampler_sample_and_accept_n(slot.smpl.get(), slot.ctx, slot.spec_i_batch, slot.spec_draft);
@ -2972,7 +2979,7 @@ private:

                    // check for partial draft acceptance
                    if (accepted.size() < slot.spec_draft.size() + 1) {
-                        if (slot.ctx_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL) {
+                        if (use_ckpt) {
                            // partial acceptance is not supported by the context -> truncate the draft and restore the state
                            slot.spec_draft = std::move(accepted);

@ -3774,7 +3781,7 @@ void server_routes::init_routes() {
    this->post_responses_oai = [this](const server_http_req & req) {
        auto res = create_response();
        std::vector<raw_buffer> files;
-        json body = convert_responses_to_chatcmpl(json::parse(req.body));
+        json body = server_chat_convert_responses_to_chatcmpl(json::parse(req.body));
        SRV_DBG("%s\n", "Request converted: OpenAI Responses -> OpenAI Chat Completions");
        SRV_DBG("converted request: %s\n", body.dump().c_str());
        json body_parsed = oaicompat_chat_params_parse(
@ -3819,7 +3826,7 @@ void server_routes::init_routes() {
    this->post_anthropic_messages = [this](const server_http_req & req) {
        auto res = create_response();
        std::vector<raw_buffer> files;
-        json body = convert_anthropic_to_oai(json::parse(req.body));
+        json body = server_chat_convert_anthropic_to_oai(json::parse(req.body));
        SRV_DBG("%s\n", "Request converted: Anthropic -> OpenAI Chat Completions");
        SRV_DBG("converted request: %s\n", body.dump().c_str());
        json body_parsed = oaicompat_chat_params_parse(
@ -3837,7 +3844,7 @@ void server_routes::init_routes() {
    this->post_anthropic_count_tokens = [this](const server_http_req & req) {
        auto res = create_response();
        std::vector<raw_buffer> files;
-        json body = convert_anthropic_to_oai(json::parse(req.body));
+        json body = server_chat_convert_anthropic_to_oai(json::parse(req.body));
        SRV_DBG("%s\n", "Request converted: Anthropic -> OpenAI Chat Completions");
        SRV_DBG("converted request: %s\n", body.dump().c_str());
        json body_parsed = oaicompat_chat_params_parse(
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@ -712,6 +712,11 @@ void server_models::unload(const std::string & name) {
        if (it->second.meta.is_running()) {
            SRV_INF("stopping model instance name=%s\n", name.c_str());
            stopping_models.insert(name);
+            if (it->second.meta.status == SERVER_MODEL_STATUS_LOADING) {
+                // special case: if model is in loading state, unloading means force-killing it
+                SRV_WRN("model name=%s is still loading, force-killing\n", name.c_str());
+                subprocess_terminate(it->second.subproc.get());
+            }
            cv_stop.notify_all();
            // status change will be handled by the managing thread
        } else {
--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@ -1,6 +1,7 @@
 #include "server-task.h"

 #include "build-info.h"
+#include "server-chat.h"
 #include "chat.h"
 #include "common.h"
 #include "json-schema-to-grammar.h"
@ -873,7 +874,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() {
                json {
                    {"finish_reason", nullptr},
                    {"index", index},
-                    {"delta", common_chat_msg_diff_to_json_oaicompat(diff)},
+                    {"delta", server_chat_msg_diff_to_json_oaicompat(diff)},
                },
            })},
            {"created", t},
@ -1110,7 +1111,7 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() {
 json server_task_result_cmpl_final::to_json_oaicompat_asr() {
    json event = json {
        {"type",  "transcript.text.done"},
-        {"text",  content},
+        {"text",  oaicompat_msg.content},
        {"usage", json {
            {"type",         "tokens"},
            {"input_tokens",  n_prompt_tokens},
@ -1522,7 +1523,7 @@ json server_task_result_cmpl_partial::to_json_oaicompat_chat() {
    }

    for (const auto & diff : oaicompat_msg_diffs) {
-        add_delta(common_chat_msg_diff_to_json_oaicompat(diff));
+        add_delta(server_chat_msg_diff_to_json_oaicompat(diff));
    }

    if (!deltas.empty()) {