Merge commit '4aced7a631' into concedo_experimental

# Conflicts: # .devops/cann.Dockerfile # .devops/cpu.Dockerfile # .devops/cuda.Dockerfile # .devops/intel.Dockerfile # .devops/musa.Dockerfile # .devops/rocm.Dockerfile # .devops/tools.sh # .devops/vulkan.Dockerfile # .github/workflows/build.yml # .github/workflows/release.yml # .gitignore # docs/ops.md # docs/ops/SYCL.csv # examples/batched/batched.cpp # examples/eval-callback/eval-callback.cpp # examples/gen-docs/gen-docs.cpp # examples/lookahead/lookahead.cpp # examples/lookup/lookup-create.cpp # examples/lookup/lookup-stats.cpp # examples/lookup/lookup.cpp # examples/model-conversion/scripts/causal/compare-logits.py # examples/model-conversion/scripts/causal/run-org-model.py # examples/model-conversion/scripts/utils/check-nmse.py # examples/parallel/parallel.cpp # examples/retrieval/retrieval.cpp # examples/save-load-state/save-load-state.cpp # examples/speculative-simple/speculative-simple.cpp # examples/speculative/speculative.cpp # examples/training/finetune.cpp # ggml/CMakeLists.txt # ggml/src/ggml-cann/ggml-cann.cpp # ggml/src/ggml-cpu/repack.cpp # ggml/src/ggml-sycl/common.hpp # ggml/src/ggml-sycl/convert.cpp # ggml/src/ggml-sycl/dequantize.hpp # ggml/src/ggml-sycl/dpct/helper.hpp # ggml/src/ggml-sycl/element_wise.cpp # ggml/src/ggml-sycl/element_wise.hpp # ggml/src/ggml-sycl/ggml-sycl.cpp # ggml/src/ggml-sycl/mmvq.cpp # ggml/src/ggml-sycl/pad.cpp # ggml/src/ggml-sycl/ssm_conv.cpp # ggml/src/ggml-sycl/vecdotq.hpp # pyrightconfig.json # scripts/sync-ggml.last # tests/test-arg-parser.cpp # tests/test-backend-ops.cpp # tools/cvector-generator/cvector-generator.cpp # tools/imatrix/imatrix.cpp # tools/mtmd/CMakeLists.txt # tools/mtmd/clip.cpp # tools/perplexity/perplexity.cpp # tools/server/README.md
2026-05-22 03:10:03 +00:00 · 2025-12-16 23:14:12 +08:00 · 2025-12-16 23:14:12 +08:00 · 050a5b1f52
commit 050a5b1f52
parent e88bf41fdc 4aced7a631
72 changed files with 4190 additions and 3086 deletions
--- a/.devops/s390x.Dockerfile
+++ b/.devops/s390x.Dockerfile
@ -1,126 +0,0 @@
-ARG GCC_VERSION=15.2.0
-ARG UBUNTU_VERSION=24.04
-
-### Build Llama.cpp stage
-FROM gcc:${GCC_VERSION} AS build
-
-RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
-    --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
-    apt update -y && \
-    apt upgrade -y && \
-    apt install -y --no-install-recommends \
-        git cmake ccache ninja-build \
-        # WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
-        libopenblas-dev libcurl4-openssl-dev && \
-    rm -rf /var/lib/apt/lists/*
-
-WORKDIR /app
-COPY . .
-
-RUN --mount=type=cache,target=/root/.ccache \
-    --mount=type=cache,target=/app/build \
-    cmake -S . -B build -G Ninja \
-        -DCMAKE_BUILD_TYPE=Release \
-        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-        -DLLAMA_BUILD_TESTS=OFF \
-        -DGGML_NATIVE=OFF \
-        -DGGML_BACKEND_DL=ON \
-        -DGGML_CPU_ALL_VARIANTS=ON \
-        -DGGML_BLAS=ON \
-        -DGGML_BLAS_VENDOR=OpenBLAS && \
-    cmake --build build --config Release -j $(nproc) && \
-    cmake --install build --prefix /opt/llama.cpp
-
-COPY *.py             /opt/llama.cpp/bin
-COPY .devops/tools.sh /opt/llama.cpp/bin
-
-COPY gguf-py          /opt/llama.cpp/gguf-py
-COPY requirements.txt /opt/llama.cpp/gguf-py
-COPY requirements     /opt/llama.cpp/gguf-py/requirements
-
-
-### Collect all llama.cpp binaries, libraries and distro libraries
-FROM scratch AS collector
-
-# Copy llama.cpp binaries and libraries
-COPY --from=build /opt/llama.cpp/bin     /llama.cpp/bin
-COPY --from=build /opt/llama.cpp/lib     /llama.cpp/lib
-COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py
-
-
-### Base image
-FROM ubuntu:${UBUNTU_VERSION} AS base
-
-RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
-    --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
-    apt update -y && \
-    apt install -y --no-install-recommends \
-        # WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
-        # See: https://github.com/ggml-org/llama.cpp/pull/15915#issuecomment-3317166506
-        curl libgomp1 libopenblas-dev && \
-    apt autoremove -y && \
-    apt clean -y && \
-    rm -rf /tmp/* /var/tmp/* && \
-    find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
-    find /var/cache -type f -delete
-
-# Copy llama.cpp libraries
-COPY --from=collector /llama.cpp/lib /usr/lib/s390x-linux-gnu
-
-
-### Full
-FROM base AS full
-
-ENV PATH="/root/.cargo/bin:${PATH}"
-WORKDIR /app
-
-RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
-    --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
-    apt update -y && \
-    apt install -y \
-        git cmake libjpeg-dev \
-        python3 python3-pip python3-dev && \
-    apt autoremove -y && \
-    apt clean -y && \
-    rm -rf /tmp/* /var/tmp/* && \
-    find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \
-    find /var/cache -type f -delete
-
-RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
-
-COPY --from=collector /llama.cpp/bin /app
-COPY --from=collector /llama.cpp/gguf-py /app/gguf-py
-
-RUN pip install --no-cache-dir --break-system-packages \
-        -r /app/gguf-py/requirements.txt
-
-ENTRYPOINT [ "/app/tools.sh" ]
-
-
-### CLI Only
-FROM base AS light
-
-WORKDIR /llama.cpp/bin
-
-# Copy llama.cpp binaries and libraries
-COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
-COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin
-
-ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
-
-
-### Server
-FROM base AS server
-
-ENV LLAMA_ARG_HOST=0.0.0.0
-
-WORKDIR /llama.cpp/bin
-
-# Copy llama.cpp binaries and libraries
-COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
-COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin
-
-EXPOSE 8080
-
-ENTRYPOINT [ "/llama.cpp/bin/llama-server" ]
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -107,6 +107,16 @@ bool common_arg::is_exclude(enum llama_example ex) {

 bool common_arg::get_value_from_env(std::string & output) const {
    if (env == nullptr) return false;
+    if (!args_neg.empty()) {
+        // for compatibility, we need to check LLAMA_ARG_NO_ env as well
+        std::string neg_env = env;
+        string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
+        char * neg_value = std::getenv(neg_env.c_str());
+        if (neg_value) {
+            output = "0"; // falsey
+            return true;
+        }
+    }
    char * value = std::getenv(env);
    if (value) {
        output = value;
@ -116,6 +126,14 @@ bool common_arg::get_value_from_env(std::string & output) const {
 }

 bool common_arg::has_value_from_env() const {
+    if (env != nullptr && !args_neg.empty()) {
+        // for compatibility, we need to check LLAMA_ARG_NO_ env as well
+        std::string neg_env = env;
+        string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
+        if (std::getenv(neg_env.c_str())) {
+            return true;
+        }
+    }
    return env != nullptr && std::getenv(env);
 }

@ -153,9 +171,10 @@ std::string common_arg::to_string() const {
    std::string leading_spaces(n_leading_spaces, ' ');

    std::ostringstream ss;
-    for (const auto arg : args) {
-        if (arg == args.front()) {
-            if (args.size() == 1) {
+    auto all_args = get_args(); // also contains args_neg
+    for (const auto & arg : all_args) {
+        if (arg == all_args.front()) {
+            if (all_args.size() == 1) {
                ss << arg;
            } else {
                // first arg is usually abbreviation, we need padding to make it more beautiful
@ -164,7 +183,7 @@ std::string common_arg::to_string() const {
                ss << tmp << spaces;
            }
        } else {
-            ss << arg << (arg != args.back() ? ", " : "");
+            ss << arg << (arg != all_args.back() ? ", " : "");
        }
    }
    if (value_hint) ss << " " << value_hint;
@ -183,6 +202,31 @@ std::string common_arg::to_string() const {
    return ss.str();
 }

+std::vector<std::string> common_arg::get_args() const {
+    std::vector<std::string> result;
+    for (const auto & arg : args) {
+        result.push_back(std::string(arg));
+    }
+    for (const auto & arg : args_neg) {
+        result.push_back(std::string(arg));
+    }
+    return result;
+}
+
+std::vector<std::string> common_arg::get_env() const {
+    std::vector<std::string> result;
+    if (env) {
+        result.push_back(std::string(env));
+    }
+    if (!args_neg.empty() && env) {
+        // for compatibility, we need to add LLAMA_ARG_NO_ variant
+        std::string neg_env = env;
+        string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
+        result.push_back(neg_env);
+    }
+    return result;
+}
+
 //
 // utils
 //
@ -318,6 +362,16 @@ static std::string get_all_kv_cache_types() {
    return msg.str();
 }

+static bool parse_bool_value(const std::string & value) {
+    if (is_truthy(value)) {
+        return true;
+    } else if (is_falsey(value)) {
+        return false;
+    } else {
+        throw std::invalid_argument("invalid boolean value");
+    }
+}
+
 //
 // CLI argument parsing functions
 //
@ -325,10 +379,13 @@ static std::string get_all_kv_cache_types() {
 static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
    common_params & params = ctx_arg.params;

-    std::unordered_map<std::string, common_arg *> arg_to_options;
+    std::unordered_map<std::string, std::pair<common_arg *, bool>> arg_to_options;
    for (auto & opt : ctx_arg.options) {
        for (const auto & arg : opt.args) {
-            arg_to_options[arg] = &opt;
+            arg_to_options[arg] = {&opt, /* is_positive */ true};
+        }
+        for (const auto & arg : opt.args_neg) {
+            arg_to_options[arg] = {&opt, /* is_positive */ false};
        }
    }

@ -337,12 +394,15 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        std::string value;
        if (opt.get_value_from_env(value)) {
            try {
-                if (opt.handler_void && (value == "1" || value == "true")) {
+                if (opt.handler_void && is_truthy(value)) {
                    opt.handler_void(params);
                }
                if (opt.handler_int) {
                    opt.handler_int(params, std::stoi(value));
                }
+                if (opt.handler_bool) {
+                    opt.handler_bool(params, parse_bool_value(value));
+                }
                if (opt.handler_string) {
                    opt.handler_string(params, value);
                    continue;
@ -371,7 +431,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        if (arg_to_options.find(arg) == arg_to_options.end()) {
            throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
        }
-        auto opt = *arg_to_options[arg];
+        auto & tmp = arg_to_options[arg];
+        auto opt = *tmp.first;
+        bool is_positive = tmp.second;
        if (opt.has_value_from_env()) {
            fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
        }
@ -380,6 +442,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
                opt.handler_void(params);
                continue;
            }
+            if (opt.handler_bool) {
+                opt.handler_bool(params, is_positive);
+                continue;
+            }

            // arg with single value
            check_arg(i);
@ -404,7 +470,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
            throw std::invalid_argument(string_format(
                "error while handling argument \"%s\": %s\n\n"
                "usage:\n%s\n\nto show complete usage, run with -h",
-                arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str()));
+                arg.c_str(), e.what(), opt.to_string().c_str()));
        }
    }

@ -440,7 +506,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context

    // model is required (except for server)
    // TODO @ngxson : maybe show a list of available models in CLI in this case
-    if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage) {
+    if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage && !params.completion) {
        throw std::invalid_argument("error: --model is required\n");
    }

@ -575,6 +641,7 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
        "llama-batched-bench",
        "llama-bench",
        "llama-cli",
+        "llama-completion",
        "llama-convert-llama2c-to-ggml",
        "llama-cvector-generator",
        "llama-embedding",
@ -659,7 +726,7 @@ static void add_rpc_devices(const std::string & servers) {
    }
 }

-bool common_params_parse(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map) {
+bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map) {
    common_params dummy_params;
    common_params_context ctx_arg = common_params_parser_init(dummy_params, ex, nullptr);

@ -668,6 +735,9 @@ bool common_params_parse(int argc, char ** argv, llama_example ex, std::map<comm
        for (const auto & arg : opt.args) {
            arg_to_options[arg] = &opt;
        }
+        for (const auto & arg : opt.args_neg) {
+            arg_to_options[arg] = &opt;
+        }
    }

    // TODO @ngxson : find a way to deduplicate this code
@ -752,11 +822,11 @@ static std::string list_builtin_chat_templates() {
 }

 bool common_arg_utils::is_truthy(const std::string & value) {
-    return value == "on" || value == "enabled" || value == "1";
+    return value == "on" || value == "enabled" || value == "true" || value == "1";
 }

 bool common_arg_utils::is_falsey(const std::string & value) {
-    return value == "off" || value == "disabled" || value == "0";
+    return value == "off" || value == "disabled" || value == "false" || value == "0";
 }

 bool common_arg_utils::is_autoy(const std::string & value) {
@ -841,10 +911,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ));
    add_opt(common_arg(
+        {"--display-prompt"},
        {"--no-display-prompt"},
-        string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
-        [](common_params & params) {
-            params.display_prompt = false;
+        string_format("whether to print prompt at generation (default: %s)", params.display_prompt ? "true" : "false"),
+        [](common_params & params, bool value) {
+            params.display_prompt = value;
        }
    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
    add_opt(common_arg(
@ -1057,18 +1128,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.kv_unified = true;
        }
    ).set_env("LLAMA_ARG_KV_UNIFIED"));
-    add_opt(common_arg(
-        {"--no-context-shift"},
-        string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
-        [](common_params & params) {
-            params.ctx_shift = false;
-        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
    add_opt(common_arg(
        {"--context-shift"},
-        string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
-        [](common_params & params) {
-            params.ctx_shift = true;
+        {"--no-context-shift"},
+        string_format("whether to use context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.ctx_shift = value;
        }
    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
    add_opt(common_arg(
@ -1108,20 +1173,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
    add_opt(common_arg(
+        {"--perf"},
        {"--no-perf"},
-        string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
-        [](common_params & params) {
-            params.no_perf = true;
-            params.sampling.no_perf = true;
+        string_format("whether to enable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
+        [](common_params & params, bool value) {
+            params.no_perf = !value;
+            params.sampling.no_perf = !value;
        }
-    ).set_env("LLAMA_ARG_NO_PERF"));
+    ).set_env("LLAMA_ARG_PERF"));
    add_opt(common_arg(
+        {"--show-timings"},
        {"--no-show-timings"},
-        string_format("disable timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
-        [](common_params & params) {
-            params.show_timings = false;
+        string_format("whether to show timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
+        [](common_params & params, bool value) {
+            params.show_timings = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_NO_SHOW_TIMINGS"));
+    ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SHOW_TIMINGS"));
    add_opt(common_arg(
        {"-f", "--file"}, "FNAME",
        "a file containing the prompt (default: none)",
@ -1173,16 +1240,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_excludes({LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"-e", "--escape"},
-        string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
-        [](common_params & params) {
-            params.escape = true;
-        }
-    ));
-    add_opt(common_arg(
        {"--no-escape"},
-        "do not process escape sequences",
-        [](common_params & params) {
-            params.escape = false;
+        string_format("whether to process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
+        [](common_params & params, bool value) {
+            params.escape = value;
        }
    ));
    add_opt(common_arg(
@ -1229,19 +1290,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"-cnv", "--conversation"},
-        "run in conversation mode:\n"
+        {"-no-cnv", "--no-conversation"},
+        "whether to run in conversation mode:\n"
        "- does not print special tokens and suffix/prefix\n"
        "- interactive mode is also enabled\n"
        "(default: auto enabled if chat template is available)",
-        [](common_params & params) {
-            params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
-        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
-    add_opt(common_arg(
-        {"-no-cnv", "--no-conversation"},
-        "force disable conversation mode (default: false)",
-        [](common_params & params) {
-            params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
+        [](common_params & params, bool value) {
+            params.conversation_mode = value ? COMMON_CONVERSATION_MODE_ENABLED : COMMON_CONVERSATION_MODE_DISABLED;
        }
    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
    add_opt(common_arg(
@ -1299,10 +1354,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
    add_opt(common_arg(
+        {"--warmup"},
        {"--no-warmup"},
-        "skip warming up the model with an empty run",
-        [](common_params & params) {
-            params.warmup = false;
+        string_format("whether to perform warmup with an empty run (default: %s)", params.warmup ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.warmup = value;
        }
    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
    add_opt(common_arg(
@ -1361,7 +1417,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.sampling.top_k = value;
            params.sampling.user_sampling_config |= common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_TOP_K;
        }
-    ).set_sparam());
+    ).set_sparam().set_env("LLAMA_ARG_TOP_K"));
    add_opt(common_arg(
        {"--top-p"}, "N",
        string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
@ -1704,19 +1760,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_COMPLETION}));
    add_opt(common_arg(
+        {"-kvo", "--kv-offload"},
        {"-nkvo", "--no-kv-offload"},
-        "disable KV offload",
-        [](common_params & params) {
-            params.no_kv_offload = true;
+        string_format("whether to enable KV cache offloading (default: %s)", params.no_kv_offload ? "disabled" : "enabled"),
+        [](common_params & params, bool value) {
+            params.no_kv_offload = !value;
        }
-    ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
+    ).set_env("LLAMA_ARG_KV_OFFLOAD"));
    add_opt(common_arg(
+        {"--repack"},
        {"-nr", "--no-repack"},
-        "disable weight repacking",
-        [](common_params & params) {
-            params.no_extra_bufts = true;
+        string_format("whether to enable weight repacking (default: %s)", params.no_extra_bufts ? "disabled" : "enabled"),
+        [](common_params & params, bool value) {
+            params.no_extra_bufts = !value;
        }
-    ).set_env("LLAMA_ARG_NO_REPACK"));
+    ).set_env("LLAMA_ARG_REPACK"));
    add_opt(common_arg(
        {"--no-host"},
        "bypass host buffer allowing extra buffers to be used",
@ -1845,18 +1903,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_PARALLEL}));
    add_opt(common_arg(
        {"-cb", "--cont-batching"},
-        string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
-        [](common_params & params) {
-            params.cont_batching = true;
+        {"-nocb", "--no-cont-batching"},
+        string_format("whether to enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.cont_batching = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
-    add_opt(common_arg(
-        {"-nocb", "--no-cont-batching"},
-        "disable continuous batching",
-        [](common_params & params) {
-            params.cont_batching = false;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
    add_opt(common_arg(
        {"-mm", "--mmproj"}, "FILE",
        "path to a multimodal projector file. see tools/mtmd/README.md\n"
@ -1873,19 +1925,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
    add_opt(common_arg(
-        {"--no-mmproj"},
-        "explicitly disable multimodal projector, useful when using -hf",
-        [](common_params & params) {
-            params.no_mmproj = true;
+        {"--mmproj-auto"},
+        {"--no-mmproj", "--no-mmproj-auto"},
+        string_format("whether to use multimodal projector file (if available), useful when using -hf (default: %s)", params.no_mmproj ? "disabled" : "enabled"),
+        [](common_params & params, bool value) {
+            params.no_mmproj = !value;
        }
-    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ"));
+    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_AUTO"));
    add_opt(common_arg(
+        {"--mmproj-offload"},
        {"--no-mmproj-offload"},
-        "do not offload multimodal projector to GPU",
-        [](common_params & params) {
-            params.mmproj_use_gpu = false;
+        string_format("whether to enable GPU offloading for multimodal projector (default: %s)", params.mmproj_use_gpu ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.mmproj_use_gpu = value;
        }
-    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
+    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
    add_opt(common_arg(
        {"--image", "--audio"}, "FILE",
        "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
@ -1925,12 +1979,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_env("LLAMA_ARG_MLOCK"));
    add_opt(common_arg(
+        {"--mmap"},
        {"--no-mmap"},
-        "do not memory-map model (slower load but may reduce pageouts if not using mlock)",
-        [](common_params & params) {
-            params.use_mmap = false;
+        string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.use_mmap = value;
        }
-    ).set_env("LLAMA_ARG_NO_MMAP"));
+    ).set_env("LLAMA_ARG_MMAP"));
    add_opt(common_arg(
        {"--numa"}, "TYPE",
        "attempt optimizations that help on some NUMA systems\n"
@ -2118,10 +2173,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ));
    add_opt(common_arg(
+        {"--op-offload"},
        {"--no-op-offload"},
-        string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"),
-        [](common_params & params) {
-            params.no_op_offload = true;
+        string_format("whether to offload host tensor operations to device (default: %s)", params.no_op_offload ? "false" : "true"),
+        [](common_params & params, bool value) {
+            params.no_op_offload = !value;
        }
    ));
    add_opt(common_arg(
@ -2317,10 +2373,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
    add_opt(common_arg(
+        {"--ppl"},
        {"--no-ppl"},
-        string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
-        [](common_params & params) {
-            params.compute_ppl = false;
+        string_format("whether to compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
+        [](common_params & params, bool value) {
+            params.compute_ppl = value;
        }
    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
    add_opt(common_arg(
@ -2439,12 +2496,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
    add_opt(common_arg(
+        {"--webui"},
        {"--no-webui"},
-        string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
-        [](common_params & params) {
-            params.webui = false;
+        string_format("whether to enable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.webui = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI"));
    add_opt(common_arg(
        {"--embedding", "--embeddings"},
        string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
@ -2549,18 +2607,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
    add_opt(common_arg(
        {"--slots"},
-        string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
-        [](common_params & params) {
-            params.endpoint_slots = true;
+        {"--no-slots"},
+        string_format("expose slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.endpoint_slots = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
-    add_opt(common_arg(
-        {"--no-slots"},
-        "disables slots monitoring endpoint",
-        [](common_params & params) {
-            params.endpoint_slots = false;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
    add_opt(common_arg(
        {"--slot-save-path"}, "PATH",
        "path to save slot kv cache (default: disabled)",
@ -2611,26 +2663,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
    add_opt(common_arg(
+        {"--models-autoload"},
        {"--no-models-autoload"},
-        "disables automatic loading of models (default: enabled)",
-        [](common_params & params) {
-            params.models_autoload = false;
+        string_format("for router server, whether to automatically load models (default: %s)", params.models_autoload ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.models_autoload = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_MODELS_AUTOLOAD"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_AUTOLOAD"));
    add_opt(common_arg(
        {"--jinja"},
-        string_format("use jinja template for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
-        [](common_params & params) {
-            params.use_jinja = true;
+        {"--no-jinja"},
+        string_format("whether to use jinja template engine for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.use_jinja = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
-    add_opt(common_arg(
-        {"--no-jinja"},
-        string_format("disable jinja template for chat (default: %s)", params.use_jinja ? "disabled" : "enabled"),
-        [](common_params & params) {
-            params.use_jinja = false;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_NO_JINJA"));
    add_opt(common_arg(
        {"--reasoning-format"}, "FORMAT",
        "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
@ -2675,15 +2722,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
    add_opt(common_arg(
+        {"--prefill-assistant"},
        {"--no-prefill-assistant"},
        string_format(
            "whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
            "when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
        ),
-        [](common_params & params) {
-            params.prefill_assistant = false;
+        [](common_params & params, bool value) {
+            params.prefill_assistant = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PREFILL_ASSISTANT"));
    add_opt(common_arg(
        {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
        string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
--- a/common/arg.h
+++ b/common/arg.h
@ -16,6 +16,7 @@ struct common_arg {
    std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
    std::set<enum llama_example> excludes = {};
    std::vector<const char *> args;
+    std::vector<const char *> args_neg;  // for negated args like --no-xxx
    const char * value_hint   = nullptr; // help text or example for arg value
    const char * value_hint_2 = nullptr; // for second arg value
    const char * env          = nullptr;
@ -25,6 +26,7 @@ struct common_arg {
    void (*handler_string) (common_params & params, const std::string &) = nullptr;
    void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
    void (*handler_int)    (common_params & params, int) = nullptr;
+    void (*handler_bool)   (common_params & params, bool) = nullptr;

    common_arg() = default;

@ -48,6 +50,13 @@ struct common_arg {
        void (*handler)(common_params & params)
    ) : args(args), help(help), handler_void(handler) {}

+    common_arg(
+        const std::initializer_list<const char *> & args,
+        const std::initializer_list<const char *> & args_neg,
+        const std::string & help,
+        void (*handler)(common_params & params, bool)
+    ) : args(args), args_neg(args_neg), help(help), handler_bool(handler) {}
+
    // support 2 values for arg
    common_arg(
        const std::initializer_list<const char *> & args,
@ -80,6 +89,10 @@ struct common_arg {
        }
        return strcmp(args[0], other.args[0]) == 0;
    }
+
+    // get all args and env vars (including negated args/env)
+    std::vector<std::string> get_args() const;
+    std::vector<std::string> get_env() const;
 };

 namespace common_arg_utils {
@ -102,7 +115,7 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e

 // parse input arguments from CLI into a map
 // TODO: support repeated args in the future
-bool common_params_parse(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map);
+bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map);

 // initialize argument parser context - used by test-arg-parser and preset
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1021,31 +1021,40 @@ bool tty_can_use_colors() {
 // Model utils
 //

-static inline void common_init_sampler_from_model(
+// TODO: move to common/sampling
+static void common_init_sampler_from_model(
    const llama_model * model,
    common_params_sampling & sparams) {

    const uint64_t config = sparams.user_sampling_config;

    auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) {
-        if (config & user_config) return;
+        if (config & user_config) {
+            return;
+        }

        char buf[64] = {0};
        if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
            char * end = nullptr;
            int32_t v = strtol(buf, &end, 10);
-            if (end && end != buf) dst = v;
+            if (end && end != buf) {
+                dst = v;
+            }
        }
    };

    auto get_float = [&](const char * key, float & dst, uint64_t user_config) {
-        if (config & user_config) return;
+        if (config & user_config) {
+            return;
+        }

        char buf[128] = {0};
        if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) {
            char * end = nullptr;
            float v = strtof(buf, &end);
-            if (end && end != buf) dst = v;
+            if (end && end != buf) {
+                dst = v;
+            }
        }
    };

@ -1073,31 +1082,122 @@ static inline void common_init_sampler_from_model(
    get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA),    sparams.mirostat_eta,    common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA);
 }

-struct common_init_result common_init_from_params(common_params & params) {
-    common_init_result iparams;
-    auto mparams = common_model_params_to_llama(params);
+struct common_init_result::impl {
+    impl() = default;
+    ~impl() = default;
+
+    llama_model_ptr   model;
+    llama_context_ptr context;
+
+    std::vector<llama_adapter_lora_ptr> lora;
+
+    std::vector<common_sampler_ptr> samplers;
+};
+
+common_init_result::common_init_result(common_params & params) :
+    pimpl(new impl{}) {
+    const auto mparams = common_model_params_to_llama(params);

    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
    if (model == NULL) {
-        LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
-            __func__, params.model.path.c_str());
-        return iparams;
+        return;
    }

-    common_init_sampler_from_model(model, params.sampling);
+    pimpl->model.reset(model);

    const llama_vocab * vocab = llama_model_get_vocab(model);

+    // updates params.sampling
+    // TODO: fix naming
+    common_init_sampler_from_model(model, params.sampling);
+
    auto cparams = common_context_params_to_llama(params);

+    if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
+        LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
+        params.sampling.ignore_eos = false;
+    }
+
+    // initialize once
+    for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
+        if (llama_vocab_is_eog(vocab, i)) {
+            LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY);
+            params.sampling.logit_bias_eog.push_back({i, -INFINITY});
+        }
+    }
+
+    if (params.sampling.ignore_eos) {
+        // add EOG biases to the active set of logit biases
+        params.sampling.logit_bias.insert(
+                params.sampling.logit_bias.end(),
+                params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
+    }
+
+    //if (params.sampling.penalty_last_n == -1) {
+    //    LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
+    //    params.sampling.penalty_last_n = llama_n_ctx(lctx);
+    //}
+
+    //if (params.sampling.dry_penalty_last_n == -1) {
+    //    LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
+    //    params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
+    //}
+
+    pimpl->samplers.resize(cparams.n_seq_max);
+
+    for (int i = 0; i < (int) cparams.n_seq_max; ++i) {
+        pimpl->samplers[i].reset(common_sampler_init(model, params.sampling));
+    }
+
    llama_context * lctx = llama_init_from_model(model, cparams);
    if (lctx == NULL) {
        LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
-            __func__, params.model.path.c_str());
-        llama_model_free(model);
-        return iparams;
+                __func__, params.model.path.c_str());
+        return;
    }

+    pimpl->context.reset(lctx);
+}
+
+llama_model * common_init_result::model() {
+    return pimpl->model.get();
+}
+
+llama_context * common_init_result::context() {
+    return pimpl->context.get();
+}
+
+common_sampler * common_init_result::sampler(llama_seq_id seq_id) {
+    return pimpl->samplers[seq_id].get();
+}
+
+std::vector<llama_adapter_lora_ptr> & common_init_result::lora() {
+    return pimpl->lora;
+}
+
+void common_init_result::free_context() {
+    pimpl->context.reset();
+}
+
+common_init_result_ptr common_init_from_params(common_params & params) {
+    common_init_result_ptr res(new common_init_result(params));
+
+    llama_model * model = res->model();
+    if (model == NULL) {
+        LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
+            __func__, params.model.path.c_str());
+        return res;
+    }
+
+    llama_context * lctx = res->context();
+    if (lctx == NULL) {
+        LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n",
+            __func__, params.model.path.c_str());
+        return res;
+    }
+
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
    if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) {
        LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
        params.ctx_shift = false;
@ -1109,10 +1209,7 @@ struct common_init_result common_init_from_params(common_params & params) {

        const auto cvec = common_control_vector_load(params.control_vectors);
        if (cvec.n_embd == -1) {
-            llama_free(lctx);
-            llama_model_free(model);
-
-            return iparams;
+            return res;
        }

        int err = llama_apply_adapter_cvec(
@ -1123,10 +1220,7 @@ struct common_init_result common_init_from_params(common_params & params) {
                params.control_vector_layer_start,
                params.control_vector_layer_end);
        if (err) {
-            llama_free(lctx);
-            llama_model_free(model);
-
-            return iparams;
+            return res;
        }
    }

@ -1150,10 +1244,7 @@ struct common_init_result common_init_from_params(common_params & params) {
        }

        if (!ok) {
-            llama_free(lctx);
-            llama_model_free(model);
-
-            return iparams;
+            return res;
        }
    }

@ -1163,9 +1254,7 @@ struct common_init_result common_init_from_params(common_params & params) {
        lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
        if (lora == nullptr) {
            LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
-            llama_free(lctx);
-            llama_model_free(model);
-            return iparams;
+            return res;
        }

        char buf[1024];
@ -1174,43 +1263,13 @@ struct common_init_result common_init_from_params(common_params & params) {
        la.task_name = buf;
        llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf));
        la.prompt_prefix = buf;
-        iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
+        res->lora().emplace_back(std::move(lora)); // copy to list of loaded adapters
    }

    if (!params.lora_init_without_apply) {
        common_set_adapter_lora(lctx, params.lora_adapters);
    }

-    if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
-        LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
-        params.sampling.ignore_eos = false;
-    }
-
-    // initialize once
-    for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
-        if (llama_vocab_is_eog(vocab, i)) {
-            LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
-            params.sampling.logit_bias_eog.push_back({i, -INFINITY});
-        }
-    }
-
-    if (params.sampling.ignore_eos) {
-        // add EOG biases to the active set of logit biases
-        params.sampling.logit_bias.insert(
-                params.sampling.logit_bias.end(),
-                params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end());
-    }
-
-    if (params.sampling.penalty_last_n == -1) {
-        LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
-        params.sampling.penalty_last_n = llama_n_ctx(lctx);
-    }
-
-    if (params.sampling.dry_penalty_last_n == -1) {
-        LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx));
-        params.sampling.dry_penalty_last_n = llama_n_ctx(lctx);
-    }
-
    if (params.warmup) {
        LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);

@ -1249,12 +1308,11 @@ struct common_init_result common_init_from_params(common_params & params) {
        llama_set_warmup(lctx, false);
    }

-    iparams.model.reset(model);
-    iparams.context.reset(lctx);
-
-    return iparams;
+    return res;
 }

+common_init_result::~common_init_result() = default;
+
 std::string get_model_endpoint() {
    const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
    // We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
@ -1263,7 +1321,9 @@ std::string get_model_endpoint() {
    std::string model_endpoint = "https://huggingface.co/";
    if (endpoint_env) {
        model_endpoint = endpoint_env;
-        if (model_endpoint.back() != '/') model_endpoint += '/';
+        if (model_endpoint.back() != '/') {
+            model_endpoint += '/';
+        }
    }
    return model_endpoint;
 }
--- a/common/common.h
+++ b/common/common.h
@ -191,7 +191,6 @@ struct common_params_sampling {

    std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"};     // default sequence breakers for DRY

-
    std::vector<enum common_sampler_type> samplers = {
        COMMON_SAMPLER_TYPE_PENALTIES,
        COMMON_SAMPLER_TYPE_DRY,
@ -212,6 +211,10 @@ struct common_params_sampling {
    std::vector<llama_logit_bias> logit_bias;     // logit biases to apply
    std::vector<llama_logit_bias> logit_bias_eog; // pre-calculated logit biases for EOG tokens

+    bool has_logit_bias() const {
+        return !logit_bias.empty();
+    }
+
    // print the parameters into a string
    std::string print() const;
 };
@ -665,15 +668,29 @@ bool tty_can_use_colors();
 // Model utils
 //

-// note: defines object's lifetime
-struct common_init_result {
-    llama_model_ptr   model;
-    llama_context_ptr context;
+struct common_sampler;

-    std::vector<llama_adapter_lora_ptr> lora;
+// note: defines the model, context, samplers, ets. lifetimes
+struct common_init_result {
+    common_init_result(common_params & params);
+    ~common_init_result();
+
+    llama_model * model();
+    llama_context * context();
+    common_sampler * sampler(llama_seq_id seq_id);
+
+    std::vector<llama_adapter_lora_ptr> & lora();
+
+    void free_context();
+
+private:
+    struct impl;
+    std::unique_ptr<impl> pimpl;
 };

-struct common_init_result     common_init_from_params(common_params & params);
+using common_init_result_ptr = std::unique_ptr<common_init_result>;
+
+common_init_result_ptr common_init_from_params(common_params & params);

 struct llama_model_params     common_model_params_to_llama  (      common_params & params);
 struct llama_context_params   common_context_params_to_llama(const common_params & params);
--- a/common/preset.cpp
+++ b/common/preset.cpp
@ -23,8 +23,14 @@ std::vector<std::string> common_preset::to_args() const {
        if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
            // flag option, no value
            if (common_arg_utils::is_falsey(value)) {
-                // skip the flag
-                args.pop_back();
+                // use negative arg if available
+                if (!opt.args_neg.empty()) {
+                    args.back() = opt.args_neg.back();
+                } else {
+                    // otherwise, skip the flag
+                    // TODO: maybe throw an error instead?
+                    args.pop_back();
+                }
            }
        }
        if (opt.value_hint != nullptr) {
@ -141,16 +147,31 @@ static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_
 static std::map<std::string, common_arg> get_map_key_opt(common_params_context & ctx_params) {
    std::map<std::string, common_arg> mapping;
    for (const auto & opt : ctx_params.options) {
-        if (opt.env != nullptr) {
-            mapping[opt.env] = opt;
+        for (const auto & env : opt.get_env()) {
+            mapping[env] = opt;
        }
-        for (const auto & arg : opt.args) {
+        for (const auto & arg : opt.get_args()) {
            mapping[rm_leading_dashes(arg)] = opt;
        }
    }
    return mapping;
 }

+static bool is_bool_arg(const common_arg & arg) {
+    return !arg.args_neg.empty();
+}
+
+static std::string parse_bool_arg(const common_arg & arg, const std::string & key, const std::string & value) {
+    // if this is a negated arg, we need to reverse the value
+    for (const auto & neg_arg : arg.args_neg) {
+        if (rm_leading_dashes(neg_arg) == key) {
+            return common_arg_utils::is_truthy(value) ? "false" : "true";
+        }
+    }
+    // otherwise, not negated
+    return value;
+}
+
 common_presets common_presets_load(const std::string & path, common_params_context & ctx_params) {
    common_presets out;
    auto key_to_opt = get_map_key_opt(ctx_params);
@ -167,8 +188,13 @@ common_presets common_presets_load(const std::string & path, common_params_conte
        for (const auto & [key, value] : section.second) {
            LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str());
            if (key_to_opt.find(key) != key_to_opt.end()) {
-                preset.options[key_to_opt[key]] = value;
-                LOG_DBG("accepted option: %s = %s\n", key.c_str(), value.c_str());
+                auto & opt = key_to_opt[key];
+                if (is_bool_arg(opt)) {
+                    preset.options[opt] = parse_bool_arg(opt, key, value);
+                } else {
+                    preset.options[opt] = value;
+                }
+                LOG_DBG("accepted option: %s = %s\n", key.c_str(), preset.options[opt].c_str());
            } else {
                // TODO: maybe warn about unknown key?
            }
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -104,9 +104,10 @@ struct ring_buffer {
 struct common_sampler {
    common_params_sampling params;

-    struct llama_sampler * grmr;
    struct llama_sampler * chain;

+    bool grammar;
+
    ring_buffer<llama_token> prev;

    std::vector<llama_token_data> cur;
@ -116,7 +117,6 @@ struct common_sampler {
    void reset() {
        prev.clear();

-        llama_sampler_reset(grmr);
        llama_sampler_reset(chain);
    }

@ -167,10 +167,15 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co

    lparams.no_perf = params.no_perf;

-    struct llama_sampler * grmr;
+    llama_sampler * chain = llama_sampler_chain_init(lparams);
+
+    bool grammar = false;
+    std::vector<llama_sampler *> samplers;
+
    if (params.grammar.compare(0, 11, "%llguidance") == 0) {
 #ifdef LLAMA_USE_LLGUIDANCE
-        grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str());
+        samplers.push_back(llama_sampler_init_llg(vocab, "lark", params.grammar.c_str()));
+        grammar = true;
 #else
        GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled");
 #endif // LLAMA_USE_LLGUIDANCE
@ -217,30 +222,23 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
            trigger_patterns_c.push_back(regex.c_str());
        }

-        grmr = params.grammar_lazy
-             ? llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
-                                                        trigger_patterns_c.data(), trigger_patterns_c.size(),
-                                                        trigger_tokens.data(), trigger_tokens.size())
-             :      llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
-        if (!grmr) {
-            return nullptr;
+        if (!params.grammar.empty()) {
+             if (params.grammar_lazy) {
+                 samplers.push_back(
+                         llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root",
+                             trigger_patterns_c.data(), trigger_patterns_c.size(),
+                             trigger_tokens.data(),     trigger_tokens.size()));
+             } else {
+                 samplers.push_back(llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"));
+             }
+
+             grammar = true;
        }
    }

-    auto * result = new common_sampler {
-        /* .params = */ params,
-        /* .grmr   = */ grmr,
-        /* .chain  = */ llama_sampler_chain_init(lparams),
-        /* .prev   = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
-        /* .cur    = */ {},
-        /* .cur_p  = */ {},
-    };
-
-    llama_sampler_chain_add(result->chain,
-            llama_sampler_init_logit_bias(
-                llama_vocab_n_tokens(vocab),
-                params.logit_bias.size(),
-                params.logit_bias.data()));
+    if (params.has_logit_bias()) {
+        samplers.push_back(llama_sampler_init_logit_bias(llama_vocab_n_tokens(vocab), params.logit_bias.size(), params.logit_bias.data()));
+    }

    if (params.mirostat == 0) {
        for (const auto & cnstr : params.samplers) {
@ -253,58 +251,70 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                            c_breakers.push_back(str.c_str());
                        }

-                        llama_sampler_chain_add(result->chain, llama_sampler_init_dry      (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
+                        samplers.push_back(llama_sampler_init_dry    (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
                    }
                    break;
                case COMMON_SAMPLER_TYPE_TOP_K:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_k       (params.top_k));
+                    samplers.push_back(llama_sampler_init_top_k      (params.top_k));
                    break;
                case COMMON_SAMPLER_TYPE_TOP_P:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_p       (params.top_p, params.min_keep));
+                    samplers.push_back(llama_sampler_init_top_p      (params.top_p, params.min_keep));
                    break;
                case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
+                    samplers.push_back(llama_sampler_init_top_n_sigma(params.top_n_sigma));
                    break;
                case COMMON_SAMPLER_TYPE_MIN_P:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_min_p       (params.min_p, params.min_keep));
+                    samplers.push_back(llama_sampler_init_min_p      (params.min_p, params.min_keep));
                    break;
                case COMMON_SAMPLER_TYPE_XTC:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_xtc         (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
+                    samplers.push_back(llama_sampler_init_xtc        (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
                    break;
                case COMMON_SAMPLER_TYPE_TYPICAL_P:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_typical     (params.typ_p, params.min_keep));
+                    samplers.push_back(llama_sampler_init_typical    (params.typ_p, params.min_keep));
                    break;
                case COMMON_SAMPLER_TYPE_TEMPERATURE:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext    (params.temp, params.dynatemp_range, params.dynatemp_exponent));
+                    samplers.push_back(llama_sampler_init_temp_ext   (params.temp, params.dynatemp_range, params.dynatemp_exponent));
                    break;
                case COMMON_SAMPLER_TYPE_INFILL:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_infill      (vocab));
+                    samplers.push_back(llama_sampler_init_infill     (vocab));
                    break;
                case COMMON_SAMPLER_TYPE_PENALTIES:
-                    llama_sampler_chain_add(result->chain, llama_sampler_init_penalties   (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
+                    samplers.push_back(llama_sampler_init_penalties  (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
                    break;
                default:
                    GGML_ASSERT(false && "unknown sampler type");
            }
        }
-        llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
+
+        samplers.push_back(llama_sampler_init_dist(params.seed));
    } else if (params.mirostat == 1) {
-        llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
-        llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
+        samplers.push_back(llama_sampler_init_temp(params.temp));
+        samplers.push_back(llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
    } else if (params.mirostat == 2) {
-        llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
-        llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
+        samplers.push_back(llama_sampler_init_temp(params.temp));
+        samplers.push_back(llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));
    } else {
        GGML_ASSERT(false && "unknown mirostat version");
    }

+    for (auto * smpl : samplers) {
+        llama_sampler_chain_add(chain, smpl);
+    }
+
+    auto * result = new common_sampler {
+        /* .params  = */ params,
+        /* .chain   = */ chain,
+        /* .grammar = */ grammar,
+        /* .prev    = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
+        /* .cur     = */ {},
+        /* .cur_p   = */ {},
+    };
+
    return result;
 }

 void common_sampler_free(struct common_sampler * gsmpl) {
    if (gsmpl) {
-        llama_sampler_free(gsmpl->grmr);
-
        llama_sampler_free(gsmpl->chain);

        delete gsmpl;
@ -314,11 +324,24 @@ void common_sampler_free(struct common_sampler * gsmpl) {
 void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
    const auto tm = gsmpl->tm();

-    if (accept_grammar) {
-        llama_sampler_accept(gsmpl->grmr, token);
-    }
+    if (gsmpl->grammar) {
+        const int n_smpl = llama_sampler_chain_n(gsmpl->chain);

-    llama_sampler_accept(gsmpl->chain, token);
+        for (int i = 0; i < n_smpl; i++) {
+            auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
+
+            // the grammar sampler is always the first one
+            if (i == 0) {
+                if (accept_grammar) {
+                    llama_sampler_accept(smpl, token);
+                }
+            } else {
+                llama_sampler_accept(smpl, token);
+            }
+        }
+    } else {
+        llama_sampler_accept(gsmpl->chain, token);
+    }

    gsmpl->prev.push_back(token);
 }
@ -329,12 +352,12 @@ void common_sampler_reset(struct common_sampler * gsmpl) {

 struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
    return new common_sampler {
-        /* .params = */ gsmpl->params,
-        /* .grmr   = */ llama_sampler_clone(gsmpl->grmr),
-        /* .chain  = */ llama_sampler_clone(gsmpl->chain),
-        /* .prev   = */ gsmpl->prev,
-        /* .cur    = */ gsmpl->cur,
-        /* .cur_p  = */ gsmpl->cur_p,
+        /* .params  = */ gsmpl->params,
+        /* .chain   = */ llama_sampler_clone(gsmpl->chain),
+        /* .grammar = */ gsmpl->grammar,
+        /* .prev    = */ gsmpl->prev,
+        /* .cur     = */ gsmpl->cur,
+        /* .cur_p   = */ gsmpl->cur_p,
    };
 }

@ -383,58 +406,33 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
    }
 }

-llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
+struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) {
+    return gsmpl->chain;
+}
+
+llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx) {
    llama_synchronize(ctx);

    // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
    const auto tm = gsmpl->tm();

-    gsmpl->set_logits(ctx, idx);
+    llama_token id = LLAMA_TOKEN_NULL;

-    auto & grmr  = gsmpl->grmr;
    auto & chain = gsmpl->chain;
    auto & cur_p = gsmpl->cur_p; // initialized by set_logits

-    if (grammar_first) {
-        llama_sampler_apply(grmr, &cur_p);
-    }
+    gsmpl->set_logits(ctx, idx);

    llama_sampler_apply(chain, &cur_p);

    GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");

-    const llama_token id = cur_p.data[cur_p.selected].id;
+    id = cur_p.data[cur_p.selected].id;

-    if (grammar_first) {
-        return id;
-    }
-
-    // check if it the sampled token fits the grammar
-    {
-        llama_token_data       single_token_data       = { id, 1.0f, 0.0f };
-        llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false };
-
-        llama_sampler_apply(grmr, &single_token_data_array);
-
-        const bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
-        if (is_valid) {
-            return id;
-        }
-    }
-
-    // resampling:
-    // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
-    gsmpl->set_logits(ctx, idx);
-
-    llama_sampler_apply(grmr,  &cur_p);
-    llama_sampler_apply(chain, &cur_p);
-
-    GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration");
-
-    return cur_p.data[cur_p.selected].id;
+    return id;
 }

-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft) {
    GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");

    std::vector<llama_token> result;
@ -442,7 +440,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample

    size_t i = 0;
    for (; i < draft.size(); i++) {
-        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
+        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);

        common_sampler_accept(gsmpl, id, true);

@ -454,7 +452,7 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
    }

    if (i == draft.size()) {
-        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
+        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]);

        common_sampler_accept(gsmpl, id, true);

@ -464,13 +462,13 @@ std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sample
    return result;
 }

-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft) {
    std::vector<int> idxs(draft.size() + 1);
    for (size_t i = 0; i < idxs.size(); ++i) {
        idxs[i] = i;
    }

-    return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
+    return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft);
 }

 uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
@ -515,7 +513,8 @@ std::string common_sampler_print(const struct common_sampler * gsmpl) {

    for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
        const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
-        result += std::string("-> ") + llama_sampler_name(smpl) + " ";
+        result += std::string("-> ");
+        result += std::string(llama_sampler_name(smpl)) + " ";
    }

    return result;
--- a/common/sampling.h
+++ b/common/sampling.h
@ -48,6 +48,8 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
 // arguments can be nullptr to skip printing
 void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);

+struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl);
+
 // extended sampling implementation:
 //
 // - set logits
@ -55,10 +57,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
 // - check if the token fits the grammar (if any)
 // - if not: resample by first applying the grammar constraints and then sampling again (slower path)
 //
-// if grammar_first is true, the grammar is applied before the samplers (slower)
-// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
-//
-llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
+llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx);

 // generalized version of common_sampler_sample
 //
@ -76,10 +75,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
 //
 // returns at least 1 token, up to idxs.size()
 //
-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft);

 // assume idxs == [ 0, 1, 2, ..., draft.size() ]
-std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft);

 uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);

@ -107,3 +106,9 @@ std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std:

 llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
                const char * grammar_kind, const char * grammar_data);
+
+struct common_sampler_deleter {
+    void operator()(common_sampler * s) { common_sampler_free(s); }
+};
+
+typedef std::unique_ptr<common_sampler, common_sampler_deleter> common_sampler_ptr;
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@ -315,7 +315,7 @@ llama_tokens common_speculative_gen_draft(
    for (int i = 0; i < params.n_draft; ++i) {
        common_batch_clear(batch);

-        common_sampler_sample(smpl, ctx_dft, 0, true);
+        common_sampler_sample(smpl, ctx_dft, 0);

        const auto * cur_p = common_sampler_get_candidates(smpl, true);

--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -136,11 +136,19 @@ class ModelBase:
        self.remote_hf_model_id = remote_hf_model_id
        self.sentence_transformers_dense_modules = sentence_transformers_dense_modules
        self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams
+        self.rope_parameters = self.hparams.get("rope_parameters", self.hparams.get("rope_scaling")) or {}
        self.model_tensors = self.index_tensors(remote_hf_model_id=remote_hf_model_id)
        self.metadata_override = metadata_override
        self.model_name = model_name
        self.dir_model_card = dir_model  # overridden in convert_lora_to_gguf.py

+        # Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters
+        if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters:
+            if "rope_theta" not in self.rope_parameters and (rope_theta := self.find_hparam(["rope_theta", "global_rope_theta", "rotary_emb_base"], optional=True)) is not None:
+                self.rope_parameters["rope_theta"] = rope_theta
+            if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None:
+                self.rope_parameters["rope_type"] = rope_type
+
        # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
        if self.ftype == gguf.LlamaFileType.GUESSED:
            # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
@ -705,6 +713,9 @@ class ModelBase:
        if "llm_config" in config:
            # rename for InternVL
            config["text_config"] = config["llm_config"]
+        if "lm_config" in config:
+            # rename for GlmASR
+            config["text_config"] = config["lm_config"]
        if "thinker_config" in config:
            # rename for Qwen2.5-Omni
            config["text_config"] = config["thinker_config"]["text_config"]
@ -795,7 +806,7 @@ class TextModel(ModelBase):
    def set_gguf_parameters(self):
        self.gguf_writer.add_block_count(self.block_count)

-        if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length"], optional=True)) is not None:
+        if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length", "max_sequence_length", "model_max_length"], optional=True)) is not None:
            self.gguf_writer.add_context_length(n_ctx)
            logger.info(f"gguf: context length = {n_ctx}")

@ -815,7 +826,42 @@ class TextModel(ModelBase):
            self.gguf_writer.add_head_count_kv(n_head_kv)
            logger.info(f"gguf: key-value head count = {n_head_kv}")

-        if (rope_theta := self.hparams.get("rope_theta")) is not None:
+        rope_params = self.rope_parameters.get("full_attention", self.rope_parameters)
+        if (rope_type := rope_params.get("rope_type")) is not None:
+            rope_factor = rope_params.get("factor")
+            rope_gguf_type = gguf.RopeScalingType.NONE
+            if rope_type == "linear" and rope_factor is not None:
+                rope_gguf_type = gguf.RopeScalingType.LINEAR
+                self.gguf_writer.add_rope_scaling_type(rope_gguf_type)
+                self.gguf_writer.add_rope_scaling_factor(rope_factor)
+            elif rope_type == "yarn" and rope_factor is not None:
+                rope_gguf_type = gguf.RopeScalingType.YARN
+                self.gguf_writer.add_rope_scaling_type(rope_gguf_type)
+                self.gguf_writer.add_rope_scaling_factor(rope_factor)
+                self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_params["original_max_position_embeddings"])
+                if (yarn_ext_factor := rope_params.get("extrapolation_factor")) is not None:
+                    self.gguf_writer.add_rope_scaling_yarn_ext_factor(yarn_ext_factor)
+                if (yarn_attn_factor := rope_params.get("attention_factor", rope_params.get("attn_factor"))) is not None:
+                    self.gguf_writer.add_rope_scaling_yarn_attn_factor(yarn_attn_factor)
+                if (yarn_beta_fast := rope_params.get("beta_fast")) is not None:
+                    self.gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_beta_fast)
+                if (yarn_beta_slow := rope_params.get("beta_slow")) is not None:
+                    self.gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_beta_slow)
+                # self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"])
+            elif rope_type == "su" or rope_type == "longrope":
+                rope_gguf_type = gguf.RopeScalingType.LONGROPE
+                self.gguf_writer.add_rope_scaling_type(rope_gguf_type)
+            elif rope_type == "dynamic":
+                # HunYuan, handled in model class
+                pass
+            elif rope_type.lower() == "llama3":
+                # Handled in generate_extra_tensors
+                pass
+            else:
+                logger.warning(f"Unknown RoPE type: {rope_type}")
+            logger.info(f"gguf: rope scaling type = {rope_gguf_type.name}")
+
+        if (rope_theta := rope_params.get("rope_theta")) is not None:
            self.gguf_writer.add_rope_freq_base(rope_theta)
            logger.info(f"gguf: rope theta = {rope_theta}")
        if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"], optional=True)) is not None:
@ -1486,6 +1532,21 @@ class TextModel(ModelBase):
                raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported")
            self.gguf_writer.add_pooling_type(pooling_type)

+    def _set_vocab_glmedge(self):
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
+        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
+        tokens, toktypes, tokpre = self.get_vocab_base()
+        self.gguf_writer.add_tokenizer_model("gpt2")
+        self.gguf_writer.add_tokenizer_pre(tokpre)
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_types(toktypes)
+        special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
+        special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
+        special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
+        special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"])
+        special_vocab.add_to_gguf(self.gguf_writer)
+
    def _set_vocab_interns1(self):
        tokens: list[str] = []
        toktypes: list[int] = []
@ -1615,7 +1676,7 @@ class MmprojModel(ModelBase):
    preprocessor_config: dict[str, Any]
    global_config: dict[str, Any]

-    n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"]
+    n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "encoder_layers"]

    has_vision_encoder: bool = True # by default
    has_audio_encoder: bool = False
@ -1691,7 +1752,8 @@ class MmprojModel(ModelBase):
        return self.global_config.get(config_name)

    def get_audio_config(self) -> dict[str, Any] | None:
-        return self.global_config.get("audio_config")
+        mm_config_key = "whisper_config" if "whisper_config" in self.hparams else "audio_config"
+        return self.global_config.get(mm_config_key)

    def set_type(self):
        self.gguf_writer.add_type(gguf.GGUFType.MMPROJ)
@ -1966,34 +2028,10 @@ class BaichuanModel(TextModel):
        self._set_vocab_sentencepiece()

    def set_gguf_parameters(self):
-        head_count = self.hparams["num_attention_heads"]
-        head_count_kv = self.hparams.get("num_key_value_heads", head_count)
-
-        ctx_length = 0
-        if "max_sequence_length" in self.hparams:
-            ctx_length = self.hparams["max_sequence_length"]
-        elif "max_position_embeddings" in self.hparams:
-            ctx_length = self.hparams["max_position_embeddings"]
-        elif "model_max_length" in self.hparams:
-            ctx_length = self.hparams["model_max_length"]
-        else:
-            raise ValueError("gguf: can not find ctx length parameter.")
+        super().set_gguf_parameters()

        self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
-        self.gguf_writer.add_context_length(ctx_length)
-        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
-        self.gguf_writer.add_block_count(self.block_count)
-        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
        self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
-        self.gguf_writer.add_head_count(head_count)
-        self.gguf_writer.add_head_count_kv(head_count_kv)
-        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
-        self.gguf_writer.add_file_type(self.ftype)
-
-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        head_count = self.hparams["num_attention_heads"]
@ -2089,34 +2127,10 @@ class XverseModel(TextModel):
        special_vocab.add_to_gguf(self.gguf_writer)

    def set_gguf_parameters(self):
-        head_count = self.hparams["num_attention_heads"]
-        head_count_kv = self.hparams.get("num_key_value_heads", head_count)
-
-        ctx_length = 0
-        if "max_sequence_length" in self.hparams:
-            ctx_length = self.hparams["max_sequence_length"]
-        elif "max_position_embeddings" in self.hparams:
-            ctx_length = self.hparams["max_position_embeddings"]
-        elif "model_max_length" in self.hparams:
-            ctx_length = self.hparams["model_max_length"]
-        else:
-            raise ValueError("gguf: can not find ctx length parameter.")
+        super().set_gguf_parameters()

        self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
-        self.gguf_writer.add_context_length(ctx_length)
-        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
-        self.gguf_writer.add_block_count(self.block_count)
-        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
        self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
-        self.gguf_writer.add_head_count(head_count)
-        self.gguf_writer.add_head_count_kv(head_count_kv)
-        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
-        self.gguf_writer.add_file_type(self.ftype)
-
-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unused
@ -2377,8 +2391,13 @@ class LlamaModel(TextModel):
        # fix for SmolVLM2, missing `num_attention_heads` in config.json
        if self.hf_arch == "VLlama3ForCausalLM":
            self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
+        hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
+        self.origin_hf_arch = hparams.get('architectures', [None])[0]

    def set_vocab(self):
+        if self.origin_hf_arch == "GlmasrModel":
+            return self._set_vocab_glmedge()
+
        if self.is_mistral_format:
            return self._set_vocab_mistral()

@ -2430,11 +2449,6 @@ class LlamaModel(TextModel):
            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
        self.gguf_writer.add_rope_dimension_count(rope_dim)

-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
-
    @staticmethod
    def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
        if n_head_kv is not None and n_head != n_head_kv:
@ -2454,6 +2468,7 @@ class LlamaModel(TextModel):
            "vision_language_adapter.",
            "patch_merger.",
            "pre_mm_projector_norm",
+            "audio_encoder.",
        ]

        is_multimodal_tensor = "vision_tower" in name \
@ -2518,16 +2533,16 @@ class LlamaModel(TextModel):
        return [(self.map_tensor_name(name), data_torch)]

    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
-        if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
-            if rope_scaling.get("rope_type", '').lower() == "llama3":
-                base = self.hparams.get("rope_theta", 10000.0)
+        if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters):
+            if rope_params.get("rope_type", '').lower() == "llama3":
+                base = rope_params.get("rope_theta", 10000.0)
                if (dim := self.hparams.get("head_dim")) is None:
                    dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
                freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))

-                factor = rope_scaling.get("factor", 8.0)
-                low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
-                high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
+                factor = rope_params.get("factor", 8.0)
+                low_freq_factor = rope_params.get("low_freq_factor", 1.0)
+                high_freq_factor = rope_params.get("high_freq_factor", 4.0)
                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)

                low_freq_wavelen = old_context_len / low_freq_factor
@ -2564,11 +2579,6 @@ class ArceeModel(LlamaModel):
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        self._try_set_pooling_type()
-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
-            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])


@ModelBase.register("AfmoeForCausalLM")
@ -2851,17 +2861,11 @@ class Mistral3Model(LlamaModel):

    def set_gguf_parameters(self):
        super().set_gguf_parameters()
-        rope_params = self.hparams.get("rope_parameters")
+        rope_params = self.rope_parameters
        if self.hparams.get("model_type") == "ministral3":
-            assert rope_params is not None, "ministral3 must have 'rope_parameters' config"
+            assert rope_params, "ministral3 must have 'rope_parameters' config"
            assert rope_params["rope_type"] == "yarn", "ministral3 rope_type must be 'yarn'"
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-            self.gguf_writer.add_rope_scaling_factor(rope_params["factor"])
-            self.gguf_writer.add_rope_scaling_yarn_beta_fast(rope_params["beta_fast"])
-            self.gguf_writer.add_rope_scaling_yarn_beta_slow(rope_params["beta_slow"])
            self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"])
-            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_params["original_max_position_embeddings"])
-            self.gguf_writer.add_rope_freq_base(rope_params["rope_theta"])
            self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"])

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
@ -2958,7 +2962,7 @@ class DeciModel(TextModel):
            assert self.block_count == len(self._num_kv_heads)
            assert self.block_count == len(self._num_heads)
            assert self.block_count == len(self._ffn_dims)
-            if (rope_theta := self.hparams.get("rope_theta")) is not None:
+            if (rope_theta := self.rope_parameters.get("rope_theta")) is not None:
                self.gguf_writer.add_rope_freq_base(rope_theta)
            self.gguf_writer.add_head_count_kv(self._num_kv_heads)
            self.gguf_writer.add_head_count(self._num_heads)
@ -2983,11 +2987,6 @@ class DeciModel(TextModel):
            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
        self.gguf_writer.add_rope_dimension_count(rope_dim)

-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
-
    @staticmethod
    def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
        if n_head_kv is not None and n_head != n_head_kv:
@ -3016,16 +3015,16 @@ class DeciModel(TextModel):
        return [(self.map_tensor_name(name), data_torch)]

    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
-        if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
-            if rope_scaling.get("rope_type", '').lower() == "llama3":
-                base = self.hparams.get("rope_theta", 10000.0)
+        if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters):
+            if rope_params.get("rope_type", '').lower() == "llama3":
+                base = rope_params.get("rope_theta", 10000.0)
                if (dim := self.hparams.get("head_dim")) is None:
                    dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
                freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))

-                factor = rope_scaling.get("factor", 8.0)
-                low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
-                high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
+                factor = rope_params.get("factor", 8.0)
+                low_freq_factor = rope_params.get("low_freq_factor", 1.0)
+                high_freq_factor = rope_params.get("high_freq_factor", 4.0)
                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)

                low_freq_wavelen = old_context_len / low_freq_factor
@ -3279,10 +3278,6 @@ class MiniCPMModel(TextModel):
        logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
        self.gguf_writer.add_logit_scale(logit_scale)
        logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "longrope":
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
-            logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}")

    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
        rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
@ -3402,17 +3397,6 @@ class QwenModel(TextModel):
    def set_vocab(self):
        self._set_vocab_qwen()

-    def set_gguf_parameters(self):
-        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
-        self.gguf_writer.add_block_count(self.block_count)
-        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
-        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
-        self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
-        self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
-        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
-        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
-        self.gguf_writer.add_file_type(self.ftype)
-

@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration")
 class Qwen2Model(TextModel):
@ -3427,11 +3411,6 @@ class Qwen2Model(TextModel):
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        self._try_set_pooling_type()
-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
-            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        if self.hf_arch == "Qwen2Model":
@ -3499,12 +3478,6 @@ class DreamModel(TextModel):

        # Dream models use non-causal attention for diffusion
        self.gguf_writer.add_causal_attention(False)
-        # Handle RoPE scaling similar to Qwen2
-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
-            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])

        # Add Dream-specific parameters
        mask_token_id = self.hparams.get("mask_token_id")
@ -4048,13 +4021,6 @@ class Qwen2MoeModel(TextModel):
        if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None:
            self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size)
            logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}")
-        # YaRN is not enabled by default
-        # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
-            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])

    _experts: list[dict[str, Tensor]] | None = None

@ -4656,7 +4622,7 @@ class Phi3MiniModel(TextModel):
        self.gguf_writer.add_head_count_kv(n_head_kv)
        self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
        self.gguf_writer.add_rope_dimension_count(rope_dims)
-        self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
+        self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("full_attention", self.rope_parameters)["rope_theta"])
        self.gguf_writer.add_file_type(self.ftype)
        sliding_window = self.hparams.get("sliding_window")
        # use zero value of sliding_window to distinguish Phi-4 from other PHI3 models
@ -4932,7 +4898,7 @@ class Plamo2Model(TextModel):
        self.gguf_writer.add_value_length(hparams.get("hidden_size_per_head", 128))
        self.gguf_writer.add_block_count(self.block_count)
        self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06))
-        self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 10000))
+        self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("rope_theta", 10000))

        # Mamba parameters
        self.gguf_writer.add_ssm_state_size(hparams.get("mamba_d_state", 64))
@ -5130,21 +5096,6 @@ class InternLM2Model(TextModel):

        special_vocab.add_to_gguf(self.gguf_writer)

-    def set_gguf_parameters(self):
-        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
-        self.gguf_writer.add_block_count(self.block_count)
-        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
-        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
-        self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
-        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
-        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
-        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
-        self.gguf_writer.add_file_type(self.ftype)
-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
-
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        num_heads = self.hparams["num_attention_heads"]
        num_kv_heads = self.hparams["num_key_value_heads"]
@ -5221,11 +5172,6 @@ class InternLM3Model(TextModel):
            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
        self.gguf_writer.add_rope_dimension_count(rope_dim)

-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
-
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        n_head = self.hparams["num_attention_heads"]
        n_kv_head = self.hparams.get("num_key_value_heads")
@ -5588,7 +5534,6 @@ class NomicBertModel(BertModel):

    def set_gguf_parameters(self):
        super().set_gguf_parameters()
-        self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
        if self.is_moe:
            self.gguf_writer.add_moe_every_n_layers(self.hparams["moe_every_n_layers"])
            self.gguf_writer.add_expert_count(self.hparams["num_experts"])
@ -5711,8 +5656,6 @@ class XLMRobertaModel(BertModel):
        super().set_gguf_parameters()

        # jina-embeddings-v3
-        if rotary_emb_base := self.hparams.get("rotary_emb_base"):
-            self.gguf_writer.add_rope_freq_base(rotary_emb_base)
        lora_alpha = self.hparams.get("lora_alpha")
        if lora_prompt_prefixes := self.hparams.get("task_instructions"):
            assert self._lora_files and all(lora_name in lora_prompt_prefixes for lora_name in self._lora_files.keys())
@ -5840,19 +5783,16 @@ class Gemma3Model(TextModel):
            self._set_vocab_gpt2()

    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
        hparams = self.hparams

        # some default values are not specified in the hparams
        self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 131072))
-        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
-        self.gguf_writer.add_block_count(self.block_count)
-        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
        self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 8))
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6))
        self.gguf_writer.add_key_length(hparams.get("head_dim", 256))
        self.gguf_writer.add_value_length(hparams.get("head_dim", 256))
-        self.gguf_writer.add_file_type(self.ftype)
-        self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers
+        self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("full_attention", self.rope_parameters).get("rope_theta", 1_000_000.0)) # for global layers
        # attn_logit_softcapping is removed in Gemma3
        assert hparams.get("attn_logit_softcapping") is None
        if (final_logit_softcap := hparams.get("final_logit_softcapping")):
@ -5860,19 +5800,6 @@ class Gemma3Model(TextModel):
        if hparams.get("sliding_window_pattern") != 1:
            self.gguf_writer.add_sliding_window(hparams["sliding_window"])
        self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4))
-        if hparams.get("rope_scaling") is not None:
-            rope_scaling = hparams["rope_scaling"]
-            if rope_scaling["rope_type"] == "linear":
-                # important: this rope_scaling is only applied for global layers, and not used by 1B model
-                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-                self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
-            elif rope_scaling["rope_type"] == "yarn":
-                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-                self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
-                self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
-                self.gguf_writer.add_rope_scaling_yarn_ext_factor(rope_scaling["extrapolation_factor"])
-                self.gguf_writer.add_rope_scaling_yarn_beta_fast(rope_scaling["beta_fast"])
-                self.gguf_writer.add_rope_scaling_yarn_beta_slow(rope_scaling["beta_slow"])

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unused
@ -6776,13 +6703,6 @@ class Olmo2Model(TextModel):
    def set_gguf_parameters(self):
        super().set_gguf_parameters()

-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
-            self.gguf_writer.add_rope_scaling_attn_factors(rope_scaling["attention_factor"])
-            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
-
        if "sliding_window" in self.hparams:
            self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])

@ -7281,12 +7201,11 @@ class DeepseekV2Model(TextModel):

        self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])

-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
-            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
-            self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_scaling["mscale_all_dim"])
+        if (rope_mscale_all := self.rope_parameters.get("mscale_all_dim")) is not None:
+            # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
+            # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
+            # ref https://github.com/ggml-org/llama.cpp/pull/17945
+            self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_mscale_all)

    _experts: list[dict[str, Tensor]] | None = None

@ -7894,11 +7813,6 @@ class Glm4Model(TextModel):
        if (rope_dim := self.hparams.get("head_dim")) is None:
            rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
-            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        if name.startswith("model.visual."): # ignore visual part of Glm4v
@ -8236,50 +8150,26 @@ class ExaoneModel(TextModel):
    model_arch = gguf.MODEL_ARCH.EXAONE

    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
        hparams = self.hparams

        assert (hparams["activation_function"] == "silu")

-        max_position_embeddings = hparams["max_position_embeddings"]
-        embed_dim = hparams["hidden_size"]
-        num_heads = hparams["num_attention_heads"]
-        num_kv_heads = hparams.get("num_key_value_heads", num_heads)
-        layer_norm_eps = hparams["layer_norm_epsilon"]
-        intermediate_size = hparams["intermediate_size"] if "intermediate_size" in hparams else 4 * embed_dim
-        # ignore for now as EXAONE-3.0-7.8B-Instruct attentino_dropout is 0.0
-        # attention_dropout_rate = hparams["attention_dropout"]
-        # ignore for now as EXAONE-3.0-7.8B-Instruct embed_dropout is 0.0
-        # embed_dropout_rate = hparams["embed_dropout"]
-        self.gguf_writer.add_embedding_length(embed_dim)
-        self.gguf_writer.add_head_count(num_heads)
-        self.gguf_writer.add_head_count_kv(num_kv_heads)
-        self.gguf_writer.add_context_length(max_position_embeddings)
-        self.gguf_writer.add_layer_norm_rms_eps(layer_norm_eps)
-        self.gguf_writer.add_feed_forward_length(intermediate_size)
-        self.gguf_writer.add_block_count(self.block_count)
-        self.gguf_writer.add_file_type(self.ftype)
-
-        if (rope_theta := self.hparams.get("rope_theta")) is not None:
-            self.gguf_writer.add_rope_freq_base(rope_theta)
        rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True)
        rotary_factor = rotary_factor if rotary_factor is not None else 1.0
        self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])

    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
-        if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
-            if rope_scaling.get("rope_type", '').lower() == "llama3":
-                base = self.hparams.get("rope_theta", 10000.0)
+        if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters):
+            if rope_params.get("rope_type", '').lower() == "llama3":
+                base = self.rope_parameters.get("rope_theta", 10000.0)
                if (dim := self.hparams.get("head_dim")) is None:
                    dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
                freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))

-                factor = rope_scaling.get("factor", 8.0)
-                low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
-                high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
+                factor = rope_params.get("factor", 8.0)
+                low_freq_factor = rope_params.get("low_freq_factor", 1.0)
+                high_freq_factor = rope_params.get("high_freq_factor", 4.0)
                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)

                low_freq_wavelen = old_context_len / low_freq_factor
@ -8334,22 +8224,17 @@ class Exaone4Model(TextModel):
                if len(sliding_window_pattern) == hparams["num_hidden_layers"]:
                    self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)

-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
-
    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
-        if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
-            if rope_scaling.get("rope_type", '').lower() == "llama3":
-                base = self.hparams.get("rope_theta", 10_000.0)
+        if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters):
+            if rope_params.get("rope_type", '').lower() == "llama3":
+                base = rope_params.get("rope_theta", 10_000.0)
                if (dim := self.hparams.get("head_dim")) is None:
                    dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
                freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))

-                factor = rope_scaling.get("factor", 16.0)
-                low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
-                high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
+                factor = rope_params.get("factor", 16.0)
+                low_freq_factor = rope_params.get("low_freq_factor", 1.0)
+                high_freq_factor = rope_params.get("high_freq_factor", 4.0)
                old_context_len = self.hparams.get("original_max_position_embeddings", 8192)

                low_freq_wavelen = old_context_len / low_freq_factor
@ -8660,13 +8545,6 @@ class BailingMoeModel(TextModel):
            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]

        self.gguf_writer.add_rope_dimension_count(rope_dim)
-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
-            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
-        else:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
        self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
        self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
@ -8773,13 +8651,6 @@ class BailingMoeV2Model(TextModel):
            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]

        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
-            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
-        else:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
        self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
        self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
@ -8858,13 +8729,6 @@ class GroveMoeModel(TextModel):
        self.gguf_writer.add_experts_per_group(2)
        # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L376
        self.gguf_writer.add_expert_group_scale(0.05)
-        # YaRN is not enabled by default
-        # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
-            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])

    _experts: list[dict[str, Tensor]] | None = None
    _chunk_experts: list[dict[str, Tensor]] | None = None
@ -9007,6 +8871,63 @@ class UltravoxModel(TextModel):
        raise NotImplementedError("Ultravox does not have text decoder. Instead, it uses Llama or other models for text. If you want to get the audio encoder, please use --mmproj argument")


+@ModelBase.register("GlmasrModel")
+class GlmASRWhisperEncoderModel(MmprojModel):
+    has_vision_encoder = False
+    has_audio_encoder = True
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        if "hidden_size" not in self.hparams and "intermediate_size" not in self.hparams:
+            self.hparams["hidden_size"] = self.hparams["d_model"]
+            self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"]
+            self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"]
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLMA)
+        self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"])
+        self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
+        self.gguf_writer.add_audio_stack_factor(self.global_config["merge_factor"])
+
+    def tensor_force_quant(self, name, new_name, bid, n_dims):
+        if ".conv" in name and ".weight" in name:
+            return gguf.GGMLQuantizationType.F16
+        return super().tensor_force_quant(name, new_name, bid, n_dims)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        if name.startswith("model.") or name.startswith("lm_head."):
+            # skip language model tensors
+            return []
+
+        if name.startswith("audio_encoder.whisper."):
+            name = name.replace("audio_encoder.whisper.","audio_tower.")
+        if "audio_encoder.layer_norm." in name or "audio_encoder.proj." in name:
+            name = name.replace("audio_encoder.", "audio_encoder.adapting.")
+
+        if name.startswith("audio_encoder.audio_bos_eos_token."):
+            return [(self.map_tensor_name("model.vision.boi"), data_torch[0]), (self.map_tensor_name("model.vision.eoi"), data_torch[1])]
+
+        if name.startswith("audio_encoder.adapting."):
+            name = name.replace("audio_encoder.adapting.","audio.multi_modal_projector.")
+            if ".layer_norm." in name:
+                name = name.replace(".layer_norm.", ".ln_pre.")
+            if ".0." in name:
+                name = name.replace(".0.", ".linear_1.")
+            if ".2." in name:
+                name = name.replace(".2.", ".linear_2.")
+            if ".proj." in name:
+                return []
+
+        if "conv1.bias" in name or "conv2.bias" in name:
+            # transpose conv1 and conv2 bias
+            data_torch = data_torch.unsqueeze(-1)
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+
@ModelBase.register("Qwen2AudioForConditionalGeneration")
 class WhisperEncoderModel(MmprojModel):
    has_vision_encoder = False # no vision encoder
@ -9174,7 +9095,7 @@ class FalconH1Model(Mamba2Model):
        assert self.d_inner % self.d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {self.d_head}"

        # Add any other Falcon Mamba2 specific configuration
-        self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
+        self.gguf_writer.add_rope_freq_base(self.rope_parameters["rope_theta"])


@ModelBase.register("HunYuanMoEV1ForCausalLM")
@ -9252,12 +9173,11 @@ class HunYuanMoEModel(TextModel):
        self.gguf_writer.add_expert_shared_count(moe_shared_expert[0])

        # Rope
-        rope_scaling = hparams.get("rope_scaling", {})
-        if rope_scaling.get("type") == "dynamic":
+        if self.rope_parameters.get("rope_type") == "dynamic":
            # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
            # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
-            alpha = rope_scaling.get("alpha", 1000)
-            base = hparams.get("rope_theta", 10000.0)
+            alpha = self.rope_parameters.get("alpha", 1000)
+            base = self.rope_parameters.get("rope_theta", 10000.0)
            dim = (hparams["hidden_size"] // hparams["num_attention_heads"]) # 128
            scaled_base = base * (alpha ** (dim / (dim - 2))) # 10000 * (1000 ** (128 / 126)) = 11158839.9251
            self.gguf_writer.add_rope_freq_base(scaled_base)
@ -9452,12 +9372,11 @@ class HunYuanModel(TextModel):
        hparams = self.hparams

        # Rope
-        rope_scaling = hparams.get("rope_scaling", {})
-        if rope_scaling.get("type") == "dynamic":
+        if self.rope_parameters.get("rope_type") == "dynamic":
            # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
            # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
-            alpha = rope_scaling.get("alpha", 50)
-            base = hparams.get("rope_theta", 10000.0)
+            alpha = self.rope_parameters.get("alpha", 50)
+            base = self.rope_parameters.get("rope_theta", 10000.0)
            dim = hparams["head_dim"]
            scaled_base = base * (alpha ** (dim / (dim - 2)))
            self.gguf_writer.add_rope_freq_base(scaled_base)
@ -9608,13 +9527,6 @@ class GptOssModel(TextModel):
        self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
        self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size"])

-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        rope_type = rope_scaling.get("rope_type", rope_scaling.get("type"))
-        assert rope_type == "yarn", f"GPT-OSS only supports yarn rope scaling, got {rope_type}"
-        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-        self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
-        self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling.get("original_max_position_embeddings", 4096))
-

@ModelBase.register("Lfm2ForCausalLM", "LFM2ForCausalLM")
 class LFM2Model(TextModel):
@ -9787,13 +9699,6 @@ class SmallThinkerModel(TextModel):
            self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
        else:
            self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
-        # YaRN is not enabled by default
-        # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts
-        rope_scaling = self.hparams.get("rope_scaling") or {}
-        if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-            self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
-            self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])

        sliding_window_layout = self.hparams.get("sliding_window_layout")
        if sliding_window_layout:
@ -10041,6 +9946,10 @@ class MistralMoeModel(DeepseekV2Model):
        MistralModel.set_mistral_config(self.gguf_writer, self.hparams)
        yarn_params = self.hparams["yarn"]
        self.gguf_writer.add_attn_temperature_length(yarn_params["original_max_position_embeddings"])
+
+        # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
+        # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
+        # ref https://github.com/ggml-org/llama.cpp/pull/17945
        self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1) # mscale_all_dim * 0.1

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -131,10 +131,10 @@ int main(int argc, char ** argv) {
    llama_numa_init(params.numa);

    // load the model
-    common_init_result llama_init = common_init_from_params(params);
+    auto llama_init = common_init_from_params(params);

-    llama_model * model = llama_init.model.get();
-    llama_context * ctx = llama_init.context.get();
+    auto * model = llama_init->model();
+    auto * ctx = llama_init->context();

    if (model == NULL) {
        LOG_ERR("%s: unable to load model\n", __func__);
--- a/examples/model-conversion/scripts/utils/init.py
+++ b/examples/model-conversion/scripts/utils/init.py
--- a/examples/model-conversion/scripts/utils/common.py
+++ b/examples/model-conversion/scripts/utils/common.py
@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+
+def get_model_name_from_env_path(env_path_name):
+    model_path = os.getenv(env_path_name)
+    if not model_path:
+        print(f"Error: {env_path_name} environment variable not set")
+        sys.exit(1)
+
+    if not os.path.exists(model_path):
+        print(f"Error: Model file not found: {model_path}")
+        sys.exit(1)
+
+    name = os.path.basename(os.path.normpath(model_path))
+    if name.endswith(".gguf"):
+        name = name[:-5]
+
+    return name
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@ -99,6 +99,7 @@ extern "C" {
    GGML_BACKEND_API int ggml_cpu_has_sme        (void);
    // other
    GGML_BACKEND_API int ggml_cpu_has_riscv_v    (void);
+    GGML_BACKEND_API int ggml_cpu_get_rvv_vlen   (void);  // risc-v vector length in bytes
    GGML_BACKEND_API int ggml_cpu_has_vsx        (void);
    GGML_BACKEND_API int ggml_cpu_has_vxe        (void);
    GGML_BACKEND_API int ggml_cpu_has_wasm_simd  (void);
--- a/ggml/src/ggml-cpu/arch/arm/repack.cpp
+++ b/ggml/src/ggml-cpu/arch/arm/repack.cpp
@ -24,6 +24,7 @@

 #define UNUSED GGML_UNUSED

+#if defined(__aarch64__) && defined(__ARM_NEON) && (defined(__ARM_FEATURE_MATMUL_INT8) || defined(__ARM_FEATURE_DOTPROD))
 static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in,
                                             int16x8_t *     out_mins,
                                             int8_t *        out_scales) {
@ -46,6 +47,7 @@ static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in,
    scales_u32[1] = (sm[2] & kmask2) | (((sm[0] >> 6) & kmask3) << 4);
    memcpy(out_scales, scales_u32, 8);
 }
+#endif

 void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
    assert(QK8_0 == 32);
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@ -85,6 +85,11 @@ struct ggml_arm_arch_features_type {
 } ggml_arm_arch_features = { 0 };
 #endif

+#if defined(__riscv)
+struct ggml_riscv_arch_features_type {
+    int rvv_vlen;
+} ggml_riscv_arch_features = { 0 };
+#endif

 #if defined(_WIN32)

@ -708,6 +713,15 @@ static void ggml_init_arm_arch_features(void) {}
 #endif
 #endif // __ARM_ARCH

+#if defined(__riscv) && defined(__riscv_v_intrinsic)
+#include <riscv_vector.h>
+static void ggml_init_riscv_arch_features(void) {
+    ggml_riscv_arch_features.rvv_vlen = __riscv_vlenb();
+}
+#else
+static void ggml_init_riscv_arch_features(void) {}
+#endif
+
 struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
    GGML_ASSERT(!ggml_get_no_alloc(ctx));

@ -4325,6 +4339,14 @@ int ggml_cpu_has_riscv_v(void) {
 #endif
 }

+int ggml_cpu_get_rvv_vlen(void) {
+#if defined(__riscv) && defined(__riscv_v_intrinsic)
+    return ggml_riscv_arch_features.rvv_vlen;
+#else
+    return 0;
+#endif
+}
+
 int ggml_cpu_has_f16c(void) {
 #if defined(__F16C__)
    return 1;
@ -4494,6 +4516,10 @@ void ggml_cpu_init(void) {
        ggml_cl_init();
 #endif

+#if defined(__riscv)
+        ggml_init_riscv_arch_features();
+#endif
+
        is_first_call = false;
    }

--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@ -583,6 +583,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
        if (ggml_cpu_has_riscv_v()) {
            features.push_back({ "RISCV_V", "1" });
        }
+        if (ggml_cpu_get_rvv_vlen() > 0) {
+            static std::string rvv_vlen = std::to_string(ggml_cpu_get_rvv_vlen());
+            features.push_back({ "RVV_VLEN", rvv_vlen.c_str() });
+        }
        if (ggml_cpu_has_vsx()) {
            features.push_back({ "VSX", "1" });
        }
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@ -2056,7 +2056,8 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons
 #endif

    if (cur->type == GGML_TYPE_Q4_0) {
-        if ((ggml_cpu_has_avx2() && permit_repack) || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
+        if ((ggml_cpu_has_avx2() && permit_repack) || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
+            || (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) {
            if (cur->ne[1] % 8 == 0) {
                return &q4_0_8x8_q8_0;
            }
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@ -642,8 +642,8 @@ static __global__ void flash_attn_stream_k_fixup(
    const int iter_k = (ne11 + (nbatch_fa - 1)) / nbatch_fa;
    const int iter_j = (ne01 + (ncols1    - 1)) / ncols1;

-    const int kbc0      = (bidx0 + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
-    const int kbc0_stop = (bidx0 + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
+    const int kbc0      = int64_t(bidx0 + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
+    const int kbc0_stop = int64_t(bidx0 + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;

    const bool did_not_have_any_data   = kbc0 == kbc0_stop;
    const bool wrote_beginning_of_tile = kbc0 % iter_k == 0;
@ -679,7 +679,7 @@ static __global__ void flash_attn_stream_k_fixup(
    int bidx = bidx0 - 1;
    int kbc_stop = kbc0;
    while(true) {
-        const int kbc = bidx*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
+        const int kbc = int64_t(bidx)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
        if (kbc == kbc_stop) { // Did not have any data.
            bidx--;
            kbc_stop = kbc;
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@ -1380,8 +1380,8 @@ static __global__ void flash_attn_ext_f16(
    const int iter_j = (ne01.z + (ncols1    - 1)) / ncols1;

    // kbc == k block continuous, current index in continuous ijk space.
-    int       kbc      = (blockIdx.x + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
-    const int kbc_stop = (blockIdx.x + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
+    int       kbc      = int64_t(blockIdx.x + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;
+    const int kbc_stop = int64_t(blockIdx.x + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x;

    // If the seams of 2 CUDA blocks fall within an output tile their results need to be combined.
    // For this we need to track both the block that starts the tile (needs_fixup) and the block that finishes the tile (is_fixup).
@ -1401,7 +1401,7 @@ static __global__ void flash_attn_ext_f16(
        const float2 * Q_f2   = (const float2 *) (Q + nb03*sequence + nb02* head0);
        const half2  * K_h2   = (const half2  *) (K + nb13*sequence + nb12*(head0 / gqa_ratio));
        const half   * mask_h = ncols2 == 1 && !mask ? nullptr :
-            (const half  *) (mask + nb33*(sequence % ne33));
+            (const half *) (mask + nb33*(sequence % ne33));
        float2       * dstk   = ((float2 *) dst) + (sequence*ne01.z*ne02 + head0) * (DV/2);

        const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio));
--- a/ggml/src/ggml-sycl/add-id.cpp
+++ b/ggml/src/ggml-sycl/add-id.cpp
@ -0,0 +1,77 @@
+#include <sycl/sycl.hpp>
+#include "common.hpp"
+#include "add-id.hpp"
+
+static void add_id_kernel(
+    const float* src0,
+    const float* src1,
+    const int32_t* src2,
+    float* dst,
+    int64_t ne0,
+    int64_t ne1,
+    size_t nb01,
+    size_t nb02,
+    size_t nb11,
+    size_t nb21,
+    sycl::nd_item<3> item_ct1) {
+  const int64_t i1 = item_ct1.get_group(2);
+  const int64_t i2 = item_ct1.get_group(1);
+
+  const int i11 =
+      *(const int32_t*)((const char*)src2 + i1 * sizeof(int32_t) + i2 * nb21);
+
+  const size_t nb1 = ne0 * sizeof(float);
+  const size_t nb2 = ne1 * nb1;
+
+  float* dst_row = (float*)((char*)dst + i1 * nb1 + i2 * nb2);
+  const float* src0_row =
+      (const float*)((const char*)src0 + i1 * nb01 + i2 * nb02);
+  const float* src1_row = (const float*)((const char*)src1 + i11 * nb11);
+
+  for (int64_t i0 = item_ct1.get_local_id(2); i0 < ne0;
+       i0 += item_ct1.get_local_range(2)) {
+    dst_row[i0] = src0_row[i0] + src1_row[i0];
+  }
+}
+
+void ggml_sycl_add_id(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
+  const ggml_tensor* src0 = dst->src[0];
+  const ggml_tensor* src1 = dst->src[1];
+  const ggml_tensor* src2 = dst->src[2];
+
+  GGML_TENSOR_TERNARY_OP_LOCALS
+
+  GGML_ASSERT(dst->type == GGML_TYPE_F32);
+  GGML_ASSERT(src0->type == GGML_TYPE_F32);
+  GGML_ASSERT(src1->type == GGML_TYPE_F32);
+  GGML_ASSERT(src2->type == GGML_TYPE_I32);
+
+  GGML_ASSERT(nb00 == sizeof(float));
+  GGML_ASSERT(nb10 == sizeof(float));
+  GGML_ASSERT(nb20 == sizeof(int32_t));
+
+  const float* src0_d = (const float*)src0->data;
+  const float* src1_d = (const float*)src1->data;
+  const int32_t* src2_d = (const int32_t*)src2->data;
+  float* dst_d = (float*)dst->data;
+
+  int threads = std::min((int)ne00, 768);  // cols
+  ctx.stream()->parallel_for(
+      sycl::nd_range<3>(
+          sycl::range<3>(1, ne02, ne01) * sycl::range<3>(1, 1, threads),
+          sycl::range<3>(1, 1, threads)),
+      [=](sycl::nd_item<3> item_ct1) {
+        add_id_kernel(
+            src0_d,
+            src1_d,
+            src2_d,
+            dst_d,
+            ne0,
+            ne1,
+            nb01,
+            nb02,
+            nb11,
+            nb21,
+            item_ct1);
+      });
+}
--- a/ggml/src/ggml-sycl/add-id.hpp
+++ b/ggml/src/ggml-sycl/add-id.hpp
@ -0,0 +1,8 @@
+#ifndef GGML_SYCL_ADD_ID_HPP
+#define GGML_SYCL_ADD_ID_HPP
+
+#include "common.hpp"
+
+void ggml_sycl_add_id(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+#endif // GGML_SYCL_ADD_ID_HPP
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -675,6 +675,7 @@ struct vk_device_struct {
    vk_pipeline pipeline_cos_f32;
    vk_pipeline pipeline_log[2];
    vk_pipeline pipeline_tri[2];
+    vk_pipeline pipeline_diag[2];
    vk_pipeline pipeline_clamp_f32;
    vk_pipeline pipeline_pad_f32;
    vk_pipeline pipeline_roll_f32;
@ -738,6 +739,11 @@ struct vk_device_struct {
    vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
    vk_pipeline pipeline_soft_max_f32_wg512, pipeline_soft_max_f32_f16_wg512;
    vk_pipeline pipeline_soft_max_back_f32;
+
+    vk_pipeline pipeline_soft_max_large1_f32, pipeline_soft_max_large1_f32_f16;
+    vk_pipeline pipeline_soft_max_large2_f32, pipeline_soft_max_large2_f32_f16;
+    vk_pipeline pipeline_soft_max_large3_f32, pipeline_soft_max_large3_f32_f16;
+
    vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16, pipeline_rope_norm_f32_f16;
    vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16, pipeline_rope_neox_f32_f16;
    vk_pipeline pipeline_rope_multi_f32, pipeline_rope_multi_f16;
@ -773,7 +779,8 @@ struct vk_device_struct {

    vk_pipeline pipeline_flash_attn_split_k_reduce;

-    vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][TOPK_MOE_COUNT];
+    // [2] is for whether to take n_experts from spec constant (0) or push constant (1)
+    vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][TOPK_MOE_COUNT][2];

    std::vector<vk_pipeline_ref> all_pipelines;

@ -1165,6 +1172,7 @@ static_assert(sizeof(vk_op_multi_add_push_constants) <= 256);

 struct vk_op_topk_moe_push_constants {
    uint32_t n_rows;
+    uint32_t n_experts_push;
    uint32_t n_expert_used;
    float clamp_min;
    float clamp_max;
@ -3746,6 +3754,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_XS],  "get_rows_iq4_xs",  get_rows_iq4_xs_len,  get_rows_iq4_xs_data,  "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_NL],  "get_rows_iq4_nl",  get_rows_iq4_nl_len,  get_rows_iq4_nl_data,  "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_MXFP4],   "get_rows_mxfp4",   get_rows_mxfp4_len,   get_rows_mxfp4_data,   "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_I32],     "get_rows_i32",     get_rows_i32_len,     get_rows_i32_data,     "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);

    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32",  get_rows_f32_f32_len,  get_rows_f32_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32",  get_rows_f16_f32_len,  get_rows_f16_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
@ -3933,6 +3942,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_tri[0], "tri_f32", tri_f32_len, tri_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_tri[1], "tri_f16", tri_f16_len, tri_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);

+    ggml_vk_create_pipeline(device, device->pipeline_diag[0], "diag_f32", diag_f32_len, diag_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_diag[1], "diag_f16", diag_f16_len, diag_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
+
    ggml_vk_create_pipeline(device, device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);

    ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_pad_push_constants), {512, 1, 1}, {}, 1);
@ -4012,6 +4024,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16_wg512, "soft_max_f32_f16_wg512", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1);
    ggml_vk_create_pipeline(device, device->pipeline_soft_max_back_f32, "soft_max_back_f32", soft_max_back_f32_len, soft_max_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1, true);

+    ggml_vk_create_pipeline(device, device->pipeline_soft_max_large1_f32,     "soft_max_large1_f32",     soft_max_large1_f32_len,     soft_max_large1_f32_data,     "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_soft_max_large2_f32,     "soft_max_large2_f32",     soft_max_large2_f32_len,     soft_max_large2_f32_data,     "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_soft_max_large3_f32,     "soft_max_large3_f32",     soft_max_large3_f32_len,     soft_max_large3_f32_data,     "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_soft_max_large1_f32_f16, "soft_max_large1_f32_f16", soft_max_large1_f32_f16_len, soft_max_large1_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_soft_max_large2_f32_f16, "soft_max_large2_f32_f16", soft_max_large2_f32_f16_len, soft_max_large2_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
+    ggml_vk_create_pipeline(device, device->pipeline_soft_max_large3_f32_f16, "soft_max_large3_f32_f16", soft_max_large3_f32_f16_len, soft_max_large3_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true);
+
    ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f32, "rope_multi_f32", rope_multi_f32_len, rope_multi_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
@ -4220,10 +4239,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f16_f32, "conv2d_dw_whcn_f16_f32", conv2d_dw_whcn_f16_f32_len, conv2d_dw_whcn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f16_f32, "conv2d_dw_cwhn_f16_f32", conv2d_dw_cwhn_f16_f32_len, conv2d_dw_cwhn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);

-    for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) {
-        ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX],      "topk_moe_f32_early_softmax_"+std::to_string(i),       topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 0}, 1, true, true, device->subgroup_size);
-        ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX_NORM], "topk_moe_f32_early_softmax_norm"+std::to_string(i),   topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 1, 0}, 1, true, true, device->subgroup_size);
-        ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_LATE_SOFTMAX],       "topk_moe_f32_late_softmax"+std::to_string(i),         topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 1}, 1, true, true, device->subgroup_size);
+    for (uint32_t use_push = 0; use_push < 2; ++use_push) {
+        for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) {
+            ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX][use_push],      "topk_moe_f32_early_softmax_"+std::to_string(i),       topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 0, use_push}, 1, true, true, device->subgroup_size);
+            ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX_NORM][use_push], "topk_moe_f32_early_softmax_norm"+std::to_string(i),   topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 1, 0, use_push}, 1, true, true, device->subgroup_size);
+            ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_LATE_SOFTMAX][use_push],       "topk_moe_f32_late_softmax"+std::to_string(i),         topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 1, use_push}, 1, true, true, device->subgroup_size);
+        }
    }

    for (auto &c : compiles) {
@ -8304,6 +8325,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
    switch (op) {
    case GGML_OP_GET_ROWS:
        GGML_ASSERT(src1->type == GGML_TYPE_I32);
+        if (src0->type == GGML_TYPE_I32) {
+            // i32 src only supports i32 result
+            GGML_ASSERT(dst->type == GGML_TYPE_I32);
+            return ctx->device->pipeline_get_rows[src0->type];
+        }
        if (dst->type == GGML_TYPE_F16) {
            return ctx->device->pipeline_get_rows[src0->type];
        }
@ -8430,6 +8456,12 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
            return ctx->device->pipeline_tri[dst->type == GGML_TYPE_F16];
        }
        return nullptr;
+    case GGML_OP_DIAG:
+        if (src0->type == dst->type &&
+            (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) {
+            return ctx->device->pipeline_diag[dst->type == GGML_TYPE_F16];
+        }
+        return nullptr;
    case GGML_OP_CLAMP:
        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
            return ctx->device->pipeline_clamp_f32;
@ -8584,7 +8616,9 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
            uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0])));
            GGML_ASSERT(idx < num_topk_moe_pipelines);
            topk_moe_mode mode = ggml_vk_num_additional_ops_to_topk_moe_mode(ctx->num_additional_fused_ops);
-            return ctx->device->pipeline_topk_moe[idx][mode];
+            // use n_experts from push constant if it's not equal to the power of two spec constant
+            bool use_push = dst->ne[0] != (1u << idx);
+            return ctx->device->pipeline_topk_moe[idx][mode][use_push];
        }

        if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) {
@ -9121,6 +9155,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
    case GGML_OP_COS:
    case GGML_OP_LOG:
    case GGML_OP_TRI:
+    case GGML_OP_DIAG:
    case GGML_OP_CLAMP:
    case GGML_OP_PAD:
    case GGML_OP_ROLL:
@ -9808,6 +9843,12 @@ static void ggml_vk_tri(ggml_backend_vk_context * ctx, vk_context& subctx, const
    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_TRI, std::move(p));
 }

+static void ggml_vk_diag(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
+    vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst));
+
+    ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_DIAG, std::move(p));
+}
+
 static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
    vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
    p.param1 = ggml_get_op_params_f32(dst, 0);
@ -10141,7 +10182,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx,
    const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);

-    ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_SOFT_MAX, {
+    vk_op_soft_max_push_constants pc {
        ncols,
        src1 != nullptr ? nrows_y : (uint32_t)0,
        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],
@ -10152,7 +10193,55 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx,
        n_head_log2,
        nrows_x,
        src2 != nullptr
-    });
+    };
+
+    if (ncols <= 16384) {
+        ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_SOFT_MAX, std::move(pc));
+    } else {
+
+        vk_subbuffer buf_a = ggml_vk_tensor_subbuffer(ctx, src0);
+        vk_subbuffer buf_b = src1 ? ggml_vk_tensor_subbuffer(ctx, src1) : buf_a;
+        vk_subbuffer buf_c = src2 ? ggml_vk_tensor_subbuffer(ctx, src2) : buf_a;
+        vk_subbuffer buf_d = ggml_vk_tensor_subbuffer(ctx, dst);
+
+        uint32_t elems_per_wg = 128 * 4;
+        uint32_t num_wgs = CEIL_DIV(ncols, elems_per_wg);
+        size_t tmp_size = num_wgs * nrows_x * sizeof(float);
+
+        if (ctx->prealloc_size_x < tmp_size) {
+            ctx->prealloc_size_x = tmp_size;
+            ggml_vk_preallocate_buffers(ctx, subctx);
+        }
+        if (ctx->prealloc_size_y < tmp_size) {
+            ctx->prealloc_size_y = tmp_size;
+            ggml_vk_preallocate_buffers(ctx, subctx);
+        }
+        if (ctx->prealloc_x_need_sync || ctx->prealloc_y_need_sync) {
+            ggml_vk_sync_buffers(ctx, subctx);
+        }
+
+        vk_subbuffer buf_x = { ctx->prealloc_x, 0, tmp_size };
+        vk_subbuffer buf_y = { ctx->prealloc_y, 0, tmp_size };
+
+        std::array<uint32_t, 3> elements = { num_wgs, nrows_x, 1 };
+
+        vk_pipeline pipeline1 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large1_f32_f16 : ctx->device->pipeline_soft_max_large1_f32;
+        vk_pipeline pipeline2 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large2_f32_f16 : ctx->device->pipeline_soft_max_large2_f32;
+        vk_pipeline pipeline3 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large3_f32_f16 : ctx->device->pipeline_soft_max_large3_f32;
+
+        ggml_pipeline_request_descriptor_sets(ctx, pipeline1, 1);
+        ggml_pipeline_request_descriptor_sets(ctx, pipeline2, 1);
+        ggml_pipeline_request_descriptor_sets(ctx, pipeline3, 1);
+
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline1, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements);
+        ggml_vk_sync_buffers(ctx, subctx);
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline2, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements);
+        ggml_vk_sync_buffers(ctx, subctx);
+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline3, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements);
+
+        ctx->prealloc_x_need_sync = true;
+        ctx->prealloc_y_need_sync = true;
+    }
 }

 static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@ -10188,6 +10277,7 @@ static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx,

    vk_op_topk_moe_push_constants pc {};
    pc.n_rows = n_rows;
+    pc.n_experts_push = n_experts;
    pc.n_expert_used = n_expert_used;
    if (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) {
        ggml_tensor * clamp = cgraph->nodes[node_idx + 7];
@ -11887,6 +11977,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
    case GGML_OP_TRI:
        ggml_vk_tri(ctx, compute_ctx, src0, node);

+        break;
+    case GGML_OP_DIAG:
+        ggml_vk_diag(ctx, compute_ctx, src0, node);
+
        break;
    case GGML_OP_CLAMP:
        ggml_vk_clamp(ctx, compute_ctx, src0, node);
@ -12862,8 +12956,7 @@ static bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struc
    }

    const int n_expert = softmax->ne[0];
-    // n_expert must be a power of 2
-    if (!is_pow2(n_expert) || n_expert > (1 << (num_topk_moe_pipelines-1))) {
+    if (n_expert > (1 << (num_topk_moe_pipelines-1))) {
        return false;
    }

@ -13907,6 +14000,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                    case GGML_TYPE_IQ4_XS:
                    case GGML_TYPE_IQ4_NL:
                    case GGML_TYPE_MXFP4:
+                    case GGML_TYPE_I32:
                        return true;
                    default:
                        return false;
@ -14031,6 +14125,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
            return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
        case GGML_OP_LOG:
        case GGML_OP_TRI:
+        case GGML_OP_DIAG:
            return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
                   op->type == op->src[0]->type;
        case GGML_OP_ARGSORT:
@ -14621,6 +14716,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph *
            tensor_clone = ggml_log(ggml_ctx, src_clone[0]);
        } else if (tensor->op == GGML_OP_TRI) {
            tensor_clone = ggml_tri(ggml_ctx, src_clone[0], ggml_get_op_params_i32(tensor, 0));
+        } else if (tensor->op == GGML_OP_DIAG) {
+            tensor_clone = ggml_diag(ggml_ctx, src_clone[0]);
        } else if (tensor->op == GGML_OP_CLAMP) {
            const float * params = (const float *)tensor->op_params;
            tensor_clone = ggml_clamp(ggml_ctx, src_clone[0], params[0], params[1]);
--- a/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp
@ -0,0 +1,29 @@
+#version 450
+
+#include "rte.glsl"
+#include "types.glsl"
+#include "generic_unary_head.glsl"
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    const uint idx = get_idx();
+
+    if (idx >= p.ne) {
+        return;
+    }
+
+    const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
+    const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
+    const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L);
+    const uint i12_offset = i12*p.ne11*p.ne10;
+    const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L);
+    const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
+
+    if (i10 == i11) {
+        const float val = float(data_a[get_aoffset() + i13*p.nb03 + i12*p.nb02 + 0*p.nb01 + i10*p.nb00]);
+        data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val);
+    } else {
+        data_d[get_doffset() + dst_idx(idx)] = D_TYPE(0);
+    }
+}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp
@ -256,6 +256,9 @@ void main() {
        barrier();
    }

+    // prevent race on tmpsh
+    barrier();
+
    // reduce across threads

    [[unroll]] for (uint32_t r = 0; r < Br; ++r) {
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp
@ -302,6 +302,9 @@ void main() {
        barrier();
    }

+    // prevent race on tmpsh
+    barrier();
+
    // reduce across threads

    float rowmaxf[rows_per_thread], eMf[rows_per_thread], Moldf[rows_per_thread];
--- a/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp
@ -26,9 +26,9 @@ void main() {
            const uint d_offset = get_doffset() + i10*p.nb21 + i11*p.nb22 + i12*p.nb23;

 #if defined(DATA_A_BF16)
-            FLOAT_TYPE v = FLOAT_TYPE(bf16_to_fp32(data_a[a_offset + i00]));
+            TEMP_TYPE v = TEMP_TYPE(bf16_to_fp32(data_a[a_offset + i00]));
 #else
-            FLOAT_TYPE v = FLOAT_TYPE(data_a[a_offset + i00]);
+            TEMP_TYPE v = TEMP_TYPE(data_a[a_offset + i00]);
 #endif
 #ifndef OPTIMIZATION_ERROR_WORKAROUND
            data_d[d_offset + i00] = D_TYPE(v);
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp
@ -7,34 +7,50 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;

 FLOAT_TYPE temp[NUM_COLS][NUM_ROWS];

-void calc_superblock(const uint a_offset, const uint b_offset, const uint ib32, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
-    const uint y_idx = i * QUANT_K + 32 * ib32;
-
-    uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
-    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
-        const float d = float(data_a[ibi].d);
-        const uint qh = data_a[ibi].qh[ib32];
-        const float dl = d * float(2 * bitfieldExtract(qh, 12, 3) + 1);
-        const float delta = ((qh & 0x8000) != 0) ? -IQ1S_DELTA : IQ1S_DELTA;
-
+void calc_superblock(const uint a_offset, const uint b_offset, const uint ib32, const uint i,
+                     const uint num_blocks_per_row, const uint first_row, const uint num_rows) {
+    const uint y_idx_base = i * QUANT_K + 32 * ib32;
+    [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
+        const uint base_b_idx = (j * p.batch_stride_b + b_offset + y_idx_base) / 4;
        [[unroll]] for (uint l = 0; l < 4; ++l) {
-            const uint qs = data_a[ibi].qs[4 * ib32 + l];
-            const uint idxhi = bitfieldExtract(qh, 3 * int(l), 3);
-            const int16_t grid = int16_t(iq1s_grid[qs | (idxhi << 8)]);
+            const vec4 b_val_0 = vec4(data_b_v4[base_b_idx + 2 * l]);
+            const vec4 b_val_1 = vec4(data_b_v4[base_b_idx + 2 * l + 1]);

-            [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) {
-                vec4 b0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 0]);
-                vec4 b4 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 1]);
+            // index for data_a
+            uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i;
+
+            [[unroll]] for (uint n = 0; n < num_rows; ++n) {
+                const float d = float(data_a[ibi].d);
+                const uint qh = data_a[ibi].qh[ib32];
+
+                const float dl = d * float(2 * bitfieldExtract(qh, 12, 3) + 1);
+                const uint qs = data_a[ibi].qs[4 * ib32 + l];
+                const uint idxhi = bitfieldExtract(qh, 3 * int(l), 3);
+                const uint16_t grid = uint16_t(iq1s_grid[qs | (idxhi << 8)]);
+
+                const float delta_val = ((qh & 0x8000) != 0) ? -IQ1S_DELTA : IQ1S_DELTA;
+                const vec4 delta_v = vec4(delta_val);
+                const vec4 fbits0 = vec4(
+                    float(bitfieldExtract(grid, 0, 2)),
+                    float(bitfieldExtract(grid, 2, 2)),
+                    float(bitfieldExtract(grid, 4, 2)),
+                    float(bitfieldExtract(grid, 6, 2))
+                );
+                const vec4 fbits1 = vec4(
+                    float(bitfieldExtract(grid, 8, 2)),
+                    float(bitfieldExtract(grid, 10, 2)),
+                    float(bitfieldExtract(grid, 12, 2)),
+                    float(bitfieldExtract(grid, 14, 2))
+                );
+
+                vec4 sum_v = fma(b_val_0, fbits0 + delta_v, vec4(0.0));
+                sum_v      = fma(b_val_1, fbits1 + delta_v, sum_v);
+                FLOAT_TYPE sum = dot(sum_v, vec4(1.0));

-                FLOAT_TYPE sum = FLOAT_TYPE(0.0);
-                [[unroll]] for (int k = 0; k < 4; ++k) {
-                    sum = fma(FLOAT_TYPE(b0[k]), bitfieldExtract(grid, 2 * k, 2) + delta,
-                          fma(FLOAT_TYPE(b4[k]), bitfieldExtract(grid, 8 + 2 * k, 2) + delta, sum));
-                }
                temp[j][n] = fma(dl, sum, temp[j][n]);
+                ibi += num_blocks_per_row;
            }
        }
-        ibi += num_blocks_per_row;
    }
 }

--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl
@ -244,17 +244,20 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
            const uint iqs = idx % 128;                 // 0..127

            const uint n = iqs / 64;                    // 0,1
-            const uint b = (iqs % 64) / 32;             // 0,1
+            const uint b = ((iqs % 64) / 32) * 4;       // 0,4
            const uint is_b = (iqs % 16) / 8;           // 0,1
            const uint qhshift = ((iqs % 64) / 16) * 2; // 0,2,4,6
            const uint is = 8 * n + qhshift + is_b;     // 0..15
-            const uint qsi = n * 64 + (iqs % 32) * 2;   // 0,2,4..126
-            const uint qhi = n * 32 + (iqs % 16) * 2;   // 0,2,4..62
+            const uint qsi = n * 32 + (iqs % 32);       // 0..63
+            const uint qhi = n * 16 + (iqs % 16);       // 0..31

            const float dscale = float(data_a[ib].d) * float(data_a[ib].scales[is]);

-            buf_a[buf_idx] = FLOAT_TYPE_VEC2(dscale * float(int8_t(((data_a[ib].ql[qsi    ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi    ] >> qhshift) & 3) << 4)) - 32),
-                                             dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32));
+            const uint ql = (uint(data_a_packed16[ib].ql[qsi]) >> b) & 0x0F0F;
+            const uint qh = (uint(data_a_packed16[ib].qh[qhi]) >> qhshift) & 0x0303;
+            const vec2 q = (vec2(unpack8(ql | (qh << 4)).xy) - 32) * dscale;
+
+            buf_a[buf_idx] = FLOAT_TYPE_VEC2(q.x, q.y);
 #elif defined(DATA_A_IQ1_S)
            const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
            const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
--- a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp
@ -0,0 +1,62 @@
+#version 450
+
+#include "soft_max_large_common.glsl"
+
+void main() {
+    const uint tid = gl_LocalInvocationID.x;
+    const uint rowx = gl_WorkGroupID.y;
+    const uint wg_start = gl_WorkGroupID.x * BLOCK_SIZE * num_iters;
+
+    const uint32_t i03 = rowx / (p.ne01 * p.ne02);
+    const uint32_t i02 = (rowx - i03 * p.ne01 * p.ne02) / p.ne01;
+    const uint32_t i01 = rowx % p.ne01;
+
+    uint rowy_start = 0;
+    if (p.KY > 0) {
+        rowy_start = i01 * p.nb11 + (i02 % p.ne12) * p.nb12 + (i03 % p.ne13) * p.nb13;
+    }
+
+    if (rowx >= p.nrows_x) {
+        return;
+    }
+
+    float slope = get_slope(rowx);
+
+    // Find max
+    FLOAT_TYPE max_val = p.has_sinks == 0 ? uintBitsToFloat(0xFF800000) : data_c[i02];
+
+    [[unroll]] for (uint col0 = wg_start, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) {
+        const uint col = col0 + tid;
+
+        FLOAT_TYPE a = FLOAT_TYPE(0);
+        if (col < p.KX) {
+            a = data_a[rowx * p.KX + col];
+        }
+
+        FLOAT_TYPE b = FLOAT_TYPE(0);
+        if (p.KY > 0 && col < p.KX) {
+            b = data_b[rowy_start + col];
+        }
+
+        FLOAT_TYPE v = a * p.scale + slope * b;
+
+        if (col < p.KX) {
+            max_val = max(max_val, v);
+        }
+    }
+
+    // reduce across the workgroup
+    vals[tid] = max_val;
+    barrier();
+    [[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
+        if (tid < s) {
+            vals[tid] = max(vals[tid], vals[tid + s]);
+        }
+        barrier();
+    }
+
+    if (tid == 0) {
+        max_val = vals[0];
+        data_m[rowx * gl_NumWorkGroups.x + gl_WorkGroupID.x] = max_val;
+    }
+}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp
@ -0,0 +1,79 @@
+#version 450
+
+#include "soft_max_large_common.glsl"
+
+void main() {
+    const uint tid = gl_LocalInvocationID.x;
+    const uint rowx = gl_WorkGroupID.y;
+    const uint wg_start = gl_WorkGroupID.x * BLOCK_SIZE * num_iters;
+
+    const uint32_t i03 = rowx / (p.ne01 * p.ne02);
+    const uint32_t i02 = (rowx - i03 * p.ne01 * p.ne02) / p.ne01;
+    const uint32_t i01 = rowx % p.ne01;
+
+    uint rowy_start = 0;
+    if (p.KY > 0) {
+        rowy_start = i01 * p.nb11 + (i02 % p.ne12) * p.nb12 + (i03 % p.ne13) * p.nb13;
+    }
+
+    if (rowx >= p.nrows_x) {
+        return;
+    }
+
+    float slope = get_slope(rowx);
+
+    // Find max
+    FLOAT_TYPE max_val = p.has_sinks == 0 ? uintBitsToFloat(0xFF800000) : data_c[i02];
+
+    [[unroll]] for (uint i = 0; i < gl_NumWorkGroups.x; i += BLOCK_SIZE) {
+        if (i + tid < gl_NumWorkGroups.x) {
+            max_val = max(max_val, data_m[rowx * gl_NumWorkGroups.x + i + tid]);
+        }
+    }
+
+    // reduce across the workgroup
+    vals[tid] = max_val;
+    barrier();
+    [[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
+        if (tid < s) {
+            vals[tid] = max(max_val, vals[tid + s]);
+        }
+        barrier();
+    }
+
+    max_val = vals[0];
+    barrier();
+
+    FLOAT_TYPE sum = FLOAT_TYPE(0.0f);
+
+    // Compute sum{exp(x - max)}
+    [[unroll]] for (uint col0 = wg_start, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) {
+        const uint col = col0 + tid;
+
+        if (col >= p.KX) {
+            break;
+        }
+
+        // compute exp(a*scale+b*slope), add it to sum
+        const uint i = rowx * p.KX + col;
+        FLOAT_TYPE val;
+        val = exp(FLOAT_TYPE(data_a[i]) * p.scale + (p.KY > 0 ? slope * FLOAT_TYPE(data_b[rowy_start + col]) : FLOAT_TYPE(0.0f)) - max_val);
+        sum += val;
+        data_d[i] = D_TYPE(val);
+    }
+
+    // reduce across the workgroup
+    vals[tid] = sum;
+    barrier();
+    [[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
+        if (tid < s) {
+            vals[tid] += vals[tid + s];
+        }
+        barrier();
+    }
+
+    if (tid == 0) {
+        sum = vals[0];
+        data_s[rowx * gl_NumWorkGroups.x + gl_WorkGroupID.x] = sum;
+    }
+}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp
@ -0,0 +1,65 @@
+#version 450
+
+#include "soft_max_large_common.glsl"
+
+shared FLOAT_TYPE sumsh[BLOCK_SIZE];
+
+void main() {
+    const uint tid = gl_LocalInvocationID.x;
+    const uint rowx = gl_WorkGroupID.y;
+    const uint wg_start = gl_WorkGroupID.x * BLOCK_SIZE * num_iters;
+
+    const uint32_t i03 = rowx / (p.ne01 * p.ne02);
+    const uint32_t i02 = (rowx - i03 * p.ne01 * p.ne02) / p.ne01;
+    const uint32_t i01 = rowx % p.ne01;
+
+    uint rowy_start = 0;
+    if (p.KY > 0) {
+        rowy_start = i01 * p.nb11 + (i02 % p.ne12) * p.nb12 + (i03 % p.ne13) * p.nb13;
+    }
+
+    if (rowx >= p.nrows_x) {
+        return;
+    }
+
+    FLOAT_TYPE max_val = p.has_sinks == 0 ? uintBitsToFloat(0xFF800000) : data_c[i02];
+    FLOAT_TYPE sum = FLOAT_TYPE(0.0f);
+
+    [[unroll]] for (uint i = 0; i < gl_NumWorkGroups.x; i += BLOCK_SIZE) {
+        if (i + tid < gl_NumWorkGroups.x) {
+            max_val = max(max_val, data_m[rowx * gl_NumWorkGroups.x + i + tid]);
+            sum += data_s[rowx * gl_NumWorkGroups.x + i + tid];
+        }
+    }
+
+    // reduce across the workgroup
+    vals[tid] = max_val;
+    sumsh[tid] = sum;
+    barrier();
+    [[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
+        if (tid < s) {
+            vals[tid] = max(max_val, vals[tid + s]);
+            sumsh[tid] += sumsh[tid + s];
+        }
+        barrier();
+    }
+
+    max_val = vals[0];
+    sum = sumsh[0];
+
+    if (p.has_sinks != 0) {
+        sum += FLOAT_TYPE(exp(FLOAT_TYPE(data_c[i02]) - max_val));
+    }
+
+    FLOAT_TYPE rcpdivisor = 1.0/sum;
+
+    [[unroll]] for (uint col0 = wg_start, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) {
+        const uint col = col0 + tid;
+
+        if (col >= p.KX) {
+            continue;
+        }
+
+        data_d[rowx*p.KX + col] *= D_TYPE(rcpdivisor);
+    }
+}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl
@ -0,0 +1,53 @@
+#extension GL_EXT_control_flow_attributes : enable
+
+layout (push_constant) uniform parameter
+{
+    uint KX;
+    uint KY;
+    uint ne00;
+    uint ne01;
+    uint ne02;
+    uint ne12;
+    uint ne13;
+    uint nb11;
+    uint nb12;
+    uint nb13;
+    float scale;
+    float max_bias;
+    float m0;
+    float m1;
+    uint n_head_log2;
+    uint nrows_x;
+    uint has_sinks;
+} p;
+
+#include "types.glsl"
+
+layout(constant_id = 0) const uint BLOCK_SIZE = 128;
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+layout(constant_id = 1) const uint num_iters = 4;
+
+layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
+layout (binding = 1) readonly buffer Y {B_TYPE data_b[];};
+layout (binding = 2) readonly buffer Z {float data_c[];};
+layout (binding = 3) buffer D {D_TYPE data_d[];};
+layout (binding = 4) buffer M {float data_m[];};
+layout (binding = 5) buffer S {float data_s[];};
+
+shared FLOAT_TYPE vals[BLOCK_SIZE];
+
+float get_slope(uint rowx) {
+    float slope = 1.0f;
+
+    // ALiBi
+    if (p.max_bias > 0.0f) {
+        const uint h = (rowx / p.ne01) % p.ne02; // head index
+
+        const float base = h < p.n_head_log2 ? p.m0 : p.m1;
+        const uint   exp = h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1;
+
+        slope = pow(base, exp);
+    }
+
+    return slope;
+}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp
@ -10,6 +10,7 @@
 layout (push_constant) uniform parameter
 {
    uint n_rows;
+    uint n_experts_push;
    uint n_expert_used;
    float clamp_min;
    float clamp_max;
@ -18,11 +19,16 @@ layout (push_constant) uniform parameter
 layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in;

 layout(constant_id = 0) const uint WARP_SIZE = 32;
-layout(constant_id = 1) const uint n_experts = 512;
+layout(constant_id = 1) const uint n_experts_spec = 512;
 layout(constant_id = 2) const bool with_norm = true;
 layout(constant_id = 3) const bool late_softmax = false;
+layout(constant_id = 4) const bool nexperts_use_push = false;

-const uint experts_per_thread = (n_experts > WARP_SIZE) ? n_experts / WARP_SIZE : 1;
+uint n_experts = nexperts_use_push ? n_experts_push : n_experts_spec;
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
+
+const uint experts_per_thread = CEIL_DIV(n_experts_spec, WARP_SIZE);

 layout (binding = 0, std430) readonly buffer Logits {float logits[];};
 layout (binding = 1, std430) writeonly buffer Weights {float weights[];};
@ -94,7 +100,7 @@ void main() {
    }

    if (!late_softmax) {
-        softmax_warp_inplace(wt, n_experts, lane, false);
+        softmax_warp_inplace(wt, n_experts, lane, nexperts_use_push);
    }

    // at this point, each thread holds a portion of softmax,
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@ -721,13 +721,15 @@ void process_shaders() {
        shader = (tname == "f32" || tname == "f16" || tname == "bf16") ? "get_rows.comp" : "get_rows_quant.comp";

        if (tname == "f16") {
-            string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}));
+            string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{"TEMP_TYPE", "FLOAT_TYPE"}, {data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}));
        } else {
-            string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}}));
+            string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{"TEMP_TYPE", "FLOAT_TYPE"}, {data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}}));
        }
-        string_to_spv("get_rows_" + tname + "_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}}));
+        string_to_spv("get_rows_" + tname + "_f32", shader, merge_maps(base_dict, {{"TEMP_TYPE", "FLOAT_TYPE"}, {data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}}));
    }

+    string_to_spv("get_rows_i32", "get_rows.comp", {{"TEMP_TYPE", "uint"}, {"A_TYPE", "uint"}, {"B_TYPE", "int"}, {"D_TYPE", "uint"}});
+
    string_to_spv("mul_mat_vec_p021_f16_f32_subgroup_add", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}});
    string_to_spv("mul_mat_vec_p021_f16_f32",              "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}});
    string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}});
@ -871,6 +873,8 @@ void process_shaders() {

    string_to_spv("tri_f16",        "tri.comp",         {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
    string_to_spv("tri_f32",        "tri.comp",         {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
+    string_to_spv("diag_f16",       "diag.comp",        {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
+    string_to_spv("diag_f32",       "diag.comp",        {{"A_TYPE", "float"},       {"D_TYPE", "float"}});

    string_to_spv("softplus_f16",   "softplus.comp",    {{"A_TYPE", "float16_t"},   {"D_TYPE", "float16_t"}});
    string_to_spv("softplus_f32",   "softplus.comp",    {{"A_TYPE", "float"},       {"D_TYPE", "float"}});
@ -916,6 +920,13 @@ void process_shaders() {
    string_to_spv("soft_max_f32_f16", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
    string_to_spv("soft_max_back_f32", "soft_max_back.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));

+    string_to_spv("soft_max_large1_f32", "soft_max_large1.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
+    string_to_spv("soft_max_large2_f32", "soft_max_large2.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
+    string_to_spv("soft_max_large3_f32", "soft_max_large3.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
+    string_to_spv("soft_max_large1_f32_f16", "soft_max_large1.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
+    string_to_spv("soft_max_large2_f32_f16", "soft_max_large2.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
+    string_to_spv("soft_max_large3_f32_f16", "soft_max_large3.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
+
    string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float"}});
    string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}});
    string_to_spv("rope_norm_f16_rte", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}, {"RTE16", "1"}});
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -3320,6 +3320,7 @@ class VisionProjectorType:
    ULTRAVOX = "ultravox"
    INTERNVL = "internvl"
    QWEN2A = "qwen2a" # audio
+    GLMA = "glma" # audio
    QWEN25O = "qwen2.5o" # omni
    VOXTRAL = "voxtral"
    LFM2 = "lfm2"
--- a/scripts/compare-logprobs.py
+++ b/scripts/compare-logprobs.py
@ -0,0 +1,281 @@
+import argparse
+import requests
+import json
+from pathlib import Path
+import logging
+
+logger = logging.getLogger("compare-logprobs")
+logging.basicConfig(level=logging.INFO)
+
+
+DESCRIPTION = """
+Compare logits between llama.cpp and another inference engine using OpenAI-compatible server endpoints.
+
+Unlike compare-logits.py, it allows dumping logits from a hosted API endpoint. Useful when it's not possible to run both models locally.
+
+Example usage:
+    Step 1: Dump logits from two different servers
+        python scripts/compare-logprobs.py dump logits_llama.log http://localhost:8080/v1/completions
+        python scripts/compare-logprobs.py dump logits_other.log http://other-engine:8000/v1/completions
+
+        (optionally, you can add --api-key <key> if the endpoint requires authentication)
+
+    Step 2: Compare the dumped logits
+        python scripts/compare-logprobs.py compare logits_llama.log logits_other.log report.md
+"""
+
+
+def generate_input_prompt(length: int) -> list[str]:
+    CORPUS = """
+    You are an advanced AI assistant capable of using tools to gather information, perform calculations, or execute tasks. Always think step by step before responding. If a user's query requires external data, computation, or actions beyond your internal knowledge, use the appropriate tools via function calls.
+
+    ### Tool Call Format:
+    When you need to use a tool, output the call in this exact XML format. Include the opening and closing tags. Do not escape arguments; they will be parsed as plain text.
+
+    You can make multiple calls in one go by placing them one after another.
+    """
+    words = [w.strip() for w in CORPUS.strip().split(" ")]
+    words = [w for w in words if len(w) > 0]  # filter out empty strings
+    while len(words) < length:
+        words += words
+    return words[:length]
+
+
+def dump_logits(
+    endpoint: str,
+    output_path: Path,
+    input_words: list[str],
+    pattern: list[tuple[bool, int]],
+    api_key=None,
+):
+    logger.info(f"Dumping logits to {output_path} from endpoint {endpoint}...")
+    words = input_words
+    curr_text = ""
+    n_total = sum(n for get, n in pattern if get)
+    n_done = 0
+    i_cur = 0
+    i_total = len(words)
+    with output_path.open("w") as f:
+        for get, n in pattern:
+            if not get:
+                # skip n words
+                for i in range(n):
+                    curr_text += words.pop(0) + " "
+                    i_cur += 1
+                continue
+            # get n words
+            for i in range(n):
+                curr_text += words.pop(0) + " "
+                payload = {
+                    "prompt": curr_text.strip(),
+                    "temperature": 0.0,
+                    "top_k": 1,
+                    "max_tokens": 1,
+                    "logprobs": 1,
+                    "stream": False,
+                }
+                response = requests.post(
+                    endpoint,
+                    json=payload,
+                    headers={"Authorization": f"Bearer {api_key}"} if api_key else {},
+                )
+                response.raise_for_status()
+                data = response.json()
+                data["__index"] = i_cur  # add index for easier debugging later
+                data = json.dumps(data)
+                f.write(f"{data}\n")
+                n_done += 1
+                i_cur += 1
+                logger.info(
+                    f"\n\n{data}\n\n[Step: {n_done}/{n_total} | Word: {i_cur}/{i_total}]"
+                )
+    logger.info(f"Logits dumped to {output_path}")
+
+
+def get_token_logprobs(data: dict):
+    logprobs = data["choices"][0]["logprobs"]
+    if "content" in logprobs:
+        # llama.cpp case
+        top = logprobs["content"][0]["top_logprobs"][0]
+        return top["token"], top["logprob"]
+    else:
+        # vllm case
+        tokens = logprobs["tokens"]
+        token_logprobs = logprobs["token_logprobs"]
+        return tokens[0], token_logprobs[0]
+
+
+def clean_text(text: str) -> str:
+    return (
+        "'"
+        + text.replace("\n", "\\n")
+        .replace("\t", "\\t")
+        .replace("\r", "\\r")
+        .replace("|", "\\|")
+        + "'"
+    )
+
+
+def compare_logits(input1: Path, input2: Path, output_path: Path):
+    with input1.open("r") as f1, input2.open("r") as f2, output_path.open("w") as fout:
+        lines1 = f1.readlines()
+        lines2 = f2.readlines()
+
+        tab_header = [
+            "idx",
+            input1.name,
+            "logprob_1",
+            input2.name,
+            "logprob_2",
+            "diff (abs)",
+        ]
+        tab_entries = []
+        tab_max_widths = [len(h) for h in tab_header]
+
+        assert len(lines1) == len(
+            lines2
+        ), "Input files must have the same number of lines."
+
+        fout.write("# Logits Comparison Report\n\n")
+        for i, (line1, line2) in enumerate(zip(lines1, lines2)):
+            if not line1.strip() or not line2.strip():
+                continue  # skip empty lines
+
+            data1 = json.loads(line1)
+            data2 = json.loads(line2)
+
+            idx1 = data1.get("__index", -1)
+            idx2 = data2.get("__index", -1)
+            if idx1 != idx2:
+                logger.warning(
+                    f"Warning: Mismatched indices at line {i}: {idx1} vs {idx2}"
+                )
+
+            token1, logprob1 = get_token_logprobs(data1)
+            token2, logprob2 = get_token_logprobs(data2)
+
+            token1 = clean_text(token1)
+            token2 = clean_text(token2)
+            abs_diff = abs(logprob1 - logprob2)
+
+            tab_entries.append(
+                (
+                    str(idx1 + 1),
+                    token1,
+                    f"{logprob1:.4f}",
+                    token2,
+                    f"{logprob2:.4f}",
+                    f"{(abs_diff):.4f}",
+                )
+            )
+
+        for i in range(len(tab_entries)):
+            for j in range(len(tab_header)):
+                tab_max_widths[j] = max(tab_max_widths[j], len(tab_entries[i][j]))
+
+        output = ""
+        for j in range(len(tab_header)):
+            output += f"| {tab_header[j]:<{tab_max_widths[j]}} "
+        output += "|\n"
+        for j in range(len(tab_header)):
+            output += f"|{'-' * (tab_max_widths[j] + 2)}"
+        output += "|\n"
+        for entry in tab_entries:
+            for j in range(len(tab_header)):
+                output += f"| {entry[j]:<{tab_max_widths[j]}} "
+            output += "|\n"
+
+        logger.info("\n" + output)
+        fout.write(output)
+        logger.info(f"Report written to {output_path}")
+
+
+def parse_pattern(pattern: str) -> list[tuple[bool, int]]:
+    parts = pattern.split(",")
+    result = []
+    for i, part in enumerate(parts):
+        n = int(part)
+        if i % 2 == 0:
+            result.append((True, n))  # get n words
+        else:
+            result.append((False, n))  # skip n words
+    return result
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description=DESCRIPTION, formatter_class=argparse.RawTextHelpFormatter
+    )
+    subparsers = parser.add_subparsers(
+        dest="verb", required=True, help="action to perform"
+    )
+
+    # dump subcommand
+    parser_dump = subparsers.add_parser("dump", help="dump logits from an endpoint")
+    parser_dump.add_argument(
+        "output", type=Path, help="output path for dumped logits (.log)"
+    )
+    parser_dump.add_argument(
+        "endpoint", type=str, help="OAI-compat /completions endpoint"
+    )
+    parser_dump.add_argument(
+        "--api-key",
+        type=str,
+        default=None,
+        help="API key for authentication (if required)",
+    )
+    parser_dump.add_argument(
+        "--file",
+        type=Path,
+        default=None,
+        help="File containing prompt to use instead of the default",
+    )
+    parser_dump.add_argument(
+        "--pattern",
+        type=str,
+        default="10,1000,10,4000,10",
+        help="Pattern n_get,n_skip,... where n_get is number of words to get and n_skip is number of words to skip (num of words, NOT num of tokens)",
+    )
+
+    # compare subcommand
+    parser_compare = subparsers.add_parser(
+        "compare", help="compare two dumped logits files"
+    )
+    parser_compare.add_argument("input1", type=Path, help="first input file (.log)")
+    parser_compare.add_argument("input2", type=Path, help="second input file (.log)")
+    parser_compare.add_argument(
+        "output", type=Path, help="output path for comparison report (.md)"
+    )
+
+    try:
+        return parser.parse_args()
+    except Exception as e:
+        parser.print_help()
+        raise e
+
+
+def main():
+    args = parse_args()
+
+    if args.verb == "dump":
+        pattern = parse_pattern(args.pattern)
+        input_length = sum(n for _, n in pattern)
+        input_words = generate_input_prompt(input_length)
+        if args.file is not None:
+            with args.file.open("r") as f:
+                input_words = f.read().strip().split(" ")
+                if input_length < sum(n for _, n in pattern):
+                    raise ValueError(
+                        f"Input file has only {input_length} words, but pattern requires at least {input_length} words."
+                    )
+                input_length = len(input_words)
+        logger.info(f"Using {input_length} words")
+        dump_logits(args.endpoint, args.output, input_words, pattern, args.api_key)
+    elif args.verb == "compare":
+        compare_logits(args.input1, args.input2, args.output)
+    else:
+        raise ValueError(f"Unknown verb: {args.verb}")
+
+
+if __name__ == "__main__":
+    main()
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@ -9,6 +9,7 @@
 #include "llama-model.h"

 #include <cinttypes>
+#include <cmath>
 #include <cstring>
 #include <limits>
 #include <stdexcept>
@ -75,6 +76,43 @@ llama_context::llama_context(
        cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
    }

+    if (cparams.yarn_ext_factor != 0) {
+        static auto get_mscale = [](float scale, float mscale) {
+            return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
+        };
+
+        const float factor = 1.0f / cparams.rope_freq_scale;
+
+        // ref: https://github.com/huggingface/transformers/blob/6d00f6b0a5679c36510f203e4226e36f517c3032/src/transformers/modeling_rope_utils.py#L336-L348
+        if (hparams.rope_yarn_log_mul != 0.0f) {
+            // note: here we assume `mscale == 1.0f`
+            // TODO: start reading the actual value of mscale and handle the case where it is not 1.0f
+                  float mscale          = 1.0f;
+            const float mscale_all_dims = hparams.rope_yarn_log_mul;
+
+            // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
+            // special-case DEEPSEEK v2:
+            // https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat/blob/main/config.json#L42-L43
+            if (model.arch == LLM_ARCH_DEEPSEEK2 && mscale_all_dims != 1.0f) {
+                mscale = mscale_all_dims;
+            }
+
+            cparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
+
+            LLAMA_LOG_WARN("%s: setting new yarn_attn_factor = %.4f (mscale == %.1f, mscale_all_dim = %.1f)\n",
+                    __func__, cparams.yarn_attn_factor, mscale, mscale_all_dims);
+        } else {
+            cparams.yarn_attn_factor = get_mscale(factor, 1.0f);
+        }
+
+        // when YARN is applied with yarn_ext_factor != 0.0f, we need to cancel this factor:
+        // https://github.com/ggml-org/llama.cpp/blob/a81a569577cc38b32558958b048228150be63eae/ggml/src/ggml-cpu/ops.cpp#L5541-L5544
+        //
+        // ref: https://github.com/ggml-org/llama.cpp/discussions/7416
+        //      https://github.com/ggml-org/llama.cpp/pull/17945
+        cparams.yarn_attn_factor *= 1.0f / (1.0f + 0.1f * logf(factor));
+    }
+
    cparams.yarn_attn_factor *= hparams.rope_attn_factor;

    if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
@ -1328,6 +1366,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
            // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
            LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
 #endif
+            synchronize();
            buf_output = nullptr;
            logits = nullptr;
            embd = nullptr;
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@ -78,7 +78,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
        for (int i = 0; i < n_tokens; ++i) {
            const float pos = ubatch->pos[i];
            attn_scale_data[i] = std::log(
-                std::floor((pos + 1.0f) / n_attn_temp_floor_scale) + 1.0
+                std::floor((pos + f_attn_temp_offset) / n_attn_temp_floor_scale) + 1.0
            ) * f_attn_temp_scale + 1.0;
        }

@ -1203,7 +1203,7 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
 }

 ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
-    auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
+    auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale, hparams.f_attn_temp_offset);

    auto & cur = inp->attn_scale;

--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@ -132,8 +132,8 @@ public:
 // temperature tuning, used by llama4
 class llm_graph_input_attn_temp : public llm_graph_input_i {
 public:
-    llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
-        : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
+    llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale, float f_attn_temp_offset)
+        : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale), f_attn_temp_offset(f_attn_temp_offset) {}
    virtual ~llm_graph_input_attn_temp() = default;

    void set_input(const llama_ubatch * ubatch) override;
@ -142,6 +142,7 @@ public:

    const uint32_t n_attn_temp_floor_scale;
    const float    f_attn_temp_scale;
+    const float    f_attn_temp_offset;
 };

 class llm_graph_input_pos_bucket : public llm_graph_input_i {
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@ -1,6 +1,7 @@
 #include "llama-hparams.h"

 #include "ggml.h"
+
 #include <cassert>

 void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@ -107,6 +107,7 @@ struct llama_hparams {
    float    rope_freq_base_train_swa;
    float    rope_freq_scale_train;
    float    rope_freq_scale_train_swa;
+
    uint32_t n_ctx_orig_yarn;
    float    rope_yarn_log_mul = 0.0f;

@ -164,6 +165,7 @@ struct llama_hparams {
    uint32_t n_no_rope_layer_step    = 4;
    uint32_t n_attn_temp_floor_scale = 0;
    float    f_attn_temp_scale       = 0.0f;
+    float    f_attn_temp_offset      = 0.0f; // offset position index

    // gemma3n altup
    uint32_t n_altup      = 4; // altup_num_inputs
@ -270,4 +272,3 @@ struct llama_hparams {
 };

 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
-
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@ -1369,9 +1369,10 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
                      float   freq_scale) const {
    const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;

-    const auto & yarn_ext_factor = cparams.yarn_ext_factor;
-    const auto & yarn_beta_fast  = cparams.yarn_beta_fast;
-    const auto & yarn_beta_slow  = cparams.yarn_beta_slow;
+    const auto & yarn_ext_factor  = cparams.yarn_ext_factor;
+    const auto & yarn_beta_fast   = cparams.yarn_beta_fast;
+    const auto & yarn_beta_slow   = cparams.yarn_beta_slow;
+    const auto & yarn_attn_factor = cparams.yarn_attn_factor;

    const auto & n_rot     = hparams.n_rot;
    const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE
@ -1382,12 +1383,6 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
                                ? LLAMA_ROPE_TYPE_NEOX
                                : hparams.rope_type;

-    // See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
-    // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
-    const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2
-                                    ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale))
-                                    : cparams.yarn_attn_factor;
-
    ggml_tensor * tmp;

    if (ggml_is_quantized(cur->type)) {
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -773,6 +773,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    hparams.n_swa                   = 8192;
                    hparams.n_attn_temp_floor_scale = 8192;
                    hparams.f_attn_temp_scale       = 0.1f;
+                    hparams.f_attn_temp_offset      = 1.0f;
                    hparams.set_swa_pattern(4);   // pattern: 3 chunked - 1 full
                }

@ -1740,12 +1741,19 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    // that have no expert_gating_func model parameter set
                    hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
                }
-                ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false);
+
+                if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) {
+                    // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
+                    // cancel the factor from the convert script
+                    hparams.rope_yarn_log_mul /= 0.1f;
+                }

                // (optional) temperature tuning - used by mistral-large
                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE,  hparams.f_attn_temp_scale,       false);
                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false);

+                hparams.f_attn_temp_offset = 0.0f;
+
                switch (hparams.n_layer) {
                    case 27: type = LLM_TYPE_16B; break;
                    case 60: type = LLM_TYPE_236B; break;
@ -2372,9 +2380,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false);

-                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST,   hparams.yarn_beta_fast, false);
-                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW,   hparams.yarn_beta_slow, false);
-                ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL,     hparams.rope_yarn_log_mul, false);
+                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast,    false);
+                ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow,    false);
+                ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL,   hparams.rope_yarn_log_mul, 0.0f);
+
+                hparams.f_attn_temp_offset = 0.0f;

                // TODO: maybe add n_attn_temp_floor_scale as a separate KV?
                if (hparams.f_attn_temp_scale != 0.0f) {
@ -2384,18 +2394,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    }
                }

-                // TODO: this seems to be correct with the case of mscale == mscale_all_dims == 1.0f
-                //       but may need further verification with other values
-                if (hparams.rope_yarn_log_mul != 0.0f) {
-                    float factor = 1.0f / hparams.rope_freq_scale_train;
-                    float mscale = 1.0f;
-                    float mscale_all_dims = hparams.rope_yarn_log_mul;
-                    static auto get_mscale = [](float scale, float mscale) {
-                        return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f);
-                    };
-                    hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims);
-                }
-
                switch (hparams.n_layer) {
                    case 26: type = LLM_TYPE_3B; break;
                    case 34: type = LLM_TYPE_8B; break;
@ -6965,6 +6963,7 @@ void llama_model::print_info() const {
        LLAMA_LOG_INFO("%s: freq_base_train  = %.1f\n",   __func__, hparams.rope_freq_base_train);
        LLAMA_LOG_INFO("%s: freq_scale_train = %g\n",     __func__, hparams.rope_freq_scale_train);
        LLAMA_LOG_INFO("%s: n_ctx_orig_yarn  = %u\n",     __func__, hparams.n_ctx_orig_yarn);
+        LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n",   __func__, hparams.rope_yarn_log_mul);
        LLAMA_LOG_INFO("%s: rope_finetuned   = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "unknown");
        // MRoPE (Multi-axis Rotary Position Embedding) sections
        if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) {
@ -7028,7 +7027,6 @@ void llama_model::print_info() const {
        LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n",   __func__, hparams.expert_weights_scale);
        LLAMA_LOG_INFO("%s: expert_weights_norm  = %d\n",     __func__, hparams.expert_weights_norm);
        LLAMA_LOG_INFO("%s: expert_gating_func   = %s\n",     __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
-        LLAMA_LOG_INFO("%s: rope_yarn_log_mul    = %.4f\n",   __func__, hparams.rope_yarn_log_mul);
    }

    if (arch == LLM_ARCH_QWEN2MOE) {
--- a/src/models/deepseek2.cpp
+++ b/src/models/deepseek2.cpp
@ -1,7 +1,5 @@
 #include "models.h"

-
-
 llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
    // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
@ -20,9 +18,15 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr

    // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
    // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
-    const float mscale      = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
-    const float kq_scale    = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));
-    const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
+    // And also: https://github.com/ggml-org/llama.cpp/pull/17945 [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
+
+    // first cancel the adjustment from llama_hparams::yarn_attn_factor_adjust to get the original attn_factor
+    GGML_ASSERT(ext_factor >= 0.0f);
+    const float attn_factor_org = attn_factor * (1.0f + 0.1f * logf(1.0f / freq_scale));
+
+    // use the original attn_factor to pre-scale the kq_scale
+    const float mscale   = attn_factor_org * (1.0f + 0.1f * hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
+    const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k));

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/tools/completion/completion.cpp
+++ b/tools/completion/completion.cpp
@ -142,13 +142,15 @@ int main(int argc, char ** argv) {

    // load the model and apply lora adapter, if any
    LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
-    common_init_result llama_init = common_init_from_params(params);

-    model = llama_init.model.get();
-    ctx = llama_init.context.get();
+    auto llama_init = common_init_from_params(params);

-    if (model == NULL) {
-        LOG_ERR("%s: error: unable to load model\n", __func__);
+    ctx   = llama_init->context();
+    model = llama_init->model();
+    smpl  = llama_init->sampler(0);
+
+    if (ctx == NULL) {
+        LOG_ERR("%s: error: unable to create context\n", __func__);
        return 1;
    }

@ -475,12 +477,6 @@ int main(int argc, char ** argv) {
        }
    }

-    smpl = common_sampler_init(model, sparams);
-    if (!smpl) {
-        LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
-        return 1;
-    }
-
    LOG_INF("sampler seed: %u\n",     common_sampler_get_seed(smpl));
    LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
    LOG_INF("sampler chain: %s\n",    common_sampler_print(smpl).c_str());
@ -994,8 +990,6 @@ int main(int argc, char ** argv) {
    LOG("\n\n");
    common_perf_print(ctx, smpl);

-    common_sampler_free(smpl);
-
    llama_backend_free();

    ggml_threadpool_free_fn(threadpool);
--- a/tools/mtmd/clip-graph.h
+++ b/tools/mtmd/clip-graph.h
@ -0,0 +1,119 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-cpp.h"
+#include "clip.h"
+#include "clip-impl.h"
+#include "clip-model.h"
+
+#include <vector>
+#include <functional>
+
+struct clip_graph {
+    const clip_model & model;
+    const clip_hparams & hparams;
+    projector_type proj_type;
+
+    // we only support single image per batch
+    const clip_image_f32 & img;
+
+    const int patch_size;
+    const int n_patches_x;
+    const int n_patches_y;
+    const int n_patches;
+    const int n_embd;
+    const int n_head;
+    const int d_head;
+    const int n_layer;
+    const int n_mmproj_embd;
+    const float eps;
+    const float kq_scale;
+    const clip_flash_attn_type flash_attn_type;
+
+    // for debugging
+    const bool debug_graph;
+    std::vector<ggml_tensor *> & debug_print_tensors;
+
+    ggml_context_ptr ctx0_ptr;
+    ggml_context * ctx0;
+    ggml_cgraph * gf;
+
+    clip_graph(clip_ctx * ctx, const clip_image_f32 & img);
+
+    virtual ~clip_graph() = default;
+    virtual ggml_cgraph * build() = 0;
+
+    //
+    // utility functions
+    //
+    void cb(ggml_tensor * cur0, const char * name, int il) const;
+
+    // siglip2 naflex
+    ggml_tensor * resize_position_embeddings();
+
+    // build vision transformer (ViT) cgraph
+    // this function should cover most of the models
+    // if your model has specific features, you should probably duplicate this function
+    ggml_tensor * build_vit(
+                ggml_tensor * inp,
+                int64_t n_pos,
+                norm_type norm_t,
+                ffn_op_type ffn_t,
+                ggml_tensor * learned_pos_embd,
+                std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos);
+
+    // build the input after conv2d (inp_raw --> patches)
+    // returns tensor with shape [n_embd, n_patches]
+    ggml_tensor * build_inp();
+
+    ggml_tensor * build_inp_raw(int channels = 3);
+
+    ggml_tensor * build_norm(
+            ggml_tensor * cur,
+            ggml_tensor * mw,
+            ggml_tensor * mb,
+            norm_type type,
+            float norm_eps,
+            int il) const;
+
+    ggml_tensor * build_ffn(
+            ggml_tensor * cur,
+            ggml_tensor * up,
+            ggml_tensor * up_b,
+            ggml_tensor * gate,
+            ggml_tensor * gate_b,
+            ggml_tensor * down,
+            ggml_tensor * down_b,
+            ffn_op_type type_op,
+            int il) const;
+
+    ggml_tensor * build_attn(
+            ggml_tensor * wo,
+            ggml_tensor * wo_b,
+            ggml_tensor * q_cur,
+            ggml_tensor * k_cur,
+            ggml_tensor * v_cur,
+            ggml_tensor * kq_mask,
+            float kq_scale,
+            int il) const;
+
+    // implementation of the 2D RoPE without adding a new op in ggml
+    // this is not efficient (use double the memory), but works on all backends
+    // TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
+    ggml_tensor * build_rope_2d(
+        ggml_context * ctx0,
+        ggml_tensor * cur,
+        ggml_tensor * pos_a, // first half
+        ggml_tensor * pos_b, // second half
+        const float freq_base,
+        const bool interleave_freq
+    );
+
+    // aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
+    // support dynamic resolution
+    ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor);
+
+    // Generic function to stack frames for audio processing
+    // Abstracts out the StackAudioFrames logic used by ultravox
+    ggml_tensor * build_stack(ggml_tensor * cur, int32_t stack_factor, int32_t n_embed);
+};
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@ -1,3 +1,5 @@
+#pragma once
+
 #include "ggml.h"
 #include "gguf.h"
 #include "clip.h"
@ -13,6 +15,8 @@

 // Internal header for clip.cpp

+#define MTMD_INTERNAL_HEADER
+
 #define KEY_FTYPE               "general.file_type"
 #define KEY_NAME                "general.name"
 #define KEY_DESCRIPTION         "general.description"
@ -132,6 +136,10 @@
 // align x to upper multiple of n
 #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))

+// forward declaration
+// TODO: improve this later
+struct clip_ctx;
+
 enum projector_type {
    PROJECTOR_TYPE_MLP,
    PROJECTOR_TYPE_MLP_NORM,
@ -149,6 +157,7 @@ enum projector_type {
    PROJECTOR_TYPE_INTERNVL,
    PROJECTOR_TYPE_LLAMA4,
    PROJECTOR_TYPE_QWEN2A,
+    PROJECTOR_TYPE_GLMA,
    PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
    PROJECTOR_TYPE_VOXTRAL,
    PROJECTOR_TYPE_LFM2,
@ -175,6 +184,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_INTERNVL,  "internvl"},
    { PROJECTOR_TYPE_LLAMA4,    "llama4"},
    { PROJECTOR_TYPE_QWEN2A,    "qwen2a"},
+    { PROJECTOR_TYPE_GLMA,      "glma"},
    { PROJECTOR_TYPE_QWEN25O,   "qwen2.5o"},
    { PROJECTOR_TYPE_VOXTRAL,   "voxtral"},
    { PROJECTOR_TYPE_LFM2,      "lfm2"},
--- a/tools/mtmd/clip-model.h
+++ b/tools/mtmd/clip-model.h
@ -0,0 +1,280 @@
+#pragma once
+
+#include "ggml.h"
+#include "clip.h"
+#include "clip-impl.h"
+
+#include <vector>
+#include <unordered_set>
+#include <cstdint>
+#include <cmath>
+
+enum ffn_op_type {
+    FFN_GELU,
+    FFN_GELU_ERF,
+    FFN_SILU,
+    FFN_GELU_QUICK,
+};
+
+enum norm_type {
+    NORM_TYPE_NORMAL,
+    NORM_TYPE_RMS,
+};
+
+enum patch_merge_type {
+    PATCH_MERGE_FLAT,
+    PATCH_MERGE_SPATIAL_UNPAD,
+};
+
+struct clip_hparams {
+    int32_t image_size = 0;
+    int32_t patch_size = 0;
+    int32_t n_embd = 0;
+    int32_t n_ff = 0;
+    int32_t projection_dim = 0;
+    int32_t n_head = 0;
+    int32_t n_layer = 0;
+    // idefics3
+    int32_t image_longest_edge = 0;
+    int32_t image_min_pixels = -1;
+    int32_t image_max_pixels = -1;
+    int32_t n_merge = 0; // number of patch merges **per-side**
+
+    float image_mean[3];
+    float image_std[3];
+
+    // for models using dynamic image size, we need to have a smaller image size to warmup
+    // otherwise, user will get OOM everytime they load the model
+    int32_t warmup_image_size = 0;
+    int32_t warmup_audio_size = 3000;
+
+    ffn_op_type ffn_op = FFN_GELU;
+
+    patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
+
+    float eps = 1e-6;
+    float rope_theta = 0.0;
+
+    std::vector<clip_image_size> image_res_candidates; // for llava-uhd style models
+    int32_t image_crop_resolution;
+    std::unordered_set<int32_t> vision_feature_layer;
+    int32_t attn_window_size = 0;
+    int32_t n_wa_pattern = 0;
+
+    // audio
+    int32_t n_mel_bins = 0; // whisper preprocessor
+    int32_t proj_stack_factor = 0; // ultravox
+
+    // legacy
+    bool has_llava_projector = false;
+    int minicpmv_version = 0;
+    int32_t minicpmv_query_num = 0;         // MiniCPM-V query number
+
+    // custom value provided by user, can be undefined if not set
+    int32_t custom_image_min_tokens = -1;
+    int32_t custom_image_max_tokens = -1;
+
+    void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
+        const int cur_merge = n_merge == 0 ? 1 : n_merge;
+        const int patch_area = patch_size * patch_size * cur_merge * cur_merge;
+        image_min_pixels = (custom_image_min_tokens > 0 ? custom_image_min_tokens : n_tokens_min) * patch_area;
+        image_max_pixels = (custom_image_max_tokens > 0 ? custom_image_max_tokens : n_tokens_max) * patch_area;
+        warmup_image_size = static_cast<int>(std::sqrt(image_max_pixels));
+    }
+
+    void set_warmup_n_tokens(int n_tokens) {
+        int n_tok_per_side = static_cast<int>(std::sqrt(n_tokens));
+        GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n");
+        const int cur_merge = n_merge == 0 ? 1 : n_merge;
+        warmup_image_size = n_tok_per_side * patch_size * cur_merge;
+        // TODO: support warmup size for custom token numbers
+    }
+};
+
+struct clip_layer {
+    // attention
+    ggml_tensor * k_w = nullptr;
+    ggml_tensor * k_b = nullptr;
+    ggml_tensor * q_w = nullptr;
+    ggml_tensor * q_b = nullptr;
+    ggml_tensor * v_w = nullptr;
+    ggml_tensor * v_b = nullptr;
+    ggml_tensor * qkv_w = nullptr;
+    ggml_tensor * qkv_b = nullptr;
+
+    ggml_tensor * o_w = nullptr;
+    ggml_tensor * o_b = nullptr;
+
+    ggml_tensor * k_norm = nullptr;
+    ggml_tensor * q_norm = nullptr;
+
+    // layernorm 1
+    ggml_tensor * ln_1_w = nullptr;
+    ggml_tensor * ln_1_b = nullptr;
+
+    ggml_tensor * ff_up_w = nullptr;
+    ggml_tensor * ff_up_b = nullptr;
+    ggml_tensor * ff_gate_w = nullptr;
+    ggml_tensor * ff_gate_b = nullptr;
+    ggml_tensor * ff_down_w = nullptr;
+    ggml_tensor * ff_down_b = nullptr;
+
+    // layernorm 2
+    ggml_tensor * ln_2_w = nullptr;
+    ggml_tensor * ln_2_b = nullptr;
+
+    // layer scale (no bias)
+    ggml_tensor * ls_1_w = nullptr;
+    ggml_tensor * ls_2_w = nullptr;
+
+    // qwen3vl deepstack merger
+    ggml_tensor * deepstack_norm_w = nullptr;
+    ggml_tensor * deepstack_norm_b = nullptr;
+    ggml_tensor * deepstack_fc1_w = nullptr;
+    ggml_tensor * deepstack_fc1_b = nullptr;
+    ggml_tensor * deepstack_fc2_w = nullptr;
+    ggml_tensor * deepstack_fc2_b = nullptr;
+
+    bool has_deepstack() const {
+        return deepstack_fc1_w != nullptr;
+    }
+};
+
+struct clip_model {
+    clip_modality modality = CLIP_MODALITY_VISION;
+    projector_type proj_type = PROJECTOR_TYPE_MLP;
+    clip_hparams hparams;
+
+    // embeddings
+    ggml_tensor * class_embedding = nullptr;
+    ggml_tensor * patch_embeddings_0 = nullptr;
+    ggml_tensor * patch_embeddings_1 = nullptr;  // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
+    ggml_tensor * patch_bias = nullptr;
+    ggml_tensor * position_embeddings = nullptr;
+
+    ggml_tensor * pre_ln_w = nullptr;
+    ggml_tensor * pre_ln_b = nullptr;
+
+    std::vector<clip_layer> layers;
+
+    int32_t n_deepstack_layers = 0; // used by Qwen3-VL, calculated from clip_layer
+
+    ggml_tensor * post_ln_w;
+    ggml_tensor * post_ln_b;
+
+    ggml_tensor * projection; // TODO: rename it to fc (fully connected layer)
+    ggml_tensor * mm_fc_w;
+    ggml_tensor * mm_fc_b;
+
+    // LLaVA projection
+    ggml_tensor * mm_input_norm_w = nullptr;
+    ggml_tensor * mm_input_norm_b = nullptr;
+    ggml_tensor * mm_0_w = nullptr;
+    ggml_tensor * mm_0_b = nullptr;
+    ggml_tensor * mm_2_w = nullptr;
+    ggml_tensor * mm_2_b = nullptr;
+
+    ggml_tensor * image_newline = nullptr;
+
+    // Yi type models with mlp+normalization projection
+    ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4
+    ggml_tensor * mm_1_b = nullptr;
+    ggml_tensor * mm_3_w = nullptr;
+    ggml_tensor * mm_3_b = nullptr;
+    ggml_tensor * mm_4_w = nullptr;
+    ggml_tensor * mm_4_b = nullptr;
+
+    // GLMV-Edge projection
+    ggml_tensor * mm_model_adapter_conv_w = nullptr;
+    ggml_tensor * mm_model_adapter_conv_b = nullptr;
+
+    // MobileVLM projection
+    ggml_tensor * mm_model_mlp_1_w = nullptr;
+    ggml_tensor * mm_model_mlp_1_b = nullptr;
+    ggml_tensor * mm_model_mlp_3_w = nullptr;
+    ggml_tensor * mm_model_mlp_3_b = nullptr;
+    ggml_tensor * mm_model_block_1_block_0_0_w = nullptr;
+    ggml_tensor * mm_model_block_1_block_0_1_w = nullptr;
+    ggml_tensor * mm_model_block_1_block_0_1_b = nullptr;
+    ggml_tensor * mm_model_block_1_block_1_fc1_w = nullptr;
+    ggml_tensor * mm_model_block_1_block_1_fc1_b = nullptr;
+    ggml_tensor * mm_model_block_1_block_1_fc2_w = nullptr;
+    ggml_tensor * mm_model_block_1_block_1_fc2_b = nullptr;
+    ggml_tensor * mm_model_block_1_block_2_0_w = nullptr;
+    ggml_tensor * mm_model_block_1_block_2_1_w = nullptr;
+    ggml_tensor * mm_model_block_1_block_2_1_b = nullptr;
+    ggml_tensor * mm_model_block_2_block_0_0_w = nullptr;
+    ggml_tensor * mm_model_block_2_block_0_1_w = nullptr;
+    ggml_tensor * mm_model_block_2_block_0_1_b = nullptr;
+    ggml_tensor * mm_model_block_2_block_1_fc1_w = nullptr;
+    ggml_tensor * mm_model_block_2_block_1_fc1_b = nullptr;
+    ggml_tensor * mm_model_block_2_block_1_fc2_w = nullptr;
+    ggml_tensor * mm_model_block_2_block_1_fc2_b = nullptr;
+    ggml_tensor * mm_model_block_2_block_2_0_w = nullptr;
+    ggml_tensor * mm_model_block_2_block_2_1_w = nullptr;
+    ggml_tensor * mm_model_block_2_block_2_1_b = nullptr;
+
+    // MobileVLM_V2 projection
+    ggml_tensor * mm_model_mlp_0_w = nullptr;
+    ggml_tensor * mm_model_mlp_0_b = nullptr;
+    ggml_tensor * mm_model_mlp_2_w = nullptr;
+    ggml_tensor * mm_model_mlp_2_b = nullptr;
+    ggml_tensor * mm_model_peg_0_w = nullptr;
+    ggml_tensor * mm_model_peg_0_b = nullptr;
+
+    // MINICPMV projection
+    ggml_tensor * mm_model_pos_embed_k = nullptr;
+    ggml_tensor * mm_model_query = nullptr;
+    ggml_tensor * mm_model_proj = nullptr;
+    ggml_tensor * mm_model_kv_proj = nullptr;
+    ggml_tensor * mm_model_attn_q_w = nullptr;
+    ggml_tensor * mm_model_attn_q_b = nullptr;
+    ggml_tensor * mm_model_attn_k_w = nullptr;
+    ggml_tensor * mm_model_attn_k_b = nullptr;
+    ggml_tensor * mm_model_attn_v_w = nullptr;
+    ggml_tensor * mm_model_attn_v_b = nullptr;
+    ggml_tensor * mm_model_attn_o_w = nullptr;
+    ggml_tensor * mm_model_attn_o_b = nullptr;
+    ggml_tensor * mm_model_ln_q_w = nullptr;
+    ggml_tensor * mm_model_ln_q_b = nullptr;
+    ggml_tensor * mm_model_ln_kv_w = nullptr;
+    ggml_tensor * mm_model_ln_kv_b = nullptr;
+    ggml_tensor * mm_model_ln_post_w = nullptr;
+    ggml_tensor * mm_model_ln_post_b = nullptr;
+
+    // gemma3
+    ggml_tensor * mm_input_proj_w = nullptr;
+    ggml_tensor * mm_soft_emb_norm_w = nullptr;
+
+    // pixtral
+    ggml_tensor * token_embd_img_break = nullptr;
+    ggml_tensor * mm_patch_merger_w = nullptr;
+
+    // ultravox / whisper encoder
+    ggml_tensor * conv1d_1_w = nullptr;
+    ggml_tensor * conv1d_1_b = nullptr;
+    ggml_tensor * conv1d_2_w = nullptr;
+    ggml_tensor * conv1d_2_b = nullptr;
+    ggml_tensor * mm_norm_pre_w = nullptr;
+    ggml_tensor * mm_norm_pre_b = nullptr;
+    ggml_tensor * mm_norm_mid_w = nullptr;
+
+    // cogvlm
+    ggml_tensor * mm_post_fc_norm_w = nullptr;
+    ggml_tensor * mm_post_fc_norm_b = nullptr;
+    ggml_tensor * mm_h_to_4h_w = nullptr;
+    ggml_tensor * mm_gate_w = nullptr;
+    ggml_tensor * mm_4h_to_h_w = nullptr;
+    ggml_tensor * mm_boi = nullptr;
+    ggml_tensor * mm_eoi = nullptr;
+
+    bool audio_has_avgpool() const {
+        return proj_type == PROJECTOR_TYPE_QWEN2A
+            || proj_type == PROJECTOR_TYPE_VOXTRAL;
+    }
+
+    bool audio_has_stack_frames() const {
+        return proj_type == PROJECTOR_TYPE_ULTRAVOX
+            || proj_type == PROJECTOR_TYPE_VOXTRAL;
+    }
+};
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@ -7,6 +7,8 @@

 // !!! Internal header, to be used by mtmd only !!!

+#define MTMD_INTERNAL_HEADER
+
 struct clip_ctx;

 struct clip_image_size {
--- a/tools/mtmd/models/cogvlm.cpp
+++ b/tools/mtmd/models/cogvlm.cpp
@ -0,0 +1,98 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_cogvlm::build() {
+    GGML_ASSERT(model.class_embedding != nullptr);
+    GGML_ASSERT(model.position_embeddings != nullptr);
+
+    const int n_pos = n_patches + 1; // +1 for [CLS]
+
+    // build input and concatenate class embedding
+    ggml_tensor * inp = build_inp();
+    inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
+
+    inp = ggml_add(ctx0, inp, model.position_embeddings);
+    cb(inp, "inp_pos", -1);
+
+    ggml_tensor * inpL = inp;
+
+    for (int il = 0; il < n_layer; il++) {
+        auto & layer = model.layers[il];
+        ggml_tensor * cur = inpL;
+
+        cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
+
+        cur = ggml_add(ctx0, cur, layer.qkv_b);
+
+        ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
+            cur->nb[1], 0);
+        ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
+            cur->nb[1], n_embd * sizeof(float));
+        ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float),
+            cur->nb[1], 2 * n_embd * sizeof(float));
+
+        cb(Qcur, "Qcur", il);
+        cb(Kcur, "Kcur", il);
+        cb(Vcur, "Vcur", il);
+
+        cur = build_attn(layer.o_w, layer.o_b,
+            Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+        cb(cur, "attn_out", il);
+
+        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
+        cb(cur, "attn_post_norm", il);
+
+        cur = ggml_add(ctx0, cur, inpL);
+        inpL = cur;
+
+        cur = build_ffn(cur,
+            layer.ff_up_w, layer.ff_up_b,
+            layer.ff_gate_w, layer.ff_gate_b,
+            layer.ff_down_w, layer.ff_down_b,
+            hparams.ffn_op, il);
+
+        cb(cur, "ffn_out", il);
+
+        cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
+        cb(cur, "ffn_post_norm", il);
+
+        cur = ggml_add(ctx0, cur, inpL);
+        cb(cur, "layer_out", il);
+        inpL = cur;
+
+    }
+
+    // remove CLS token (like build_llama4 does)
+    ggml_tensor * cur = ggml_view_2d(ctx0, inpL,
+        n_embd, n_patches,
+        ggml_row_size(inpL->type, n_embd), 0);
+
+    // Multiply with mm_model_proj
+    cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
+
+    // Apply layernorm, weight, bias
+    cur = build_norm(cur, model.mm_post_fc_norm_w, model.mm_post_fc_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
+
+    // Apply GELU
+    cur = ggml_gelu_inplace(ctx0, cur);
+
+    // Branch 1: multiply with mm_h_to_4h_w
+    ggml_tensor * h_to_4h = ggml_mul_mat(ctx0, model.mm_h_to_4h_w, cur);
+
+    // Branch 2: multiply with mm_gate_w
+    ggml_tensor * gate = ggml_mul_mat(ctx0, model.mm_gate_w, cur);
+
+    // Apply silu
+    gate = ggml_swiglu_split(ctx0, gate, h_to_4h);
+
+    // Apply mm_4h_to_h_w
+    cur = ggml_mul_mat(ctx0, model.mm_4h_to_h_w, gate);
+
+    // Concatenate with boi and eoi
+    cur = ggml_concat(ctx0, model.mm_boi, cur, 1);
+    cur = ggml_concat(ctx0, cur, model.mm_eoi, 1);
+
+    // build the graph
+    ggml_build_forward_expand(gf, cur);
+
+    return gf;
+}
--- a/tools/mtmd/models/internvl.cpp
+++ b/tools/mtmd/models/internvl.cpp
@ -0,0 +1,69 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_internvl::build() {
+    GGML_ASSERT(model.class_embedding != nullptr);
+    GGML_ASSERT(model.position_embeddings != nullptr);
+
+    const int n_pos = n_patches + 1;
+    ggml_tensor * inp = build_inp();
+
+    // add CLS token
+    inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
+
+    // The larger models use a different ViT, which uses RMS norm instead of layer norm
+    // ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188
+    norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45)
+        ? NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B)
+        : NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models)
+
+    ggml_tensor * cur = build_vit(
+                            inp, n_pos,
+                            norm_t,
+                            hparams.ffn_op,
+                            model.position_embeddings,
+                            nullptr);
+
+    // remove CLS token
+    cur = ggml_view_2d(ctx0, cur,
+        n_embd, n_patches,
+        ggml_row_size(cur->type, n_embd), 0);
+
+    // pixel shuffle
+    {
+        const int scale_factor = model.hparams.n_merge;
+        const int bsz    = 1; // batch size, always 1 for now since we don't support batching
+        const int height = n_patches_y;
+        const int width  = n_patches_x;
+        GGML_ASSERT(scale_factor > 0);
+        cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz);
+        cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+        cur = ggml_cont_4d(ctx0, cur,
+            n_embd * scale_factor * scale_factor,
+            height / scale_factor,
+            width / scale_factor,
+            bsz);
+        cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+        // flatten to 2D
+        cur = ggml_cont_2d(ctx0, cur,
+            n_embd * scale_factor * scale_factor,
+            cur->ne[1] * cur->ne[2]);
+    }
+
+    // projector (always using GELU activation)
+    {
+        // projector LayerNorm uses pytorch's default eps = 1e-5
+        // ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79
+        cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1);
+        cur = build_ffn(cur,
+            model.mm_1_w, model.mm_1_b,
+            nullptr, nullptr,
+            model.mm_3_w, model.mm_3_b,
+            FFN_GELU,
+            -1);
+    }
+
+    // build the graph
+    ggml_build_forward_expand(gf, cur);
+
+    return gf;
+}
--- a/tools/mtmd/models/kimivl.cpp
+++ b/tools/mtmd/models/kimivl.cpp
@ -0,0 +1,63 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_kimivl::build() {
+    // 2D input positions
+    ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+    ggml_set_name(pos_h, "pos_h");
+    ggml_set_input(pos_h);
+
+    ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+    ggml_set_name(pos_w, "pos_w");
+    ggml_set_input(pos_w);
+
+    ggml_tensor * learned_pos_embd = resize_position_embeddings();
+
+    // build ViT with 2D position embeddings
+    auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
+        // first half is X axis and second half is Y axis
+        return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
+    };
+
+    ggml_tensor * inp = build_inp();
+    ggml_tensor * cur = build_vit(
+                            inp, n_patches,
+                            NORM_TYPE_NORMAL,
+                            hparams.ffn_op,
+                            learned_pos_embd,
+                            add_pos);
+
+    cb(cur, "vit_out", -1);
+
+    {
+        // patch_merger
+        const int scale_factor = model.hparams.n_merge;
+        cur = build_patch_merge_permute(cur, scale_factor);
+
+        // projection norm
+        int proj_inp_dim = cur->ne[0];
+        cur = ggml_view_2d(ctx0, cur,
+            n_embd, cur->ne[1] * scale_factor * scale_factor,
+            ggml_row_size(cur->type, n_embd), 0);
+        cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
+        cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
+        cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
+        cur = ggml_view_2d(ctx0, cur,
+            proj_inp_dim, cur->ne[1] / scale_factor / scale_factor,
+            ggml_row_size(cur->type, proj_inp_dim), 0);
+        cb(cur, "proj_inp_normed", -1);
+
+        // projection mlp
+        cur = build_ffn(cur,
+            model.mm_1_w, model.mm_1_b,
+            nullptr, nullptr,
+            model.mm_2_w, model.mm_2_b,
+            FFN_GELU,
+            -1);
+        cb(cur, "proj_out", -1);
+    }
+
+    // build the graph
+    ggml_build_forward_expand(gf, cur);
+
+    return gf;
+}
--- a/tools/mtmd/models/llama4.cpp
+++ b/tools/mtmd/models/llama4.cpp
@ -0,0 +1,96 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_llama4::build() {
+    GGML_ASSERT(model.class_embedding != nullptr);
+    GGML_ASSERT(model.position_embeddings != nullptr);
+
+    const int n_pos = n_patches + 1; // +1 for [CLS]
+
+    // 2D input positions
+    ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+    ggml_set_name(pos_h, "pos_h");
+    ggml_set_input(pos_h);
+
+    ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+    ggml_set_name(pos_w, "pos_w");
+    ggml_set_input(pos_w);
+
+    ggml_tensor * inp = build_inp_raw();
+
+    // Llama4UnfoldConvolution
+    {
+        ggml_tensor * kernel = ggml_reshape_4d(ctx0, model.patch_embeddings_0,
+                                                patch_size, patch_size, 3, n_embd);
+        inp = ggml_im2col(ctx0, kernel, inp, patch_size, patch_size, 0, 0, 1, 1, true, inp->type);
+        inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
+        inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
+        cb(inp, "patch_conv", -1);
+    }
+
+    // add CLS token
+    inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
+
+    // build ViT with 2D position embeddings
+    auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
+        // first half is X axis and second half is Y axis
+        // ref: https://github.com/huggingface/transformers/blob/40a493c7ed4f19f08eadb0639cf26d49bfa5e180/src/transformers/models/llama4/modeling_llama4.py#L1312
+        // ref: https://github.com/Blaizzy/mlx-vlm/blob/a57156aa87b33cca6e5ee6cfc14dd4ef8f611be6/mlx_vlm/models/llama4/vision.py#L441
+        return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
+    };
+    ggml_tensor * cur = build_vit(
+                            inp, n_pos,
+                            NORM_TYPE_NORMAL,
+                            hparams.ffn_op,
+                            model.position_embeddings,
+                            add_pos);
+
+    // remove CLS token
+    cur = ggml_view_2d(ctx0, cur,
+        n_embd, n_patches,
+        ggml_row_size(cur->type, n_embd), 0);
+
+    // pixel shuffle
+    // based on Llama4VisionPixelShuffleMLP
+    // https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151
+    {
+        const int scale_factor = model.hparams.n_merge;
+        const int bsz = 1; // batch size, always 1 for now since we don't support batching
+        GGML_ASSERT(scale_factor > 0);
+        GGML_ASSERT(n_patches_x == n_patches_y); // llama4 only supports square images
+        cur = ggml_reshape_4d(ctx0, cur,
+            n_embd * scale_factor,
+            n_patches_x / scale_factor,
+            n_patches_y,
+            bsz);
+        cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+        cur = ggml_cont_4d(ctx0, cur,
+            n_embd * scale_factor * scale_factor,
+            n_patches_x / scale_factor,
+            n_patches_y / scale_factor,
+            bsz);
+        //cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+        // flatten to 2D
+        cur = ggml_cont_2d(ctx0, cur,
+            n_embd * scale_factor * scale_factor,
+            n_patches / scale_factor / scale_factor);
+        cb(cur, "pixel_shuffle", -1);
+    }
+
+    // based on Llama4VisionMLP2 (always uses GELU activation, no bias)
+    {
+        cur = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, cur);
+        cur = ggml_gelu(ctx0, cur);
+        cur = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, cur);
+        cur = ggml_gelu(ctx0, cur);
+        cb(cur, "adapter_mlp", -1);
+    }
+
+    // Llama4MultiModalProjector
+    cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
+    cb(cur, "projected", -1);
+
+    // build the graph
+    ggml_build_forward_expand(gf, cur);
+
+    return gf;
+}
--- a/tools/mtmd/models/llava.cpp
+++ b/tools/mtmd/models/llava.cpp
@ -0,0 +1,374 @@
+#include "models.h"
+
+// this graph is used by llava, granite and glm
+// due to having embedding_stack (used by granite), we cannot reuse build_vit
+ggml_cgraph * clip_graph_llava::build() {
+    const int batch_size = 1;
+    const int n_pos = n_patches + (model.class_embedding ? 1 : 0);
+
+    GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported");
+
+    // Calculate the deepest feature layer based on hparams and projector type
+    int max_feature_layer = n_layer;
+    {
+        // Get the index of the second to last layer; this is the default for models that have a llava projector
+        int il_last = hparams.n_layer - 1;
+        int deepest_feature_layer = -1;
+
+        if (proj_type == PROJECTOR_TYPE_MINICPMV || proj_type == PROJECTOR_TYPE_GLM_EDGE) {
+            il_last += 1;
+        }
+
+        // If we set explicit vision feature layers, only go up to the deepest one
+        // NOTE: only used by granite-vision models for now
+        for (const auto & feature_layer : hparams.vision_feature_layer) {
+            if (feature_layer > deepest_feature_layer) {
+                deepest_feature_layer = feature_layer;
+            }
+        }
+        max_feature_layer = deepest_feature_layer < 0 ? il_last : deepest_feature_layer;
+    }
+
+    ggml_tensor * inp = build_inp();
+
+    // concat class_embeddings and patch_embeddings
+    if (model.class_embedding) {
+        inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
+    }
+
+    ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+    ggml_set_name(positions, "positions");
+    ggml_set_input(positions);
+
+    inp = ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions));
+
+    ggml_tensor * inpL = inp;
+
+    // pre-layernorm
+    if (model.pre_ln_w) {
+        inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, NORM_TYPE_NORMAL, eps, -1);
+        cb(inpL, "pre_ln", -1);
+    }
+
+    std::vector<ggml_tensor *> embedding_stack;
+    const auto & vision_feature_layer = hparams.vision_feature_layer;
+
+    // loop over layers
+    for (int il = 0; il < max_feature_layer; il++) {
+        auto & layer = model.layers[il];
+        ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
+
+        // If this is an embedding feature layer, save the output.
+        // NOTE: 0 index here refers to the input to the encoder.
+        if (vision_feature_layer.find(il) != vision_feature_layer.end()) {
+            embedding_stack.push_back(cur);
+        }
+
+        // layernorm1
+        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
+        cb(cur, "layer_inp_normed", il);
+
+        // self-attention
+        {
+            ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
+            if (layer.q_b) {
+                Qcur = ggml_add(ctx0, Qcur, layer.q_b);
+            }
+
+            ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
+            if (layer.k_b) {
+                Kcur = ggml_add(ctx0, Kcur, layer.k_b);
+            }
+
+            ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
+            if (layer.v_b) {
+                Vcur = ggml_add(ctx0, Vcur, layer.v_b);
+            }
+
+            Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos);
+            Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos);
+            Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos);
+
+            cb(Qcur, "Qcur", il);
+            cb(Kcur, "Kcur", il);
+            cb(Vcur, "Vcur", il);
+
+            cur = build_attn(layer.o_w, layer.o_b,
+                Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+            cb(cur, "attn_out", il);
+        }
+
+        // re-add the layer input, e.g., residual
+        cur = ggml_add(ctx0, cur, inpL);
+
+        inpL = cur; // inpL = residual, cur = hidden_states
+
+        cb(cur, "ffn_inp", il);
+
+        // layernorm2
+        cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
+        cb(cur, "ffn_inp_normed", il);
+
+        // ffn
+        cur = build_ffn(cur,
+            layer.ff_up_w, layer.ff_up_b,
+            layer.ff_gate_w, layer.ff_gate_b,
+            layer.ff_down_w, layer.ff_down_b,
+            hparams.ffn_op, il);
+
+        cb(cur, "ffn_out", il);
+
+        // residual 2
+        cur = ggml_add(ctx0, inpL, cur);
+        cb(cur, "layer_out", il);
+
+        inpL = cur;
+    }
+
+    // post-layernorm
+    if (model.post_ln_w) {
+        inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, eps, -1);
+    }
+
+    ggml_tensor * embeddings = inpL;
+
+    // process vision feature layers (used by granite)
+    {
+        // final layer is a vision feature layer
+        if (vision_feature_layer.find(max_feature_layer) != vision_feature_layer.end()) {
+            embedding_stack.push_back(inpL);
+        }
+
+        // If feature layers are explicitly set, stack them (if we have multiple)
+        if (!embedding_stack.empty()) {
+            embeddings = embedding_stack[0];
+            for (size_t i = 1; i < embedding_stack.size(); i++) {
+                embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0);
+            }
+        }
+    }
+
+    // llava projector (also used by granite)
+    if (hparams.has_llava_projector) {
+        embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
+
+        ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+        ggml_set_name(patches, "patches");
+        ggml_set_input(patches);
+
+        // shape [1, 576, 1024]
+        // ne is whcn, ne = [1024, 576, 1, 1]
+        embeddings = ggml_get_rows(ctx0, embeddings, patches);
+
+        // print_tensor_info(embeddings, "embeddings");
+
+        // llava projector
+        if (proj_type == PROJECTOR_TYPE_MLP) {
+            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
+            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
+
+            embeddings = ggml_gelu(ctx0, embeddings);
+            if (model.mm_2_w) {
+                embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
+                embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
+            }
+        }
+        else if (proj_type == PROJECTOR_TYPE_MLP_NORM) {
+            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
+            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
+            // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
+            // First LayerNorm
+            embeddings = ggml_norm(ctx0, embeddings, eps);
+            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
+                                model.mm_1_b);
+
+            // GELU activation
+            embeddings = ggml_gelu(ctx0, embeddings);
+
+            // Second linear layer
+            embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
+            embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
+
+            // Second LayerNorm
+            embeddings = ggml_norm(ctx0, embeddings, eps);
+            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
+                                model.mm_4_b);
+        }
+        else if (proj_type == PROJECTOR_TYPE_LDP) {
+            // MobileVLM projector
+            int n_patch = 24;
+            ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
+            mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
+            mlp_1 = ggml_gelu(ctx0, mlp_1);
+            ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
+            mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
+            // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
+
+            // block 1
+            ggml_tensor * block_1 = nullptr;
+            {
+                // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
+                mlp_3 = ggml_permute(ctx0, mlp_3, 1, 0, 2, 3);
+                mlp_3 = ggml_cont_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
+                // stride = 1, padding = 1, bias is nullptr
+                block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
+
+                // layer norm
+                // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
+                // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
+                block_1 = ggml_norm(ctx0, block_1, eps);
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+
+                // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+                // hardswish
+                ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
+
+                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
+                // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+                // pointwise conv
+                block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
+                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
+                block_1 = ggml_relu(ctx0, block_1);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
+                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
+                block_1 = ggml_hardsigmoid(ctx0, block_1);
+                // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
+                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
+                block_1 = ggml_mul(ctx0, block_1_hw, block_1);
+
+                int w = block_1->ne[0], h = block_1->ne[1];
+                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
+
+                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
+                block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
+
+                // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
+                block_1 = ggml_norm(ctx0, block_1, eps);
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+                // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
+                // residual
+                block_1 = ggml_add(ctx0, mlp_3, block_1);
+            }
+
+            // block_2
+            {
+                // stride = 2
+                block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
+
+                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
+                // layer norm
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
+                // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
+                block_1 = ggml_norm(ctx0, block_1, eps);
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
+                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
+                // hardswish
+                ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
+
+                // not sure the parameters is right for globalAvgPooling
+                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
+                // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+                // pointwise conv
+                block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
+                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
+                block_1 = ggml_relu(ctx0, block_1);
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
+                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
+                block_1 = ggml_hardsigmoid(ctx0, block_1);
+
+                // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
+                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
+                block_1 = ggml_mul(ctx0, block_1_hw, block_1);
+
+                int w = block_1->ne[0], h = block_1->ne[1];
+                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
+                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
+                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
+                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
+                block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
+
+
+                // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
+                block_1 = ggml_norm(ctx0, block_1, eps);
+                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
+                block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
+                // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
+            }
+            embeddings = block_1;
+        }
+        else if (proj_type == PROJECTOR_TYPE_LDPV2)
+        {
+            int n_patch = 24;
+            ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
+            mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
+            mlp_0 = ggml_gelu(ctx0, mlp_0);
+            ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
+            mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
+            // mlp_2 ne = [2048, 576, 1, 1]
+            // // AVG Pool Layer 2*2, strides = 2
+            mlp_2 = ggml_permute(ctx0, mlp_2, 1, 0, 2, 3);
+            // mlp_2 ne = [576, 2048, 1, 1]
+            mlp_2 = ggml_cont_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]);
+            // mlp_2 ne [24, 24, 2048, 1]
+            mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0);
+            // weight ne = [3, 3, 2048, 1]
+            ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1);
+            peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3));
+            peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b);
+            mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3));
+            peg_0 = ggml_add(ctx0, peg_0, mlp_2);
+            peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]);
+            embeddings = peg_0;
+        }
+        else {
+            GGML_ABORT("fatal error");
+        }
+    }
+
+    // glm projector
+    else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) {
+        size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
+        embeddings = ggml_permute(ctx0,embeddings,1,0,2,3);
+        embeddings = ggml_cont_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
+        embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
+        embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
+        embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
+        embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
+        // GLU
+        {
+            embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
+            embeddings = ggml_norm(ctx0, embeddings, eps);
+            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
+            embeddings = ggml_gelu_inplace(ctx0, embeddings);
+            ggml_tensor * x = embeddings;
+            embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
+            x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
+            embeddings = ggml_swiglu_split(ctx0, embeddings, x);
+            embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
+        }
+        // arrangement of BOI/EOI token embeddings
+        // note: these embeddings are not present in text model, hence we cannot process them as text tokens
+        // see: https://huggingface.co/THUDM/glm-edge-v-2b/blob/main/siglip.py#L53
+        {
+            embeddings = ggml_concat(ctx0, model.mm_boi, embeddings, 1); // BOI
+            embeddings = ggml_concat(ctx0, embeddings, model.mm_eoi, 1); // EOI
+        }
+    }
+
+    else {
+        GGML_ABORT("llava: unknown projector type");
+    }
+
+    // build the graph
+    ggml_build_forward_expand(gf, embeddings);
+
+    return gf;
+}
--- a/tools/mtmd/models/minicpmv.cpp
+++ b/tools/mtmd/models/minicpmv.cpp
@ -0,0 +1,114 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_minicpmv::build() {
+    GGML_ASSERT(model.class_embedding == nullptr);
+    const int n_pos       = n_patches;
+    const int n_embd_proj = n_mmproj_embd;
+
+    // position embeddings for the projector (not for ViT)
+    // see: https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/resampler.py#L70
+    // base frequency omega
+    ggml_tensor * omega = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_embd_proj / 4);
+    ggml_set_name(omega, "omega");
+    ggml_set_input(omega);
+
+    // 2D input positions (using float for sinusoidal embeddings)
+    ggml_tensor * pos_h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
+    ggml_set_name(pos_h, "pos_h");
+    ggml_set_input(pos_h);
+    ggml_tensor * pos_w = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos);
+    ggml_set_name(pos_w, "pos_w");
+    ggml_set_input(pos_w);
+
+    // for selecting learned pos embd, used by ViT
+    struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+    ggml_set_name(positions, "positions");
+    ggml_set_input(positions);
+
+    ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
+
+    ggml_tensor * inp = build_inp();
+    ggml_tensor * embeddings = build_vit(
+                            inp, n_pos,
+                            NORM_TYPE_NORMAL,
+                            hparams.ffn_op,
+                            learned_pos_embd,
+                            nullptr);
+
+    // resampler projector (it is just another transformer)
+
+    ggml_tensor * q = model.mm_model_query;
+    ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
+
+    // norm
+    q = build_norm(q, model.mm_model_ln_q_w,  model.mm_model_ln_q_b,  NORM_TYPE_NORMAL, eps, -1);
+    v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1);
+
+    // calculate sinusoidal pos embd
+    ggml_tensor * pos_embed = nullptr;
+    {
+        // outer product
+        ggml_tensor * omega_b = ggml_repeat_4d(ctx0, omega, omega->ne[0], n_pos, 1, 1); // n_pos rows
+        ggml_tensor * theta_x = ggml_mul(ctx0, omega_b, pos_w);
+        ggml_tensor * theta_y = ggml_mul(ctx0, omega_b, pos_h);
+        // sin and cos
+        ggml_tensor * pos_embd_x = ggml_concat(
+            ctx0,
+            ggml_sin(ctx0, theta_x),
+            ggml_cos(ctx0, theta_x),
+            0 // concat on first dim
+        );
+        ggml_tensor * pos_embd_y = ggml_concat(
+            ctx0,
+            ggml_sin(ctx0, theta_y),
+            ggml_cos(ctx0, theta_y),
+            0 // concat on first dim
+        );
+        pos_embed = ggml_concat(ctx0, pos_embd_x, pos_embd_y, 0);
+    }
+
+    // k = v + pos_embed
+    ggml_tensor * k = ggml_add(ctx0, v, pos_embed);
+
+    // attention
+    {
+        const int d_head = 128;
+        int n_head = n_embd_proj/d_head;
+        // Use actual config value if available, otherwise fall back to hardcoded values
+        int num_query = hparams.minicpmv_query_num;
+        ggml_tensor * Q = ggml_add(ctx0,
+            ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
+            model.mm_model_attn_q_b);
+        ggml_tensor * K = ggml_add(ctx0,
+            ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k),
+            model.mm_model_attn_k_b);
+        ggml_tensor * V = ggml_add(ctx0,
+            ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v),
+            model.mm_model_attn_v_b);
+
+        Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_query);
+        K = ggml_reshape_3d(ctx0, K, d_head, n_head, n_pos);
+        V = ggml_reshape_3d(ctx0, V, d_head, n_head, n_pos);
+
+        cb(Q, "resampler_Q", -1);
+        cb(K, "resampler_K", -1);
+        cb(V, "resampler_V", -1);
+
+        float resampler_kq_scale = 1.0f/ sqrtf(float(d_head));
+        embeddings = build_attn(
+            model.mm_model_attn_o_w,
+            model.mm_model_attn_o_b,
+            Q, K, V, nullptr, resampler_kq_scale, -1);
+        cb(embeddings, "resampler_attn_out", -1);
+    }
+    // layernorm
+    embeddings = build_norm(embeddings, model.mm_model_ln_post_w, model.mm_model_ln_post_b, NORM_TYPE_NORMAL, eps, -1);
+
+    // projection
+    embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
+
+    // build the graph
+    ggml_build_forward_expand(gf, embeddings);
+
+    return gf;
+}
--- a/tools/mtmd/models/models.h
+++ b/tools/mtmd/models/models.h
@ -0,0 +1,58 @@
+#pragma once
+
+#include "../clip-graph.h"
+
+struct clip_graph_siglip : clip_graph {
+    clip_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_pixtral : clip_graph {
+    clip_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_qwen2vl : clip_graph {
+    clip_graph_qwen2vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_qwen3vl : clip_graph {
+    clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_minicpmv : clip_graph {
+    clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_internvl : clip_graph {
+    clip_graph_internvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_llama4 : clip_graph {
+    clip_graph_llama4(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_kimivl : clip_graph {
+    clip_graph_kimivl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_cogvlm : clip_graph {
+    clip_graph_cogvlm(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_llava : clip_graph {
+    clip_graph_llava(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
+struct clip_graph_whisper_enc : clip_graph {
+    clip_graph_whisper_enc(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
--- a/tools/mtmd/models/pixtral.cpp
+++ b/tools/mtmd/models/pixtral.cpp
@ -0,0 +1,86 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_pixtral::build() {
+    const int n_merge = hparams.n_merge;
+
+    // 2D input positions
+    ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+    ggml_set_name(pos_h, "pos_h");
+    ggml_set_input(pos_h);
+
+    ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+    ggml_set_name(pos_w, "pos_w");
+    ggml_set_input(pos_w);
+
+    auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
+        return build_rope_2d(ctx0, cur, pos_h, pos_w, hparams.rope_theta, true);
+    };
+
+    ggml_tensor * inp = build_inp();
+    ggml_tensor * cur = build_vit(
+                            inp, n_patches,
+                            NORM_TYPE_RMS,
+                            hparams.ffn_op,
+                            nullptr, // no learned pos embd
+                            add_pos);
+
+    // mistral small 3.1 patch merger
+    // ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67
+    if (model.mm_patch_merger_w) {
+        GGML_ASSERT(hparams.n_merge > 0);
+
+        cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.mm_input_norm_w);
+
+        // reshape image tokens to 2D grid
+        cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y);
+        cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd]
+        cur = ggml_cont(ctx0, cur);
+
+        // torch.nn.functional.unfold is just an im2col under the hood
+        // we just need a dummy kernel to make it work
+        ggml_tensor * kernel = ggml_view_3d(ctx0, cur, n_merge, n_merge, cur->ne[2], 0, 0, 0);
+        cur = ggml_im2col(ctx0, kernel, cur, n_merge, n_merge, 0, 0, 1, 1, true, inp->type);
+
+        // project to n_embd
+        cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
+        cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur);
+    }
+
+    // LlavaMultiModalProjector (always using GELU activation)
+    {
+        cur = build_ffn(cur,
+            model.mm_1_w, model.mm_1_b,
+            nullptr, nullptr,
+            model.mm_2_w, model.mm_2_b,
+            FFN_GELU,
+            -1);
+    }
+
+    // arrangement of the [IMG_BREAK] token
+    if (model.token_embd_img_break) {
+        // not efficient, but works
+        // the trick is to view the embeddings as a 3D tensor with shape [n_embd, n_patches_per_row, n_rows]
+        // and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
+        // after the concatenation, we have a tensor with shape [n_embd, n_patches_per_row + 1, n_rows]
+
+        const int p_y             = n_merge > 0 ? n_patches_y / n_merge : n_patches_y;
+        const int p_x             = n_merge > 0 ? n_patches_x / n_merge : n_patches_x;
+        const int p_total         = p_x * p_y;
+        const int n_embd_text     = cur->ne[0];
+        const int n_tokens_output = p_total + p_y - 1; // one [IMG_BREAK] per row, except the last row
+
+        ggml_tensor * tmp = ggml_reshape_3d(ctx0, cur, n_embd_text, p_x, p_y);
+        ggml_tensor * tok = ggml_new_tensor_3d(ctx0, tmp->type, n_embd_text, 1, p_y);
+        tok = ggml_scale(ctx0, tok, 0.0); // clear the tensor
+        tok = ggml_add(ctx0, tok, model.token_embd_img_break);
+        tmp = ggml_concat(ctx0, tmp, tok, 1);
+        cur = ggml_view_2d(ctx0, tmp,
+            n_embd_text, n_tokens_output,
+            ggml_row_size(tmp->type, n_embd_text), 0);
+    }
+
+    // build the graph
+    ggml_build_forward_expand(gf, cur);
+
+    return gf;
+}
--- a/tools/mtmd/models/qwen2vl.cpp
+++ b/tools/mtmd/models/qwen2vl.cpp
@ -0,0 +1,183 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_qwen2vl::build() {
+    GGML_ASSERT(model.patch_bias == nullptr);
+    GGML_ASSERT(model.class_embedding == nullptr);
+
+    const int batch_size       = 1;
+    const bool use_window_attn = hparams.n_wa_pattern > 0;
+    const int n_wa_pattern     = hparams.n_wa_pattern;
+    const int n_pos            = n_patches;
+    const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
+
+    norm_type norm_t = proj_type == PROJECTOR_TYPE_QWEN25VL
+        ? NORM_TYPE_RMS // qwen 2.5 vl
+        : NORM_TYPE_NORMAL; // qwen 2 vl
+
+    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
+
+    ggml_tensor * inp_raw = build_inp_raw();
+    ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+
+    GGML_ASSERT(img.nx % (patch_size * 2) == 0);
+    GGML_ASSERT(img.ny % (patch_size * 2) == 0);
+
+    // second conv dimension
+    {
+        auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+        inp = ggml_add(ctx0, inp, inp_1);
+
+        inp = ggml_permute(ctx0, inp, 1, 2, 0, 3);  // [w, h, c, b] -> [c, w, h, b]
+        inp = ggml_cont_4d(
+            ctx0, inp,
+            n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
+        inp = ggml_reshape_4d(
+            ctx0, inp,
+            n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
+        inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
+        inp = ggml_cont_3d(
+            ctx0, inp,
+            n_embd, n_patches_x * n_patches_y, batch_size);
+    }
+
+    ggml_tensor * inpL           = inp;
+    ggml_tensor * window_mask    = nullptr;
+    ggml_tensor * window_idx     = nullptr;
+    ggml_tensor * inv_window_idx = nullptr;
+
+    ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
+    ggml_set_name(positions, "positions");
+    ggml_set_input(positions);
+
+    // pre-layernorm
+    if (model.pre_ln_w) {
+        inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
+    }
+
+    if (use_window_attn) {
+        // handle window attention inputs
+        inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
+        ggml_set_name(inv_window_idx, "inv_window_idx");
+        ggml_set_input(inv_window_idx);
+        // mask for window attention
+        window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
+        ggml_set_name(window_mask, "window_mask");
+        ggml_set_input(window_mask);
+
+        // if flash attn is used, we need to pad the mask and cast to f16
+        if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
+            window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
+        }
+
+        // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
+        GGML_ASSERT(batch_size == 1);
+        inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
+        inpL = ggml_get_rows(ctx0, inpL, inv_window_idx);
+        inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size);
+    }
+
+    // loop over layers
+    for (int il = 0; il < n_layer; il++) {
+        const auto & layer = model.layers[il];
+        const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true;
+
+        ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
+
+        // layernorm1
+        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
+        cb(cur, "ln1", il);
+
+        // self-attention
+        {
+            ggml_tensor * Qcur = ggml_add(ctx0,
+                ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
+            ggml_tensor * Kcur = ggml_add(ctx0,
+                ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
+            ggml_tensor * Vcur = ggml_add(ctx0,
+                ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
+
+            Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
+            Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
+            Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches);
+
+            cb(Qcur, "Qcur", il);
+            cb(Kcur, "Kcur", il);
+            cb(Vcur, "Vcur", il);
+
+            // apply M-RoPE
+            Qcur = ggml_rope_multi(
+                ctx0, Qcur, positions, nullptr,
+                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+            Kcur = ggml_rope_multi(
+                ctx0, Kcur, positions, nullptr,
+                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+
+            cb(Qcur, "Qcur_rope", il);
+            cb(Kcur, "Kcur_rope", il);
+
+            ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
+
+            cur = build_attn(layer.o_w, layer.o_b,
+                Qcur, Kcur, Vcur, attn_mask, kq_scale, il);
+            cb(cur, "attn_out", il);
+        }
+
+        // re-add the layer input, e.g., residual
+        cur = ggml_add(ctx0, cur, inpL);
+
+        inpL = cur; // inpL = residual, cur = hidden_states
+
+        cb(cur, "ffn_inp", il);
+
+        // layernorm2
+        cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
+        cb(cur, "ffn_inp_normed", il);
+
+        // ffn
+        cur = build_ffn(cur,
+            layer.ff_up_w, layer.ff_up_b,
+            layer.ff_gate_w, layer.ff_gate_b,
+            layer.ff_down_w, layer.ff_down_b,
+            hparams.ffn_op, il);
+
+        cb(cur, "ffn_out", il);
+
+        // residual 2
+        cur = ggml_add(ctx0, inpL, cur);
+        cb(cur, "layer_out", il);
+
+        inpL = cur;
+    }
+
+    // post-layernorm
+    if (model.post_ln_w) {
+        inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
+    }
+
+    // multimodal projection
+    ggml_tensor * embeddings = inpL;
+    embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
+    embeddings = build_ffn(embeddings,
+                        model.mm_0_w, model.mm_0_b,
+                        nullptr, nullptr,
+                        model.mm_1_w, model.mm_1_b,
+                        FFN_GELU,
+                        -1);
+
+    if (use_window_attn) {
+        window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4);
+        ggml_set_name(window_idx, "window_idx");
+        ggml_set_input(window_idx);
+
+        // embeddings shape: [n_embd, n_patches_x * n_patches_y, batch_size]
+        GGML_ASSERT(batch_size == 1);
+        embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4);
+        embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
+        embeddings = ggml_reshape_3d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4, batch_size);
+    }
+
+    // build the graph
+    ggml_build_forward_expand(gf, embeddings);
+
+    return gf;
+}
--- a/tools/mtmd/models/qwen3vl.cpp
+++ b/tools/mtmd/models/qwen3vl.cpp
@ -0,0 +1,191 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_qwen3vl::build() {
+    GGML_ASSERT(model.patch_bias != nullptr);
+    GGML_ASSERT(model.position_embeddings != nullptr);
+    GGML_ASSERT(model.class_embedding == nullptr);
+
+    const int batch_size       = 1;
+    const int n_pos            = n_patches;
+    const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
+
+    norm_type norm_t = NORM_TYPE_NORMAL;
+
+    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
+
+    ggml_tensor * inp_raw = build_inp_raw();
+    ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+
+    GGML_ASSERT(img.nx % (patch_size * 2) == 0);
+    GGML_ASSERT(img.ny % (patch_size * 2) == 0);
+
+    // second conv dimension
+    {
+        auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+        inp = ggml_add(ctx0, inp, inp_1);
+
+        inp = ggml_permute(ctx0, inp, 1, 2, 0, 3);  // [w, h, c, b] -> [c, w, h, b]
+        inp = ggml_cont_4d(
+            ctx0, inp,
+            n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
+        inp = ggml_reshape_4d(
+            ctx0, inp,
+            n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
+        inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
+        inp = ggml_cont_3d(
+            ctx0, inp,
+            n_embd, n_patches_x * n_patches_y, batch_size);
+    }
+
+    // add patch bias
+    if (model.patch_bias != nullptr) {
+        inp = ggml_add(ctx0, inp, model.patch_bias);
+        cb(inp, "patch_bias", -1);
+    }
+
+    // calculate absolute position embedding and apply
+    ggml_tensor * learned_pos_embd = resize_position_embeddings();
+    learned_pos_embd = ggml_cont_4d(
+        ctx0, learned_pos_embd,
+        n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
+    learned_pos_embd = ggml_reshape_4d(
+        ctx0, learned_pos_embd,
+        n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
+    learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3);
+    learned_pos_embd = ggml_cont_3d(
+        ctx0, learned_pos_embd,
+        n_embd, n_patches_x * n_patches_y, batch_size);
+    inp = ggml_add(ctx0, inp, learned_pos_embd);
+    cb(inp, "inp_pos_emb", -1);
+
+    ggml_tensor * inpL = inp;
+
+    ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
+    ggml_set_name(positions, "positions");
+    ggml_set_input(positions);
+
+    // pre-layernorm
+    if (model.pre_ln_w) {
+        inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1);
+    }
+
+    // deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size]
+    ggml_tensor * deepstack_features = nullptr;
+    const int merge_factor = hparams.n_merge > 0 ? hparams.n_merge * hparams.n_merge : 4; // default 2x2=4 for qwen3vl
+
+    // loop over layers
+    for (int il = 0; il < n_layer; il++) {
+        auto & layer = model.layers[il];
+
+        ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
+
+        // layernorm1
+        cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il);
+        cb(cur, "ln1", il);
+
+        // self-attention
+        {
+            cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
+            cur = ggml_add(ctx0, cur, layer.qkv_b);
+
+            ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+                    /* nb1    */ ggml_row_size(cur->type, d_head),
+                    /* nb2    */ cur->nb[1],
+                    /* offset */ 0);
+
+            ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+                    /* nb1    */ ggml_row_size(cur->type, d_head),
+                    /* nb2    */ cur->nb[1],
+                    /* offset */ ggml_row_size(cur->type, n_embd));
+
+            ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
+                    /* nb1    */ ggml_row_size(cur->type, d_head),
+                    /* nb2    */ cur->nb[1],
+                    /* offset */ ggml_row_size(cur->type, 2 * n_embd));
+
+            cb(Qcur, "Qcur", il);
+            cb(Kcur, "Kcur", il);
+            cb(Vcur, "Vcur", il);
+
+            // apply M-RoPE
+            Qcur = ggml_rope_multi(
+                ctx0, Qcur, positions, nullptr,
+                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+            Kcur = ggml_rope_multi(
+                ctx0, Kcur, positions, nullptr,
+                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+
+            cb(Qcur, "Qcur_rope", il);
+            cb(Kcur, "Kcur_rope", il);
+
+            cur = build_attn(layer.o_w, layer.o_b,
+                Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+            cb(cur, "attn_out", il);
+        }
+
+        // re-add the layer input, e.g., residual
+        cur = ggml_add(ctx0, cur, inpL);
+
+        inpL = cur; // inpL = residual, cur = hidden_states
+
+        cb(cur, "ffn_inp", il);
+
+        // layernorm2
+        cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il);
+        cb(cur, "ffn_inp_normed", il);
+
+        // ffn
+        cur = build_ffn(cur,
+            layer.ff_up_w, layer.ff_up_b,
+            layer.ff_gate_w, layer.ff_gate_b,
+            layer.ff_down_w, layer.ff_down_b,
+            hparams.ffn_op, il);
+
+        cb(cur, "ffn_out", il);
+
+        // residual 2
+        cur = ggml_add(ctx0, inpL, cur);
+        cb(cur, "layer_out", il);
+
+        if (layer.has_deepstack()) {
+            ggml_tensor * feat = ggml_reshape_3d(ctx0, cur, n_embd * merge_factor, n_pos / merge_factor, batch_size);
+            feat = build_norm(feat, layer.deepstack_norm_w, layer.deepstack_norm_b, norm_t, eps, il);
+            feat = build_ffn(feat,
+                layer.deepstack_fc1_w, layer.deepstack_fc1_b,
+                nullptr, nullptr,
+                layer.deepstack_fc2_w, layer.deepstack_fc2_b,
+                ffn_op_type::FFN_GELU, il);
+
+            if(!deepstack_features) {
+                deepstack_features = feat;
+            } else {
+                // concat along the feature dimension
+                deepstack_features = ggml_concat(ctx0, deepstack_features, feat, 0);
+            }
+        }
+
+        inpL = cur;
+    }
+
+    // post-layernorm
+    if (model.post_ln_w) {
+        inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
+    }
+
+    // multimodal projection
+    ggml_tensor * embeddings = inpL;
+    embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
+
+    embeddings = build_ffn(embeddings,
+        model.mm_0_w, model.mm_0_b,
+        nullptr, nullptr,
+        model.mm_1_w, model.mm_1_b,
+        ffn_op_type::FFN_GELU, -1);
+
+    embeddings = ggml_concat(ctx0, embeddings, deepstack_features, 0); // concat along the feature dimension
+
+    // build the graph
+    ggml_build_forward_expand(gf, embeddings);
+
+    return gf;
+}
--- a/tools/mtmd/models/siglip.cpp
+++ b/tools/mtmd/models/siglip.cpp
@ -0,0 +1,81 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_siglip::build() {
+    ggml_tensor * inp = build_inp();
+
+    ggml_tensor * learned_pos_embd = model.position_embeddings;
+    if (proj_type == PROJECTOR_TYPE_LFM2) {
+        learned_pos_embd = resize_position_embeddings();
+    }
+
+    ggml_tensor * cur = build_vit(
+                            inp, n_patches,
+                            NORM_TYPE_NORMAL,
+                            hparams.ffn_op,
+                            learned_pos_embd,
+                            nullptr);
+
+    if (proj_type == PROJECTOR_TYPE_GEMMA3) {
+        const int batch_size = 1;
+        GGML_ASSERT(n_patches_x == n_patches_y);
+        const int patches_per_image = n_patches_x;
+        const int kernel_size = hparams.n_merge;
+
+        cur = ggml_transpose(ctx0, cur);
+        cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size);
+
+        // doing a pool2d to reduce the number of output tokens
+        cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0);
+        cur = ggml_reshape_3d(ctx0, cur, cur->ne[0] * cur->ne[0], n_embd, batch_size);
+        cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+
+        // apply norm before projection
+        cur = ggml_rms_norm(ctx0, cur, eps);
+        cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
+
+        // apply projection
+        cur = ggml_mul_mat(ctx0,
+            ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
+            cur);
+
+    } else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
+        // pixel_shuffle
+        // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
+        const int scale_factor = model.hparams.n_merge;
+        cur = build_patch_merge_permute(cur, scale_factor);
+        cur = ggml_mul_mat(ctx0, model.projection, cur);
+
+    } else if (proj_type == PROJECTOR_TYPE_LFM2) {
+        // pixel unshuffle block
+        const int scale_factor = model.hparams.n_merge;
+        cur = build_patch_merge_permute(cur, scale_factor);
+
+        // projection
+        cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm
+        cur = ggml_mul(ctx0, cur, model.mm_input_norm_w);
+        cur = ggml_add(ctx0, cur, model.mm_input_norm_b);
+
+        cur = build_ffn(cur,
+            model.mm_1_w, model.mm_1_b,
+            nullptr, nullptr,
+            model.mm_2_w, model.mm_2_b,
+            FFN_GELU,
+            -1);
+
+    } else if (proj_type == PROJECTOR_TYPE_JANUS_PRO) {
+        cur = build_ffn(cur,
+            model.mm_0_w, model.mm_0_b,
+            nullptr, nullptr,
+            model.mm_1_w, model.mm_1_b,
+            hparams.ffn_op,
+            -1);
+
+    } else {
+        GGML_ABORT("SigLIP: Unsupported projector type");
+    }
+
+    // build the graph
+    ggml_build_forward_expand(gf, cur);
+
+    return gf;
+}
--- a/tools/mtmd/models/whisper-enc.cpp
+++ b/tools/mtmd/models/whisper-enc.cpp
@ -0,0 +1,106 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_whisper_enc::build() {
+    const int n_frames = img.nx;
+    const int n_pos    = n_frames / 2;
+    GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
+
+    ggml_tensor * inp = build_inp_raw(1);
+
+    // conv1d block
+    {
+        // convolution + gelu
+        ggml_tensor * cur = ggml_conv_1d_ph(ctx0, model.conv1d_1_w, inp, 1, 1);
+        cur = ggml_add(ctx0, cur, model.conv1d_1_b);
+
+        cur = ggml_gelu_erf(ctx0, cur);
+
+        cur = ggml_conv_1d_ph(ctx0, model.conv1d_2_w, cur, 2, 1);
+        cur = ggml_add(ctx0, cur, model.conv1d_2_b);
+
+        cur = ggml_gelu_erf(ctx0, cur);
+        // transpose
+        inp = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+        cb(inp, "after_conv1d", -1);
+    }
+
+    // sanity check (only check one layer, but it should be the same for all)
+    GGML_ASSERT(model.layers[0].ln_1_w && model.layers[0].ln_1_b);
+    GGML_ASSERT(model.layers[0].ln_2_w && model.layers[0].ln_2_b);
+    GGML_ASSERT(model.layers[0].q_b);
+    GGML_ASSERT(model.layers[0].v_b);
+    GGML_ASSERT(!model.layers[0].k_b); // no bias for k
+
+    ggml_tensor * pos_embd_selected = ggml_view_2d(
+        ctx0, model.position_embeddings,
+        model.position_embeddings->ne[0], n_pos,
+        model.position_embeddings->nb[1], 0
+    );
+    ggml_tensor * cur = build_vit(
+                            inp, n_pos,
+                            NORM_TYPE_NORMAL,
+                            hparams.ffn_op,
+                            pos_embd_selected,
+                            nullptr);
+
+    cb(cur, "after_transformer", -1);
+
+    if (model.audio_has_stack_frames()) {
+        // StackAudioFrames
+        // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
+        cur = build_stack(cur, hparams.proj_stack_factor, n_embd);
+        cb(cur, "after_stacked", -1);
+    }
+
+    if (proj_type == PROJECTOR_TYPE_ULTRAVOX) {
+        // UltravoxProjector
+        // pre-norm
+        cur = ggml_rms_norm(ctx0, cur, 1e-6);
+        cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
+
+        // ffn in
+        cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
+
+        // swiglu
+        // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
+        cur = ggml_swiglu_swapped(ctx0, cur);
+
+        // mid-norm
+        cur = ggml_rms_norm(ctx0, cur, 1e-6);
+        cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w);
+
+        // ffn out
+        cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
+
+    } else if (proj_type == PROJECTOR_TYPE_QWEN2A) {
+        // projector
+        cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
+        cur = ggml_add(ctx0, cur, model.mm_fc_b);
+
+    } else if (proj_type == PROJECTOR_TYPE_VOXTRAL) {
+        // projector
+        cur = build_ffn(cur,
+            model.mm_1_w, model.mm_1_b,
+            nullptr, nullptr,
+            model.mm_2_w, model.mm_2_b,
+            FFN_GELU_ERF,
+            -1);
+
+    } else if (proj_type == PROJECTOR_TYPE_GLMA) {
+            cur = ggml_norm(ctx0, cur, hparams.eps);
+            cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
+            cur = ggml_add(ctx0, cur, model.mm_norm_pre_b);
+            cur = build_stack(cur, hparams.proj_stack_factor, n_embd);
+            cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_2_w, model.mm_2_b, hparams.ffn_op, 0);
+            cur = ggml_concat(ctx0, model.mm_boi, cur, 1);
+            cur = ggml_concat(ctx0, cur, model.mm_eoi, 1);
+    } else {
+        GGML_ABORT("%s: unknown projector type", __func__);
+    }
+
+    cb(cur, "projected", -1);
+
+    ggml_build_forward_expand(gf, cur);
+
+    return gf;
+}
--- a/tools/mtmd/mtmd-audio.h
+++ b/tools/mtmd/mtmd-audio.h
@ -6,6 +6,8 @@
 #include <vector>
 #include <string>

+#define MTMD_INTERNAL_HEADER
+
 #define WHISPER_ASSERT GGML_ASSERT

 #define WHISPER_SAMPLE_RATE 16000
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@ -65,7 +65,7 @@ static void sigint_handler(int signo) {

 struct mtmd_cli_context {
    mtmd::context_ptr ctx_vision;
-    common_init_result llama_init;
+    common_init_result_ptr llama_init;

    llama_model       * model;
    llama_context     * lctx;
@ -89,8 +89,8 @@ struct mtmd_cli_context {
    llama_pos n_past = 0;

    mtmd_cli_context(common_params & params) : llama_init(common_init_from_params(params)) {
-        model = llama_init.model.get();
-        lctx = llama_init.context.get();
+        model = llama_init->model();
+        lctx = llama_init->context();
        vocab = llama_model_get_vocab(model);
        smpl = common_sampler_init(model, params.sampling);
        n_threads = params.cpuparams.n_threads;
--- a/tools/mtmd/mtmd-helper.cpp
+++ b/tools/mtmd/mtmd-helper.cpp
@ -32,6 +32,10 @@
 // #define STB_IMAGE_IMPLEMENTATION
 #include "stb/stb_image.h"

+#ifdef MTMD_INTERNAL_HEADER
+#error "mtmd-helper is a public library outside of mtmd. it must not include internal headers"
+#endif
+
 //
 // internal logging functions
 //
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@ -22,6 +22,11 @@
 *          Issues related to API usage may receive lower priority support.
 *
 * For the usage, see an example in mtmd-cli.cpp
+ *
+ * For contributors:
+ * - Make sure the C API is aligned with the libllama C API (as in llama.h)
+ * - Do not include model name (e.g., qwen, gemma) in the API, use generic terms instead
+ * - Keep the API minimal, do not expose internal details unless necessary
 */

 #ifdef LLAMA_SHARED
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@ -153,7 +153,7 @@ struct server_slot {
    // sampling
    json json_schema;

-    struct common_sampler * smpl = nullptr;
+    common_sampler_ptr smpl;

    llama_token sampled; // in speculative mode, this is the last accepted token
    llama_tokens drafted;
@ -510,8 +510,8 @@ struct server_context_impl {
    common_params params_base;

    // note: keep these alive - they determine the lifetime of the model, context, etc.
-    common_init_result llama_init;
-    common_init_result llama_init_dft;
+    common_init_result_ptr llama_init;
+    common_init_result_ptr llama_init_dft;

    llama_model * model = nullptr;
    llama_context * ctx = nullptr;
@ -557,9 +557,6 @@ struct server_context_impl {

        // Clear any sampling context
        for (server_slot & slot : slots) {
-            common_sampler_free(slot.smpl);
-            slot.smpl = nullptr;
-
            llama_free(slot.ctx_dft);
            slot.ctx_dft = nullptr;

@ -580,8 +577,8 @@ struct server_context_impl {

        llama_init = common_init_from_params(params_base);

-        model = llama_init.model.get();
-        ctx   = llama_init.context.get();
+        model = llama_init->model();
+        ctx   = llama_init->context();

        if (model == nullptr) {
            SRV_ERR("failed to load model, '%s'\n", params_base.model.path.c_str());
@ -613,25 +610,25 @@ struct server_context_impl {

            llama_init_dft = common_init_from_params(params_dft);

-            model_dft = llama_init_dft.model.get();
+            model_dft = llama_init_dft->model();

            if (model_dft == nullptr) {
                SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.path.c_str());
                return false;
            }

-            vocab_dft_compatible = common_speculative_are_compatible(ctx, llama_init_dft.context.get());
+            vocab_dft_compatible = common_speculative_are_compatible(ctx, llama_init_dft->context());
            if (!vocab_dft_compatible) {
                SRV_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params_base.speculative.model.path.c_str(), params_base.model.path.c_str());
            }

-            const int n_ctx_dft = llama_n_ctx(llama_init_dft.context.get());
+            const int n_ctx_dft = llama_n_ctx(llama_init_dft->context());

            cparams_dft = common_context_params_to_llama(params_dft);
            cparams_dft.n_batch = n_ctx_dft;

            // the context is not needed - we will create one for each slot
-            llama_init_dft.context.reset();
+            llama_init_dft->free_context();
        }

        chat_templates = common_chat_templates_init(model, params_base.chat_template);
@ -1051,18 +1048,15 @@ struct server_context_impl {

        // initialize samplers
        {
-            if (slot.smpl != nullptr) {
-                common_sampler_free(slot.smpl);
-            }
+            slot.smpl.reset(common_sampler_init(model, task.params.sampling));

-            slot.smpl = common_sampler_init(model, task.params.sampling);
            if (slot.smpl == nullptr) {
                // for now, the only error that may happen here is invalid grammar
                send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
                return false;
            }

-            SLT_INF(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl).c_str());
+            SLT_INF(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl.get()).c_str());
        }

        // initialize draft batch
@ -1216,11 +1210,10 @@ struct server_context_impl {
    }

    void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) const {
-        size_t n_probs = slot.task->params.sampling.n_probs;
-        size_t n_vocab = llama_vocab_n_tokens(vocab);
+        const size_t n_probs = slot.task->params.sampling.n_probs;

        if (post_sampling) {
-            const auto * cur_p = common_sampler_get_candidates(slot.smpl, true);
+            const auto * cur_p = common_sampler_get_candidates(slot.smpl.get(), true);
            const size_t max_probs = cur_p->size;

            // set probability for sampled token
@ -1245,7 +1238,7 @@ struct server_context_impl {
            std::vector<llama_token_data> cur = get_token_probabilities(ctx, idx);

            // set probability for sampled token
-            for (size_t i = 0; i < n_vocab; i++) {
+            for (size_t i = 0; i < cur.size(); i++) {
                // set probability for sampled token
                if (cur[i].id == result.tok) {
                    result.prob = cur[i].p;
@ -1255,7 +1248,7 @@ struct server_context_impl {

            // set probability for top n_probs tokens
            result.probs.reserve(n_probs);
-            for (size_t i = 0; i < std::min(n_vocab, n_probs); i++) {
+            for (size_t i = 0; i < std::min(cur.size(), n_probs); i++) {
                result.probs.push_back({
                    cur[i].id,
                    common_token_to_piece(ctx, cur[i].id, special),
@ -2301,13 +2294,13 @@ struct server_context_impl {

                        GGML_ASSERT(batch.n_tokens > 0);

-                        common_sampler_reset(slot.smpl);
+                        common_sampler_reset(slot.smpl.get());

                        // Process all prompt tokens through sampler system
                        for (int i = 0; i < slot.task->n_tokens(); ++i) {
                            llama_token id = input_tokens[i];
                            if (id != LLAMA_TOKEN_NULL) {
-                                common_sampler_accept(slot.smpl, id, false);
+                                common_sampler_accept(slot.smpl.get(), id, false);
                            }
                        }

@ -2525,11 +2518,11 @@ struct server_context_impl {

                const int tok_idx = slot.i_batch - i;

-                llama_token id = common_sampler_sample(slot.smpl, ctx, tok_idx);
+                llama_token id = common_sampler_sample(slot.smpl.get(), ctx, tok_idx);

                slot.i_batch = -1;

-                common_sampler_accept(slot.smpl, id, true);
+                common_sampler_accept(slot.smpl.get(), id, true);

                slot.n_decoded += 1;

@ -2570,7 +2563,7 @@ struct server_context_impl {
                size_t n_draft = slot.drafted.size();

                // the accepted tokens from the speculation
-                const auto ids = common_sampler_sample_and_accept_n(slot.smpl, ctx, slot.i_batch_dft, slot.drafted);
+                const auto ids = common_sampler_sample_and_accept_n(slot.smpl.get(), ctx, slot.i_batch_dft, slot.drafted);
                slot.i_batch_dft.clear();
                slot.drafted.clear();

--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@ -16,6 +16,7 @@
 #include <atomic>
 #include <chrono>
 #include <queue>
+#include <filesystem>

 #ifdef _WIN32
 #include <winsock2.h>
@ -171,7 +172,7 @@ server_presets::server_presets(int argc, char ** argv, common_params & base_para
    }

    // read base args from router's argv
-    common_params_parse(argc, argv, LLAMA_EXAMPLE_SERVER, base_args);
+    common_params_to_map(argc, argv, LLAMA_EXAMPLE_SERVER, base_args);

    // remove any router-controlled args from base_args
    for (const auto & cargs : control_args) {
--- a/tools/server/tests/unit/test_compat_anthropic.py
+++ b/tools/server/tests/unit/test_compat_anthropic.py
@ -684,7 +684,7 @@ def test_anthropic_streaming_content_block_indices():
    # Request that might produce both text and tool use
    res = server.make_stream_request("POST", "/v1/messages", data={
        "model": "test",
-        "max_tokens": 200,
+        "max_tokens": 400,
        "stream": True,
        "tools": [{
            "name": "test_tool",
--- a/tools/tts/tts.cpp
+++ b/tools/tts/tts.cpp
@ -568,10 +568,10 @@ int main(int argc, char ** argv) {
    llama_context * ctx_ttc = NULL;
    llama_context * ctx_cts = NULL;

-    common_init_result llama_init_ttc = common_init_from_params(params);
+    auto llama_init_ttc = common_init_from_params(params);

-    model_ttc = llama_init_ttc.model.get();
-    ctx_ttc   = llama_init_ttc.context.get();
+    model_ttc = llama_init_ttc->model();
+    ctx_ttc   = llama_init_ttc->context();

    if (model_ttc == nullptr || ctx_ttc == nullptr) {
        return ENOENT;
@ -583,10 +583,10 @@ int main(int argc, char ** argv) {
    params.embedding = true;
    params.n_ubatch = params.n_batch;

-    common_init_result llama_init_cts = common_init_from_params(params);
+    auto llama_init_cts = common_init_from_params(params);

-    model_cts = llama_init_cts.model.get();
-    ctx_cts   = llama_init_cts.context.get();
+    model_cts = llama_init_cts->model();
+    ctx_cts   = llama_init_cts->context();

    if (model_cts == nullptr || ctx_cts == nullptr) {
        return ENOENT;
--- a/vendor/cpp-httplib/CMakeLists.txt
+++ b/vendor/cpp-httplib/CMakeLists.txt
@ -11,8 +11,9 @@ endif()
 target_link_libraries  (${TARGET} PRIVATE Threads::Threads)

 if (WIN32 AND NOT MSVC)
-    target_link_libraries(${TARGET} PUBLIC ws2_32)
+    target_link_libraries(${TARGET} PRIVATE ws2_32)
 endif()
+
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

 target_compile_definitions(${TARGET} PRIVATE