diff --git a/.devops/s390x.Dockerfile b/.devops/s390x.Dockerfile deleted file mode 100644 index b7c945768..000000000 --- a/.devops/s390x.Dockerfile +++ /dev/null @@ -1,126 +0,0 @@ -ARG GCC_VERSION=15.2.0 -ARG UBUNTU_VERSION=24.04 - -### Build Llama.cpp stage -FROM gcc:${GCC_VERSION} AS build - -RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ - --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \ - apt update -y && \ - apt upgrade -y && \ - apt install -y --no-install-recommends \ - git cmake ccache ninja-build \ - # WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster. - libopenblas-dev libcurl4-openssl-dev && \ - rm -rf /var/lib/apt/lists/* - -WORKDIR /app -COPY . . - -RUN --mount=type=cache,target=/root/.ccache \ - --mount=type=cache,target=/app/build \ - cmake -S . -B build -G Ninja \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_C_COMPILER_LAUNCHER=ccache \ - -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ - -DLLAMA_BUILD_TESTS=OFF \ - -DGGML_NATIVE=OFF \ - -DGGML_BACKEND_DL=ON \ - -DGGML_CPU_ALL_VARIANTS=ON \ - -DGGML_BLAS=ON \ - -DGGML_BLAS_VENDOR=OpenBLAS && \ - cmake --build build --config Release -j $(nproc) && \ - cmake --install build --prefix /opt/llama.cpp - -COPY *.py /opt/llama.cpp/bin -COPY .devops/tools.sh /opt/llama.cpp/bin - -COPY gguf-py /opt/llama.cpp/gguf-py -COPY requirements.txt /opt/llama.cpp/gguf-py -COPY requirements /opt/llama.cpp/gguf-py/requirements - - -### Collect all llama.cpp binaries, libraries and distro libraries -FROM scratch AS collector - -# Copy llama.cpp binaries and libraries -COPY --from=build /opt/llama.cpp/bin /llama.cpp/bin -COPY --from=build /opt/llama.cpp/lib /llama.cpp/lib -COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py - - -### Base image -FROM ubuntu:${UBUNTU_VERSION} AS base - -RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ - --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \ - apt update -y && \ - apt install -y --no-install-recommends \ - # WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster. - # See: https://github.com/ggml-org/llama.cpp/pull/15915#issuecomment-3317166506 - curl libgomp1 libopenblas-dev && \ - apt autoremove -y && \ - apt clean -y && \ - rm -rf /tmp/* /var/tmp/* && \ - find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \ - find /var/cache -type f -delete - -# Copy llama.cpp libraries -COPY --from=collector /llama.cpp/lib /usr/lib/s390x-linux-gnu - - -### Full -FROM base AS full - -ENV PATH="/root/.cargo/bin:${PATH}" -WORKDIR /app - -RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ - --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \ - apt update -y && \ - apt install -y \ - git cmake libjpeg-dev \ - python3 python3-pip python3-dev && \ - apt autoremove -y && \ - apt clean -y && \ - rm -rf /tmp/* /var/tmp/* && \ - find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \ - find /var/cache -type f -delete - -RUN curl https://sh.rustup.rs -sSf | bash -s -- -y - -COPY --from=collector /llama.cpp/bin /app -COPY --from=collector /llama.cpp/gguf-py /app/gguf-py - -RUN pip install --no-cache-dir --break-system-packages \ - -r /app/gguf-py/requirements.txt - -ENTRYPOINT [ "/app/tools.sh" ] - - -### CLI Only -FROM base AS light - -WORKDIR /llama.cpp/bin - -# Copy llama.cpp binaries and libraries -COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin -COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin - -ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ] - - -### Server -FROM base AS server - -ENV LLAMA_ARG_HOST=0.0.0.0 - -WORKDIR /llama.cpp/bin - -# Copy llama.cpp binaries and libraries -COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin -COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin - -EXPOSE 8080 - -ENTRYPOINT [ "/llama.cpp/bin/llama-server" ] diff --git a/common/arg.cpp b/common/arg.cpp index 52a254a35..5204b7f41 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -107,6 +107,16 @@ bool common_arg::is_exclude(enum llama_example ex) { bool common_arg::get_value_from_env(std::string & output) const { if (env == nullptr) return false; + if (!args_neg.empty()) { + // for compatibility, we need to check LLAMA_ARG_NO_ env as well + std::string neg_env = env; + string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_"); + char * neg_value = std::getenv(neg_env.c_str()); + if (neg_value) { + output = "0"; // falsey + return true; + } + } char * value = std::getenv(env); if (value) { output = value; @@ -116,6 +126,14 @@ bool common_arg::get_value_from_env(std::string & output) const { } bool common_arg::has_value_from_env() const { + if (env != nullptr && !args_neg.empty()) { + // for compatibility, we need to check LLAMA_ARG_NO_ env as well + std::string neg_env = env; + string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_"); + if (std::getenv(neg_env.c_str())) { + return true; + } + } return env != nullptr && std::getenv(env); } @@ -153,9 +171,10 @@ std::string common_arg::to_string() const { std::string leading_spaces(n_leading_spaces, ' '); std::ostringstream ss; - for (const auto arg : args) { - if (arg == args.front()) { - if (args.size() == 1) { + auto all_args = get_args(); // also contains args_neg + for (const auto & arg : all_args) { + if (arg == all_args.front()) { + if (all_args.size() == 1) { ss << arg; } else { // first arg is usually abbreviation, we need padding to make it more beautiful @@ -164,7 +183,7 @@ std::string common_arg::to_string() const { ss << tmp << spaces; } } else { - ss << arg << (arg != args.back() ? ", " : ""); + ss << arg << (arg != all_args.back() ? ", " : ""); } } if (value_hint) ss << " " << value_hint; @@ -183,6 +202,31 @@ std::string common_arg::to_string() const { return ss.str(); } +std::vector common_arg::get_args() const { + std::vector result; + for (const auto & arg : args) { + result.push_back(std::string(arg)); + } + for (const auto & arg : args_neg) { + result.push_back(std::string(arg)); + } + return result; +} + +std::vector common_arg::get_env() const { + std::vector result; + if (env) { + result.push_back(std::string(env)); + } + if (!args_neg.empty() && env) { + // for compatibility, we need to add LLAMA_ARG_NO_ variant + std::string neg_env = env; + string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_"); + result.push_back(neg_env); + } + return result; +} + // // utils // @@ -318,6 +362,16 @@ static std::string get_all_kv_cache_types() { return msg.str(); } +static bool parse_bool_value(const std::string & value) { + if (is_truthy(value)) { + return true; + } else if (is_falsey(value)) { + return false; + } else { + throw std::invalid_argument("invalid boolean value"); + } +} + // // CLI argument parsing functions // @@ -325,10 +379,13 @@ static std::string get_all_kv_cache_types() { static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) { common_params & params = ctx_arg.params; - std::unordered_map arg_to_options; + std::unordered_map> arg_to_options; for (auto & opt : ctx_arg.options) { for (const auto & arg : opt.args) { - arg_to_options[arg] = &opt; + arg_to_options[arg] = {&opt, /* is_positive */ true}; + } + for (const auto & arg : opt.args_neg) { + arg_to_options[arg] = {&opt, /* is_positive */ false}; } } @@ -337,12 +394,15 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context std::string value; if (opt.get_value_from_env(value)) { try { - if (opt.handler_void && (value == "1" || value == "true")) { + if (opt.handler_void && is_truthy(value)) { opt.handler_void(params); } if (opt.handler_int) { opt.handler_int(params, std::stoi(value)); } + if (opt.handler_bool) { + opt.handler_bool(params, parse_bool_value(value)); + } if (opt.handler_string) { opt.handler_string(params, value); continue; @@ -371,7 +431,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context if (arg_to_options.find(arg) == arg_to_options.end()) { throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str())); } - auto opt = *arg_to_options[arg]; + auto & tmp = arg_to_options[arg]; + auto opt = *tmp.first; + bool is_positive = tmp.second; if (opt.has_value_from_env()) { fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str()); } @@ -380,6 +442,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context opt.handler_void(params); continue; } + if (opt.handler_bool) { + opt.handler_bool(params, is_positive); + continue; + } // arg with single value check_arg(i); @@ -404,7 +470,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context throw std::invalid_argument(string_format( "error while handling argument \"%s\": %s\n\n" "usage:\n%s\n\nto show complete usage, run with -h", - arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str())); + arg.c_str(), e.what(), opt.to_string().c_str())); } } @@ -440,7 +506,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context // model is required (except for server) // TODO @ngxson : maybe show a list of available models in CLI in this case - if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage) { + if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage && !params.completion) { throw std::invalid_argument("error: --model is required\n"); } @@ -575,6 +641,7 @@ static void common_params_print_completion(common_params_context & ctx_arg) { "llama-batched-bench", "llama-bench", "llama-cli", + "llama-completion", "llama-convert-llama2c-to-ggml", "llama-cvector-generator", "llama-embedding", @@ -659,7 +726,7 @@ static void add_rpc_devices(const std::string & servers) { } } -bool common_params_parse(int argc, char ** argv, llama_example ex, std::map & out_map) { +bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map & out_map) { common_params dummy_params; common_params_context ctx_arg = common_params_parser_init(dummy_params, ex, nullptr); @@ -668,6 +735,9 @@ bool common_params_parse(int argc, char ** argv, llama_example ex, std::map examples = {LLAMA_EXAMPLE_COMMON}; std::set excludes = {}; std::vector args; + std::vector args_neg; // for negated args like --no-xxx const char * value_hint = nullptr; // help text or example for arg value const char * value_hint_2 = nullptr; // for second arg value const char * env = nullptr; @@ -25,6 +26,7 @@ struct common_arg { void (*handler_string) (common_params & params, const std::string &) = nullptr; void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr; void (*handler_int) (common_params & params, int) = nullptr; + void (*handler_bool) (common_params & params, bool) = nullptr; common_arg() = default; @@ -48,6 +50,13 @@ struct common_arg { void (*handler)(common_params & params) ) : args(args), help(help), handler_void(handler) {} + common_arg( + const std::initializer_list & args, + const std::initializer_list & args_neg, + const std::string & help, + void (*handler)(common_params & params, bool) + ) : args(args), args_neg(args_neg), help(help), handler_bool(handler) {} + // support 2 values for arg common_arg( const std::initializer_list & args, @@ -80,6 +89,10 @@ struct common_arg { } return strcmp(args[0], other.args[0]) == 0; } + + // get all args and env vars (including negated args/env) + std::vector get_args() const; + std::vector get_env() const; }; namespace common_arg_utils { @@ -102,7 +115,7 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e // parse input arguments from CLI into a map // TODO: support repeated args in the future -bool common_params_parse(int argc, char ** argv, llama_example ex, std::map & out_map); +bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map & out_map); // initialize argument parser context - used by test-arg-parser and preset common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); diff --git a/common/common.cpp b/common/common.cpp index f21e4da2c..6149f02c9 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1021,31 +1021,40 @@ bool tty_can_use_colors() { // Model utils // -static inline void common_init_sampler_from_model( +// TODO: move to common/sampling +static void common_init_sampler_from_model( const llama_model * model, common_params_sampling & sparams) { const uint64_t config = sparams.user_sampling_config; auto get_int32 = [&](const char * key, int32_t & dst, uint64_t user_config) { - if (config & user_config) return; + if (config & user_config) { + return; + } char buf[64] = {0}; if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) { char * end = nullptr; int32_t v = strtol(buf, &end, 10); - if (end && end != buf) dst = v; + if (end && end != buf) { + dst = v; + } } }; auto get_float = [&](const char * key, float & dst, uint64_t user_config) { - if (config & user_config) return; + if (config & user_config) { + return; + } char buf[128] = {0}; if (llama_model_meta_val_str(model, key, buf, sizeof(buf)) > 0) { char * end = nullptr; float v = strtof(buf, &end); - if (end && end != buf) dst = v; + if (end && end != buf) { + dst = v; + } } }; @@ -1073,31 +1082,122 @@ static inline void common_init_sampler_from_model( get_float(llama_model_meta_key_str(LLAMA_MODEL_META_KEY_SAMPLING_MIROSTAT_ETA), sparams.mirostat_eta, common_params_sampling_config::COMMON_PARAMS_SAMPLING_CONFIG_MIROSTAT_ETA); } -struct common_init_result common_init_from_params(common_params & params) { - common_init_result iparams; - auto mparams = common_model_params_to_llama(params); +struct common_init_result::impl { + impl() = default; + ~impl() = default; + + llama_model_ptr model; + llama_context_ptr context; + + std::vector lora; + + std::vector samplers; +}; + +common_init_result::common_init_result(common_params & params) : + pimpl(new impl{}) { + const auto mparams = common_model_params_to_llama(params); llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams); if (model == NULL) { - LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n", - __func__, params.model.path.c_str()); - return iparams; + return; } - common_init_sampler_from_model(model, params.sampling); + pimpl->model.reset(model); const llama_vocab * vocab = llama_model_get_vocab(model); + // updates params.sampling + // TODO: fix naming + common_init_sampler_from_model(model, params.sampling); + auto cparams = common_context_params_to_llama(params); + if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) { + LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__); + params.sampling.ignore_eos = false; + } + + // initialize once + for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) { + if (llama_vocab_is_eog(vocab, i)) { + LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(vocab, i).c_str(), -INFINITY); + params.sampling.logit_bias_eog.push_back({i, -INFINITY}); + } + } + + if (params.sampling.ignore_eos) { + // add EOG biases to the active set of logit biases + params.sampling.logit_bias.insert( + params.sampling.logit_bias.end(), + params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end()); + } + + //if (params.sampling.penalty_last_n == -1) { + // LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx)); + // params.sampling.penalty_last_n = llama_n_ctx(lctx); + //} + + //if (params.sampling.dry_penalty_last_n == -1) { + // LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx)); + // params.sampling.dry_penalty_last_n = llama_n_ctx(lctx); + //} + + pimpl->samplers.resize(cparams.n_seq_max); + + for (int i = 0; i < (int) cparams.n_seq_max; ++i) { + pimpl->samplers[i].reset(common_sampler_init(model, params.sampling)); + } + llama_context * lctx = llama_init_from_model(model, cparams); if (lctx == NULL) { LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n", - __func__, params.model.path.c_str()); - llama_model_free(model); - return iparams; + __func__, params.model.path.c_str()); + return; } + pimpl->context.reset(lctx); +} + +llama_model * common_init_result::model() { + return pimpl->model.get(); +} + +llama_context * common_init_result::context() { + return pimpl->context.get(); +} + +common_sampler * common_init_result::sampler(llama_seq_id seq_id) { + return pimpl->samplers[seq_id].get(); +} + +std::vector & common_init_result::lora() { + return pimpl->lora; +} + +void common_init_result::free_context() { + pimpl->context.reset(); +} + +common_init_result_ptr common_init_from_params(common_params & params) { + common_init_result_ptr res(new common_init_result(params)); + + llama_model * model = res->model(); + if (model == NULL) { + LOG_ERR("%s: failed to load model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n", + __func__, params.model.path.c_str()); + return res; + } + + llama_context * lctx = res->context(); + if (lctx == NULL) { + LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n", + __func__, params.model.path.c_str()); + return res; + } + + const llama_vocab * vocab = llama_model_get_vocab(model); + if (params.ctx_shift && !llama_memory_can_shift(llama_get_memory(lctx))) { LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__); params.ctx_shift = false; @@ -1109,10 +1209,7 @@ struct common_init_result common_init_from_params(common_params & params) { const auto cvec = common_control_vector_load(params.control_vectors); if (cvec.n_embd == -1) { - llama_free(lctx); - llama_model_free(model); - - return iparams; + return res; } int err = llama_apply_adapter_cvec( @@ -1123,10 +1220,7 @@ struct common_init_result common_init_from_params(common_params & params) { params.control_vector_layer_start, params.control_vector_layer_end); if (err) { - llama_free(lctx); - llama_model_free(model); - - return iparams; + return res; } } @@ -1150,10 +1244,7 @@ struct common_init_result common_init_from_params(common_params & params) { } if (!ok) { - llama_free(lctx); - llama_model_free(model); - - return iparams; + return res; } } @@ -1163,9 +1254,7 @@ struct common_init_result common_init_from_params(common_params & params) { lora.reset(llama_adapter_lora_init(model, la.path.c_str())); if (lora == nullptr) { LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); - llama_free(lctx); - llama_model_free(model); - return iparams; + return res; } char buf[1024]; @@ -1174,43 +1263,13 @@ struct common_init_result common_init_from_params(common_params & params) { la.task_name = buf; llama_adapter_meta_val_str(la.ptr, "adapter.lora.prompt_prefix", buf, sizeof(buf)); la.prompt_prefix = buf; - iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters + res->lora().emplace_back(std::move(lora)); // copy to list of loaded adapters } if (!params.lora_init_without_apply) { common_set_adapter_lora(lctx, params.lora_adapters); } - if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) { - LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__); - params.sampling.ignore_eos = false; - } - - // initialize once - for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) { - if (llama_vocab_is_eog(vocab, i)) { - LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY); - params.sampling.logit_bias_eog.push_back({i, -INFINITY}); - } - } - - if (params.sampling.ignore_eos) { - // add EOG biases to the active set of logit biases - params.sampling.logit_bias.insert( - params.sampling.logit_bias.end(), - params.sampling.logit_bias_eog.begin(), params.sampling.logit_bias_eog.end()); - } - - if (params.sampling.penalty_last_n == -1) { - LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx)); - params.sampling.penalty_last_n = llama_n_ctx(lctx); - } - - if (params.sampling.dry_penalty_last_n == -1) { - LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx)); - params.sampling.dry_penalty_last_n = llama_n_ctx(lctx); - } - if (params.warmup) { LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); @@ -1249,12 +1308,11 @@ struct common_init_result common_init_from_params(common_params & params) { llama_set_warmup(lctx, false); } - iparams.model.reset(model); - iparams.context.reset(lctx); - - return iparams; + return res; } +common_init_result::~common_init_result() = default; + std::string get_model_endpoint() { const char * model_endpoint_env = getenv("MODEL_ENDPOINT"); // We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility. @@ -1263,7 +1321,9 @@ std::string get_model_endpoint() { std::string model_endpoint = "https://huggingface.co/"; if (endpoint_env) { model_endpoint = endpoint_env; - if (model_endpoint.back() != '/') model_endpoint += '/'; + if (model_endpoint.back() != '/') { + model_endpoint += '/'; + } } return model_endpoint; } diff --git a/common/common.h b/common/common.h index c2c48a2b0..5e13f0d83 100644 --- a/common/common.h +++ b/common/common.h @@ -191,7 +191,6 @@ struct common_params_sampling { std::vector dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY - std::vector samplers = { COMMON_SAMPLER_TYPE_PENALTIES, COMMON_SAMPLER_TYPE_DRY, @@ -212,6 +211,10 @@ struct common_params_sampling { std::vector logit_bias; // logit biases to apply std::vector logit_bias_eog; // pre-calculated logit biases for EOG tokens + bool has_logit_bias() const { + return !logit_bias.empty(); + } + // print the parameters into a string std::string print() const; }; @@ -665,15 +668,29 @@ bool tty_can_use_colors(); // Model utils // -// note: defines object's lifetime -struct common_init_result { - llama_model_ptr model; - llama_context_ptr context; +struct common_sampler; - std::vector lora; +// note: defines the model, context, samplers, ets. lifetimes +struct common_init_result { + common_init_result(common_params & params); + ~common_init_result(); + + llama_model * model(); + llama_context * context(); + common_sampler * sampler(llama_seq_id seq_id); + + std::vector & lora(); + + void free_context(); + +private: + struct impl; + std::unique_ptr pimpl; }; -struct common_init_result common_init_from_params(common_params & params); +using common_init_result_ptr = std::unique_ptr; + +common_init_result_ptr common_init_from_params(common_params & params); struct llama_model_params common_model_params_to_llama ( common_params & params); struct llama_context_params common_context_params_to_llama(const common_params & params); diff --git a/common/preset.cpp b/common/preset.cpp index 09ac171b7..60746aad5 100644 --- a/common/preset.cpp +++ b/common/preset.cpp @@ -23,8 +23,14 @@ std::vector common_preset::to_args() const { if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) { // flag option, no value if (common_arg_utils::is_falsey(value)) { - // skip the flag - args.pop_back(); + // use negative arg if available + if (!opt.args_neg.empty()) { + args.back() = opt.args_neg.back(); + } else { + // otherwise, skip the flag + // TODO: maybe throw an error instead? + args.pop_back(); + } } } if (opt.value_hint != nullptr) { @@ -141,16 +147,31 @@ static std::map> parse_ini_from_ static std::map get_map_key_opt(common_params_context & ctx_params) { std::map mapping; for (const auto & opt : ctx_params.options) { - if (opt.env != nullptr) { - mapping[opt.env] = opt; + for (const auto & env : opt.get_env()) { + mapping[env] = opt; } - for (const auto & arg : opt.args) { + for (const auto & arg : opt.get_args()) { mapping[rm_leading_dashes(arg)] = opt; } } return mapping; } +static bool is_bool_arg(const common_arg & arg) { + return !arg.args_neg.empty(); +} + +static std::string parse_bool_arg(const common_arg & arg, const std::string & key, const std::string & value) { + // if this is a negated arg, we need to reverse the value + for (const auto & neg_arg : arg.args_neg) { + if (rm_leading_dashes(neg_arg) == key) { + return common_arg_utils::is_truthy(value) ? "false" : "true"; + } + } + // otherwise, not negated + return value; +} + common_presets common_presets_load(const std::string & path, common_params_context & ctx_params) { common_presets out; auto key_to_opt = get_map_key_opt(ctx_params); @@ -167,8 +188,13 @@ common_presets common_presets_load(const std::string & path, common_params_conte for (const auto & [key, value] : section.second) { LOG_DBG("option: %s = %s\n", key.c_str(), value.c_str()); if (key_to_opt.find(key) != key_to_opt.end()) { - preset.options[key_to_opt[key]] = value; - LOG_DBG("accepted option: %s = %s\n", key.c_str(), value.c_str()); + auto & opt = key_to_opt[key]; + if (is_bool_arg(opt)) { + preset.options[opt] = parse_bool_arg(opt, key, value); + } else { + preset.options[opt] = value; + } + LOG_DBG("accepted option: %s = %s\n", key.c_str(), preset.options[opt].c_str()); } else { // TODO: maybe warn about unknown key? } diff --git a/common/sampling.cpp b/common/sampling.cpp index 7a6b7be1e..6935d84e2 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -104,9 +104,10 @@ struct ring_buffer { struct common_sampler { common_params_sampling params; - struct llama_sampler * grmr; struct llama_sampler * chain; + bool grammar; + ring_buffer prev; std::vector cur; @@ -116,7 +117,6 @@ struct common_sampler { void reset() { prev.clear(); - llama_sampler_reset(grmr); llama_sampler_reset(chain); } @@ -167,10 +167,15 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co lparams.no_perf = params.no_perf; - struct llama_sampler * grmr; + llama_sampler * chain = llama_sampler_chain_init(lparams); + + bool grammar = false; + std::vector samplers; + if (params.grammar.compare(0, 11, "%llguidance") == 0) { #ifdef LLAMA_USE_LLGUIDANCE - grmr = llama_sampler_init_llg(vocab, "lark", params.grammar.c_str()); + samplers.push_back(llama_sampler_init_llg(vocab, "lark", params.grammar.c_str())); + grammar = true; #else GGML_ABORT("llguidance (cmake -DLLAMA_LLGUIDANCE=ON) is not enabled"); #endif // LLAMA_USE_LLGUIDANCE @@ -217,30 +222,23 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co trigger_patterns_c.push_back(regex.c_str()); } - grmr = params.grammar_lazy - ? llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root", - trigger_patterns_c.data(), trigger_patterns_c.size(), - trigger_tokens.data(), trigger_tokens.size()) - : llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"); - if (!grmr) { - return nullptr; + if (!params.grammar.empty()) { + if (params.grammar_lazy) { + samplers.push_back( + llama_sampler_init_grammar_lazy_patterns(vocab, params.grammar.c_str(), "root", + trigger_patterns_c.data(), trigger_patterns_c.size(), + trigger_tokens.data(), trigger_tokens.size())); + } else { + samplers.push_back(llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root")); + } + + grammar = true; } } - auto * result = new common_sampler { - /* .params = */ params, - /* .grmr = */ grmr, - /* .chain = */ llama_sampler_chain_init(lparams), - /* .prev = */ ring_buffer(std::max(32, params.n_prev)), - /* .cur = */ {}, - /* .cur_p = */ {}, - }; - - llama_sampler_chain_add(result->chain, - llama_sampler_init_logit_bias( - llama_vocab_n_tokens(vocab), - params.logit_bias.size(), - params.logit_bias.data())); + if (params.has_logit_bias()) { + samplers.push_back(llama_sampler_init_logit_bias(llama_vocab_n_tokens(vocab), params.logit_bias.size(), params.logit_bias.data())); + } if (params.mirostat == 0) { for (const auto & cnstr : params.samplers) { @@ -253,58 +251,70 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co c_breakers.push_back(str.c_str()); } - llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size())); + samplers.push_back(llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size())); } break; case COMMON_SAMPLER_TYPE_TOP_K: - llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k)); + samplers.push_back(llama_sampler_init_top_k (params.top_k)); break; case COMMON_SAMPLER_TYPE_TOP_P: - llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep)); + samplers.push_back(llama_sampler_init_top_p (params.top_p, params.min_keep)); break; case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: - llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma)); + samplers.push_back(llama_sampler_init_top_n_sigma(params.top_n_sigma)); break; case COMMON_SAMPLER_TYPE_MIN_P: - llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep)); + samplers.push_back(llama_sampler_init_min_p (params.min_p, params.min_keep)); break; case COMMON_SAMPLER_TYPE_XTC: - llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed)); + samplers.push_back(llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed)); break; case COMMON_SAMPLER_TYPE_TYPICAL_P: - llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep)); + samplers.push_back(llama_sampler_init_typical (params.typ_p, params.min_keep)); break; case COMMON_SAMPLER_TYPE_TEMPERATURE: - llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent)); + samplers.push_back(llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent)); break; case COMMON_SAMPLER_TYPE_INFILL: - llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab)); + samplers.push_back(llama_sampler_init_infill (vocab)); break; case COMMON_SAMPLER_TYPE_PENALTIES: - llama_sampler_chain_add(result->chain, llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present)); + samplers.push_back(llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present)); break; default: GGML_ASSERT(false && "unknown sampler type"); } } - llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed)); + + samplers.push_back(llama_sampler_init_dist(params.seed)); } else if (params.mirostat == 1) { - llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); - llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100)); + samplers.push_back(llama_sampler_init_temp(params.temp)); + samplers.push_back(llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100)); } else if (params.mirostat == 2) { - llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp)); - llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta)); + samplers.push_back(llama_sampler_init_temp(params.temp)); + samplers.push_back(llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta)); } else { GGML_ASSERT(false && "unknown mirostat version"); } + for (auto * smpl : samplers) { + llama_sampler_chain_add(chain, smpl); + } + + auto * result = new common_sampler { + /* .params = */ params, + /* .chain = */ chain, + /* .grammar = */ grammar, + /* .prev = */ ring_buffer(std::max(32, params.n_prev)), + /* .cur = */ {}, + /* .cur_p = */ {}, + }; + return result; } void common_sampler_free(struct common_sampler * gsmpl) { if (gsmpl) { - llama_sampler_free(gsmpl->grmr); - llama_sampler_free(gsmpl->chain); delete gsmpl; @@ -314,11 +324,24 @@ void common_sampler_free(struct common_sampler * gsmpl) { void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) { const auto tm = gsmpl->tm(); - if (accept_grammar) { - llama_sampler_accept(gsmpl->grmr, token); - } + if (gsmpl->grammar) { + const int n_smpl = llama_sampler_chain_n(gsmpl->chain); - llama_sampler_accept(gsmpl->chain, token); + for (int i = 0; i < n_smpl; i++) { + auto * smpl = llama_sampler_chain_get(gsmpl->chain, i); + + // the grammar sampler is always the first one + if (i == 0) { + if (accept_grammar) { + llama_sampler_accept(smpl, token); + } + } else { + llama_sampler_accept(smpl, token); + } + } + } else { + llama_sampler_accept(gsmpl->chain, token); + } gsmpl->prev.push_back(token); } @@ -329,12 +352,12 @@ void common_sampler_reset(struct common_sampler * gsmpl) { struct common_sampler * common_sampler_clone(common_sampler * gsmpl) { return new common_sampler { - /* .params = */ gsmpl->params, - /* .grmr = */ llama_sampler_clone(gsmpl->grmr), - /* .chain = */ llama_sampler_clone(gsmpl->chain), - /* .prev = */ gsmpl->prev, - /* .cur = */ gsmpl->cur, - /* .cur_p = */ gsmpl->cur_p, + /* .params = */ gsmpl->params, + /* .chain = */ llama_sampler_clone(gsmpl->chain), + /* .grammar = */ gsmpl->grammar, + /* .prev = */ gsmpl->prev, + /* .cur = */ gsmpl->cur, + /* .cur_p = */ gsmpl->cur_p, }; } @@ -383,58 +406,33 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam } } -llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) { +struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl) { + return gsmpl->chain; +} + +llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx) { llama_synchronize(ctx); // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations const auto tm = gsmpl->tm(); - gsmpl->set_logits(ctx, idx); + llama_token id = LLAMA_TOKEN_NULL; - auto & grmr = gsmpl->grmr; auto & chain = gsmpl->chain; auto & cur_p = gsmpl->cur_p; // initialized by set_logits - if (grammar_first) { - llama_sampler_apply(grmr, &cur_p); - } + gsmpl->set_logits(ctx, idx); llama_sampler_apply(chain, &cur_p); GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration"); - const llama_token id = cur_p.data[cur_p.selected].id; + id = cur_p.data[cur_p.selected].id; - if (grammar_first) { - return id; - } - - // check if it the sampled token fits the grammar - { - llama_token_data single_token_data = { id, 1.0f, 0.0f }; - llama_token_data_array single_token_data_array = { &single_token_data, 1, -1, false }; - - llama_sampler_apply(grmr, &single_token_data_array); - - const bool is_valid = single_token_data_array.data[0].logit != -INFINITY; - if (is_valid) { - return id; - } - } - - // resampling: - // if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain - gsmpl->set_logits(ctx, idx); - - llama_sampler_apply(grmr, &cur_p); - llama_sampler_apply(chain, &cur_p); - - GGML_ASSERT(cur_p.selected != -1 && "no selected token during re-sampling - check your sampling configuration"); - - return cur_p.data[cur_p.selected].id; + return id; } -std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector & idxs, const llama_tokens & draft, bool grammar_first) { +std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector & idxs, const llama_tokens & draft) { GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1"); std::vector result; @@ -442,7 +440,7 @@ std::vector common_sampler_sample_and_accept_n(struct common_sample size_t i = 0; for (; i < draft.size(); i++) { - const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first); + const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]); common_sampler_accept(gsmpl, id, true); @@ -454,7 +452,7 @@ std::vector common_sampler_sample_and_accept_n(struct common_sample } if (i == draft.size()) { - const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first); + const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i]); common_sampler_accept(gsmpl, id, true); @@ -464,13 +462,13 @@ std::vector common_sampler_sample_and_accept_n(struct common_sample return result; } -std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) { +std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft) { std::vector idxs(draft.size() + 1); for (size_t i = 0; i < idxs.size(); ++i) { idxs[i] = i; } - return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first); + return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft); } uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) { @@ -515,7 +513,8 @@ std::string common_sampler_print(const struct common_sampler * gsmpl) { for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) { const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i); - result += std::string("-> ") + llama_sampler_name(smpl) + " "; + result += std::string("-> "); + result += std::string(llama_sampler_name(smpl)) + " "; } return result; diff --git a/common/sampling.h b/common/sampling.h index e198eecda..ace5d3d02 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -48,6 +48,8 @@ struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl); // arguments can be nullptr to skip printing void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl); +struct llama_sampler * common_sampler_get(const struct common_sampler * gsmpl); + // extended sampling implementation: // // - set logits @@ -55,10 +57,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam // - check if the token fits the grammar (if any) // - if not: resample by first applying the grammar constraints and then sampling again (slower path) // -// if grammar_first is true, the grammar is applied before the samplers (slower) -// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar -// -llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false); +llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx); // generalized version of common_sampler_sample // @@ -76,10 +75,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co // // returns at least 1 token, up to idxs.size() // -std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector & idxs, const llama_tokens & draft, bool grammar_first = false); +std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector & idxs, const llama_tokens & draft); // assume idxs == [ 0, 1, 2, ..., draft.size() ] -std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false); +std::vector common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft); uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl); @@ -107,3 +106,9 @@ std::vector common_sampler_types_from_chars(const std: llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * grammar_kind, const char * grammar_data); + +struct common_sampler_deleter { + void operator()(common_sampler * s) { common_sampler_free(s); } +}; + +typedef std::unique_ptr common_sampler_ptr; diff --git a/common/speculative.cpp b/common/speculative.cpp index 3e83b0964..1e12383ae 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -315,7 +315,7 @@ llama_tokens common_speculative_gen_draft( for (int i = 0; i < params.n_draft; ++i) { common_batch_clear(batch); - common_sampler_sample(smpl, ctx_dft, 0, true); + common_sampler_sample(smpl, ctx_dft, 0); const auto * cur_p = common_sampler_get_candidates(smpl, true); diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 867bc9053..ee02cdd91 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -136,11 +136,19 @@ class ModelBase: self.remote_hf_model_id = remote_hf_model_id self.sentence_transformers_dense_modules = sentence_transformers_dense_modules self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams + self.rope_parameters = self.hparams.get("rope_parameters", self.hparams.get("rope_scaling")) or {} self.model_tensors = self.index_tensors(remote_hf_model_id=remote_hf_model_id) self.metadata_override = metadata_override self.model_name = model_name self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py + # Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters + if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters: + if "rope_theta" not in self.rope_parameters and (rope_theta := self.find_hparam(["rope_theta", "global_rope_theta", "rotary_emb_base"], optional=True)) is not None: + self.rope_parameters["rope_theta"] = rope_theta + if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None: + self.rope_parameters["rope_type"] = rope_type + # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type if self.ftype == gguf.LlamaFileType.GUESSED: # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie. @@ -705,6 +713,9 @@ class ModelBase: if "llm_config" in config: # rename for InternVL config["text_config"] = config["llm_config"] + if "lm_config" in config: + # rename for GlmASR + config["text_config"] = config["lm_config"] if "thinker_config" in config: # rename for Qwen2.5-Omni config["text_config"] = config["thinker_config"]["text_config"] @@ -795,7 +806,7 @@ class TextModel(ModelBase): def set_gguf_parameters(self): self.gguf_writer.add_block_count(self.block_count) - if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length"], optional=True)) is not None: + if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length", "max_sequence_length", "model_max_length"], optional=True)) is not None: self.gguf_writer.add_context_length(n_ctx) logger.info(f"gguf: context length = {n_ctx}") @@ -815,7 +826,42 @@ class TextModel(ModelBase): self.gguf_writer.add_head_count_kv(n_head_kv) logger.info(f"gguf: key-value head count = {n_head_kv}") - if (rope_theta := self.hparams.get("rope_theta")) is not None: + rope_params = self.rope_parameters.get("full_attention", self.rope_parameters) + if (rope_type := rope_params.get("rope_type")) is not None: + rope_factor = rope_params.get("factor") + rope_gguf_type = gguf.RopeScalingType.NONE + if rope_type == "linear" and rope_factor is not None: + rope_gguf_type = gguf.RopeScalingType.LINEAR + self.gguf_writer.add_rope_scaling_type(rope_gguf_type) + self.gguf_writer.add_rope_scaling_factor(rope_factor) + elif rope_type == "yarn" and rope_factor is not None: + rope_gguf_type = gguf.RopeScalingType.YARN + self.gguf_writer.add_rope_scaling_type(rope_gguf_type) + self.gguf_writer.add_rope_scaling_factor(rope_factor) + self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_params["original_max_position_embeddings"]) + if (yarn_ext_factor := rope_params.get("extrapolation_factor")) is not None: + self.gguf_writer.add_rope_scaling_yarn_ext_factor(yarn_ext_factor) + if (yarn_attn_factor := rope_params.get("attention_factor", rope_params.get("attn_factor"))) is not None: + self.gguf_writer.add_rope_scaling_yarn_attn_factor(yarn_attn_factor) + if (yarn_beta_fast := rope_params.get("beta_fast")) is not None: + self.gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_beta_fast) + if (yarn_beta_slow := rope_params.get("beta_slow")) is not None: + self.gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_beta_slow) + # self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"]) + elif rope_type == "su" or rope_type == "longrope": + rope_gguf_type = gguf.RopeScalingType.LONGROPE + self.gguf_writer.add_rope_scaling_type(rope_gguf_type) + elif rope_type == "dynamic": + # HunYuan, handled in model class + pass + elif rope_type.lower() == "llama3": + # Handled in generate_extra_tensors + pass + else: + logger.warning(f"Unknown RoPE type: {rope_type}") + logger.info(f"gguf: rope scaling type = {rope_gguf_type.name}") + + if (rope_theta := rope_params.get("rope_theta")) is not None: self.gguf_writer.add_rope_freq_base(rope_theta) logger.info(f"gguf: rope theta = {rope_theta}") if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"], optional=True)) is not None: @@ -1486,6 +1532,21 @@ class TextModel(ModelBase): raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported") self.gguf_writer.add_pooling_type(pooling_type) + def _set_vocab_glmedge(self): + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"]) + special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) + special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) + special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"]) + special_vocab.add_to_gguf(self.gguf_writer) + def _set_vocab_interns1(self): tokens: list[str] = [] toktypes: list[int] = [] @@ -1615,7 +1676,7 @@ class MmprojModel(ModelBase): preprocessor_config: dict[str, Any] global_config: dict[str, Any] - n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"] + n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "encoder_layers"] has_vision_encoder: bool = True # by default has_audio_encoder: bool = False @@ -1691,7 +1752,8 @@ class MmprojModel(ModelBase): return self.global_config.get(config_name) def get_audio_config(self) -> dict[str, Any] | None: - return self.global_config.get("audio_config") + mm_config_key = "whisper_config" if "whisper_config" in self.hparams else "audio_config" + return self.global_config.get(mm_config_key) def set_type(self): self.gguf_writer.add_type(gguf.GGUFType.MMPROJ) @@ -1966,34 +2028,10 @@ class BaichuanModel(TextModel): self._set_vocab_sentencepiece() def set_gguf_parameters(self): - head_count = self.hparams["num_attention_heads"] - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - - ctx_length = 0 - if "max_sequence_length" in self.hparams: - ctx_length = self.hparams["max_sequence_length"] - elif "max_position_embeddings" in self.hparams: - ctx_length = self.hparams["max_position_embeddings"] - elif "model_max_length" in self.hparams: - ctx_length = self.hparams["model_max_length"] - else: - raise ValueError("gguf: can not find ctx length parameter.") + super().set_gguf_parameters() self.gguf_writer.add_tensor_data_layout("Meta AI original pth") - self.gguf_writer.add_context_length(ctx_length) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count(head_count) - self.gguf_writer.add_head_count_kv(head_count_kv) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_file_type(self.ftype) - - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: head_count = self.hparams["num_attention_heads"] @@ -2089,34 +2127,10 @@ class XverseModel(TextModel): special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): - head_count = self.hparams["num_attention_heads"] - head_count_kv = self.hparams.get("num_key_value_heads", head_count) - - ctx_length = 0 - if "max_sequence_length" in self.hparams: - ctx_length = self.hparams["max_sequence_length"] - elif "max_position_embeddings" in self.hparams: - ctx_length = self.hparams["max_position_embeddings"] - elif "model_max_length" in self.hparams: - ctx_length = self.hparams["model_max_length"] - else: - raise ValueError("gguf: can not find ctx length parameter.") + super().set_gguf_parameters() self.gguf_writer.add_tensor_data_layout("Meta AI original pth") - self.gguf_writer.add_context_length(ctx_length) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count(head_count) - self.gguf_writer.add_head_count_kv(head_count_kv) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_file_type(self.ftype) - - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused @@ -2377,8 +2391,13 @@ class LlamaModel(TextModel): # fix for SmolVLM2, missing `num_attention_heads` in config.json if self.hf_arch == "VLlama3ForCausalLM": self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32) + hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False) + self.origin_hf_arch = hparams.get('architectures', [None])[0] def set_vocab(self): + if self.origin_hf_arch == "GlmasrModel": + return self._set_vocab_glmedge() + if self.is_mistral_format: return self._set_vocab_mistral() @@ -2430,11 +2449,6 @@ class LlamaModel(TextModel): rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(rope_dim) - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - @staticmethod def permute(weights: Tensor, n_head: int, n_head_kv: int | None): if n_head_kv is not None and n_head != n_head_kv: @@ -2454,6 +2468,7 @@ class LlamaModel(TextModel): "vision_language_adapter.", "patch_merger.", "pre_mm_projector_norm", + "audio_encoder.", ] is_multimodal_tensor = "vision_tower" in name \ @@ -2518,16 +2533,16 @@ class LlamaModel(TextModel): return [(self.map_tensor_name(name), data_torch)] def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): - if rope_scaling.get("rope_type", '').lower() == "llama3": - base = self.hparams.get("rope_theta", 10000.0) + if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters): + if rope_params.get("rope_type", '').lower() == "llama3": + base = rope_params.get("rope_theta", 10000.0) if (dim := self.hparams.get("head_dim")) is None: dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) - factor = rope_scaling.get("factor", 8.0) - low_freq_factor = rope_scaling.get("low_freq_factor", 1.0) - high_freq_factor = rope_scaling.get("high_freq_factor", 4.0) + factor = rope_params.get("factor", 8.0) + low_freq_factor = rope_params.get("low_freq_factor", 1.0) + high_freq_factor = rope_params.get("high_freq_factor", 4.0) old_context_len = self.hparams.get("original_max_position_embeddings", 8192) low_freq_wavelen = old_context_len / low_freq_factor @@ -2564,11 +2579,6 @@ class ArceeModel(LlamaModel): def set_gguf_parameters(self): super().set_gguf_parameters() self._try_set_pooling_type() - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) @ModelBase.register("AfmoeForCausalLM") @@ -2851,17 +2861,11 @@ class Mistral3Model(LlamaModel): def set_gguf_parameters(self): super().set_gguf_parameters() - rope_params = self.hparams.get("rope_parameters") + rope_params = self.rope_parameters if self.hparams.get("model_type") == "ministral3": - assert rope_params is not None, "ministral3 must have 'rope_parameters' config" + assert rope_params, "ministral3 must have 'rope_parameters' config" assert rope_params["rope_type"] == "yarn", "ministral3 rope_type must be 'yarn'" - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(rope_params["factor"]) - self.gguf_writer.add_rope_scaling_yarn_beta_fast(rope_params["beta_fast"]) - self.gguf_writer.add_rope_scaling_yarn_beta_slow(rope_params["beta_slow"]) self.gguf_writer.add_rope_scaling_yarn_log_mul(rope_params["mscale_all_dim"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_params["original_max_position_embeddings"]) - self.gguf_writer.add_rope_freq_base(rope_params["rope_theta"]) self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): @@ -2958,7 +2962,7 @@ class DeciModel(TextModel): assert self.block_count == len(self._num_kv_heads) assert self.block_count == len(self._num_heads) assert self.block_count == len(self._ffn_dims) - if (rope_theta := self.hparams.get("rope_theta")) is not None: + if (rope_theta := self.rope_parameters.get("rope_theta")) is not None: self.gguf_writer.add_rope_freq_base(rope_theta) self.gguf_writer.add_head_count_kv(self._num_kv_heads) self.gguf_writer.add_head_count(self._num_heads) @@ -2983,11 +2987,6 @@ class DeciModel(TextModel): rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(rope_dim) - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - @staticmethod def permute(weights: Tensor, n_head: int, n_head_kv: int | None): if n_head_kv is not None and n_head != n_head_kv: @@ -3016,16 +3015,16 @@ class DeciModel(TextModel): return [(self.map_tensor_name(name), data_torch)] def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): - if rope_scaling.get("rope_type", '').lower() == "llama3": - base = self.hparams.get("rope_theta", 10000.0) + if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters): + if rope_params.get("rope_type", '').lower() == "llama3": + base = rope_params.get("rope_theta", 10000.0) if (dim := self.hparams.get("head_dim")) is None: dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) - factor = rope_scaling.get("factor", 8.0) - low_freq_factor = rope_scaling.get("low_freq_factor", 1.0) - high_freq_factor = rope_scaling.get("high_freq_factor", 4.0) + factor = rope_params.get("factor", 8.0) + low_freq_factor = rope_params.get("low_freq_factor", 1.0) + high_freq_factor = rope_params.get("high_freq_factor", 4.0) old_context_len = self.hparams.get("original_max_position_embeddings", 8192) low_freq_wavelen = old_context_len / low_freq_factor @@ -3279,10 +3278,6 @@ class MiniCPMModel(TextModel): logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"] self.gguf_writer.add_logit_scale(logit_scale) logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}") - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "longrope": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE) - logger.info(f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}") def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] @@ -3402,17 +3397,6 @@ class QwenModel(TextModel): def set_vocab(self): self._set_vocab_qwen() - def set_gguf_parameters(self): - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) - self.gguf_writer.add_file_type(self.ftype) - @ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration") class Qwen2Model(TextModel): @@ -3427,11 +3411,6 @@ class Qwen2Model(TextModel): def set_gguf_parameters(self): super().set_gguf_parameters() self._try_set_pooling_type() - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if self.hf_arch == "Qwen2Model": @@ -3499,12 +3478,6 @@ class DreamModel(TextModel): # Dream models use non-causal attention for diffusion self.gguf_writer.add_causal_attention(False) - # Handle RoPE scaling similar to Qwen2 - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) # Add Dream-specific parameters mask_token_id = self.hparams.get("mask_token_id") @@ -4048,13 +4021,6 @@ class Qwen2MoeModel(TextModel): if (shared_expert_intermediate_size := self.hparams.get('shared_expert_intermediate_size')) is not None: self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size) logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}") - # YaRN is not enabled by default - # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) _experts: list[dict[str, Tensor]] | None = None @@ -4656,7 +4622,7 @@ class Phi3MiniModel(TextModel): self.gguf_writer.add_head_count_kv(n_head_kv) self.gguf_writer.add_layer_norm_rms_eps(rms_eps) self.gguf_writer.add_rope_dimension_count(rope_dims) - self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"])) + self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("full_attention", self.rope_parameters)["rope_theta"]) self.gguf_writer.add_file_type(self.ftype) sliding_window = self.hparams.get("sliding_window") # use zero value of sliding_window to distinguish Phi-4 from other PHI3 models @@ -4932,7 +4898,7 @@ class Plamo2Model(TextModel): self.gguf_writer.add_value_length(hparams.get("hidden_size_per_head", 128)) self.gguf_writer.add_block_count(self.block_count) self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06)) - self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 10000)) + self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("rope_theta", 10000)) # Mamba parameters self.gguf_writer.add_ssm_state_size(hparams.get("mamba_d_state", 64)) @@ -5130,21 +5096,6 @@ class InternLM2Model(TextModel): special_vocab.add_to_gguf(self.gguf_writer) - def set_gguf_parameters(self): - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) - self.gguf_writer.add_file_type(self.ftype) - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: num_heads = self.hparams["num_attention_heads"] num_kv_heads = self.hparams["num_key_value_heads"] @@ -5221,11 +5172,6 @@ class InternLM3Model(TextModel): rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(rope_dim) - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") @@ -5588,7 +5534,6 @@ class NomicBertModel(BertModel): def set_gguf_parameters(self): super().set_gguf_parameters() - self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) if self.is_moe: self.gguf_writer.add_moe_every_n_layers(self.hparams["moe_every_n_layers"]) self.gguf_writer.add_expert_count(self.hparams["num_experts"]) @@ -5711,8 +5656,6 @@ class XLMRobertaModel(BertModel): super().set_gguf_parameters() # jina-embeddings-v3 - if rotary_emb_base := self.hparams.get("rotary_emb_base"): - self.gguf_writer.add_rope_freq_base(rotary_emb_base) lora_alpha = self.hparams.get("lora_alpha") if lora_prompt_prefixes := self.hparams.get("task_instructions"): assert self._lora_files and all(lora_name in lora_prompt_prefixes for lora_name in self._lora_files.keys()) @@ -5840,19 +5783,16 @@ class Gemma3Model(TextModel): self._set_vocab_gpt2() def set_gguf_parameters(self): + super().set_gguf_parameters() hparams = self.hparams # some default values are not specified in the hparams self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 131072)) - self.gguf_writer.add_embedding_length(hparams["hidden_size"]) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 8)) self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-6)) self.gguf_writer.add_key_length(hparams.get("head_dim", 256)) self.gguf_writer.add_value_length(hparams.get("head_dim", 256)) - self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1_000_000.0)) # for global layers + self.gguf_writer.add_rope_freq_base(self.rope_parameters.get("full_attention", self.rope_parameters).get("rope_theta", 1_000_000.0)) # for global layers # attn_logit_softcapping is removed in Gemma3 assert hparams.get("attn_logit_softcapping") is None if (final_logit_softcap := hparams.get("final_logit_softcapping")): @@ -5860,19 +5800,6 @@ class Gemma3Model(TextModel): if hparams.get("sliding_window_pattern") != 1: self.gguf_writer.add_sliding_window(hparams["sliding_window"]) self.gguf_writer.add_head_count_kv(hparams.get("num_key_value_heads", 4)) - if hparams.get("rope_scaling") is not None: - rope_scaling = hparams["rope_scaling"] - if rope_scaling["rope_type"] == "linear": - # important: this rope_scaling is only applied for global layers, and not used by 1B model - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - elif rope_scaling["rope_type"] == "yarn": - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) - self.gguf_writer.add_rope_scaling_yarn_ext_factor(rope_scaling["extrapolation_factor"]) - self.gguf_writer.add_rope_scaling_yarn_beta_fast(rope_scaling["beta_fast"]) - self.gguf_writer.add_rope_scaling_yarn_beta_slow(rope_scaling["beta_slow"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused @@ -6776,13 +6703,6 @@ class Olmo2Model(TextModel): def set_gguf_parameters(self): super().set_gguf_parameters() - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - self.gguf_writer.add_rope_scaling_attn_factors(rope_scaling["attention_factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) - if "sliding_window" in self.hparams: self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) @@ -7281,12 +7201,11 @@ class DeepseekV2Model(TextModel): self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) - self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_scaling["mscale_all_dim"]) + if (rope_mscale_all := self.rope_parameters.get("mscale_all_dim")) is not None: + # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX] + # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul + # ref https://github.com/ggml-org/llama.cpp/pull/17945 + self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_mscale_all) _experts: list[dict[str, Tensor]] | None = None @@ -7894,11 +7813,6 @@ class Glm4Model(TextModel): if (rope_dim := self.hparams.get("head_dim")) is None: rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))) - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if name.startswith("model.visual."): # ignore visual part of Glm4v @@ -8236,50 +8150,26 @@ class ExaoneModel(TextModel): model_arch = gguf.MODEL_ARCH.EXAONE def set_gguf_parameters(self): + super().set_gguf_parameters() hparams = self.hparams assert (hparams["activation_function"] == "silu") - max_position_embeddings = hparams["max_position_embeddings"] - embed_dim = hparams["hidden_size"] - num_heads = hparams["num_attention_heads"] - num_kv_heads = hparams.get("num_key_value_heads", num_heads) - layer_norm_eps = hparams["layer_norm_epsilon"] - intermediate_size = hparams["intermediate_size"] if "intermediate_size" in hparams else 4 * embed_dim - # ignore for now as EXAONE-3.0-7.8B-Instruct attentino_dropout is 0.0 - # attention_dropout_rate = hparams["attention_dropout"] - # ignore for now as EXAONE-3.0-7.8B-Instruct embed_dropout is 0.0 - # embed_dropout_rate = hparams["embed_dropout"] - self.gguf_writer.add_embedding_length(embed_dim) - self.gguf_writer.add_head_count(num_heads) - self.gguf_writer.add_head_count_kv(num_kv_heads) - self.gguf_writer.add_context_length(max_position_embeddings) - self.gguf_writer.add_layer_norm_rms_eps(layer_norm_eps) - self.gguf_writer.add_feed_forward_length(intermediate_size) - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_file_type(self.ftype) - - if (rope_theta := self.hparams.get("rope_theta")) is not None: - self.gguf_writer.add_rope_freq_base(rope_theta) rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True) rotary_factor = rotary_factor if rotary_factor is not None else 1.0 self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): - if rope_scaling.get("rope_type", '').lower() == "llama3": - base = self.hparams.get("rope_theta", 10000.0) + if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters): + if rope_params.get("rope_type", '').lower() == "llama3": + base = self.rope_parameters.get("rope_theta", 10000.0) if (dim := self.hparams.get("head_dim")) is None: dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) - factor = rope_scaling.get("factor", 8.0) - low_freq_factor = rope_scaling.get("low_freq_factor", 1.0) - high_freq_factor = rope_scaling.get("high_freq_factor", 4.0) + factor = rope_params.get("factor", 8.0) + low_freq_factor = rope_params.get("low_freq_factor", 1.0) + high_freq_factor = rope_params.get("high_freq_factor", 4.0) old_context_len = self.hparams.get("original_max_position_embeddings", 8192) low_freq_wavelen = old_context_len / low_freq_factor @@ -8334,22 +8224,17 @@ class Exaone4Model(TextModel): if len(sliding_window_pattern) == hparams["num_hidden_layers"]: self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern) - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): - if rope_scaling.get("rope_type", '').lower() == "llama3": - base = self.hparams.get("rope_theta", 10_000.0) + if rope_params := self.rope_parameters.get("full_attention", self.rope_parameters): + if rope_params.get("rope_type", '').lower() == "llama3": + base = rope_params.get("rope_theta", 10_000.0) if (dim := self.hparams.get("head_dim")) is None: dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) - factor = rope_scaling.get("factor", 16.0) - low_freq_factor = rope_scaling.get("low_freq_factor", 1.0) - high_freq_factor = rope_scaling.get("high_freq_factor", 4.0) + factor = rope_params.get("factor", 16.0) + low_freq_factor = rope_params.get("low_freq_factor", 1.0) + high_freq_factor = rope_params.get("high_freq_factor", 4.0) old_context_len = self.hparams.get("original_max_position_embeddings", 8192) low_freq_wavelen = old_context_len / low_freq_factor @@ -8660,13 +8545,6 @@ class BailingMoeModel(TextModel): rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(rope_dim) - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) - else: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) self.gguf_writer.add_vocab_size(hparams["vocab_size"]) self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) @@ -8773,13 +8651,6 @@ class BailingMoeV2Model(TextModel): rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))) - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) - else: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) self.gguf_writer.add_vocab_size(hparams["vocab_size"]) self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) @@ -8858,13 +8729,6 @@ class GroveMoeModel(TextModel): self.gguf_writer.add_experts_per_group(2) # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L376 self.gguf_writer.add_expert_group_scale(0.05) - # YaRN is not enabled by default - # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) _experts: list[dict[str, Tensor]] | None = None _chunk_experts: list[dict[str, Tensor]] | None = None @@ -9007,6 +8871,63 @@ class UltravoxModel(TextModel): raise NotImplementedError("Ultravox does not have text decoder. Instead, it uses Llama or other models for text. If you want to get the audio encoder, please use --mmproj argument") +@ModelBase.register("GlmasrModel") +class GlmASRWhisperEncoderModel(MmprojModel): + has_vision_encoder = False + has_audio_encoder = True + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if "hidden_size" not in self.hparams and "intermediate_size" not in self.hparams: + self.hparams["hidden_size"] = self.hparams["d_model"] + self.hparams["intermediate_size"] = self.hparams["encoder_ffn_dim"] + self.hparams["num_attention_heads"] = self.hparams["encoder_attention_heads"] + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLMA) + self.gguf_writer.add_audio_num_mel_bins(self.hparams["num_mel_bins"]) + self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5)) + self.gguf_writer.add_audio_stack_factor(self.global_config["merge_factor"]) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + if ".conv" in name and ".weight" in name: + return gguf.GGMLQuantizationType.F16 + return super().tensor_force_quant(name, new_name, bid, n_dims) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + if name.startswith("model.") or name.startswith("lm_head."): + # skip language model tensors + return [] + + if name.startswith("audio_encoder.whisper."): + name = name.replace("audio_encoder.whisper.","audio_tower.") + if "audio_encoder.layer_norm." in name or "audio_encoder.proj." in name: + name = name.replace("audio_encoder.", "audio_encoder.adapting.") + + if name.startswith("audio_encoder.audio_bos_eos_token."): + return [(self.map_tensor_name("model.vision.boi"), data_torch[0]), (self.map_tensor_name("model.vision.eoi"), data_torch[1])] + + if name.startswith("audio_encoder.adapting."): + name = name.replace("audio_encoder.adapting.","audio.multi_modal_projector.") + if ".layer_norm." in name: + name = name.replace(".layer_norm.", ".ln_pre.") + if ".0." in name: + name = name.replace(".0.", ".linear_1.") + if ".2." in name: + name = name.replace(".2.", ".linear_2.") + if ".proj." in name: + return [] + + if "conv1.bias" in name or "conv2.bias" in name: + # transpose conv1 and conv2 bias + data_torch = data_torch.unsqueeze(-1) + + return [(self.map_tensor_name(name), data_torch)] + + @ModelBase.register("Qwen2AudioForConditionalGeneration") class WhisperEncoderModel(MmprojModel): has_vision_encoder = False # no vision encoder @@ -9174,7 +9095,7 @@ class FalconH1Model(Mamba2Model): assert self.d_inner % self.d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {self.d_head}" # Add any other Falcon Mamba2 specific configuration - self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"])) + self.gguf_writer.add_rope_freq_base(self.rope_parameters["rope_theta"]) @ModelBase.register("HunYuanMoEV1ForCausalLM") @@ -9252,12 +9173,11 @@ class HunYuanMoEModel(TextModel): self.gguf_writer.add_expert_shared_count(moe_shared_expert[0]) # Rope - rope_scaling = hparams.get("rope_scaling", {}) - if rope_scaling.get("type") == "dynamic": + if self.rope_parameters.get("rope_type") == "dynamic": # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/ # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf) - alpha = rope_scaling.get("alpha", 1000) - base = hparams.get("rope_theta", 10000.0) + alpha = self.rope_parameters.get("alpha", 1000) + base = self.rope_parameters.get("rope_theta", 10000.0) dim = (hparams["hidden_size"] // hparams["num_attention_heads"]) # 128 scaled_base = base * (alpha ** (dim / (dim - 2))) # 10000 * (1000 ** (128 / 126)) = 11158839.9251 self.gguf_writer.add_rope_freq_base(scaled_base) @@ -9452,12 +9372,11 @@ class HunYuanModel(TextModel): hparams = self.hparams # Rope - rope_scaling = hparams.get("rope_scaling", {}) - if rope_scaling.get("type") == "dynamic": + if self.rope_parameters.get("rope_type") == "dynamic": # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/ # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf) - alpha = rope_scaling.get("alpha", 50) - base = hparams.get("rope_theta", 10000.0) + alpha = self.rope_parameters.get("alpha", 50) + base = self.rope_parameters.get("rope_theta", 10000.0) dim = hparams["head_dim"] scaled_base = base * (alpha ** (dim / (dim - 2))) self.gguf_writer.add_rope_freq_base(scaled_base) @@ -9608,13 +9527,6 @@ class GptOssModel(TextModel): self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size"]) - rope_scaling = self.hparams.get("rope_scaling") or {} - rope_type = rope_scaling.get("rope_type", rope_scaling.get("type")) - assert rope_type == "yarn", f"GPT-OSS only supports yarn rope scaling, got {rope_type}" - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling.get("original_max_position_embeddings", 4096)) - @ModelBase.register("Lfm2ForCausalLM", "LFM2ForCausalLM") class LFM2Model(TextModel): @@ -9787,13 +9699,6 @@ class SmallThinkerModel(TextModel): self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) else: self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) - # YaRN is not enabled by default - # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) sliding_window_layout = self.hparams.get("sliding_window_layout") if sliding_window_layout: @@ -10041,6 +9946,10 @@ class MistralMoeModel(DeepseekV2Model): MistralModel.set_mistral_config(self.gguf_writer, self.hparams) yarn_params = self.hparams["yarn"] self.gguf_writer.add_attn_temperature_length(yarn_params["original_max_position_embeddings"]) + + # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX] + # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul + # ref https://github.com/ggml-org/llama.cpp/pull/17945 self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1) # mscale_all_dim * 0.1 def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index fe91b308c..81111e81b 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -131,10 +131,10 @@ int main(int argc, char ** argv) { llama_numa_init(params.numa); // load the model - common_init_result llama_init = common_init_from_params(params); + auto llama_init = common_init_from_params(params); - llama_model * model = llama_init.model.get(); - llama_context * ctx = llama_init.context.get(); + auto * model = llama_init->model(); + auto * ctx = llama_init->context(); if (model == NULL) { LOG_ERR("%s: unable to load model\n", __func__); diff --git a/examples/model-conversion/scripts/utils/__init__.py b/examples/model-conversion/scripts/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/model-conversion/scripts/utils/common.py b/examples/model-conversion/scripts/utils/common.py new file mode 100644 index 000000000..945f9a1a1 --- /dev/null +++ b/examples/model-conversion/scripts/utils/common.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 + +import os +import sys + +def get_model_name_from_env_path(env_path_name): + model_path = os.getenv(env_path_name) + if not model_path: + print(f"Error: {env_path_name} environment variable not set") + sys.exit(1) + + if not os.path.exists(model_path): + print(f"Error: Model file not found: {model_path}") + sys.exit(1) + + name = os.path.basename(os.path.normpath(model_path)) + if name.endswith(".gguf"): + name = name[:-5] + + return name diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index 9edd48513..4f3b99c8d 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -99,6 +99,7 @@ extern "C" { GGML_BACKEND_API int ggml_cpu_has_sme (void); // other GGML_BACKEND_API int ggml_cpu_has_riscv_v (void); + GGML_BACKEND_API int ggml_cpu_get_rvv_vlen (void); // risc-v vector length in bytes GGML_BACKEND_API int ggml_cpu_has_vsx (void); GGML_BACKEND_API int ggml_cpu_has_vxe (void); GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void); diff --git a/ggml/src/ggml-cpu/arch/arm/repack.cpp b/ggml/src/ggml-cpu/arch/arm/repack.cpp index 683ed8d2d..fb7f074a8 100644 --- a/ggml/src/ggml-cpu/arch/arm/repack.cpp +++ b/ggml/src/ggml-cpu/arch/arm/repack.cpp @@ -24,6 +24,7 @@ #define UNUSED GGML_UNUSED +#if defined(__aarch64__) && defined(__ARM_NEON) && (defined(__ARM_FEATURE_MATMUL_INT8) || defined(__ARM_FEATURE_DOTPROD)) static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in, int16x8_t * out_mins, int8_t * out_scales) { @@ -46,6 +47,7 @@ static inline void decode_q4_Kx8_scales_mins(const uint8_t * scales_in, scales_u32[1] = (sm[2] & kmask2) | (((sm[0] >> 6) & kmask3) << 4); memcpy(out_scales, scales_u32, 8); } +#endif void ggml_quantize_mat_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { assert(QK8_0 == 32); diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 2f3205fb9..fc7543ac9 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -85,6 +85,11 @@ struct ggml_arm_arch_features_type { } ggml_arm_arch_features = { 0 }; #endif +#if defined(__riscv) +struct ggml_riscv_arch_features_type { + int rvv_vlen; +} ggml_riscv_arch_features = { 0 }; +#endif #if defined(_WIN32) @@ -708,6 +713,15 @@ static void ggml_init_arm_arch_features(void) {} #endif #endif // __ARM_ARCH +#if defined(__riscv) && defined(__riscv_v_intrinsic) +#include +static void ggml_init_riscv_arch_features(void) { + ggml_riscv_arch_features.rvv_vlen = __riscv_vlenb(); +} +#else +static void ggml_init_riscv_arch_features(void) {} +#endif + struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) { GGML_ASSERT(!ggml_get_no_alloc(ctx)); @@ -4325,6 +4339,14 @@ int ggml_cpu_has_riscv_v(void) { #endif } +int ggml_cpu_get_rvv_vlen(void) { +#if defined(__riscv) && defined(__riscv_v_intrinsic) + return ggml_riscv_arch_features.rvv_vlen; +#else + return 0; +#endif +} + int ggml_cpu_has_f16c(void) { #if defined(__F16C__) return 1; @@ -4494,6 +4516,10 @@ void ggml_cpu_init(void) { ggml_cl_init(); #endif +#if defined(__riscv) + ggml_init_riscv_arch_features(); +#endif + is_first_call = false; } diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index 0320bf4dd..db325f515 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -583,6 +583,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r if (ggml_cpu_has_riscv_v()) { features.push_back({ "RISCV_V", "1" }); } + if (ggml_cpu_get_rvv_vlen() > 0) { + static std::string rvv_vlen = std::to_string(ggml_cpu_get_rvv_vlen()); + features.push_back({ "RVV_VLEN", rvv_vlen.c_str() }); + } if (ggml_cpu_has_vsx()) { features.push_back({ "VSX", "1" }); } diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index 9726bda58..26747b838 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -2056,7 +2056,8 @@ static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(cons #endif if (cur->type == GGML_TYPE_Q4_0) { - if ((ggml_cpu_has_avx2() && permit_repack) || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) { + if ((ggml_cpu_has_avx2() && permit_repack) || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) + || (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) { if (cur->ne[1] % 8 == 0) { return &q4_0_8x8_q8_0; } diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index 2750117aa..8dc82a9d3 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -642,8 +642,8 @@ static __global__ void flash_attn_stream_k_fixup( const int iter_k = (ne11 + (nbatch_fa - 1)) / nbatch_fa; const int iter_j = (ne01 + (ncols1 - 1)) / ncols1; - const int kbc0 = (bidx0 + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; - const int kbc0_stop = (bidx0 + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; + const int kbc0 = int64_t(bidx0 + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; + const int kbc0_stop = int64_t(bidx0 + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; const bool did_not_have_any_data = kbc0 == kbc0_stop; const bool wrote_beginning_of_tile = kbc0 % iter_k == 0; @@ -679,7 +679,7 @@ static __global__ void flash_attn_stream_k_fixup( int bidx = bidx0 - 1; int kbc_stop = kbc0; while(true) { - const int kbc = bidx*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; + const int kbc = int64_t(bidx)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; if (kbc == kbc_stop) { // Did not have any data. bidx--; kbc_stop = kbc; diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh index d51537f7d..7bd1044c1 100644 --- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh @@ -1380,8 +1380,8 @@ static __global__ void flash_attn_ext_f16( const int iter_j = (ne01.z + (ncols1 - 1)) / ncols1; // kbc == k block continuous, current index in continuous ijk space. - int kbc = (blockIdx.x + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; - const int kbc_stop = (blockIdx.x + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; + int kbc = int64_t(blockIdx.x + 0)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; + const int kbc_stop = int64_t(blockIdx.x + 1)*(iter_k*iter_j*(ne02/ncols2)*ne03) / gridDim.x; // If the seams of 2 CUDA blocks fall within an output tile their results need to be combined. // For this we need to track both the block that starts the tile (needs_fixup) and the block that finishes the tile (is_fixup). @@ -1401,7 +1401,7 @@ static __global__ void flash_attn_ext_f16( const float2 * Q_f2 = (const float2 *) (Q + nb03*sequence + nb02* head0); const half2 * K_h2 = (const half2 *) (K + nb13*sequence + nb12*(head0 / gqa_ratio)); const half * mask_h = ncols2 == 1 && !mask ? nullptr : - (const half *) (mask + nb33*(sequence % ne33)); + (const half *) (mask + nb33*(sequence % ne33)); float2 * dstk = ((float2 *) dst) + (sequence*ne01.z*ne02 + head0) * (DV/2); const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio)); diff --git a/ggml/src/ggml-sycl/add-id.cpp b/ggml/src/ggml-sycl/add-id.cpp new file mode 100644 index 000000000..00c073cf9 --- /dev/null +++ b/ggml/src/ggml-sycl/add-id.cpp @@ -0,0 +1,77 @@ +#include +#include "common.hpp" +#include "add-id.hpp" + +static void add_id_kernel( + const float* src0, + const float* src1, + const int32_t* src2, + float* dst, + int64_t ne0, + int64_t ne1, + size_t nb01, + size_t nb02, + size_t nb11, + size_t nb21, + sycl::nd_item<3> item_ct1) { + const int64_t i1 = item_ct1.get_group(2); + const int64_t i2 = item_ct1.get_group(1); + + const int i11 = + *(const int32_t*)((const char*)src2 + i1 * sizeof(int32_t) + i2 * nb21); + + const size_t nb1 = ne0 * sizeof(float); + const size_t nb2 = ne1 * nb1; + + float* dst_row = (float*)((char*)dst + i1 * nb1 + i2 * nb2); + const float* src0_row = + (const float*)((const char*)src0 + i1 * nb01 + i2 * nb02); + const float* src1_row = (const float*)((const char*)src1 + i11 * nb11); + + for (int64_t i0 = item_ct1.get_local_id(2); i0 < ne0; + i0 += item_ct1.get_local_range(2)) { + dst_row[i0] = src0_row[i0] + src1_row[i0]; + } +} + +void ggml_sycl_add_id(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { + const ggml_tensor* src0 = dst->src[0]; + const ggml_tensor* src1 = dst->src[1]; + const ggml_tensor* src2 = dst->src[2]; + + GGML_TENSOR_TERNARY_OP_LOCALS + + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(src2->type == GGML_TYPE_I32); + + GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(nb10 == sizeof(float)); + GGML_ASSERT(nb20 == sizeof(int32_t)); + + const float* src0_d = (const float*)src0->data; + const float* src1_d = (const float*)src1->data; + const int32_t* src2_d = (const int32_t*)src2->data; + float* dst_d = (float*)dst->data; + + int threads = std::min((int)ne00, 768); // cols + ctx.stream()->parallel_for( + sycl::nd_range<3>( + sycl::range<3>(1, ne02, ne01) * sycl::range<3>(1, 1, threads), + sycl::range<3>(1, 1, threads)), + [=](sycl::nd_item<3> item_ct1) { + add_id_kernel( + src0_d, + src1_d, + src2_d, + dst_d, + ne0, + ne1, + nb01, + nb02, + nb11, + nb21, + item_ct1); + }); +} diff --git a/ggml/src/ggml-sycl/add-id.hpp b/ggml/src/ggml-sycl/add-id.hpp new file mode 100644 index 000000000..e1b09ee8c --- /dev/null +++ b/ggml/src/ggml-sycl/add-id.hpp @@ -0,0 +1,8 @@ +#ifndef GGML_SYCL_ADD_ID_HPP +#define GGML_SYCL_ADD_ID_HPP + +#include "common.hpp" + +void ggml_sycl_add_id(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_ADD_ID_HPP diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 006b77071..0ed98ec0e 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -675,6 +675,7 @@ struct vk_device_struct { vk_pipeline pipeline_cos_f32; vk_pipeline pipeline_log[2]; vk_pipeline pipeline_tri[2]; + vk_pipeline pipeline_diag[2]; vk_pipeline pipeline_clamp_f32; vk_pipeline pipeline_pad_f32; vk_pipeline pipeline_roll_f32; @@ -738,6 +739,11 @@ struct vk_device_struct { vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16; vk_pipeline pipeline_soft_max_f32_wg512, pipeline_soft_max_f32_f16_wg512; vk_pipeline pipeline_soft_max_back_f32; + + vk_pipeline pipeline_soft_max_large1_f32, pipeline_soft_max_large1_f32_f16; + vk_pipeline pipeline_soft_max_large2_f32, pipeline_soft_max_large2_f32_f16; + vk_pipeline pipeline_soft_max_large3_f32, pipeline_soft_max_large3_f32_f16; + vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16, pipeline_rope_norm_f32_f16; vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16, pipeline_rope_neox_f32_f16; vk_pipeline pipeline_rope_multi_f32, pipeline_rope_multi_f16; @@ -773,7 +779,8 @@ struct vk_device_struct { vk_pipeline pipeline_flash_attn_split_k_reduce; - vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][TOPK_MOE_COUNT]; + // [2] is for whether to take n_experts from spec constant (0) or push constant (1) + vk_pipeline pipeline_topk_moe[num_topk_moe_pipelines][TOPK_MOE_COUNT][2]; std::vector all_pipelines; @@ -1165,6 +1172,7 @@ static_assert(sizeof(vk_op_multi_add_push_constants) <= 256); struct vk_op_topk_moe_push_constants { uint32_t n_rows; + uint32_t n_experts_push; uint32_t n_expert_used; float clamp_min; float clamp_max; @@ -3746,6 +3754,7 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_XS], "get_rows_iq4_xs", get_rows_iq4_xs_len, get_rows_iq4_xs_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl", get_rows_iq4_nl_len, get_rows_iq4_nl_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_MXFP4], "get_rows_mxfp4", get_rows_mxfp4_len, get_rows_mxfp4_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows[GGML_TYPE_I32], "get_rows_i32", get_rows_i32_len, get_rows_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); @@ -3933,6 +3942,9 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_tri[0], "tri_f32", tri_f32_len, tri_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_tri[1], "tri_f16", tri_f16_len, tri_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_diag[0], "diag_f32", diag_f32_len, diag_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_diag[1], "diag_f16", diag_f16_len, diag_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_pad_push_constants), {512, 1, 1}, {}, 1); @@ -4012,6 +4024,13 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_soft_max_f32_f16_wg512, "soft_max_f32_f16_wg512", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 512 }, 1); ggml_vk_create_pipeline(device, device->pipeline_soft_max_back_f32, "soft_max_back_f32", soft_max_back_f32_len, soft_max_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_soft_max_large1_f32, "soft_max_large1_f32", soft_max_large1_f32_len, soft_max_large1_f32_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_soft_max_large2_f32, "soft_max_large2_f32", soft_max_large2_f32_len, soft_max_large2_f32_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_soft_max_large3_f32, "soft_max_large3_f32", soft_max_large3_f32_len, soft_max_large3_f32_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_soft_max_large1_f32_f16, "soft_max_large1_f32_f16", soft_max_large1_f32_f16_len, soft_max_large1_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_soft_max_large2_f32_f16, "soft_max_large2_f32_f16", soft_max_large2_f32_f16_len, soft_max_large2_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_soft_max_large3_f32_f16, "soft_max_large3_f32_f16", soft_max_large3_f32_f16_len, soft_max_large3_f32_f16_data, "main", 6, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, { 128, 4 }, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rope_multi_f32, "rope_multi_f32", rope_multi_f32_len, rope_multi_f32_data, "main", 5, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); @@ -4220,10 +4239,12 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f16_f32, "conv2d_dw_whcn_f16_f32", conv2d_dw_whcn_f16_f32_len, conv2d_dw_whcn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f16_f32, "conv2d_dw_cwhn_f16_f32", conv2d_dw_cwhn_f16_f32_len, conv2d_dw_cwhn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1); - for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) { - ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX], "topk_moe_f32_early_softmax_"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<subgroup_size); - ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX_NORM], "topk_moe_f32_early_softmax_norm"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<subgroup_size); - ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_LATE_SOFTMAX], "topk_moe_f32_late_softmax"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<subgroup_size); + for (uint32_t use_push = 0; use_push < 2; ++use_push) { + for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) { + ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX][use_push], "topk_moe_f32_early_softmax_"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<subgroup_size); + ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX_NORM][use_push], "topk_moe_f32_early_softmax_norm"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<subgroup_size); + ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_LATE_SOFTMAX][use_push], "topk_moe_f32_late_softmax"+std::to_string(i), topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<subgroup_size); + } } for (auto &c : compiles) { @@ -8304,6 +8325,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const switch (op) { case GGML_OP_GET_ROWS: GGML_ASSERT(src1->type == GGML_TYPE_I32); + if (src0->type == GGML_TYPE_I32) { + // i32 src only supports i32 result + GGML_ASSERT(dst->type == GGML_TYPE_I32); + return ctx->device->pipeline_get_rows[src0->type]; + } if (dst->type == GGML_TYPE_F16) { return ctx->device->pipeline_get_rows[src0->type]; } @@ -8430,6 +8456,12 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_tri[dst->type == GGML_TYPE_F16]; } return nullptr; + case GGML_OP_DIAG: + if (src0->type == dst->type && + (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) { + return ctx->device->pipeline_diag[dst->type == GGML_TYPE_F16]; + } + return nullptr; case GGML_OP_CLAMP: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_clamp_f32; @@ -8584,7 +8616,9 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const uint32_t idx = (uint32_t)ceilf(log2f(float(dst->ne[0]))); GGML_ASSERT(idx < num_topk_moe_pipelines); topk_moe_mode mode = ggml_vk_num_additional_ops_to_topk_moe_mode(ctx->num_additional_fused_ops); - return ctx->device->pipeline_topk_moe[idx][mode]; + // use n_experts from push constant if it's not equal to the power of two spec constant + bool use_push = dst->ne[0] != (1u << idx); + return ctx->device->pipeline_topk_moe[idx][mode][use_push]; } if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) { @@ -9121,6 +9155,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co case GGML_OP_COS: case GGML_OP_LOG: case GGML_OP_TRI: + case GGML_OP_DIAG: case GGML_OP_CLAMP: case GGML_OP_PAD: case GGML_OP_ROLL: @@ -9808,6 +9843,12 @@ static void ggml_vk_tri(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_TRI, std::move(p)); } +static void ggml_vk_diag(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst)); + + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_DIAG, std::move(p)); +} + static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); p.param1 = ggml_get_op_params_f32(dst, 0); @@ -10141,7 +10182,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - ggml_vk_op_f32(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_SOFT_MAX, { + vk_op_soft_max_push_constants pc { ncols, src1 != nullptr ? nrows_y : (uint32_t)0, (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], @@ -10152,7 +10193,55 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context& subctx, n_head_log2, nrows_x, src2 != nullptr - }); + }; + + if (ncols <= 16384) { + ggml_vk_op_f32(ctx, subctx, src0, src1, src2, nullptr, dst, GGML_OP_SOFT_MAX, std::move(pc)); + } else { + + vk_subbuffer buf_a = ggml_vk_tensor_subbuffer(ctx, src0); + vk_subbuffer buf_b = src1 ? ggml_vk_tensor_subbuffer(ctx, src1) : buf_a; + vk_subbuffer buf_c = src2 ? ggml_vk_tensor_subbuffer(ctx, src2) : buf_a; + vk_subbuffer buf_d = ggml_vk_tensor_subbuffer(ctx, dst); + + uint32_t elems_per_wg = 128 * 4; + uint32_t num_wgs = CEIL_DIV(ncols, elems_per_wg); + size_t tmp_size = num_wgs * nrows_x * sizeof(float); + + if (ctx->prealloc_size_x < tmp_size) { + ctx->prealloc_size_x = tmp_size; + ggml_vk_preallocate_buffers(ctx, subctx); + } + if (ctx->prealloc_size_y < tmp_size) { + ctx->prealloc_size_y = tmp_size; + ggml_vk_preallocate_buffers(ctx, subctx); + } + if (ctx->prealloc_x_need_sync || ctx->prealloc_y_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); + } + + vk_subbuffer buf_x = { ctx->prealloc_x, 0, tmp_size }; + vk_subbuffer buf_y = { ctx->prealloc_y, 0, tmp_size }; + + std::array elements = { num_wgs, nrows_x, 1 }; + + vk_pipeline pipeline1 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large1_f32_f16 : ctx->device->pipeline_soft_max_large1_f32; + vk_pipeline pipeline2 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large2_f32_f16 : ctx->device->pipeline_soft_max_large2_f32; + vk_pipeline pipeline3 = src1 && src1->type == GGML_TYPE_F16 ? ctx->device->pipeline_soft_max_large3_f32_f16 : ctx->device->pipeline_soft_max_large3_f32; + + ggml_pipeline_request_descriptor_sets(ctx, pipeline1, 1); + ggml_pipeline_request_descriptor_sets(ctx, pipeline2, 1); + ggml_pipeline_request_descriptor_sets(ctx, pipeline3, 1); + + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline1, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements); + ggml_vk_sync_buffers(ctx, subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline2, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements); + ggml_vk_sync_buffers(ctx, subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline3, { buf_a, buf_b, buf_c, buf_d, buf_x, buf_y }, pc, elements); + + ctx->prealloc_x_need_sync = true; + ctx->prealloc_y_need_sync = true; + } } static void ggml_vk_soft_max_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -10188,6 +10277,7 @@ static void ggml_vk_topk_moe(ggml_backend_vk_context * ctx, vk_context& subctx, vk_op_topk_moe_push_constants pc {}; pc.n_rows = n_rows; + pc.n_experts_push = n_experts; pc.n_expert_used = n_expert_used; if (mode == TOPK_MOE_EARLY_SOFTMAX_NORM) { ggml_tensor * clamp = cgraph->nodes[node_idx + 7]; @@ -11887,6 +11977,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_OP_TRI: ggml_vk_tri(ctx, compute_ctx, src0, node); + break; + case GGML_OP_DIAG: + ggml_vk_diag(ctx, compute_ctx, src0, node); + break; case GGML_OP_CLAMP: ggml_vk_clamp(ctx, compute_ctx, src0, node); @@ -12862,8 +12956,7 @@ static bool ggml_vk_can_fuse_topk_moe(ggml_backend_vk_context * ctx, const struc } const int n_expert = softmax->ne[0]; - // n_expert must be a power of 2 - if (!is_pow2(n_expert) || n_expert > (1 << (num_topk_moe_pipelines-1))) { + if (n_expert > (1 << (num_topk_moe_pipelines-1))) { return false; } @@ -13907,6 +14000,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_NL: case GGML_TYPE_MXFP4: + case GGML_TYPE_I32: return true; default: return false; @@ -14031,6 +14125,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; case GGML_OP_LOG: case GGML_OP_TRI: + case GGML_OP_DIAG: return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && op->type == op->src[0]->type; case GGML_OP_ARGSORT: @@ -14621,6 +14716,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * tensor_clone = ggml_log(ggml_ctx, src_clone[0]); } else if (tensor->op == GGML_OP_TRI) { tensor_clone = ggml_tri(ggml_ctx, src_clone[0], ggml_get_op_params_i32(tensor, 0)); + } else if (tensor->op == GGML_OP_DIAG) { + tensor_clone = ggml_diag(ggml_ctx, src_clone[0]); } else if (tensor->op == GGML_OP_CLAMP) { const float * params = (const float *)tensor->op_params; tensor_clone = ggml_clamp(ggml_ctx, src_clone[0], params[0], params[1]); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp b/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp new file mode 100644 index 000000000..cd3f42f49 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp @@ -0,0 +1,29 @@ +#version 450 + +#include "rte.glsl" +#include "types.glsl" +#include "generic_unary_head.glsl" + +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + +void main() { + const uint idx = get_idx(); + + if (idx >= p.ne) { + return; + } + + const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L); + const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10; + const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L); + const uint i12_offset = i12*p.ne11*p.ne10; + const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L); + const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10; + + if (i10 == i11) { + const float val = float(data_a[get_aoffset() + i13*p.nb03 + i12*p.nb02 + 0*p.nb01 + i10*p.nb00]); + data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val); + } else { + data_d[get_doffset() + dst_idx(idx)] = D_TYPE(0); + } +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp index 4bef48b00..0379e5d50 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp @@ -256,6 +256,9 @@ void main() { barrier(); } + // prevent race on tmpsh + barrier(); + // reduce across threads [[unroll]] for (uint32_t r = 0; r < Br; ++r) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp index cd82e4abf..c995ab140 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp @@ -302,6 +302,9 @@ void main() { barrier(); } + // prevent race on tmpsh + barrier(); + // reduce across threads float rowmaxf[rows_per_thread], eMf[rows_per_thread], Moldf[rows_per_thread]; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp index 76d83041c..e88bdd057 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp @@ -26,9 +26,9 @@ void main() { const uint d_offset = get_doffset() + i10*p.nb21 + i11*p.nb22 + i12*p.nb23; #if defined(DATA_A_BF16) - FLOAT_TYPE v = FLOAT_TYPE(bf16_to_fp32(data_a[a_offset + i00])); + TEMP_TYPE v = TEMP_TYPE(bf16_to_fp32(data_a[a_offset + i00])); #else - FLOAT_TYPE v = FLOAT_TYPE(data_a[a_offset + i00]); + TEMP_TYPE v = TEMP_TYPE(data_a[a_offset + i00]); #endif #ifndef OPTIMIZATION_ERROR_WORKAROUND data_d[d_offset + i00] = D_TYPE(v); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp index 0b74b3321..c5f5e9cbb 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp @@ -7,34 +7,50 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; FLOAT_TYPE temp[NUM_COLS][NUM_ROWS]; -void calc_superblock(const uint a_offset, const uint b_offset, const uint ib32, const uint i, const uint num_blocks_per_row, const uint first_row, const uint num_rows) { - const uint y_idx = i * QUANT_K + 32 * ib32; - - uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i; - [[unroll]] for (uint n = 0; n < num_rows; ++n) { - const float d = float(data_a[ibi].d); - const uint qh = data_a[ibi].qh[ib32]; - const float dl = d * float(2 * bitfieldExtract(qh, 12, 3) + 1); - const float delta = ((qh & 0x8000) != 0) ? -IQ1S_DELTA : IQ1S_DELTA; - +void calc_superblock(const uint a_offset, const uint b_offset, const uint ib32, const uint i, + const uint num_blocks_per_row, const uint first_row, const uint num_rows) { + const uint y_idx_base = i * QUANT_K + 32 * ib32; + [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { + const uint base_b_idx = (j * p.batch_stride_b + b_offset + y_idx_base) / 4; [[unroll]] for (uint l = 0; l < 4; ++l) { - const uint qs = data_a[ibi].qs[4 * ib32 + l]; - const uint idxhi = bitfieldExtract(qh, 3 * int(l), 3); - const int16_t grid = int16_t(iq1s_grid[qs | (idxhi << 8)]); + const vec4 b_val_0 = vec4(data_b_v4[base_b_idx + 2 * l]); + const vec4 b_val_1 = vec4(data_b_v4[base_b_idx + 2 * l + 1]); - [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { - vec4 b0 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 0]); - vec4 b4 = vec4(data_b_v4[(j*p.batch_stride_b + b_offset + y_idx) / 4 + 2*l + 1]); + // index for data_a + uint ibi = a_offset / QUANT_K + first_row * num_blocks_per_row + i; + + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + const float d = float(data_a[ibi].d); + const uint qh = data_a[ibi].qh[ib32]; + + const float dl = d * float(2 * bitfieldExtract(qh, 12, 3) + 1); + const uint qs = data_a[ibi].qs[4 * ib32 + l]; + const uint idxhi = bitfieldExtract(qh, 3 * int(l), 3); + const uint16_t grid = uint16_t(iq1s_grid[qs | (idxhi << 8)]); + + const float delta_val = ((qh & 0x8000) != 0) ? -IQ1S_DELTA : IQ1S_DELTA; + const vec4 delta_v = vec4(delta_val); + const vec4 fbits0 = vec4( + float(bitfieldExtract(grid, 0, 2)), + float(bitfieldExtract(grid, 2, 2)), + float(bitfieldExtract(grid, 4, 2)), + float(bitfieldExtract(grid, 6, 2)) + ); + const vec4 fbits1 = vec4( + float(bitfieldExtract(grid, 8, 2)), + float(bitfieldExtract(grid, 10, 2)), + float(bitfieldExtract(grid, 12, 2)), + float(bitfieldExtract(grid, 14, 2)) + ); + + vec4 sum_v = fma(b_val_0, fbits0 + delta_v, vec4(0.0)); + sum_v = fma(b_val_1, fbits1 + delta_v, sum_v); + FLOAT_TYPE sum = dot(sum_v, vec4(1.0)); - FLOAT_TYPE sum = FLOAT_TYPE(0.0); - [[unroll]] for (int k = 0; k < 4; ++k) { - sum = fma(FLOAT_TYPE(b0[k]), bitfieldExtract(grid, 2 * k, 2) + delta, - fma(FLOAT_TYPE(b4[k]), bitfieldExtract(grid, 8 + 2 * k, 2) + delta, sum)); - } temp[j][n] = fma(dl, sum, temp[j][n]); + ibi += num_blocks_per_row; } } - ibi += num_blocks_per_row; } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl index ee5ded2e8..58ede0440 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl @@ -244,17 +244,20 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin const uint iqs = idx % 128; // 0..127 const uint n = iqs / 64; // 0,1 - const uint b = (iqs % 64) / 32; // 0,1 + const uint b = ((iqs % 64) / 32) * 4; // 0,4 const uint is_b = (iqs % 16) / 8; // 0,1 const uint qhshift = ((iqs % 64) / 16) * 2; // 0,2,4,6 const uint is = 8 * n + qhshift + is_b; // 0..15 - const uint qsi = n * 64 + (iqs % 32) * 2; // 0,2,4..126 - const uint qhi = n * 32 + (iqs % 16) * 2; // 0,2,4..62 + const uint qsi = n * 32 + (iqs % 32); // 0..63 + const uint qhi = n * 16 + (iqs % 16); // 0..31 const float dscale = float(data_a[ib].d) * float(data_a[ib].scales[is]); - buf_a[buf_idx] = FLOAT_TYPE_VEC2(dscale * float(int8_t(((data_a[ib].ql[qsi ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi ] >> qhshift) & 3) << 4)) - 32), - dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32)); + const uint ql = (uint(data_a_packed16[ib].ql[qsi]) >> b) & 0x0F0F; + const uint qh = (uint(data_a_packed16[ib].qh[qhi]) >> qhshift) & 0x0303; + const vec2 q = (vec2(unpack8(ql | (qh << 4)).xy) - 32) * dscale; + + buf_a[buf_idx] = FLOAT_TYPE_VEC2(q.x, q.y); #elif defined(DATA_A_IQ1_S) const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp new file mode 100644 index 000000000..39c466391 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large1.comp @@ -0,0 +1,62 @@ +#version 450 + +#include "soft_max_large_common.glsl" + +void main() { + const uint tid = gl_LocalInvocationID.x; + const uint rowx = gl_WorkGroupID.y; + const uint wg_start = gl_WorkGroupID.x * BLOCK_SIZE * num_iters; + + const uint32_t i03 = rowx / (p.ne01 * p.ne02); + const uint32_t i02 = (rowx - i03 * p.ne01 * p.ne02) / p.ne01; + const uint32_t i01 = rowx % p.ne01; + + uint rowy_start = 0; + if (p.KY > 0) { + rowy_start = i01 * p.nb11 + (i02 % p.ne12) * p.nb12 + (i03 % p.ne13) * p.nb13; + } + + if (rowx >= p.nrows_x) { + return; + } + + float slope = get_slope(rowx); + + // Find max + FLOAT_TYPE max_val = p.has_sinks == 0 ? uintBitsToFloat(0xFF800000) : data_c[i02]; + + [[unroll]] for (uint col0 = wg_start, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) { + const uint col = col0 + tid; + + FLOAT_TYPE a = FLOAT_TYPE(0); + if (col < p.KX) { + a = data_a[rowx * p.KX + col]; + } + + FLOAT_TYPE b = FLOAT_TYPE(0); + if (p.KY > 0 && col < p.KX) { + b = data_b[rowy_start + col]; + } + + FLOAT_TYPE v = a * p.scale + slope * b; + + if (col < p.KX) { + max_val = max(max_val, v); + } + } + + // reduce across the workgroup + vals[tid] = max_val; + barrier(); + [[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) { + if (tid < s) { + vals[tid] = max(vals[tid], vals[tid + s]); + } + barrier(); + } + + if (tid == 0) { + max_val = vals[0]; + data_m[rowx * gl_NumWorkGroups.x + gl_WorkGroupID.x] = max_val; + } +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp new file mode 100644 index 000000000..69524f5f7 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large2.comp @@ -0,0 +1,79 @@ +#version 450 + +#include "soft_max_large_common.glsl" + +void main() { + const uint tid = gl_LocalInvocationID.x; + const uint rowx = gl_WorkGroupID.y; + const uint wg_start = gl_WorkGroupID.x * BLOCK_SIZE * num_iters; + + const uint32_t i03 = rowx / (p.ne01 * p.ne02); + const uint32_t i02 = (rowx - i03 * p.ne01 * p.ne02) / p.ne01; + const uint32_t i01 = rowx % p.ne01; + + uint rowy_start = 0; + if (p.KY > 0) { + rowy_start = i01 * p.nb11 + (i02 % p.ne12) * p.nb12 + (i03 % p.ne13) * p.nb13; + } + + if (rowx >= p.nrows_x) { + return; + } + + float slope = get_slope(rowx); + + // Find max + FLOAT_TYPE max_val = p.has_sinks == 0 ? uintBitsToFloat(0xFF800000) : data_c[i02]; + + [[unroll]] for (uint i = 0; i < gl_NumWorkGroups.x; i += BLOCK_SIZE) { + if (i + tid < gl_NumWorkGroups.x) { + max_val = max(max_val, data_m[rowx * gl_NumWorkGroups.x + i + tid]); + } + } + + // reduce across the workgroup + vals[tid] = max_val; + barrier(); + [[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) { + if (tid < s) { + vals[tid] = max(max_val, vals[tid + s]); + } + barrier(); + } + + max_val = vals[0]; + barrier(); + + FLOAT_TYPE sum = FLOAT_TYPE(0.0f); + + // Compute sum{exp(x - max)} + [[unroll]] for (uint col0 = wg_start, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) { + const uint col = col0 + tid; + + if (col >= p.KX) { + break; + } + + // compute exp(a*scale+b*slope), add it to sum + const uint i = rowx * p.KX + col; + FLOAT_TYPE val; + val = exp(FLOAT_TYPE(data_a[i]) * p.scale + (p.KY > 0 ? slope * FLOAT_TYPE(data_b[rowy_start + col]) : FLOAT_TYPE(0.0f)) - max_val); + sum += val; + data_d[i] = D_TYPE(val); + } + + // reduce across the workgroup + vals[tid] = sum; + barrier(); + [[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) { + if (tid < s) { + vals[tid] += vals[tid + s]; + } + barrier(); + } + + if (tid == 0) { + sum = vals[0]; + data_s[rowx * gl_NumWorkGroups.x + gl_WorkGroupID.x] = sum; + } +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp new file mode 100644 index 000000000..06efd7d9f --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large3.comp @@ -0,0 +1,65 @@ +#version 450 + +#include "soft_max_large_common.glsl" + +shared FLOAT_TYPE sumsh[BLOCK_SIZE]; + +void main() { + const uint tid = gl_LocalInvocationID.x; + const uint rowx = gl_WorkGroupID.y; + const uint wg_start = gl_WorkGroupID.x * BLOCK_SIZE * num_iters; + + const uint32_t i03 = rowx / (p.ne01 * p.ne02); + const uint32_t i02 = (rowx - i03 * p.ne01 * p.ne02) / p.ne01; + const uint32_t i01 = rowx % p.ne01; + + uint rowy_start = 0; + if (p.KY > 0) { + rowy_start = i01 * p.nb11 + (i02 % p.ne12) * p.nb12 + (i03 % p.ne13) * p.nb13; + } + + if (rowx >= p.nrows_x) { + return; + } + + FLOAT_TYPE max_val = p.has_sinks == 0 ? uintBitsToFloat(0xFF800000) : data_c[i02]; + FLOAT_TYPE sum = FLOAT_TYPE(0.0f); + + [[unroll]] for (uint i = 0; i < gl_NumWorkGroups.x; i += BLOCK_SIZE) { + if (i + tid < gl_NumWorkGroups.x) { + max_val = max(max_val, data_m[rowx * gl_NumWorkGroups.x + i + tid]); + sum += data_s[rowx * gl_NumWorkGroups.x + i + tid]; + } + } + + // reduce across the workgroup + vals[tid] = max_val; + sumsh[tid] = sum; + barrier(); + [[unroll]] for (uint s = BLOCK_SIZE / 2; s > 0; s >>= 1) { + if (tid < s) { + vals[tid] = max(max_val, vals[tid + s]); + sumsh[tid] += sumsh[tid + s]; + } + barrier(); + } + + max_val = vals[0]; + sum = sumsh[0]; + + if (p.has_sinks != 0) { + sum += FLOAT_TYPE(exp(FLOAT_TYPE(data_c[i02]) - max_val)); + } + + FLOAT_TYPE rcpdivisor = 1.0/sum; + + [[unroll]] for (uint col0 = wg_start, idx = 0; idx < num_iters; col0 += BLOCK_SIZE, ++idx) { + const uint col = col0 + tid; + + if (col >= p.KX) { + continue; + } + + data_d[rowx*p.KX + col] *= D_TYPE(rcpdivisor); + } +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl new file mode 100644 index 000000000..6636d1f8d --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_large_common.glsl @@ -0,0 +1,53 @@ +#extension GL_EXT_control_flow_attributes : enable + +layout (push_constant) uniform parameter +{ + uint KX; + uint KY; + uint ne00; + uint ne01; + uint ne02; + uint ne12; + uint ne13; + uint nb11; + uint nb12; + uint nb13; + float scale; + float max_bias; + float m0; + float m1; + uint n_head_log2; + uint nrows_x; + uint has_sinks; +} p; + +#include "types.glsl" + +layout(constant_id = 0) const uint BLOCK_SIZE = 128; +layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; +layout(constant_id = 1) const uint num_iters = 4; + +layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; +layout (binding = 1) readonly buffer Y {B_TYPE data_b[];}; +layout (binding = 2) readonly buffer Z {float data_c[];}; +layout (binding = 3) buffer D {D_TYPE data_d[];}; +layout (binding = 4) buffer M {float data_m[];}; +layout (binding = 5) buffer S {float data_s[];}; + +shared FLOAT_TYPE vals[BLOCK_SIZE]; + +float get_slope(uint rowx) { + float slope = 1.0f; + + // ALiBi + if (p.max_bias > 0.0f) { + const uint h = (rowx / p.ne01) % p.ne02; // head index + + const float base = h < p.n_head_log2 ? p.m0 : p.m1; + const uint exp = h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1; + + slope = pow(base, exp); + } + + return slope; +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp b/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp index 5cd0785d2..b83a2b9d2 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp @@ -10,6 +10,7 @@ layout (push_constant) uniform parameter { uint n_rows; + uint n_experts_push; uint n_expert_used; float clamp_min; float clamp_max; @@ -18,11 +19,16 @@ layout (push_constant) uniform parameter layout(local_size_x_id = 0, local_size_y = 4, local_size_z = 1) in; layout(constant_id = 0) const uint WARP_SIZE = 32; -layout(constant_id = 1) const uint n_experts = 512; +layout(constant_id = 1) const uint n_experts_spec = 512; layout(constant_id = 2) const bool with_norm = true; layout(constant_id = 3) const bool late_softmax = false; +layout(constant_id = 4) const bool nexperts_use_push = false; -const uint experts_per_thread = (n_experts > WARP_SIZE) ? n_experts / WARP_SIZE : 1; +uint n_experts = nexperts_use_push ? n_experts_push : n_experts_spec; + +#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b)) + +const uint experts_per_thread = CEIL_DIV(n_experts_spec, WARP_SIZE); layout (binding = 0, std430) readonly buffer Logits {float logits[];}; layout (binding = 1, std430) writeonly buffer Weights {float weights[];}; @@ -94,7 +100,7 @@ void main() { } if (!late_softmax) { - softmax_warp_inplace(wt, n_experts, lane, false); + softmax_warp_inplace(wt, n_experts, lane, nexperts_use_push); } // at this point, each thread holds a portion of softmax, diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 269f32f57..83470fa1c 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -721,13 +721,15 @@ void process_shaders() { shader = (tname == "f32" || tname == "f16" || tname == "bf16") ? "get_rows.comp" : "get_rows_quant.comp"; if (tname == "f16") { - string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}})); + string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{"TEMP_TYPE", "FLOAT_TYPE"}, {data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}})); } else { - string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}})); + string_to_spv("get_rows_" + tname, shader, merge_maps(base_dict, {{"TEMP_TYPE", "FLOAT_TYPE"}, {data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}})); } - string_to_spv("get_rows_" + tname + "_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}})); + string_to_spv("get_rows_" + tname + "_f32", shader, merge_maps(base_dict, {{"TEMP_TYPE", "FLOAT_TYPE"}, {data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}})); } + string_to_spv("get_rows_i32", "get_rows.comp", {{"TEMP_TYPE", "uint"}, {"A_TYPE", "uint"}, {"B_TYPE", "int"}, {"D_TYPE", "uint"}}); + string_to_spv("mul_mat_vec_p021_f16_f32_subgroup_add", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}, {"USE_SUBGROUP_ADD", "1"}}); string_to_spv("mul_mat_vec_p021_f16_f32", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}); string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {{"A_TYPE", "float16_t"}, {"A_TYPE_VEC4", "f16vec4"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}); @@ -871,6 +873,8 @@ void process_shaders() { string_to_spv("tri_f16", "tri.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); string_to_spv("tri_f32", "tri.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("diag_f16", "diag.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); + string_to_spv("diag_f32", "diag.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); string_to_spv("softplus_f16", "softplus.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}}); string_to_spv("softplus_f32", "softplus.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); @@ -916,6 +920,13 @@ void process_shaders() { string_to_spv("soft_max_f32_f16", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}})); string_to_spv("soft_max_back_f32", "soft_max_back.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}})); + string_to_spv("soft_max_large1_f32", "soft_max_large1.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}})); + string_to_spv("soft_max_large2_f32", "soft_max_large2.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}})); + string_to_spv("soft_max_large3_f32", "soft_max_large3.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}})); + string_to_spv("soft_max_large1_f32_f16", "soft_max_large1.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}})); + string_to_spv("soft_max_large2_f32_f16", "soft_max_large2.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}})); + string_to_spv("soft_max_large3_f32_f16", "soft_max_large3.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}})); + string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"ROPE_D_TYPE", "float"}}); string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}}); string_to_spv("rope_norm_f16_rte", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"ROPE_D_TYPE", "float16_t"}, {"RTE16", "1"}}); diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 2b8489c59..8ef4a23a1 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -3320,6 +3320,7 @@ class VisionProjectorType: ULTRAVOX = "ultravox" INTERNVL = "internvl" QWEN2A = "qwen2a" # audio + GLMA = "glma" # audio QWEN25O = "qwen2.5o" # omni VOXTRAL = "voxtral" LFM2 = "lfm2" diff --git a/scripts/compare-logprobs.py b/scripts/compare-logprobs.py new file mode 100644 index 000000000..63861dd9a --- /dev/null +++ b/scripts/compare-logprobs.py @@ -0,0 +1,281 @@ +import argparse +import requests +import json +from pathlib import Path +import logging + +logger = logging.getLogger("compare-logprobs") +logging.basicConfig(level=logging.INFO) + + +DESCRIPTION = """ +Compare logits between llama.cpp and another inference engine using OpenAI-compatible server endpoints. + +Unlike compare-logits.py, it allows dumping logits from a hosted API endpoint. Useful when it's not possible to run both models locally. + +Example usage: + Step 1: Dump logits from two different servers + python scripts/compare-logprobs.py dump logits_llama.log http://localhost:8080/v1/completions + python scripts/compare-logprobs.py dump logits_other.log http://other-engine:8000/v1/completions + + (optionally, you can add --api-key if the endpoint requires authentication) + + Step 2: Compare the dumped logits + python scripts/compare-logprobs.py compare logits_llama.log logits_other.log report.md +""" + + +def generate_input_prompt(length: int) -> list[str]: + CORPUS = """ + You are an advanced AI assistant capable of using tools to gather information, perform calculations, or execute tasks. Always think step by step before responding. If a user's query requires external data, computation, or actions beyond your internal knowledge, use the appropriate tools via function calls. + + ### Tool Call Format: + When you need to use a tool, output the call in this exact XML format. Include the opening and closing tags. Do not escape arguments; they will be parsed as plain text. + + You can make multiple calls in one go by placing them one after another. + """ + words = [w.strip() for w in CORPUS.strip().split(" ")] + words = [w for w in words if len(w) > 0] # filter out empty strings + while len(words) < length: + words += words + return words[:length] + + +def dump_logits( + endpoint: str, + output_path: Path, + input_words: list[str], + pattern: list[tuple[bool, int]], + api_key=None, +): + logger.info(f"Dumping logits to {output_path} from endpoint {endpoint}...") + words = input_words + curr_text = "" + n_total = sum(n for get, n in pattern if get) + n_done = 0 + i_cur = 0 + i_total = len(words) + with output_path.open("w") as f: + for get, n in pattern: + if not get: + # skip n words + for i in range(n): + curr_text += words.pop(0) + " " + i_cur += 1 + continue + # get n words + for i in range(n): + curr_text += words.pop(0) + " " + payload = { + "prompt": curr_text.strip(), + "temperature": 0.0, + "top_k": 1, + "max_tokens": 1, + "logprobs": 1, + "stream": False, + } + response = requests.post( + endpoint, + json=payload, + headers={"Authorization": f"Bearer {api_key}"} if api_key else {}, + ) + response.raise_for_status() + data = response.json() + data["__index"] = i_cur # add index for easier debugging later + data = json.dumps(data) + f.write(f"{data}\n") + n_done += 1 + i_cur += 1 + logger.info( + f"\n\n{data}\n\n[Step: {n_done}/{n_total} | Word: {i_cur}/{i_total}]" + ) + logger.info(f"Logits dumped to {output_path}") + + +def get_token_logprobs(data: dict): + logprobs = data["choices"][0]["logprobs"] + if "content" in logprobs: + # llama.cpp case + top = logprobs["content"][0]["top_logprobs"][0] + return top["token"], top["logprob"] + else: + # vllm case + tokens = logprobs["tokens"] + token_logprobs = logprobs["token_logprobs"] + return tokens[0], token_logprobs[0] + + +def clean_text(text: str) -> str: + return ( + "'" + + text.replace("\n", "\\n") + .replace("\t", "\\t") + .replace("\r", "\\r") + .replace("|", "\\|") + + "'" + ) + + +def compare_logits(input1: Path, input2: Path, output_path: Path): + with input1.open("r") as f1, input2.open("r") as f2, output_path.open("w") as fout: + lines1 = f1.readlines() + lines2 = f2.readlines() + + tab_header = [ + "idx", + input1.name, + "logprob_1", + input2.name, + "logprob_2", + "diff (abs)", + ] + tab_entries = [] + tab_max_widths = [len(h) for h in tab_header] + + assert len(lines1) == len( + lines2 + ), "Input files must have the same number of lines." + + fout.write("# Logits Comparison Report\n\n") + for i, (line1, line2) in enumerate(zip(lines1, lines2)): + if not line1.strip() or not line2.strip(): + continue # skip empty lines + + data1 = json.loads(line1) + data2 = json.loads(line2) + + idx1 = data1.get("__index", -1) + idx2 = data2.get("__index", -1) + if idx1 != idx2: + logger.warning( + f"Warning: Mismatched indices at line {i}: {idx1} vs {idx2}" + ) + + token1, logprob1 = get_token_logprobs(data1) + token2, logprob2 = get_token_logprobs(data2) + + token1 = clean_text(token1) + token2 = clean_text(token2) + abs_diff = abs(logprob1 - logprob2) + + tab_entries.append( + ( + str(idx1 + 1), + token1, + f"{logprob1:.4f}", + token2, + f"{logprob2:.4f}", + f"{(abs_diff):.4f}", + ) + ) + + for i in range(len(tab_entries)): + for j in range(len(tab_header)): + tab_max_widths[j] = max(tab_max_widths[j], len(tab_entries[i][j])) + + output = "" + for j in range(len(tab_header)): + output += f"| {tab_header[j]:<{tab_max_widths[j]}} " + output += "|\n" + for j in range(len(tab_header)): + output += f"|{'-' * (tab_max_widths[j] + 2)}" + output += "|\n" + for entry in tab_entries: + for j in range(len(tab_header)): + output += f"| {entry[j]:<{tab_max_widths[j]}} " + output += "|\n" + + logger.info("\n" + output) + fout.write(output) + logger.info(f"Report written to {output_path}") + + +def parse_pattern(pattern: str) -> list[tuple[bool, int]]: + parts = pattern.split(",") + result = [] + for i, part in enumerate(parts): + n = int(part) + if i % 2 == 0: + result.append((True, n)) # get n words + else: + result.append((False, n)) # skip n words + return result + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description=DESCRIPTION, formatter_class=argparse.RawTextHelpFormatter + ) + subparsers = parser.add_subparsers( + dest="verb", required=True, help="action to perform" + ) + + # dump subcommand + parser_dump = subparsers.add_parser("dump", help="dump logits from an endpoint") + parser_dump.add_argument( + "output", type=Path, help="output path for dumped logits (.log)" + ) + parser_dump.add_argument( + "endpoint", type=str, help="OAI-compat /completions endpoint" + ) + parser_dump.add_argument( + "--api-key", + type=str, + default=None, + help="API key for authentication (if required)", + ) + parser_dump.add_argument( + "--file", + type=Path, + default=None, + help="File containing prompt to use instead of the default", + ) + parser_dump.add_argument( + "--pattern", + type=str, + default="10,1000,10,4000,10", + help="Pattern n_get,n_skip,... where n_get is number of words to get and n_skip is number of words to skip (num of words, NOT num of tokens)", + ) + + # compare subcommand + parser_compare = subparsers.add_parser( + "compare", help="compare two dumped logits files" + ) + parser_compare.add_argument("input1", type=Path, help="first input file (.log)") + parser_compare.add_argument("input2", type=Path, help="second input file (.log)") + parser_compare.add_argument( + "output", type=Path, help="output path for comparison report (.md)" + ) + + try: + return parser.parse_args() + except Exception as e: + parser.print_help() + raise e + + +def main(): + args = parse_args() + + if args.verb == "dump": + pattern = parse_pattern(args.pattern) + input_length = sum(n for _, n in pattern) + input_words = generate_input_prompt(input_length) + if args.file is not None: + with args.file.open("r") as f: + input_words = f.read().strip().split(" ") + if input_length < sum(n for _, n in pattern): + raise ValueError( + f"Input file has only {input_length} words, but pattern requires at least {input_length} words." + ) + input_length = len(input_words) + logger.info(f"Using {input_length} words") + dump_logits(args.endpoint, args.output, input_words, pattern, args.api_key) + elif args.verb == "compare": + compare_logits(args.input1, args.input2, args.output) + else: + raise ValueError(f"Unknown verb: {args.verb}") + + +if __name__ == "__main__": + main() diff --git a/src/llama-context.cpp b/src/llama-context.cpp index f186b17ee..ca495f3e5 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -9,6 +9,7 @@ #include "llama-model.h" #include +#include #include #include #include @@ -75,6 +76,43 @@ llama_context::llama_context( cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f; } + if (cparams.yarn_ext_factor != 0) { + static auto get_mscale = [](float scale, float mscale) { + return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f); + }; + + const float factor = 1.0f / cparams.rope_freq_scale; + + // ref: https://github.com/huggingface/transformers/blob/6d00f6b0a5679c36510f203e4226e36f517c3032/src/transformers/modeling_rope_utils.py#L336-L348 + if (hparams.rope_yarn_log_mul != 0.0f) { + // note: here we assume `mscale == 1.0f` + // TODO: start reading the actual value of mscale and handle the case where it is not 1.0f + float mscale = 1.0f; + const float mscale_all_dims = hparams.rope_yarn_log_mul; + + // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX] + // special-case DEEPSEEK v2: + // https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat/blob/main/config.json#L42-L43 + if (model.arch == LLM_ARCH_DEEPSEEK2 && mscale_all_dims != 1.0f) { + mscale = mscale_all_dims; + } + + cparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims); + + LLAMA_LOG_WARN("%s: setting new yarn_attn_factor = %.4f (mscale == %.1f, mscale_all_dim = %.1f)\n", + __func__, cparams.yarn_attn_factor, mscale, mscale_all_dims); + } else { + cparams.yarn_attn_factor = get_mscale(factor, 1.0f); + } + + // when YARN is applied with yarn_ext_factor != 0.0f, we need to cancel this factor: + // https://github.com/ggml-org/llama.cpp/blob/a81a569577cc38b32558958b048228150be63eae/ggml/src/ggml-cpu/ops.cpp#L5541-L5544 + // + // ref: https://github.com/ggml-org/llama.cpp/discussions/7416 + // https://github.com/ggml-org/llama.cpp/pull/17945 + cparams.yarn_attn_factor *= 1.0f / (1.0f + 0.1f * logf(factor)); + } + cparams.yarn_attn_factor *= hparams.rope_attn_factor; if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) { @@ -1328,6 +1366,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark) LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); #endif + synchronize(); buf_output = nullptr; logits = nullptr; embd = nullptr; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 6cf9a883a..8909bbfb9 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -78,7 +78,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) { for (int i = 0; i < n_tokens; ++i) { const float pos = ubatch->pos[i]; attn_scale_data[i] = std::log( - std::floor((pos + 1.0f) / n_attn_temp_floor_scale) + 1.0 + std::floor((pos + f_attn_temp_offset) / n_attn_temp_floor_scale) + 1.0 ) * f_attn_temp_scale + 1.0; } @@ -1203,7 +1203,7 @@ ggml_tensor * llm_graph_context::build_inp_pos() const { } ggml_tensor * llm_graph_context::build_inp_attn_scale() const { - auto inp = std::make_unique(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale); + auto inp = std::make_unique(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale, hparams.f_attn_temp_offset); auto & cur = inp->attn_scale; diff --git a/src/llama-graph.h b/src/llama-graph.h index d0c3934f6..e9d387bd7 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -132,8 +132,8 @@ public: // temperature tuning, used by llama4 class llm_graph_input_attn_temp : public llm_graph_input_i { public: - llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale) - : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {} + llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale, float f_attn_temp_offset) + : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale), f_attn_temp_offset(f_attn_temp_offset) {} virtual ~llm_graph_input_attn_temp() = default; void set_input(const llama_ubatch * ubatch) override; @@ -142,6 +142,7 @@ public: const uint32_t n_attn_temp_floor_scale; const float f_attn_temp_scale; + const float f_attn_temp_offset; }; class llm_graph_input_pos_bucket : public llm_graph_input_i { diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 8cdbaf69f..96c9598c2 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -1,6 +1,7 @@ #include "llama-hparams.h" #include "ggml.h" + #include void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) { diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 6eff334a5..a467c64a1 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -107,6 +107,7 @@ struct llama_hparams { float rope_freq_base_train_swa; float rope_freq_scale_train; float rope_freq_scale_train_swa; + uint32_t n_ctx_orig_yarn; float rope_yarn_log_mul = 0.0f; @@ -164,6 +165,7 @@ struct llama_hparams { uint32_t n_no_rope_layer_step = 4; uint32_t n_attn_temp_floor_scale = 0; float f_attn_temp_scale = 0.0f; + float f_attn_temp_offset = 0.0f; // offset position index // gemma3n altup uint32_t n_altup = 4; // altup_num_inputs @@ -270,4 +272,3 @@ struct llama_hparams { }; static_assert(std::is_trivially_copyable::value, "llama_hparams must be trivially copyable"); - diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 0390a7ade..91923ce6e 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -1369,9 +1369,10 @@ ggml_tensor * llama_kv_cache::build_rope_shift( float freq_scale) const { const auto & n_ctx_orig = cparams.n_ctx_orig_yarn; - const auto & yarn_ext_factor = cparams.yarn_ext_factor; - const auto & yarn_beta_fast = cparams.yarn_beta_fast; - const auto & yarn_beta_slow = cparams.yarn_beta_slow; + const auto & yarn_ext_factor = cparams.yarn_ext_factor; + const auto & yarn_beta_fast = cparams.yarn_beta_fast; + const auto & yarn_beta_slow = cparams.yarn_beta_slow; + const auto & yarn_attn_factor = cparams.yarn_attn_factor; const auto & n_rot = hparams.n_rot; const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE @@ -1382,12 +1383,6 @@ ggml_tensor * llama_kv_cache::build_rope_shift( ? LLAMA_ROPE_TYPE_NEOX : hparams.rope_type; - // See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly. - // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation. - const float yarn_attn_factor = model.arch == LLM_ARCH_DEEPSEEK2 - ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)) - : cparams.yarn_attn_factor; - ggml_tensor * tmp; if (ggml_is_quantized(cur->type)) { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index f515ad70a..1b2b12c0c 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -773,6 +773,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.n_swa = 8192; hparams.n_attn_temp_floor_scale = 8192; hparams.f_attn_temp_scale = 0.1f; + hparams.f_attn_temp_offset = 1.0f; hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full } @@ -1740,12 +1741,19 @@ void llama_model::load_hparams(llama_model_loader & ml) { // that have no expert_gating_func model parameter set hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX; } - ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false); + + if (ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f)) { + // [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX] + // cancel the factor from the convert script + hparams.rope_yarn_log_mul /= 0.1f; + } // (optional) temperature tuning - used by mistral-large ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false); ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_LENGTH, hparams.n_attn_temp_floor_scale, false); + hparams.f_attn_temp_offset = 0.0f; + switch (hparams.n_layer) { case 27: type = LLM_TYPE_16B; break; case 60: type = LLM_TYPE_236B; break; @@ -2372,9 +2380,11 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_TEMPERATURE_SCALE, hparams.f_attn_temp_scale, false); - ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false); - ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false); - ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, false); + ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false); + ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false); + ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul, 0.0f); + + hparams.f_attn_temp_offset = 0.0f; // TODO: maybe add n_attn_temp_floor_scale as a separate KV? if (hparams.f_attn_temp_scale != 0.0f) { @@ -2384,18 +2394,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { } } - // TODO: this seems to be correct with the case of mscale == mscale_all_dims == 1.0f - // but may need further verification with other values - if (hparams.rope_yarn_log_mul != 0.0f) { - float factor = 1.0f / hparams.rope_freq_scale_train; - float mscale = 1.0f; - float mscale_all_dims = hparams.rope_yarn_log_mul; - static auto get_mscale = [](float scale, float mscale) { - return scale <= 1.0f ? 1.0f : (0.1f * mscale * logf(scale) + 1.0f); - }; - hparams.yarn_attn_factor = get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dims); - } - switch (hparams.n_layer) { case 26: type = LLM_TYPE_3B; break; case 34: type = LLM_TYPE_8B; break; @@ -6965,6 +6963,7 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train); LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train); LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn); + LLAMA_LOG_INFO("%s: rope_yarn_log_mul= %.4f\n", __func__, hparams.rope_yarn_log_mul); LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown"); // MRoPE (Multi-axis Rotary Position Embedding) sections if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) { @@ -7028,7 +7027,6 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm); LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func)); - LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul); } if (arch == LLM_ARCH_QWEN2MOE) { diff --git a/src/models/deepseek2.cpp b/src/models/deepseek2.cpp index dbaa8297b..49382874b 100644 --- a/src/models/deepseek2.cpp +++ b/src/models/deepseek2.cpp @@ -1,7 +1,5 @@ #include "models.h" - - llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B @@ -20,9 +18,15 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly. // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation. - const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale)); - const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k)); - const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)); + // And also: https://github.com/ggml-org/llama.cpp/pull/17945 [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX] + + // first cancel the adjustment from llama_hparams::yarn_attn_factor_adjust to get the original attn_factor + GGML_ASSERT(ext_factor >= 0.0f); + const float attn_factor_org = attn_factor * (1.0f + 0.1f * logf(1.0f / freq_scale)); + + // use the original attn_factor to pre-scale the kq_scale + const float mscale = attn_factor_org * (1.0f + 0.1f * hparams.rope_yarn_log_mul * logf(1.0f / freq_scale)); + const float kq_scale = 1.0f * mscale * mscale / sqrtf(float(n_embd_head_k)); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/tools/completion/completion.cpp b/tools/completion/completion.cpp index 87954dd76..b89fb54fb 100644 --- a/tools/completion/completion.cpp +++ b/tools/completion/completion.cpp @@ -142,13 +142,15 @@ int main(int argc, char ** argv) { // load the model and apply lora adapter, if any LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__); - common_init_result llama_init = common_init_from_params(params); - model = llama_init.model.get(); - ctx = llama_init.context.get(); + auto llama_init = common_init_from_params(params); - if (model == NULL) { - LOG_ERR("%s: error: unable to load model\n", __func__); + ctx = llama_init->context(); + model = llama_init->model(); + smpl = llama_init->sampler(0); + + if (ctx == NULL) { + LOG_ERR("%s: error: unable to create context\n", __func__); return 1; } @@ -475,12 +477,6 @@ int main(int argc, char ** argv) { } } - smpl = common_sampler_init(model, sparams); - if (!smpl) { - LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__); - return 1; - } - LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl)); LOG_INF("sampler params: \n%s\n", sparams.print().c_str()); LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str()); @@ -994,8 +990,6 @@ int main(int argc, char ** argv) { LOG("\n\n"); common_perf_print(ctx, smpl); - common_sampler_free(smpl); - llama_backend_free(); ggml_threadpool_free_fn(threadpool); diff --git a/tools/mtmd/clip-graph.h b/tools/mtmd/clip-graph.h new file mode 100644 index 000000000..17f90e8aa --- /dev/null +++ b/tools/mtmd/clip-graph.h @@ -0,0 +1,119 @@ +#pragma once + +#include "ggml.h" +#include "ggml-cpp.h" +#include "clip.h" +#include "clip-impl.h" +#include "clip-model.h" + +#include +#include + +struct clip_graph { + const clip_model & model; + const clip_hparams & hparams; + projector_type proj_type; + + // we only support single image per batch + const clip_image_f32 & img; + + const int patch_size; + const int n_patches_x; + const int n_patches_y; + const int n_patches; + const int n_embd; + const int n_head; + const int d_head; + const int n_layer; + const int n_mmproj_embd; + const float eps; + const float kq_scale; + const clip_flash_attn_type flash_attn_type; + + // for debugging + const bool debug_graph; + std::vector & debug_print_tensors; + + ggml_context_ptr ctx0_ptr; + ggml_context * ctx0; + ggml_cgraph * gf; + + clip_graph(clip_ctx * ctx, const clip_image_f32 & img); + + virtual ~clip_graph() = default; + virtual ggml_cgraph * build() = 0; + + // + // utility functions + // + void cb(ggml_tensor * cur0, const char * name, int il) const; + + // siglip2 naflex + ggml_tensor * resize_position_embeddings(); + + // build vision transformer (ViT) cgraph + // this function should cover most of the models + // if your model has specific features, you should probably duplicate this function + ggml_tensor * build_vit( + ggml_tensor * inp, + int64_t n_pos, + norm_type norm_t, + ffn_op_type ffn_t, + ggml_tensor * learned_pos_embd, + std::function add_pos); + + // build the input after conv2d (inp_raw --> patches) + // returns tensor with shape [n_embd, n_patches] + ggml_tensor * build_inp(); + + ggml_tensor * build_inp_raw(int channels = 3); + + ggml_tensor * build_norm( + ggml_tensor * cur, + ggml_tensor * mw, + ggml_tensor * mb, + norm_type type, + float norm_eps, + int il) const; + + ggml_tensor * build_ffn( + ggml_tensor * cur, + ggml_tensor * up, + ggml_tensor * up_b, + ggml_tensor * gate, + ggml_tensor * gate_b, + ggml_tensor * down, + ggml_tensor * down_b, + ffn_op_type type_op, + int il) const; + + ggml_tensor * build_attn( + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * q_cur, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + ggml_tensor * kq_mask, + float kq_scale, + int il) const; + + // implementation of the 2D RoPE without adding a new op in ggml + // this is not efficient (use double the memory), but works on all backends + // TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065 + ggml_tensor * build_rope_2d( + ggml_context * ctx0, + ggml_tensor * cur, + ggml_tensor * pos_a, // first half + ggml_tensor * pos_b, // second half + const float freq_base, + const bool interleave_freq + ); + + // aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL) + // support dynamic resolution + ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor); + + // Generic function to stack frames for audio processing + // Abstracts out the StackAudioFrames logic used by ultravox + ggml_tensor * build_stack(ggml_tensor * cur, int32_t stack_factor, int32_t n_embed); +}; diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index cd47865bf..9ef0d301e 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -1,3 +1,5 @@ +#pragma once + #include "ggml.h" #include "gguf.h" #include "clip.h" @@ -13,6 +15,8 @@ // Internal header for clip.cpp +#define MTMD_INTERNAL_HEADER + #define KEY_FTYPE "general.file_type" #define KEY_NAME "general.name" #define KEY_DESCRIPTION "general.description" @@ -132,6 +136,10 @@ // align x to upper multiple of n #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n)) +// forward declaration +// TODO: improve this later +struct clip_ctx; + enum projector_type { PROJECTOR_TYPE_MLP, PROJECTOR_TYPE_MLP_NORM, @@ -149,6 +157,7 @@ enum projector_type { PROJECTOR_TYPE_INTERNVL, PROJECTOR_TYPE_LLAMA4, PROJECTOR_TYPE_QWEN2A, + PROJECTOR_TYPE_GLMA, PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx PROJECTOR_TYPE_VOXTRAL, PROJECTOR_TYPE_LFM2, @@ -175,6 +184,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_INTERNVL, "internvl"}, { PROJECTOR_TYPE_LLAMA4, "llama4"}, { PROJECTOR_TYPE_QWEN2A, "qwen2a"}, + { PROJECTOR_TYPE_GLMA, "glma"}, { PROJECTOR_TYPE_QWEN25O, "qwen2.5o"}, { PROJECTOR_TYPE_VOXTRAL, "voxtral"}, { PROJECTOR_TYPE_LFM2, "lfm2"}, diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h new file mode 100644 index 000000000..bdd4f63e0 --- /dev/null +++ b/tools/mtmd/clip-model.h @@ -0,0 +1,280 @@ +#pragma once + +#include "ggml.h" +#include "clip.h" +#include "clip-impl.h" + +#include +#include +#include +#include + +enum ffn_op_type { + FFN_GELU, + FFN_GELU_ERF, + FFN_SILU, + FFN_GELU_QUICK, +}; + +enum norm_type { + NORM_TYPE_NORMAL, + NORM_TYPE_RMS, +}; + +enum patch_merge_type { + PATCH_MERGE_FLAT, + PATCH_MERGE_SPATIAL_UNPAD, +}; + +struct clip_hparams { + int32_t image_size = 0; + int32_t patch_size = 0; + int32_t n_embd = 0; + int32_t n_ff = 0; + int32_t projection_dim = 0; + int32_t n_head = 0; + int32_t n_layer = 0; + // idefics3 + int32_t image_longest_edge = 0; + int32_t image_min_pixels = -1; + int32_t image_max_pixels = -1; + int32_t n_merge = 0; // number of patch merges **per-side** + + float image_mean[3]; + float image_std[3]; + + // for models using dynamic image size, we need to have a smaller image size to warmup + // otherwise, user will get OOM everytime they load the model + int32_t warmup_image_size = 0; + int32_t warmup_audio_size = 3000; + + ffn_op_type ffn_op = FFN_GELU; + + patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT; + + float eps = 1e-6; + float rope_theta = 0.0; + + std::vector image_res_candidates; // for llava-uhd style models + int32_t image_crop_resolution; + std::unordered_set vision_feature_layer; + int32_t attn_window_size = 0; + int32_t n_wa_pattern = 0; + + // audio + int32_t n_mel_bins = 0; // whisper preprocessor + int32_t proj_stack_factor = 0; // ultravox + + // legacy + bool has_llava_projector = false; + int minicpmv_version = 0; + int32_t minicpmv_query_num = 0; // MiniCPM-V query number + + // custom value provided by user, can be undefined if not set + int32_t custom_image_min_tokens = -1; + int32_t custom_image_max_tokens = -1; + + void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) { + const int cur_merge = n_merge == 0 ? 1 : n_merge; + const int patch_area = patch_size * patch_size * cur_merge * cur_merge; + image_min_pixels = (custom_image_min_tokens > 0 ? custom_image_min_tokens : n_tokens_min) * patch_area; + image_max_pixels = (custom_image_max_tokens > 0 ? custom_image_max_tokens : n_tokens_max) * patch_area; + warmup_image_size = static_cast(std::sqrt(image_max_pixels)); + } + + void set_warmup_n_tokens(int n_tokens) { + int n_tok_per_side = static_cast(std::sqrt(n_tokens)); + GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n"); + const int cur_merge = n_merge == 0 ? 1 : n_merge; + warmup_image_size = n_tok_per_side * patch_size * cur_merge; + // TODO: support warmup size for custom token numbers + } +}; + +struct clip_layer { + // attention + ggml_tensor * k_w = nullptr; + ggml_tensor * k_b = nullptr; + ggml_tensor * q_w = nullptr; + ggml_tensor * q_b = nullptr; + ggml_tensor * v_w = nullptr; + ggml_tensor * v_b = nullptr; + ggml_tensor * qkv_w = nullptr; + ggml_tensor * qkv_b = nullptr; + + ggml_tensor * o_w = nullptr; + ggml_tensor * o_b = nullptr; + + ggml_tensor * k_norm = nullptr; + ggml_tensor * q_norm = nullptr; + + // layernorm 1 + ggml_tensor * ln_1_w = nullptr; + ggml_tensor * ln_1_b = nullptr; + + ggml_tensor * ff_up_w = nullptr; + ggml_tensor * ff_up_b = nullptr; + ggml_tensor * ff_gate_w = nullptr; + ggml_tensor * ff_gate_b = nullptr; + ggml_tensor * ff_down_w = nullptr; + ggml_tensor * ff_down_b = nullptr; + + // layernorm 2 + ggml_tensor * ln_2_w = nullptr; + ggml_tensor * ln_2_b = nullptr; + + // layer scale (no bias) + ggml_tensor * ls_1_w = nullptr; + ggml_tensor * ls_2_w = nullptr; + + // qwen3vl deepstack merger + ggml_tensor * deepstack_norm_w = nullptr; + ggml_tensor * deepstack_norm_b = nullptr; + ggml_tensor * deepstack_fc1_w = nullptr; + ggml_tensor * deepstack_fc1_b = nullptr; + ggml_tensor * deepstack_fc2_w = nullptr; + ggml_tensor * deepstack_fc2_b = nullptr; + + bool has_deepstack() const { + return deepstack_fc1_w != nullptr; + } +}; + +struct clip_model { + clip_modality modality = CLIP_MODALITY_VISION; + projector_type proj_type = PROJECTOR_TYPE_MLP; + clip_hparams hparams; + + // embeddings + ggml_tensor * class_embedding = nullptr; + ggml_tensor * patch_embeddings_0 = nullptr; + ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL) + ggml_tensor * patch_bias = nullptr; + ggml_tensor * position_embeddings = nullptr; + + ggml_tensor * pre_ln_w = nullptr; + ggml_tensor * pre_ln_b = nullptr; + + std::vector layers; + + int32_t n_deepstack_layers = 0; // used by Qwen3-VL, calculated from clip_layer + + ggml_tensor * post_ln_w; + ggml_tensor * post_ln_b; + + ggml_tensor * projection; // TODO: rename it to fc (fully connected layer) + ggml_tensor * mm_fc_w; + ggml_tensor * mm_fc_b; + + // LLaVA projection + ggml_tensor * mm_input_norm_w = nullptr; + ggml_tensor * mm_input_norm_b = nullptr; + ggml_tensor * mm_0_w = nullptr; + ggml_tensor * mm_0_b = nullptr; + ggml_tensor * mm_2_w = nullptr; + ggml_tensor * mm_2_b = nullptr; + + ggml_tensor * image_newline = nullptr; + + // Yi type models with mlp+normalization projection + ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4 + ggml_tensor * mm_1_b = nullptr; + ggml_tensor * mm_3_w = nullptr; + ggml_tensor * mm_3_b = nullptr; + ggml_tensor * mm_4_w = nullptr; + ggml_tensor * mm_4_b = nullptr; + + // GLMV-Edge projection + ggml_tensor * mm_model_adapter_conv_w = nullptr; + ggml_tensor * mm_model_adapter_conv_b = nullptr; + + // MobileVLM projection + ggml_tensor * mm_model_mlp_1_w = nullptr; + ggml_tensor * mm_model_mlp_1_b = nullptr; + ggml_tensor * mm_model_mlp_3_w = nullptr; + ggml_tensor * mm_model_mlp_3_b = nullptr; + ggml_tensor * mm_model_block_1_block_0_0_w = nullptr; + ggml_tensor * mm_model_block_1_block_0_1_w = nullptr; + ggml_tensor * mm_model_block_1_block_0_1_b = nullptr; + ggml_tensor * mm_model_block_1_block_1_fc1_w = nullptr; + ggml_tensor * mm_model_block_1_block_1_fc1_b = nullptr; + ggml_tensor * mm_model_block_1_block_1_fc2_w = nullptr; + ggml_tensor * mm_model_block_1_block_1_fc2_b = nullptr; + ggml_tensor * mm_model_block_1_block_2_0_w = nullptr; + ggml_tensor * mm_model_block_1_block_2_1_w = nullptr; + ggml_tensor * mm_model_block_1_block_2_1_b = nullptr; + ggml_tensor * mm_model_block_2_block_0_0_w = nullptr; + ggml_tensor * mm_model_block_2_block_0_1_w = nullptr; + ggml_tensor * mm_model_block_2_block_0_1_b = nullptr; + ggml_tensor * mm_model_block_2_block_1_fc1_w = nullptr; + ggml_tensor * mm_model_block_2_block_1_fc1_b = nullptr; + ggml_tensor * mm_model_block_2_block_1_fc2_w = nullptr; + ggml_tensor * mm_model_block_2_block_1_fc2_b = nullptr; + ggml_tensor * mm_model_block_2_block_2_0_w = nullptr; + ggml_tensor * mm_model_block_2_block_2_1_w = nullptr; + ggml_tensor * mm_model_block_2_block_2_1_b = nullptr; + + // MobileVLM_V2 projection + ggml_tensor * mm_model_mlp_0_w = nullptr; + ggml_tensor * mm_model_mlp_0_b = nullptr; + ggml_tensor * mm_model_mlp_2_w = nullptr; + ggml_tensor * mm_model_mlp_2_b = nullptr; + ggml_tensor * mm_model_peg_0_w = nullptr; + ggml_tensor * mm_model_peg_0_b = nullptr; + + // MINICPMV projection + ggml_tensor * mm_model_pos_embed_k = nullptr; + ggml_tensor * mm_model_query = nullptr; + ggml_tensor * mm_model_proj = nullptr; + ggml_tensor * mm_model_kv_proj = nullptr; + ggml_tensor * mm_model_attn_q_w = nullptr; + ggml_tensor * mm_model_attn_q_b = nullptr; + ggml_tensor * mm_model_attn_k_w = nullptr; + ggml_tensor * mm_model_attn_k_b = nullptr; + ggml_tensor * mm_model_attn_v_w = nullptr; + ggml_tensor * mm_model_attn_v_b = nullptr; + ggml_tensor * mm_model_attn_o_w = nullptr; + ggml_tensor * mm_model_attn_o_b = nullptr; + ggml_tensor * mm_model_ln_q_w = nullptr; + ggml_tensor * mm_model_ln_q_b = nullptr; + ggml_tensor * mm_model_ln_kv_w = nullptr; + ggml_tensor * mm_model_ln_kv_b = nullptr; + ggml_tensor * mm_model_ln_post_w = nullptr; + ggml_tensor * mm_model_ln_post_b = nullptr; + + // gemma3 + ggml_tensor * mm_input_proj_w = nullptr; + ggml_tensor * mm_soft_emb_norm_w = nullptr; + + // pixtral + ggml_tensor * token_embd_img_break = nullptr; + ggml_tensor * mm_patch_merger_w = nullptr; + + // ultravox / whisper encoder + ggml_tensor * conv1d_1_w = nullptr; + ggml_tensor * conv1d_1_b = nullptr; + ggml_tensor * conv1d_2_w = nullptr; + ggml_tensor * conv1d_2_b = nullptr; + ggml_tensor * mm_norm_pre_w = nullptr; + ggml_tensor * mm_norm_pre_b = nullptr; + ggml_tensor * mm_norm_mid_w = nullptr; + + // cogvlm + ggml_tensor * mm_post_fc_norm_w = nullptr; + ggml_tensor * mm_post_fc_norm_b = nullptr; + ggml_tensor * mm_h_to_4h_w = nullptr; + ggml_tensor * mm_gate_w = nullptr; + ggml_tensor * mm_4h_to_h_w = nullptr; + ggml_tensor * mm_boi = nullptr; + ggml_tensor * mm_eoi = nullptr; + + bool audio_has_avgpool() const { + return proj_type == PROJECTOR_TYPE_QWEN2A + || proj_type == PROJECTOR_TYPE_VOXTRAL; + } + + bool audio_has_stack_frames() const { + return proj_type == PROJECTOR_TYPE_ULTRAVOX + || proj_type == PROJECTOR_TYPE_VOXTRAL; + } +}; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 03ebcfa12..3ae02ec63 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1,9 +1,9 @@ -// NOTE: This is modified from clip.cpp only for LLaVA, -// so there might be still unnecessary artifacts hanging around -// I'll gradually clean and extend it -// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch #include "clip.h" #include "clip-impl.h" +#include "clip-model.h" +#include "clip-graph.h" +#include "models/models.h" + #include "ggml.h" #include "ggml-cpp.h" #include "ggml-cpu.h" @@ -46,20 +46,23 @@ #include #include +#include "clip-model.h" +#include "clip-graph.h" +#include "models/models.h" +#include "models/cogvlm.cpp" +#include "models/internvl.cpp" +#include "models/kimivl.cpp" +#include "models/llama4.cpp" +#include "models/llava.cpp" +#include "models/minicpmv.cpp" +#include "models/pixtral.cpp" +#include "models/qwen2vl.cpp" +#include "models/qwen3vl.cpp" +#include "models/siglip.cpp" +#include "models/whisper-enc.cpp" + struct clip_logger_state g_logger_state = {clip_log_callback_default, NULL}; -enum ffn_op_type { - FFN_GELU, - FFN_GELU_ERF, - FFN_SILU, - FFN_GELU_QUICK, -}; - -enum norm_type { - NORM_TYPE_NORMAL, - NORM_TYPE_RMS, -}; - //#define CLIP_DEBUG_FUNCTIONS #ifdef CLIP_DEBUG_FUNCTIONS @@ -171,267 +174,6 @@ static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u #endif -// -// clip layers -// - -enum patch_merge_type { - PATCH_MERGE_FLAT, - PATCH_MERGE_SPATIAL_UNPAD, -}; - -struct clip_hparams { - int32_t image_size = 0; - int32_t patch_size = 0; - int32_t n_embd = 0; - int32_t n_ff = 0; - int32_t projection_dim = 0; - int32_t n_head = 0; - int32_t n_layer = 0; - // idefics3 - int32_t image_longest_edge = 0; - int32_t image_min_pixels = -1; - int32_t image_max_pixels = -1; - int32_t n_merge = 0; // number of patch merges **per-side** - - float image_mean[3]; - float image_std[3]; - - // for models using dynamic image size, we need to have a smaller image size to warmup - // otherwise, user will get OOM everytime they load the model - int32_t warmup_image_size = 0; - int32_t warmup_audio_size = 3000; - - ffn_op_type ffn_op = FFN_GELU; - - patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT; - - float eps = 1e-6; - float rope_theta = 0.0; - - std::vector image_res_candidates; // for llava-uhd style models - int32_t image_crop_resolution; - std::unordered_set vision_feature_layer; - int32_t attn_window_size = 0; - int32_t n_wa_pattern = 0; - - // audio - int32_t n_mel_bins = 0; // whisper preprocessor - int32_t proj_stack_factor = 0; // ultravox - - // legacy - bool has_llava_projector = false; - int minicpmv_version = 0; - int32_t minicpmv_query_num = 0; // MiniCPM-V query number - - // custom value provided by user, can be undefined if not set - int32_t custom_image_min_tokens = -1; - int32_t custom_image_max_tokens = -1; - - void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) { - const int cur_merge = n_merge == 0 ? 1 : n_merge; - const int patch_area = patch_size * patch_size * cur_merge * cur_merge; - image_min_pixels = (custom_image_min_tokens > 0 ? custom_image_min_tokens : n_tokens_min) * patch_area; - image_max_pixels = (custom_image_max_tokens > 0 ? custom_image_max_tokens : n_tokens_max) * patch_area; - warmup_image_size = static_cast(std::sqrt(image_max_pixels)); - } - - void set_warmup_n_tokens(int n_tokens) { - int n_tok_per_side = static_cast(std::sqrt(n_tokens)); - GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n"); - const int cur_merge = n_merge == 0 ? 1 : n_merge; - warmup_image_size = n_tok_per_side * patch_size * cur_merge; - // TODO: support warmup size for custom token numbers - } -}; - -struct clip_layer { - // attention - ggml_tensor * k_w = nullptr; - ggml_tensor * k_b = nullptr; - ggml_tensor * q_w = nullptr; - ggml_tensor * q_b = nullptr; - ggml_tensor * v_w = nullptr; - ggml_tensor * v_b = nullptr; - ggml_tensor * qkv_w = nullptr; - ggml_tensor * qkv_b = nullptr; - - ggml_tensor * o_w = nullptr; - ggml_tensor * o_b = nullptr; - - ggml_tensor * k_norm = nullptr; - ggml_tensor * q_norm = nullptr; - - // layernorm 1 - ggml_tensor * ln_1_w = nullptr; - ggml_tensor * ln_1_b = nullptr; - - ggml_tensor * ff_up_w = nullptr; - ggml_tensor * ff_up_b = nullptr; - ggml_tensor * ff_gate_w = nullptr; - ggml_tensor * ff_gate_b = nullptr; - ggml_tensor * ff_down_w = nullptr; - ggml_tensor * ff_down_b = nullptr; - - // layernorm 2 - ggml_tensor * ln_2_w = nullptr; - ggml_tensor * ln_2_b = nullptr; - - // layer scale (no bias) - ggml_tensor * ls_1_w = nullptr; - ggml_tensor * ls_2_w = nullptr; - - // qwen3vl deepstack merger - ggml_tensor * deepstack_norm_w = nullptr; - ggml_tensor * deepstack_norm_b = nullptr; - ggml_tensor * deepstack_fc1_w = nullptr; - ggml_tensor * deepstack_fc1_b = nullptr; - ggml_tensor * deepstack_fc2_w = nullptr; - ggml_tensor * deepstack_fc2_b = nullptr; - - bool has_deepstack() const { - return deepstack_fc1_w != nullptr; - } -}; - -struct clip_model { - clip_modality modality = CLIP_MODALITY_VISION; - projector_type proj_type = PROJECTOR_TYPE_MLP; - clip_hparams hparams; - - // embeddings - ggml_tensor * class_embedding = nullptr; - ggml_tensor * patch_embeddings_0 = nullptr; - ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL) - ggml_tensor * patch_bias = nullptr; - ggml_tensor * position_embeddings = nullptr; - - ggml_tensor * pre_ln_w = nullptr; - ggml_tensor * pre_ln_b = nullptr; - - std::vector layers; - - int32_t n_deepstack_layers = 0; // used by Qwen3-VL, calculated from clip_layer - - ggml_tensor * post_ln_w; - ggml_tensor * post_ln_b; - - ggml_tensor * projection; // TODO: rename it to fc (fully connected layer) - ggml_tensor * mm_fc_w; - ggml_tensor * mm_fc_b; - - // LLaVA projection - ggml_tensor * mm_input_norm_w = nullptr; - ggml_tensor * mm_input_norm_b = nullptr; - ggml_tensor * mm_0_w = nullptr; - ggml_tensor * mm_0_b = nullptr; - ggml_tensor * mm_2_w = nullptr; - ggml_tensor * mm_2_b = nullptr; - - ggml_tensor * image_newline = nullptr; - - // Yi type models with mlp+normalization projection - ggml_tensor * mm_1_w = nullptr; // Yi type models have 0, 1, 3, 4 - ggml_tensor * mm_1_b = nullptr; - ggml_tensor * mm_3_w = nullptr; - ggml_tensor * mm_3_b = nullptr; - ggml_tensor * mm_4_w = nullptr; - ggml_tensor * mm_4_b = nullptr; - - // GLMV-Edge projection - ggml_tensor * mm_model_adapter_conv_w = nullptr; - ggml_tensor * mm_model_adapter_conv_b = nullptr; - - // MobileVLM projection - ggml_tensor * mm_model_mlp_1_w = nullptr; - ggml_tensor * mm_model_mlp_1_b = nullptr; - ggml_tensor * mm_model_mlp_3_w = nullptr; - ggml_tensor * mm_model_mlp_3_b = nullptr; - ggml_tensor * mm_model_block_1_block_0_0_w = nullptr; - ggml_tensor * mm_model_block_1_block_0_1_w = nullptr; - ggml_tensor * mm_model_block_1_block_0_1_b = nullptr; - ggml_tensor * mm_model_block_1_block_1_fc1_w = nullptr; - ggml_tensor * mm_model_block_1_block_1_fc1_b = nullptr; - ggml_tensor * mm_model_block_1_block_1_fc2_w = nullptr; - ggml_tensor * mm_model_block_1_block_1_fc2_b = nullptr; - ggml_tensor * mm_model_block_1_block_2_0_w = nullptr; - ggml_tensor * mm_model_block_1_block_2_1_w = nullptr; - ggml_tensor * mm_model_block_1_block_2_1_b = nullptr; - ggml_tensor * mm_model_block_2_block_0_0_w = nullptr; - ggml_tensor * mm_model_block_2_block_0_1_w = nullptr; - ggml_tensor * mm_model_block_2_block_0_1_b = nullptr; - ggml_tensor * mm_model_block_2_block_1_fc1_w = nullptr; - ggml_tensor * mm_model_block_2_block_1_fc1_b = nullptr; - ggml_tensor * mm_model_block_2_block_1_fc2_w = nullptr; - ggml_tensor * mm_model_block_2_block_1_fc2_b = nullptr; - ggml_tensor * mm_model_block_2_block_2_0_w = nullptr; - ggml_tensor * mm_model_block_2_block_2_1_w = nullptr; - ggml_tensor * mm_model_block_2_block_2_1_b = nullptr; - - // MobileVLM_V2 projection - ggml_tensor * mm_model_mlp_0_w = nullptr; - ggml_tensor * mm_model_mlp_0_b = nullptr; - ggml_tensor * mm_model_mlp_2_w = nullptr; - ggml_tensor * mm_model_mlp_2_b = nullptr; - ggml_tensor * mm_model_peg_0_w = nullptr; - ggml_tensor * mm_model_peg_0_b = nullptr; - - // MINICPMV projection - ggml_tensor * mm_model_pos_embed_k = nullptr; - ggml_tensor * mm_model_query = nullptr; - ggml_tensor * mm_model_proj = nullptr; - ggml_tensor * mm_model_kv_proj = nullptr; - ggml_tensor * mm_model_attn_q_w = nullptr; - ggml_tensor * mm_model_attn_q_b = nullptr; - ggml_tensor * mm_model_attn_k_w = nullptr; - ggml_tensor * mm_model_attn_k_b = nullptr; - ggml_tensor * mm_model_attn_v_w = nullptr; - ggml_tensor * mm_model_attn_v_b = nullptr; - ggml_tensor * mm_model_attn_o_w = nullptr; - ggml_tensor * mm_model_attn_o_b = nullptr; - ggml_tensor * mm_model_ln_q_w = nullptr; - ggml_tensor * mm_model_ln_q_b = nullptr; - ggml_tensor * mm_model_ln_kv_w = nullptr; - ggml_tensor * mm_model_ln_kv_b = nullptr; - ggml_tensor * mm_model_ln_post_w = nullptr; - ggml_tensor * mm_model_ln_post_b = nullptr; - - // gemma3 - ggml_tensor * mm_input_proj_w = nullptr; - ggml_tensor * mm_soft_emb_norm_w = nullptr; - - // pixtral - ggml_tensor * token_embd_img_break = nullptr; - ggml_tensor * mm_patch_merger_w = nullptr; - - // ultravox / whisper encoder - ggml_tensor * conv1d_1_w = nullptr; - ggml_tensor * conv1d_1_b = nullptr; - ggml_tensor * conv1d_2_w = nullptr; - ggml_tensor * conv1d_2_b = nullptr; - ggml_tensor * mm_norm_pre_w = nullptr; - ggml_tensor * mm_norm_mid_w = nullptr; - - // cogvlm - ggml_tensor * mm_post_fc_norm_w = nullptr; - ggml_tensor * mm_post_fc_norm_b = nullptr; - ggml_tensor * mm_h_to_4h_w = nullptr; - ggml_tensor * mm_gate_w = nullptr; - ggml_tensor * mm_4h_to_h_w = nullptr; - ggml_tensor * mm_boi = nullptr; - ggml_tensor * mm_eoi = nullptr; - - bool audio_has_avgpool() const { - return proj_type == PROJECTOR_TYPE_QWEN2A - || proj_type == PROJECTOR_TYPE_VOXTRAL; - } - - bool audio_has_stack_frames() const { - return proj_type == PROJECTOR_TYPE_ULTRAVOX - || proj_type == PROJECTOR_TYPE_VOXTRAL; - } -}; - bool enable_gpu_clip = true; void set_clip_uses_gpu(bool usegpu) { @@ -519,2081 +261,640 @@ struct clip_ctx { } }; -struct clip_graph { - clip_ctx * ctx; - const clip_model & model; - const clip_hparams & hparams; +// +// clip_graph +// - // we only support single image per batch - const clip_image_f32 & img; +clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) : + model(ctx->model), + hparams(model.hparams), + proj_type(ctx->proj_type()), + img(img), + patch_size(hparams.patch_size), + n_patches_x(img.nx / patch_size), + n_patches_y(img.ny / patch_size), + n_patches(n_patches_x * n_patches_y), + n_embd(hparams.n_embd), + n_head(hparams.n_head), + d_head(n_embd / n_head), + n_layer(hparams.n_layer), + n_mmproj_embd(clip_n_mmproj_embd(ctx)), + eps(hparams.eps), + kq_scale(1.0f / sqrtf((float)d_head)), + flash_attn_type(ctx->flash_attn_type), + debug_graph(ctx->debug_graph), + debug_print_tensors(ctx->debug_print_tensors) { + struct ggml_init_params params = { + /*.mem_size =*/ ctx->buf_compute_meta.size(), + /*.mem_buffer =*/ ctx->buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + ctx0_ptr.reset(ggml_init(params)); + ctx0 = ctx0_ptr.get(); + gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false); +} - const int patch_size; - const int n_patches_x; - const int n_patches_y; - const int n_patches; - const int n_embd; - const int n_head; - const int d_head; - const int n_layer; - const float eps; - const float kq_scale; - - ggml_context_ptr ctx0_ptr; - ggml_context * ctx0; - ggml_cgraph * gf; - - clip_graph(clip_ctx * ctx, const clip_image_f32 & img) : - ctx(ctx), - model(ctx->model), - hparams(model.hparams), - img(img), - patch_size(hparams.patch_size), - n_patches_x(img.nx / patch_size), - n_patches_y(img.ny / patch_size), - n_patches(n_patches_x * n_patches_y), - n_embd(hparams.n_embd), - n_head(hparams.n_head), - d_head(n_embd / n_head), - n_layer(hparams.n_layer), - eps(hparams.eps), - kq_scale(1.0f / sqrtf((float)d_head)) { - struct ggml_init_params params = { - /*.mem_size =*/ ctx->buf_compute_meta.size(), - /*.mem_buffer =*/ ctx->buf_compute_meta.data(), - /*.no_alloc =*/ true, - }; - ctx0_ptr.reset(ggml_init(params)); - ctx0 = ctx0_ptr.get(); - gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false); - } - - ggml_cgraph * build_siglip() { - ggml_tensor * inp = build_inp(); - - ggml_tensor * learned_pos_embd = model.position_embeddings; - if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) { - learned_pos_embd = resize_position_embeddings(); - } - - ggml_tensor * cur = build_vit( - inp, n_patches, - NORM_TYPE_NORMAL, - hparams.ffn_op, - learned_pos_embd, - nullptr); - - if (ctx->proj_type() == PROJECTOR_TYPE_GEMMA3) { - const int batch_size = 1; - GGML_ASSERT(n_patches_x == n_patches_y); - const int patches_per_image = n_patches_x; - const int kernel_size = hparams.n_merge; - - cur = ggml_transpose(ctx0, cur); - cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size); - - // doing a pool2d to reduce the number of output tokens - cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0); - cur = ggml_reshape_3d(ctx0, cur, cur->ne[0] * cur->ne[0], n_embd, batch_size); - cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); - - // apply norm before projection - cur = ggml_rms_norm(ctx0, cur, eps); - cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w); - - // apply projection - cur = ggml_mul_mat(ctx0, - ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)), - cur); - - } else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) { - // pixel_shuffle - // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578 - const int scale_factor = model.hparams.n_merge; - cur = build_patch_merge_permute(cur, scale_factor); - cur = ggml_mul_mat(ctx0, model.projection, cur); - - } else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2) { - // pixel unshuffle block - const int scale_factor = model.hparams.n_merge; - cur = build_patch_merge_permute(cur, scale_factor); - - // projection - cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm - cur = ggml_mul(ctx0, cur, model.mm_input_norm_w); - cur = ggml_add(ctx0, cur, model.mm_input_norm_b); - - cur = build_ffn(cur, - model.mm_1_w, model.mm_1_b, - nullptr, nullptr, - model.mm_2_w, model.mm_2_b, - FFN_GELU, - -1); - - } else if (ctx->proj_type() == PROJECTOR_TYPE_JANUS_PRO) { - cur = build_ffn(cur, - model.mm_0_w, model.mm_0_b, - nullptr, nullptr, - model.mm_1_w, model.mm_1_b, - hparams.ffn_op, - -1); - - } else { - GGML_ABORT("SigLIP: Unsupported projector type"); - } - - // build the graph +void clip_graph::cb(ggml_tensor * cur0, const char * name, int il) const { + if (debug_graph) { + ggml_tensor * cur = ggml_cpy(ctx0, cur0, ggml_dup_tensor(ctx0, cur0)); + std::string cur_name = il >= 0 ? std::string(name) + "_" + std::to_string(il) : name; + ggml_set_name(cur, cur_name.c_str()); + ggml_set_output(cur); ggml_build_forward_expand(gf, cur); + debug_print_tensors.push_back(cur); + } +} - return gf; +// siglip2 naflex +ggml_tensor * clip_graph::resize_position_embeddings() { + ggml_tensor * pos_embd = model.position_embeddings; + const int height = img.ny / patch_size; + const int width = img.nx / patch_size; + const uint32_t mode = GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS; + const int n_per_side = (int)std::sqrt(pos_embd->ne[1]); + + GGML_ASSERT(pos_embd); + + if (height == n_per_side && width == n_per_side) { + return pos_embd; } - ggml_cgraph * build_pixtral() { - const int n_merge = hparams.n_merge; + pos_embd = ggml_reshape_3d(ctx0, pos_embd, n_embd, n_per_side, n_per_side); // -> (n_embd, n_per_side, n_per_side) + pos_embd = ggml_permute(ctx0, pos_embd, 2, 0, 1, 3); // -> (n_per_side, n_per_side, n_embd) + pos_embd = ggml_interpolate(ctx0, pos_embd, width, height, n_embd, 1, mode); // -> (width, height, n_embd) + pos_embd = ggml_permute(ctx0, pos_embd, 1, 2, 0, 3); // -> (n_embd, width, height) + pos_embd = ggml_cont_2d(ctx0, pos_embd, n_embd, width * height); // -> (n_embd, width * height) - // 2D input positions - ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); - ggml_set_name(pos_h, "pos_h"); - ggml_set_input(pos_h); + return pos_embd; +} - ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); - ggml_set_name(pos_w, "pos_w"); - ggml_set_input(pos_w); - - auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { - return build_rope_2d(ctx0, cur, pos_h, pos_w, hparams.rope_theta, true); - }; - - ggml_tensor * inp = build_inp(); - ggml_tensor * cur = build_vit( - inp, n_patches, - NORM_TYPE_RMS, - hparams.ffn_op, - nullptr, // no learned pos embd - add_pos); - - // mistral small 3.1 patch merger - // ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67 - if (model.mm_patch_merger_w) { - GGML_ASSERT(hparams.n_merge > 0); - - cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.mm_input_norm_w); - - // reshape image tokens to 2D grid - cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y); - cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd] - cur = ggml_cont(ctx0, cur); - - // torch.nn.functional.unfold is just an im2col under the hood - // we just need a dummy kernel to make it work - ggml_tensor * kernel = ggml_view_3d(ctx0, cur, n_merge, n_merge, cur->ne[2], 0, 0, 0); - cur = ggml_im2col(ctx0, kernel, cur, n_merge, n_merge, 0, 0, 1, 1, true, inp->type); - - // project to n_embd - cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]); - cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur); - } - - // LlavaMultiModalProjector (always using GELU activation) - { - cur = build_ffn(cur, - model.mm_1_w, model.mm_1_b, - nullptr, nullptr, - model.mm_2_w, model.mm_2_b, - FFN_GELU, - -1); - } - - // arrangement of the [IMG_BREAK] token - if (model.token_embd_img_break) { - // not efficient, but works - // the trick is to view the embeddings as a 3D tensor with shape [n_embd, n_patches_per_row, n_rows] - // and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension - // after the concatenation, we have a tensor with shape [n_embd, n_patches_per_row + 1, n_rows] - - const int p_y = n_merge > 0 ? n_patches_y / n_merge : n_patches_y; - const int p_x = n_merge > 0 ? n_patches_x / n_merge : n_patches_x; - const int p_total = p_x * p_y; - const int n_embd_text = cur->ne[0]; - const int n_tokens_output = p_total + p_y - 1; // one [IMG_BREAK] per row, except the last row - - ggml_tensor * tmp = ggml_reshape_3d(ctx0, cur, n_embd_text, p_x, p_y); - ggml_tensor * tok = ggml_new_tensor_3d(ctx0, tmp->type, n_embd_text, 1, p_y); - tok = ggml_scale(ctx0, tok, 0.0); // clear the tensor - tok = ggml_add(ctx0, tok, model.token_embd_img_break); - tmp = ggml_concat(ctx0, tmp, tok, 1); - cur = ggml_view_2d(ctx0, tmp, - n_embd_text, n_tokens_output, - ggml_row_size(tmp->type, n_embd_text), 0); - } - - // build the graph - ggml_build_forward_expand(gf, cur); - - return gf; - } - - // Qwen2VL and Qwen2.5VL use M-RoPE - ggml_cgraph * build_qwen2vl() { - GGML_ASSERT(model.patch_bias == nullptr); - GGML_ASSERT(model.class_embedding == nullptr); - - const int batch_size = 1; - const bool use_window_attn = hparams.n_wa_pattern > 0; - const int n_wa_pattern = hparams.n_wa_pattern; - const int n_pos = n_patches; - const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position - - norm_type norm_t = ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL - ? NORM_TYPE_RMS // qwen 2.5 vl - : NORM_TYPE_NORMAL; // qwen 2 vl - - int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; - - ggml_tensor * inp_raw = build_inp_raw(); - ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - - GGML_ASSERT(img.nx % (patch_size * 2) == 0); - GGML_ASSERT(img.ny % (patch_size * 2) == 0); - - // second conv dimension - { - auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - inp = ggml_add(ctx0, inp, inp_1); - - inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b] - inp = ggml_cont_4d( - ctx0, inp, - n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); - inp = ggml_reshape_4d( - ctx0, inp, - n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); - inp = ggml_permute(ctx0, inp, 0, 2, 1, 3); - inp = ggml_cont_3d( - ctx0, inp, - n_embd, n_patches_x * n_patches_y, batch_size); - } - - ggml_tensor * inpL = inp; - ggml_tensor * window_mask = nullptr; - ggml_tensor * window_idx = nullptr; - ggml_tensor * inv_window_idx = nullptr; - - ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids); - ggml_set_name(positions, "positions"); - ggml_set_input(positions); - - // pre-layernorm - if (model.pre_ln_w) { - inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1); - } - - if (use_window_attn) { - // handle window attention inputs - inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4); - ggml_set_name(inv_window_idx, "inv_window_idx"); - ggml_set_input(inv_window_idx); - // mask for window attention - window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos); - ggml_set_name(window_mask, "window_mask"); - ggml_set_input(window_mask); - - // if flash attn is used, we need to pad the mask and cast to f16 - if (ctx->flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) { - window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16); - } - - // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size] - GGML_ASSERT(batch_size == 1); - inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4); - inpL = ggml_get_rows(ctx0, inpL, inv_window_idx); - inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size); - } - - // loop over layers - for (int il = 0; il < n_layer; il++) { - const auto & layer = model.layers[il]; - const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true; - - ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states - - // layernorm1 - cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); - cb(cur, "ln1", il); - - // self-attention - { - ggml_tensor * Qcur = ggml_add(ctx0, - ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b); - ggml_tensor * Kcur = ggml_add(ctx0, - ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b); - ggml_tensor * Vcur = ggml_add(ctx0, - ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b); - - Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches); - Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches); - Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches); - - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); - - // apply M-RoPE - Qcur = ggml_rope_multi( - ctx0, Qcur, positions, nullptr, - d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); - Kcur = ggml_rope_multi( - ctx0, Kcur, positions, nullptr, - d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); - - cb(Qcur, "Qcur_rope", il); - cb(Kcur, "Kcur_rope", il); - - ggml_tensor * attn_mask = full_attn ? nullptr : window_mask; - - cur = build_attn(layer.o_w, layer.o_b, - Qcur, Kcur, Vcur, attn_mask, kq_scale, il); - cb(cur, "attn_out", il); - } - - // re-add the layer input, e.g., residual - cur = ggml_add(ctx0, cur, inpL); - - inpL = cur; // inpL = residual, cur = hidden_states - - cb(cur, "ffn_inp", il); - - // layernorm2 - cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); - cb(cur, "ffn_inp_normed", il); - - // ffn - cur = build_ffn(cur, - layer.ff_up_w, layer.ff_up_b, - layer.ff_gate_w, layer.ff_gate_b, - layer.ff_down_w, layer.ff_down_b, - hparams.ffn_op, il); - - cb(cur, "ffn_out", il); - - // residual 2 - cur = ggml_add(ctx0, inpL, cur); - cb(cur, "layer_out", il); - - inpL = cur; - } - - // post-layernorm - if (model.post_ln_w) { - inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer); - } - - // multimodal projection - ggml_tensor * embeddings = inpL; - embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size); - embeddings = build_ffn(embeddings, - model.mm_0_w, model.mm_0_b, - nullptr, nullptr, - model.mm_1_w, model.mm_1_b, - FFN_GELU, - -1); - - if (use_window_attn) { - window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4); - ggml_set_name(window_idx, "window_idx"); - ggml_set_input(window_idx); - - // embeddings shape: [n_embd, n_patches_x * n_patches_y, batch_size] - GGML_ASSERT(batch_size == 1); - embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4); - embeddings = ggml_get_rows(ctx0, embeddings, window_idx); - embeddings = ggml_reshape_3d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4, batch_size); - } - - // build the graph - ggml_build_forward_expand(gf, embeddings); - - return gf; - } - - // Qwen3VL - ggml_cgraph * build_qwen3vl() { - GGML_ASSERT(model.patch_bias != nullptr); - GGML_ASSERT(model.position_embeddings != nullptr); - GGML_ASSERT(model.class_embedding == nullptr); - - const int batch_size = 1; - const int n_pos = n_patches; - const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position - - norm_type norm_t = NORM_TYPE_NORMAL; - - int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; - - ggml_tensor * inp_raw = build_inp_raw(); - ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - - GGML_ASSERT(img.nx % (patch_size * 2) == 0); - GGML_ASSERT(img.ny % (patch_size * 2) == 0); - - // second conv dimension - { - auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - inp = ggml_add(ctx0, inp, inp_1); - - inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b] - inp = ggml_cont_4d( - ctx0, inp, - n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); - inp = ggml_reshape_4d( - ctx0, inp, - n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); - inp = ggml_permute(ctx0, inp, 0, 2, 1, 3); - inp = ggml_cont_3d( - ctx0, inp, - n_embd, n_patches_x * n_patches_y, batch_size); - } - - // add patch bias - if (model.patch_bias != nullptr) { - inp = ggml_add(ctx0, inp, model.patch_bias); - cb(inp, "patch_bias", -1); - } - - // calculate absolute position embedding and apply - ggml_tensor * learned_pos_embd = resize_position_embeddings(); - learned_pos_embd = ggml_cont_4d( - ctx0, learned_pos_embd, - n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); - learned_pos_embd = ggml_reshape_4d( - ctx0, learned_pos_embd, - n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); - learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3); - learned_pos_embd = ggml_cont_3d( - ctx0, learned_pos_embd, - n_embd, n_patches_x * n_patches_y, batch_size); +// build vision transformer (ViT) cgraph +// this function should cover most of the models +// if your model has specific features, you should probably duplicate this function +ggml_tensor * clip_graph::build_vit( + ggml_tensor * inp, + int64_t n_pos, + norm_type norm_t, + ffn_op_type ffn_t, + ggml_tensor * learned_pos_embd, + std::function add_pos + ) { + if (learned_pos_embd) { inp = ggml_add(ctx0, inp, learned_pos_embd); - cb(inp, "inp_pos_emb", -1); + cb(inp, "pos_embed", -1); + } - ggml_tensor * inpL = inp; + ggml_tensor * inpL = inp; - ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids); - ggml_set_name(positions, "positions"); - ggml_set_input(positions); + // pre-layernorm + if (model.pre_ln_w) { + inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1); + cb(inpL, "pre_ln", -1); + } - // pre-layernorm - if (model.pre_ln_w) { - inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1); - } + // loop over layers + for (int il = 0; il < n_layer; il++) { + auto & layer = model.layers[il]; + ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states - // deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size] - ggml_tensor * deepstack_features = nullptr; - const int merge_factor = hparams.n_merge > 0 ? hparams.n_merge * hparams.n_merge : 4; // default 2x2=4 for qwen3vl + // layernorm1 + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); + cb(cur, "layer_inp_normed", il); - // loop over layers - for (int il = 0; il < n_layer; il++) { - auto & layer = model.layers[il]; - - ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states - - // layernorm1 - cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); - cb(cur, "ln1", il); - - // self-attention - { + // self-attention + { + ggml_tensor * Qcur = nullptr; + ggml_tensor * Kcur = nullptr; + ggml_tensor * Vcur = nullptr; + if (layer.qkv_w != nullptr) { + // fused qkv cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); - cur = ggml_add(ctx0, cur, layer.qkv_b); - - ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, - /* nb1 */ ggml_row_size(cur->type, d_head), - /* nb2 */ cur->nb[1], - /* offset */ 0); - - ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, - /* nb1 */ ggml_row_size(cur->type, d_head), - /* nb2 */ cur->nb[1], - /* offset */ ggml_row_size(cur->type, n_embd)); - - ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, - /* nb1 */ ggml_row_size(cur->type, d_head), - /* nb2 */ cur->nb[1], - /* offset */ ggml_row_size(cur->type, 2 * n_embd)); - - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); - - // apply M-RoPE - Qcur = ggml_rope_multi( - ctx0, Qcur, positions, nullptr, - d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); - Kcur = ggml_rope_multi( - ctx0, Kcur, positions, nullptr, - d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); - - cb(Qcur, "Qcur_rope", il); - cb(Kcur, "Kcur_rope", il); - - cur = build_attn(layer.o_w, layer.o_b, - Qcur, Kcur, Vcur, nullptr, kq_scale, il); - cb(cur, "attn_out", il); - } - - // re-add the layer input, e.g., residual - cur = ggml_add(ctx0, cur, inpL); - - inpL = cur; // inpL = residual, cur = hidden_states - - cb(cur, "ffn_inp", il); - - // layernorm2 - cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); - cb(cur, "ffn_inp_normed", il); - - // ffn - cur = build_ffn(cur, - layer.ff_up_w, layer.ff_up_b, - layer.ff_gate_w, layer.ff_gate_b, - layer.ff_down_w, layer.ff_down_b, - hparams.ffn_op, il); - - cb(cur, "ffn_out", il); - - // residual 2 - cur = ggml_add(ctx0, inpL, cur); - cb(cur, "layer_out", il); - - if (layer.has_deepstack()) { - ggml_tensor * feat = ggml_reshape_3d(ctx0, cur, n_embd * merge_factor, n_pos / merge_factor, batch_size); - feat = build_norm(feat, layer.deepstack_norm_w, layer.deepstack_norm_b, norm_t, eps, il); - feat = build_ffn(feat, - layer.deepstack_fc1_w, layer.deepstack_fc1_b, - nullptr, nullptr, - layer.deepstack_fc2_w, layer.deepstack_fc2_b, - ffn_op_type::FFN_GELU, il); - - if(!deepstack_features) { - deepstack_features = feat; - } else { - // concat along the feature dimension - deepstack_features = ggml_concat(ctx0, deepstack_features, feat, 0); + if (layer.qkv_b != nullptr) { + cur = ggml_add(ctx0, cur, layer.qkv_b); } - } - inpL = cur; - } + Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, + /* nb1 */ ggml_row_size(cur->type, d_head), + /* nb2 */ cur->nb[1], + /* offset */ 0); - // post-layernorm - if (model.post_ln_w) { - inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer); - } + Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, + /* nb1 */ ggml_row_size(cur->type, d_head), + /* nb2 */ cur->nb[1], + /* offset */ ggml_row_size(cur->type, n_embd)); - // multimodal projection - ggml_tensor * embeddings = inpL; - embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size); + Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, + /* nb1 */ ggml_row_size(cur->type, d_head), + /* nb2 */ cur->nb[1], + /* offset */ ggml_row_size(cur->type, 2 * n_embd)); - embeddings = build_ffn(embeddings, - model.mm_0_w, model.mm_0_b, - nullptr, nullptr, - model.mm_1_w, model.mm_1_b, - ffn_op_type::FFN_GELU, -1); + // TODO: q/k norm requires row size == n_embd, while here it's d_head + // we can add support in the future if needed + GGML_ASSERT(layer.q_norm == nullptr && layer.k_norm == nullptr); - embeddings = ggml_concat(ctx0, embeddings, deepstack_features, 0); // concat along the feature dimension - - // build the graph - ggml_build_forward_expand(gf, embeddings); - - return gf; - } - - ggml_cgraph * build_minicpmv() { - GGML_ASSERT(model.class_embedding == nullptr); - const int n_pos = n_patches; - const int n_embd_proj = clip_n_mmproj_embd(ctx); - - // position embeddings for the projector (not for ViT) - // see: https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/resampler.py#L70 - // base frequency omega - ggml_tensor * omega = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_embd_proj / 4); - ggml_set_name(omega, "omega"); - ggml_set_input(omega); - - // 2D input positions (using float for sinusoidal embeddings) - ggml_tensor * pos_h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos); - ggml_set_name(pos_h, "pos_h"); - ggml_set_input(pos_h); - ggml_tensor * pos_w = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos); - ggml_set_name(pos_w, "pos_w"); - ggml_set_input(pos_w); - - // for selecting learned pos embd, used by ViT - struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); - ggml_set_name(positions, "positions"); - ggml_set_input(positions); - - ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions); - - ggml_tensor * inp = build_inp(); - ggml_tensor * embeddings = build_vit( - inp, n_pos, - NORM_TYPE_NORMAL, - hparams.ffn_op, - learned_pos_embd, - nullptr); - - // resampler projector (it is just another transformer) - - ggml_tensor * q = model.mm_model_query; - ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings); - - // norm - q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1); - v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1); - - // calculate sinusoidal pos embd - ggml_tensor * pos_embed = nullptr; - { - // outer product - ggml_tensor * omega_b = ggml_repeat_4d(ctx0, omega, omega->ne[0], n_pos, 1, 1); // n_pos rows - ggml_tensor * theta_x = ggml_mul(ctx0, omega_b, pos_w); - ggml_tensor * theta_y = ggml_mul(ctx0, omega_b, pos_h); - // sin and cos - ggml_tensor * pos_embd_x = ggml_concat( - ctx0, - ggml_sin(ctx0, theta_x), - ggml_cos(ctx0, theta_x), - 0 // concat on first dim - ); - ggml_tensor * pos_embd_y = ggml_concat( - ctx0, - ggml_sin(ctx0, theta_y), - ggml_cos(ctx0, theta_y), - 0 // concat on first dim - ); - pos_embed = ggml_concat(ctx0, pos_embd_x, pos_embd_y, 0); - } - - // k = v + pos_embed - ggml_tensor * k = ggml_add(ctx0, v, pos_embed); - - // attention - { - const int d_head = 128; - int n_head = n_embd_proj/d_head; - // Use actual config value if available, otherwise fall back to hardcoded values - int num_query = ctx->model.hparams.minicpmv_query_num; - ggml_tensor * Q = ggml_add(ctx0, - ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), - model.mm_model_attn_q_b); - ggml_tensor * K = ggml_add(ctx0, - ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), - model.mm_model_attn_k_b); - ggml_tensor * V = ggml_add(ctx0, - ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), - model.mm_model_attn_v_b); - - Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_query); - K = ggml_reshape_3d(ctx0, K, d_head, n_head, n_pos); - V = ggml_reshape_3d(ctx0, V, d_head, n_head, n_pos); - - cb(Q, "resampler_Q", -1); - cb(K, "resampler_K", -1); - cb(V, "resampler_V", -1); - - float resampler_kq_scale = 1.0f/ sqrtf(float(d_head)); - embeddings = build_attn( - model.mm_model_attn_o_w, - model.mm_model_attn_o_b, - Q, K, V, nullptr, resampler_kq_scale, -1); - cb(embeddings, "resampler_attn_out", -1); - } - // layernorm - embeddings = build_norm(embeddings, model.mm_model_ln_post_w, model.mm_model_ln_post_b, NORM_TYPE_NORMAL, eps, -1); - - // projection - embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings); - - // build the graph - ggml_build_forward_expand(gf, embeddings); - - return gf; - } - - ggml_cgraph * build_internvl() { - GGML_ASSERT(model.class_embedding != nullptr); - GGML_ASSERT(model.position_embeddings != nullptr); - - const int n_pos = n_patches + 1; - ggml_tensor * inp = build_inp(); - - // add CLS token - inp = ggml_concat(ctx0, inp, model.class_embedding, 1); - - // The larger models use a different ViT, which uses RMS norm instead of layer norm - // ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188 - norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45) - ? NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B) - : NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models) - - ggml_tensor * cur = build_vit( - inp, n_pos, - norm_t, - hparams.ffn_op, - model.position_embeddings, - nullptr); - - // remove CLS token - cur = ggml_view_2d(ctx0, cur, - n_embd, n_patches, - ggml_row_size(cur->type, n_embd), 0); - - // pixel shuffle - { - const int scale_factor = model.hparams.n_merge; - const int bsz = 1; // batch size, always 1 for now since we don't support batching - const int height = n_patches_y; - const int width = n_patches_x; - GGML_ASSERT(scale_factor > 0); - cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz); - cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); - cur = ggml_cont_4d(ctx0, cur, - n_embd * scale_factor * scale_factor, - height / scale_factor, - width / scale_factor, - bsz); - cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); - // flatten to 2D - cur = ggml_cont_2d(ctx0, cur, - n_embd * scale_factor * scale_factor, - cur->ne[1] * cur->ne[2]); - } - - // projector (always using GELU activation) - { - // projector LayerNorm uses pytorch's default eps = 1e-5 - // ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79 - cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1); - cur = build_ffn(cur, - model.mm_1_w, model.mm_1_b, - nullptr, nullptr, - model.mm_3_w, model.mm_3_b, - FFN_GELU, - -1); - } - - // build the graph - ggml_build_forward_expand(gf, cur); - - return gf; - } - - ggml_cgraph * build_llama4() { - GGML_ASSERT(model.class_embedding != nullptr); - GGML_ASSERT(model.position_embeddings != nullptr); - - const int n_pos = n_patches + 1; // +1 for [CLS] - - // 2D input positions - ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); - ggml_set_name(pos_h, "pos_h"); - ggml_set_input(pos_h); - - ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); - ggml_set_name(pos_w, "pos_w"); - ggml_set_input(pos_w); - - ggml_tensor * inp = build_inp_raw(); - - // Llama4UnfoldConvolution - { - ggml_tensor * kernel = ggml_reshape_4d(ctx0, model.patch_embeddings_0, - patch_size, patch_size, 3, n_embd); - inp = ggml_im2col(ctx0, kernel, inp, patch_size, patch_size, 0, 0, 1, 1, true, inp->type); - inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp); - inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches); - cb(inp, "patch_conv", -1); - } - - // add CLS token - inp = ggml_concat(ctx0, inp, model.class_embedding, 1); - - // build ViT with 2D position embeddings - auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { - // first half is X axis and second half is Y axis - // ref: https://github.com/huggingface/transformers/blob/40a493c7ed4f19f08eadb0639cf26d49bfa5e180/src/transformers/models/llama4/modeling_llama4.py#L1312 - // ref: https://github.com/Blaizzy/mlx-vlm/blob/a57156aa87b33cca6e5ee6cfc14dd4ef8f611be6/mlx_vlm/models/llama4/vision.py#L441 - return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false); - }; - ggml_tensor * cur = build_vit( - inp, n_pos, - NORM_TYPE_NORMAL, - hparams.ffn_op, - model.position_embeddings, - add_pos); - - // remove CLS token - cur = ggml_view_2d(ctx0, cur, - n_embd, n_patches, - ggml_row_size(cur->type, n_embd), 0); - - // pixel shuffle - // based on Llama4VisionPixelShuffleMLP - // https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151 - { - const int scale_factor = model.hparams.n_merge; - const int bsz = 1; // batch size, always 1 for now since we don't support batching - GGML_ASSERT(scale_factor > 0); - GGML_ASSERT(n_patches_x == n_patches_y); // llama4 only supports square images - cur = ggml_reshape_4d(ctx0, cur, - n_embd * scale_factor, - n_patches_x / scale_factor, - n_patches_y, - bsz); - cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); - cur = ggml_cont_4d(ctx0, cur, - n_embd * scale_factor * scale_factor, - n_patches_x / scale_factor, - n_patches_y / scale_factor, - bsz); - //cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); - // flatten to 2D - cur = ggml_cont_2d(ctx0, cur, - n_embd * scale_factor * scale_factor, - n_patches / scale_factor / scale_factor); - cb(cur, "pixel_shuffle", -1); - } - - // based on Llama4VisionMLP2 (always uses GELU activation, no bias) - { - cur = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, cur); - cur = ggml_gelu(ctx0, cur); - cur = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, cur); - cur = ggml_gelu(ctx0, cur); - cb(cur, "adapter_mlp", -1); - } - - // Llama4MultiModalProjector - cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur); - cb(cur, "projected", -1); - - // build the graph - ggml_build_forward_expand(gf, cur); - - return gf; - } - - ggml_cgraph * build_kimivl() { - // 2D input positions - ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); - ggml_set_name(pos_h, "pos_h"); - ggml_set_input(pos_h); - - ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); - ggml_set_name(pos_w, "pos_w"); - ggml_set_input(pos_w); - - ggml_tensor * learned_pos_embd = resize_position_embeddings(); - - // build ViT with 2D position embeddings - auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { - // first half is X axis and second half is Y axis - return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false); - }; - - ggml_tensor * inp = build_inp(); - ggml_tensor * cur = build_vit( - inp, n_patches, - NORM_TYPE_NORMAL, - hparams.ffn_op, - learned_pos_embd, - add_pos); - - cb(cur, "vit_out", -1); - - { - // patch_merger - const int scale_factor = model.hparams.n_merge; - cur = build_patch_merge_permute(cur, scale_factor); - - // projection norm - int proj_inp_dim = cur->ne[0]; - cur = ggml_view_2d(ctx0, cur, - n_embd, cur->ne[1] * scale_factor * scale_factor, - ggml_row_size(cur->type, n_embd), 0); - cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm - cur = ggml_mul(ctx0, cur, model.mm_input_norm_w); - cur = ggml_add(ctx0, cur, model.mm_input_norm_b); - cur = ggml_view_2d(ctx0, cur, - proj_inp_dim, cur->ne[1] / scale_factor / scale_factor, - ggml_row_size(cur->type, proj_inp_dim), 0); - cb(cur, "proj_inp_normed", -1); - - // projection mlp - cur = build_ffn(cur, - model.mm_1_w, model.mm_1_b, - nullptr, nullptr, - model.mm_2_w, model.mm_2_b, - FFN_GELU, - -1); - cb(cur, "proj_out", -1); - } - - // build the graph - ggml_build_forward_expand(gf, cur); - - return gf; - } - - // this graph is used by llava, granite and glm - // due to having embedding_stack (used by granite), we cannot reuse build_vit - ggml_cgraph * build_llava() { - const int batch_size = 1; - const int n_pos = n_patches + (model.class_embedding ? 1 : 0); - - GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported"); - - // Calculate the deepest feature layer based on hparams and projector type - int max_feature_layer = n_layer; - { - // Get the index of the second to last layer; this is the default for models that have a llava projector - int il_last = hparams.n_layer - 1; - int deepest_feature_layer = -1; - - if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV || ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE) { - il_last += 1; - } - - // If we set explicit vision feature layers, only go up to the deepest one - // NOTE: only used by granite-vision models for now - for (const auto & feature_layer : hparams.vision_feature_layer) { - if (feature_layer > deepest_feature_layer) { - deepest_feature_layer = feature_layer; - } - } - max_feature_layer = deepest_feature_layer < 0 ? il_last : deepest_feature_layer; - } - - ggml_tensor * inp = build_inp(); - - // concat class_embeddings and patch_embeddings - if (model.class_embedding) { - inp = ggml_concat(ctx0, inp, model.class_embedding, 1); - } - - ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); - ggml_set_name(positions, "positions"); - ggml_set_input(positions); - - inp = ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions)); - - ggml_tensor * inpL = inp; - - // pre-layernorm - if (model.pre_ln_w) { - inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, NORM_TYPE_NORMAL, eps, -1); - cb(inpL, "pre_ln", -1); - } - - std::vector embedding_stack; - const auto & vision_feature_layer = hparams.vision_feature_layer; - - // loop over layers - for (int il = 0; il < max_feature_layer; il++) { - auto & layer = model.layers[il]; - ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states - - // If this is an embedding feature layer, save the output. - // NOTE: 0 index here refers to the input to the encoder. - if (vision_feature_layer.find(il) != vision_feature_layer.end()) { - embedding_stack.push_back(cur); - } - - // layernorm1 - cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il); - cb(cur, "layer_inp_normed", il); - - // self-attention - { - ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur); + } else { + // separate q, k, v + Qcur = ggml_mul_mat(ctx0, layer.q_w, cur); if (layer.q_b) { Qcur = ggml_add(ctx0, Qcur, layer.q_b); } - ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur); + Kcur = ggml_mul_mat(ctx0, layer.k_w, cur); if (layer.k_b) { Kcur = ggml_add(ctx0, Kcur, layer.k_b); } - ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur); + Vcur = ggml_mul_mat(ctx0, layer.v_w, cur); if (layer.v_b) { Vcur = ggml_add(ctx0, Vcur, layer.v_b); } + if (layer.q_norm) { + Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il); + cb(Qcur, "Qcur_norm", il); + } + + if (layer.k_norm) { + Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il); + cb(Kcur, "Kcur_norm", il); + } + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos); Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos); Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos); - - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); - - cur = build_attn(layer.o_w, layer.o_b, - Qcur, Kcur, Vcur, nullptr, kq_scale, il); - cb(cur, "attn_out", il); } - // re-add the layer input, e.g., residual - cur = ggml_add(ctx0, cur, inpL); - - inpL = cur; // inpL = residual, cur = hidden_states - - cb(cur, "ffn_inp", il); - - // layernorm2 - cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il); - cb(cur, "ffn_inp_normed", il); - - // ffn - cur = build_ffn(cur, - layer.ff_up_w, layer.ff_up_b, - layer.ff_gate_w, layer.ff_gate_b, - layer.ff_down_w, layer.ff_down_b, - hparams.ffn_op, il); - - cb(cur, "ffn_out", il); - - // residual 2 - cur = ggml_add(ctx0, inpL, cur); - cb(cur, "layer_out", il); - - inpL = cur; - } - - // post-layernorm - if (model.post_ln_w) { - inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, eps, -1); - } - - ggml_tensor * embeddings = inpL; - - // process vision feature layers (used by granite) - { - // final layer is a vision feature layer - if (vision_feature_layer.find(max_feature_layer) != vision_feature_layer.end()) { - embedding_stack.push_back(inpL); - } - - // If feature layers are explicitly set, stack them (if we have multiple) - if (!embedding_stack.empty()) { - embeddings = embedding_stack[0]; - for (size_t i = 1; i < embedding_stack.size(); i++) { - embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0); - } - } - } - - // llava projector (also used by granite) - if (ctx->model.hparams.has_llava_projector) { - embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); - - ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); - ggml_set_name(patches, "patches"); - ggml_set_input(patches); - - // shape [1, 576, 1024] - // ne is whcn, ne = [1024, 576, 1, 1] - embeddings = ggml_get_rows(ctx0, embeddings, patches); - - // print_tensor_info(embeddings, "embeddings"); - - // llava projector - if (ctx->proj_type() == PROJECTOR_TYPE_MLP) { - embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); - - embeddings = ggml_gelu(ctx0, embeddings); - if (model.mm_2_w) { - embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); - } - } - else if (ctx->proj_type() == PROJECTOR_TYPE_MLP_NORM) { - embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); - // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false); - // First LayerNorm - embeddings = ggml_norm(ctx0, embeddings, eps); - embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w), - model.mm_1_b); - - // GELU activation - embeddings = ggml_gelu(ctx0, embeddings); - - // Second linear layer - embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings); - embeddings = ggml_add(ctx0, embeddings, model.mm_3_b); - - // Second LayerNorm - embeddings = ggml_norm(ctx0, embeddings, eps); - embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w), - model.mm_4_b); - } - else if (ctx->proj_type() == PROJECTOR_TYPE_LDP) { - // MobileVLM projector - int n_patch = 24; - ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings); - mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b); - mlp_1 = ggml_gelu(ctx0, mlp_1); - ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1); - mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b); - // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1] - - // block 1 - ggml_tensor * block_1 = nullptr; - { - // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24] - mlp_3 = ggml_permute(ctx0, mlp_3, 1, 0, 2, 3); - mlp_3 = ggml_cont_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]); - // stride = 1, padding = 1, bias is nullptr - block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1); - - // layer norm - // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] - block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3)); - // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1] - block_1 = ggml_norm(ctx0, block_1, eps); - block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b); - block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); - - // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] - // hardswish - ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1); - - block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0); - // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] - // pointwise conv - block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]); - block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1); - block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b); - block_1 = ggml_relu(ctx0, block_1); - block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1); - block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b); - block_1 = ggml_hardsigmoid(ctx0, block_1); - // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1] - block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]); - block_1 = ggml_mul(ctx0, block_1_hw, block_1); - - int w = block_1->ne[0], h = block_1->ne[1]; - block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]); - block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3)); - - // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1] - block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1); - block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]); - - // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1] - block_1 = ggml_norm(ctx0, block_1, eps); - block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b); - block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); - // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] - // residual - block_1 = ggml_add(ctx0, mlp_3, block_1); - } - - // block_2 - { - // stride = 2 - block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1); - - // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] - // layer norm - block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3)); - // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1] - block_1 = ggml_norm(ctx0, block_1, eps); - block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b); - block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); - // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] - // hardswish - ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1); - - // not sure the parameters is right for globalAvgPooling - block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0); - // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] - // pointwise conv - block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]); - block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1); - block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b); - block_1 = ggml_relu(ctx0, block_1); - block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1); - block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b); - block_1 = ggml_hardsigmoid(ctx0, block_1); - - // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] - block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]); - block_1 = ggml_mul(ctx0, block_1_hw, block_1); - - int w = block_1->ne[0], h = block_1->ne[1]; - block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]); - block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3)); - // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1] - block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1); - block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]); - - - // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1] - block_1 = ggml_norm(ctx0, block_1, eps); - block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b); - block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]); - // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1] - } - embeddings = block_1; - } - else if (ctx->proj_type() == PROJECTOR_TYPE_LDPV2) - { - int n_patch = 24; - ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); - mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b); - mlp_0 = ggml_gelu(ctx0, mlp_0); - ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0); - mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b); - // mlp_2 ne = [2048, 576, 1, 1] - // // AVG Pool Layer 2*2, strides = 2 - mlp_2 = ggml_permute(ctx0, mlp_2, 1, 0, 2, 3); - // mlp_2 ne = [576, 2048, 1, 1] - mlp_2 = ggml_cont_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]); - // mlp_2 ne [24, 24, 2048, 1] - mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0); - // weight ne = [3, 3, 2048, 1] - ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1); - peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3)); - peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b); - mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3)); - peg_0 = ggml_add(ctx0, peg_0, mlp_2); - peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]); - embeddings = peg_0; - } - else { - GGML_ABORT("fatal error"); - } - } - - // glm projector - else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE) { - size_t gridsz = (size_t)sqrt(embeddings->ne[1]); - embeddings = ggml_permute(ctx0,embeddings,1,0,2,3); - embeddings = ggml_cont_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]); - embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1); - embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size); - embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3)); - embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b); - // GLU - { - embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); - embeddings = ggml_norm(ctx0, embeddings, eps); - embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b); - embeddings = ggml_gelu_inplace(ctx0, embeddings); - ggml_tensor * x = embeddings; - embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings); - x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x); - embeddings = ggml_swiglu_split(ctx0, embeddings, x); - embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings); - } - // arrangement of BOI/EOI token embeddings - // note: these embeddings are not present in text model, hence we cannot process them as text tokens - // see: https://huggingface.co/THUDM/glm-edge-v-2b/blob/main/siglip.py#L53 - { - embeddings = ggml_concat(ctx0, model.mm_boi, embeddings, 1); // BOI - embeddings = ggml_concat(ctx0, embeddings, model.mm_eoi, 1); // EOI - } - } - - else { - GGML_ABORT("llava: unknown projector type"); - } - - // build the graph - ggml_build_forward_expand(gf, embeddings); - - return gf; - } - // whisper encoder with custom projector - ggml_cgraph * build_whisper_enc() { - const int n_frames = img.nx; - const int n_pos = n_frames / 2; - GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos); - - ggml_tensor * inp = build_inp_raw(1); - - // conv1d block - { - // convolution + gelu - ggml_tensor * cur = ggml_conv_1d_ph(ctx0, model.conv1d_1_w, inp, 1, 1); - cur = ggml_add(ctx0, cur, model.conv1d_1_b); - - cur = ggml_gelu_erf(ctx0, cur); - - cur = ggml_conv_1d_ph(ctx0, model.conv1d_2_w, cur, 2, 1); - cur = ggml_add(ctx0, cur, model.conv1d_2_b); - - cur = ggml_gelu_erf(ctx0, cur); - // transpose - inp = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); - cb(inp, "after_conv1d", -1); - } - - // sanity check (only check one layer, but it should be the same for all) - GGML_ASSERT(model.layers[0].ln_1_w && model.layers[0].ln_1_b); - GGML_ASSERT(model.layers[0].ln_2_w && model.layers[0].ln_2_b); - GGML_ASSERT(model.layers[0].q_b); - GGML_ASSERT(model.layers[0].v_b); - GGML_ASSERT(!model.layers[0].k_b); // no bias for k - GGML_ASSERT(model.post_ln_w && model.post_ln_b); - - ggml_tensor * pos_embd_selected = ggml_view_2d( - ctx0, model.position_embeddings, - model.position_embeddings->ne[0], n_pos, - model.position_embeddings->nb[1], 0 - ); - ggml_tensor * cur = build_vit( - inp, n_pos, - NORM_TYPE_NORMAL, - hparams.ffn_op, - pos_embd_selected, - nullptr); - - cb(cur, "after_transformer", -1); - - if (model.audio_has_stack_frames()) { - // StackAudioFrames - // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py - int64_t stride = n_embd * hparams.proj_stack_factor; - int64_t padded_len = GGML_PAD(ggml_nelements(cur), stride); - int64_t pad = padded_len - ggml_nelements(cur); - if (pad > 0) { - cur = ggml_view_1d(ctx0, cur, ggml_nelements(cur), 0); - cur = ggml_pad(ctx0, cur, pad, 0, 0, 0); - } - cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride, - ggml_row_size(cur->type, stride), 0); - cb(cur, "after_stacked", -1); - } - - if (ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX) { - // UltravoxProjector - // pre-norm - cur = ggml_rms_norm(ctx0, cur, 1e-6); - cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w); - - // ffn in - cur = ggml_mul_mat(ctx0, model.mm_1_w, cur); - - // swiglu - // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half - cur = ggml_swiglu_swapped(ctx0, cur); - - // mid-norm - cur = ggml_rms_norm(ctx0, cur, 1e-6); - cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w); - - // ffn out - cur = ggml_mul_mat(ctx0, model.mm_2_w, cur); - - } else if (ctx->proj_type() == PROJECTOR_TYPE_QWEN2A) { - // projector - cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur); - cur = ggml_add(ctx0, cur, model.mm_fc_b); - - } else if (ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL) { - // projector - cur = build_ffn(cur, - model.mm_1_w, model.mm_1_b, - nullptr, nullptr, - model.mm_2_w, model.mm_2_b, - FFN_GELU_ERF, - -1); - - } else { - GGML_ABORT("%s: unknown projector type", __func__); - } - - cb(cur, "projected", -1); - - ggml_build_forward_expand(gf, cur); - - return gf; - } - - // cogvlm vision encoder - ggml_cgraph * build_cogvlm() { - GGML_ASSERT(model.class_embedding != nullptr); - GGML_ASSERT(model.position_embeddings != nullptr); - - const int n_pos = n_patches + 1; // +1 for [CLS] - - // build input and concatenate class embedding - ggml_tensor * inp = build_inp(); - inp = ggml_concat(ctx0, inp, model.class_embedding, 1); - - inp = ggml_add(ctx0, inp, model.position_embeddings); - cb(inp, "inp_pos", -1); - - ggml_tensor * inpL = inp; - - for (int il = 0; il < n_layer; il++) { - auto & layer = model.layers[il]; - ggml_tensor * cur = inpL; - - cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); - - cur = ggml_add(ctx0, cur, layer.qkv_b); - - ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float), - cur->nb[1], 0); - ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float), - cur->nb[1], n_embd * sizeof(float)); - ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float), - cur->nb[1], 2 * n_embd * sizeof(float)); - cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); + if (add_pos) { + Qcur = add_pos(Qcur, layer); + Kcur = add_pos(Kcur, layer); + cb(Qcur, "Qcur_pos", il); + cb(Kcur, "Kcur_pos", il); + } + cur = build_attn(layer.o_w, layer.o_b, Qcur, Kcur, Vcur, nullptr, kq_scale, il); cb(cur, "attn_out", il); - - cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il); - cb(cur, "attn_post_norm", il); - - cur = ggml_add(ctx0, cur, inpL); - inpL = cur; - - cur = build_ffn(cur, - layer.ff_up_w, layer.ff_up_b, - layer.ff_gate_w, layer.ff_gate_b, - layer.ff_down_w, layer.ff_down_b, - hparams.ffn_op, il); - - cb(cur, "ffn_out", il); - - cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il); - cb(cur, "ffn_post_norm", il); - - cur = ggml_add(ctx0, cur, inpL); - cb(cur, "layer_out", il); - inpL = cur; - } - // remove CLS token (like build_llama4 does) - ggml_tensor * cur = ggml_view_2d(ctx0, inpL, - n_embd, n_patches, - ggml_row_size(inpL->type, n_embd), 0); + if (layer.ls_1_w) { + cur = ggml_mul(ctx0, cur, layer.ls_1_w); + cb(cur, "attn_out_scaled", il); + } - // Multiply with mm_model_proj - cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur); + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, inpL); - // Apply layernorm, weight, bias - cur = build_norm(cur, model.mm_post_fc_norm_w, model.mm_post_fc_norm_b, NORM_TYPE_NORMAL, 1e-5, -1); + inpL = cur; // inpL = residual, cur = hidden_states - // Apply GELU - cur = ggml_gelu_inplace(ctx0, cur); + cb(cur, "ffn_inp", il); - // Branch 1: multiply with mm_h_to_4h_w - ggml_tensor * h_to_4h = ggml_mul_mat(ctx0, model.mm_h_to_4h_w, cur); + // layernorm2 + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); + cb(cur, "ffn_inp_normed", il); - // Branch 2: multiply with mm_gate_w - ggml_tensor * gate = ggml_mul_mat(ctx0, model.mm_gate_w, cur); + // ffn + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + layer.ff_gate_w, layer.ff_gate_b, + layer.ff_down_w, layer.ff_down_b, + ffn_t, il); - // Apply silu - gate = ggml_swiglu_split(ctx0, gate, h_to_4h); + cb(cur, "ffn_out", il); - // Apply mm_4h_to_h_w - cur = ggml_mul_mat(ctx0, model.mm_4h_to_h_w, gate); + if (layer.ls_2_w) { + cur = ggml_mul(ctx0, cur, layer.ls_2_w); + cb(cur, "ffn_out_scaled", il); + } - // Concatenate with boi and eoi - cur = ggml_concat(ctx0, model.mm_boi, cur, 1); - cur = ggml_concat(ctx0, cur, model.mm_eoi, 1); + // residual 2 + cur = ggml_add(ctx0, inpL, cur); + cb(cur, "layer_out", il); - // build the graph - ggml_build_forward_expand(gf, cur); - - return gf; + inpL = cur; } -private: - // - // utility functions - // - - void cb(ggml_tensor * cur0, const char * name, int il) const { - if (ctx->debug_graph) { - ggml_tensor * cur = ggml_cpy(ctx0, cur0, ggml_dup_tensor(ctx0, cur0)); - std::string cur_name = il >= 0 ? std::string(name) + "_" + std::to_string(il) : name; - ggml_set_name(cur, cur_name.c_str()); - ggml_set_output(cur); - ggml_build_forward_expand(gf, cur); - ctx->debug_print_tensors.push_back(cur); - } + if (model.audio_has_avgpool()) { + ggml_tensor * cur = inpL; + cur = ggml_transpose(ctx0, cur); + cur = ggml_cont(ctx0, cur); + cur = ggml_pool_1d(ctx0, cur, GGML_OP_POOL_AVG, 2, 2, 0); + cur = ggml_transpose(ctx0, cur); + cur = ggml_cont(ctx0, cur); + inpL = cur; } - // siglip2 naflex - ggml_tensor * resize_position_embeddings() { - ggml_tensor * pos_embd = model.position_embeddings; - const int height = img.ny / patch_size; - const int width = img.nx / patch_size; - const uint32_t mode = GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS; - const int n_per_side = (int)std::sqrt(pos_embd->ne[1]); - - GGML_ASSERT(pos_embd); - - if (height == n_per_side && width == n_per_side) { - return pos_embd; - } - - pos_embd = ggml_reshape_3d(ctx0, pos_embd, n_embd, n_per_side, n_per_side); // -> (n_embd, n_per_side, n_per_side) - pos_embd = ggml_permute(ctx0, pos_embd, 2, 0, 1, 3); // -> (n_per_side, n_per_side, n_embd) - pos_embd = ggml_interpolate(ctx0, pos_embd, width, height, n_embd, 1, mode); // -> (width, height, n_embd) - pos_embd = ggml_permute(ctx0, pos_embd, 1, 2, 0, 3); // -> (n_embd, width, height) - pos_embd = ggml_cont_2d(ctx0, pos_embd, n_embd, width * height); // -> (n_embd, width * height) - - return pos_embd; + // post-layernorm + if (model.post_ln_w) { + inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, -1); } + return inpL; +} - // build vision transformer (ViT) cgraph - // this function should cover most of the models - // if your model has specific features, you should probably duplicate this function - ggml_tensor * build_vit( - ggml_tensor * inp, - int64_t n_pos, - norm_type norm_t, - ffn_op_type ffn_t, - ggml_tensor * learned_pos_embd, - std::function add_pos - ) { - if (learned_pos_embd) { - inp = ggml_add(ctx0, inp, learned_pos_embd); - cb(inp, "pos_embed", -1); - } - - ggml_tensor * inpL = inp; - - // pre-layernorm - if (model.pre_ln_w) { - inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1); - cb(inpL, "pre_ln", -1); - } - - // loop over layers - for (int il = 0; il < n_layer; il++) { - auto & layer = model.layers[il]; - ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states - - // layernorm1 - cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); - cb(cur, "layer_inp_normed", il); - - // self-attention - { - ggml_tensor * Qcur = nullptr; - ggml_tensor * Kcur = nullptr; - ggml_tensor * Vcur = nullptr; - if (layer.qkv_w != nullptr) { - // fused qkv - cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); - if (layer.qkv_b != nullptr) { - cur = ggml_add(ctx0, cur, layer.qkv_b); - } - - Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, - /* nb1 */ ggml_row_size(cur->type, d_head), - /* nb2 */ cur->nb[1], - /* offset */ 0); - - Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, - /* nb1 */ ggml_row_size(cur->type, d_head), - /* nb2 */ cur->nb[1], - /* offset */ ggml_row_size(cur->type, n_embd)); - - Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, - /* nb1 */ ggml_row_size(cur->type, d_head), - /* nb2 */ cur->nb[1], - /* offset */ ggml_row_size(cur->type, 2 * n_embd)); - - // TODO: q/k norm requires row size == n_embd, while here it's d_head - // we can add support in the future if needed - GGML_ASSERT(layer.q_norm == nullptr && layer.k_norm == nullptr); - - } else { - // separate q, k, v - Qcur = ggml_mul_mat(ctx0, layer.q_w, cur); - if (layer.q_b) { - Qcur = ggml_add(ctx0, Qcur, layer.q_b); - } - - Kcur = ggml_mul_mat(ctx0, layer.k_w, cur); - if (layer.k_b) { - Kcur = ggml_add(ctx0, Kcur, layer.k_b); - } - - Vcur = ggml_mul_mat(ctx0, layer.v_w, cur); - if (layer.v_b) { - Vcur = ggml_add(ctx0, Vcur, layer.v_b); - } - - if (layer.q_norm) { - Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il); - cb(Qcur, "Qcur_norm", il); - } - - if (layer.k_norm) { - Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il); - cb(Kcur, "Kcur_norm", il); - } - - Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos); - Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos); - Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos); - } - - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); - - if (add_pos) { - Qcur = add_pos(Qcur, layer); - Kcur = add_pos(Kcur, layer); - cb(Qcur, "Qcur_pos", il); - cb(Kcur, "Kcur_pos", il); - } - - cur = build_attn(layer.o_w, layer.o_b, - Qcur, Kcur, Vcur, nullptr, kq_scale, il); - cb(cur, "attn_out", il); - } - - if (layer.ls_1_w) { - cur = ggml_mul(ctx0, cur, layer.ls_1_w); - cb(cur, "attn_out_scaled", il); - } - - // re-add the layer input, e.g., residual - cur = ggml_add(ctx0, cur, inpL); - - inpL = cur; // inpL = residual, cur = hidden_states - - cb(cur, "ffn_inp", il); - - // layernorm2 - cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); - cb(cur, "ffn_inp_normed", il); - - // ffn - cur = build_ffn(cur, - layer.ff_up_w, layer.ff_up_b, - layer.ff_gate_w, layer.ff_gate_b, - layer.ff_down_w, layer.ff_down_b, - ffn_t, il); - - cb(cur, "ffn_out", il); - - if (layer.ls_2_w) { - cur = ggml_mul(ctx0, cur, layer.ls_2_w); - cb(cur, "ffn_out_scaled", il); - } - - // residual 2 - cur = ggml_add(ctx0, inpL, cur); - cb(cur, "layer_out", il); - - inpL = cur; - } - - if (ctx->model.audio_has_avgpool()) { - ggml_tensor * cur = inpL; - cur = ggml_transpose(ctx0, cur); - cur = ggml_cont(ctx0, cur); - cur = ggml_pool_1d(ctx0, cur, GGML_OP_POOL_AVG, 2, 2, 0); - cur = ggml_transpose(ctx0, cur); - cur = ggml_cont(ctx0, cur); - inpL = cur; - } - - // post-layernorm - if (model.post_ln_w) { - inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, -1); - } - return inpL; +// build the input after conv2d (inp_raw --> patches) +// returns tensor with shape [n_embd, n_patches] +ggml_tensor * clip_graph::build_inp() { + ggml_tensor * inp_raw = build_inp_raw(); + ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd); + inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); + if (model.patch_bias) { + inp = ggml_add(ctx0, inp, model.patch_bias); + cb(inp, "patch_bias", -1); } + return inp; +} - // build the input after conv2d (inp_raw --> patches) - // returns tensor with shape [n_embd, n_patches] - ggml_tensor * build_inp() { - ggml_tensor * inp_raw = build_inp_raw(); - ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd); - inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); - if (model.patch_bias) { - inp = ggml_add(ctx0, inp, model.patch_bias); - cb(inp, "patch_bias", -1); - } - return inp; - } +ggml_tensor * clip_graph::build_inp_raw(int channels) { + ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels); + ggml_set_name(inp_raw, "inp_raw"); + ggml_set_input(inp_raw); + return inp_raw; +} - ggml_tensor * build_inp_raw(int channels = 3) { - ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels); - ggml_set_name(inp_raw, "inp_raw"); - ggml_set_input(inp_raw); - return inp_raw; - } - - ggml_tensor * build_norm( - ggml_tensor * cur, - ggml_tensor * mw, - ggml_tensor * mb, - norm_type type, - float norm_eps, - int il) const { - - cur = type == NORM_TYPE_RMS - ? ggml_rms_norm(ctx0, cur, norm_eps) - : ggml_norm(ctx0, cur, norm_eps); - - if (mw || mb) { - cb(cur, "norm", il); - } - - if (mw) { - cur = ggml_mul(ctx0, cur, mw); - if (mb) { - cb(cur, "norm_w", il); - } - } - - if (mb) { - cur = ggml_add(ctx0, cur, mb); - } - - return cur; - } - - ggml_tensor * build_ffn( - ggml_tensor * cur, - ggml_tensor * up, - ggml_tensor * up_b, - ggml_tensor * gate, - ggml_tensor * gate_b, - ggml_tensor * down, - ggml_tensor * down_b, - ffn_op_type type_op, - int il) const { - - ggml_tensor * tmp = up ? ggml_mul_mat(ctx0, up, cur) : cur; - cb(tmp, "ffn_up", il); - - if (up_b) { - tmp = ggml_add(ctx0, tmp, up_b); - cb(tmp, "ffn_up_b", il); - } - - if (gate) { - cur = ggml_mul_mat(ctx0, gate, cur); - cb(cur, "ffn_gate", il); - - if (gate_b) { - cur = ggml_add(ctx0, cur, gate_b); - cb(cur, "ffn_gate_b", il); - } - } else { - cur = tmp; - } - - // we only support parallel ffn for now - switch (type_op) { - case FFN_SILU: - if (gate) { - cur = ggml_swiglu_split(ctx0, cur, tmp); - cb(cur, "ffn_swiglu", il); - } else { - cur = ggml_silu(ctx0, cur); - cb(cur, "ffn_silu", il); - } break; - case FFN_GELU: - if (gate) { - cur = ggml_geglu_split(ctx0, cur, tmp); - cb(cur, "ffn_geglu", il); - } else { - cur = ggml_gelu(ctx0, cur); - cb(cur, "ffn_gelu", il); - } break; - case FFN_GELU_ERF: - if (gate) { - cur = ggml_geglu_erf_split(ctx0, cur, tmp); - cb(cur, "ffn_geglu_erf", il); - } else { - cur = ggml_gelu_erf(ctx0, cur); - cb(cur, "ffn_gelu_erf", il); - } break; - case FFN_GELU_QUICK: - if (gate) { - cur = ggml_geglu_quick_split(ctx0, cur, tmp); - cb(cur, "ffn_geglu_quick", il); - } else { - cur = ggml_gelu_quick(ctx0, cur); - cb(cur, "ffn_gelu_quick", il); - } break; - } - - if (down) { - cur = ggml_mul_mat(ctx0, down, cur); - } - - if (down_b) { - cb(cur, "ffn_down", il); - } - - if (down_b) { - cur = ggml_add(ctx0, cur, down_b); - } - - return cur; - } - - ggml_tensor * build_attn( - ggml_tensor * wo, - ggml_tensor * wo_b, - ggml_tensor * q_cur, - ggml_tensor * k_cur, - ggml_tensor * v_cur, - ggml_tensor * kq_mask, - float kq_scale, - int il) const { - // these nodes are added to the graph together so that they are not reordered - // by doing so, the number of splits in the graph is reduced - ggml_build_forward_expand(gf, q_cur); - ggml_build_forward_expand(gf, k_cur); - ggml_build_forward_expand(gf, v_cur); - - ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3); - //cb(q, "q", il); - - ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3); - //cb(k, "k", il); - - ggml_tensor * cur; - - if (ctx->flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) { - ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3); - - k = ggml_cast(ctx0, k, GGML_TYPE_F16); - v = ggml_cast(ctx0, v, GGML_TYPE_F16); - - cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, 0.0f, 0.0f); - ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); - - cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*cur->ne[1], cur->ne[2]*cur->ne[3]); - - } else { - ggml_tensor * v = ggml_permute(ctx0, v_cur, 1, 2, 0, 3); - v = ggml_cont(ctx0, v); - - const auto n_tokens = q->ne[1]; - const auto n_head = q->ne[2]; - - ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); - // F32 may not needed for vision encoders? - // ggml_mul_mat_set_prec(kq, GGML_PREC_F32); - - kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, 0.0f); - - ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); - cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3); - cur = ggml_cont_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens); - } - - cb(cur, "kqv_out", il); - - if (wo) { - cur = ggml_mul_mat(ctx0, wo, cur); - } - - if (wo_b) { - cur = ggml_add(ctx0, cur, wo_b); - } - - return cur; - } - - // implementation of the 2D RoPE without adding a new op in ggml - // this is not efficient (use double the memory), but works on all backends - // TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065 - static ggml_tensor * build_rope_2d( - ggml_context * ctx0, +ggml_tensor * clip_graph::build_norm( ggml_tensor * cur, - ggml_tensor * pos_a, // first half - ggml_tensor * pos_b, // second half - const float freq_base, - const bool interleave_freq - ) { - const int64_t n_dim = cur->ne[0]; - const int64_t n_head = cur->ne[1]; - const int64_t n_pos = cur->ne[2]; + ggml_tensor * mw, + ggml_tensor * mb, + norm_type type, + float norm_eps, + int il) const { - // for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos) - // we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3 - // first half of cur will use 1e-0, 1e-2 (even) - // second half of cur will use 1e-1, 1e-3 (odd) - // the trick here is to rotate just half of n_dim, so inv_freq will automatically be even - // ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2) - // then for the second half, we use freq_scale to shift the inv_freq - // ^ why? replace (2i) with (2i+1) in the above equation - const float freq_scale_odd = interleave_freq - ? std::pow(freq_base, (float)-2/n_dim) - : 1.0; + cur = type == NORM_TYPE_RMS + ? ggml_rms_norm(ctx0, cur, norm_eps) + : ggml_norm(ctx0, cur, norm_eps); - // first half - ggml_tensor * first; - { - first = ggml_view_3d(ctx0, cur, - n_dim/2, n_head, n_pos, - ggml_row_size(cur->type, n_dim), - ggml_row_size(cur->type, n_dim*n_head), - 0); - first = ggml_rope_ext( - ctx0, - first, - pos_a, // positions - nullptr, // freq factors - n_dim/2, // n_dims - 0, 0, freq_base, - 1.0f, 0.0f, 1.0f, 0.0f, 0.0f - ); + if (mw || mb) { + cb(cur, "norm", il); + } + + if (mw) { + cur = ggml_mul(ctx0, cur, mw); + if (mb) { + cb(cur, "norm_w", il); } + } - // second half - ggml_tensor * second; - { - second = ggml_view_3d(ctx0, cur, - n_dim/2, n_head, n_pos, - ggml_row_size(cur->type, n_dim), - ggml_row_size(cur->type, n_dim*n_head), - n_dim/2 * ggml_element_size(cur)); - second = ggml_rope_ext( - ctx0, - second, - pos_b, // positions - nullptr, // freq factors - n_dim/2, // n_dims - 0, 0, freq_base, - freq_scale_odd, - 0.0f, 1.0f, 0.0f, 0.0f - ); + if (mb) { + cur = ggml_add(ctx0, cur, mb); + } + + return cur; +} + +ggml_tensor * clip_graph::build_ffn( + ggml_tensor * cur, + ggml_tensor * up, + ggml_tensor * up_b, + ggml_tensor * gate, + ggml_tensor * gate_b, + ggml_tensor * down, + ggml_tensor * down_b, + ffn_op_type type_op, + int il) const { + + ggml_tensor * tmp = up ? ggml_mul_mat(ctx0, up, cur) : cur; + cb(tmp, "ffn_up", il); + + if (up_b) { + tmp = ggml_add(ctx0, tmp, up_b); + cb(tmp, "ffn_up_b", il); + } + + if (gate) { + cur = ggml_mul_mat(ctx0, gate, cur); + cb(cur, "ffn_gate", il); + + if (gate_b) { + cur = ggml_add(ctx0, cur, gate_b); + cb(cur, "ffn_gate_b", il); } + } else { + cur = tmp; + } - cur = ggml_concat(ctx0, first, second, 0); + // we only support parallel ffn for now + switch (type_op) { + case FFN_SILU: + if (gate) { + cur = ggml_swiglu_split(ctx0, cur, tmp); + cb(cur, "ffn_swiglu", il); + } else { + cur = ggml_silu(ctx0, cur); + cb(cur, "ffn_silu", il); + } break; + case FFN_GELU: + if (gate) { + cur = ggml_geglu_split(ctx0, cur, tmp); + cb(cur, "ffn_geglu", il); + } else { + cur = ggml_gelu(ctx0, cur); + cb(cur, "ffn_gelu", il); + } break; + case FFN_GELU_ERF: + if (gate) { + cur = ggml_geglu_erf_split(ctx0, cur, tmp); + cb(cur, "ffn_geglu_erf", il); + } else { + cur = ggml_gelu_erf(ctx0, cur); + cb(cur, "ffn_gelu_erf", il); + } break; + case FFN_GELU_QUICK: + if (gate) { + cur = ggml_geglu_quick_split(ctx0, cur, tmp); + cb(cur, "ffn_geglu_quick", il); + } else { + cur = ggml_gelu_quick(ctx0, cur); + cb(cur, "ffn_gelu_quick", il); + } break; + } + + if (down) { + cur = ggml_mul_mat(ctx0, down, cur); + } + + if (down_b) { + cb(cur, "ffn_down", il); + } + + if (down_b) { + cur = ggml_add(ctx0, cur, down_b); + } + + return cur; +} + +ggml_tensor * clip_graph::build_attn( + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * q_cur, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + ggml_tensor * kq_mask, + float kq_scale, + int il) const { + // these nodes are added to the graph together so that they are not reordered + // by doing so, the number of splits in the graph is reduced + ggml_build_forward_expand(gf, q_cur); + ggml_build_forward_expand(gf, k_cur); + ggml_build_forward_expand(gf, v_cur); + + ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3); + //cb(q, "q", il); + + ggml_tensor * k = ggml_permute(ctx0, k_cur, 0, 2, 1, 3); + //cb(k, "k", il); + + ggml_tensor * cur; + + if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) { + ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3); + + k = ggml_cast(ctx0, k, GGML_TYPE_F16); + v = ggml_cast(ctx0, v, GGML_TYPE_F16); + + cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, 0.0f, 0.0f); + ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); + + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*cur->ne[1], cur->ne[2]*cur->ne[3]); + + } else { + ggml_tensor * v = ggml_permute(ctx0, v_cur, 1, 2, 0, 3); + v = ggml_cont(ctx0, v); + + const auto n_tokens = q->ne[1]; + const auto n_head = q->ne[2]; + + ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + // F32 may not needed for vision encoders? + // ggml_mul_mat_set_prec(kq, GGML_PREC_F32); + + kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, 0.0f); + + ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); + cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3); + cur = ggml_cont_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens); + } + + cb(cur, "kqv_out", il); + + if (wo) { + cur = ggml_mul_mat(ctx0, wo, cur); + } + + if (wo_b) { + cur = ggml_add(ctx0, cur, wo_b); + } + + return cur; +} + +// implementation of the 2D RoPE without adding a new op in ggml +// this is not efficient (use double the memory), but works on all backends +// TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065 +ggml_tensor * clip_graph::build_rope_2d( + ggml_context * ctx0, + ggml_tensor * cur, + ggml_tensor * pos_a, // first half + ggml_tensor * pos_b, // second half + const float freq_base, + const bool interleave_freq +) { + const int64_t n_dim = cur->ne[0]; + const int64_t n_head = cur->ne[1]; + const int64_t n_pos = cur->ne[2]; + + // for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos) + // we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3 + // first half of cur will use 1e-0, 1e-2 (even) + // second half of cur will use 1e-1, 1e-3 (odd) + // the trick here is to rotate just half of n_dim, so inv_freq will automatically be even + // ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2) + // then for the second half, we use freq_scale to shift the inv_freq + // ^ why? replace (2i) with (2i+1) in the above equation + const float freq_scale_odd = interleave_freq + ? std::pow(freq_base, (float)-2/n_dim) + : 1.0; + + // first half + ggml_tensor * first; + { + first = ggml_view_3d(ctx0, cur, + n_dim/2, n_head, n_pos, + ggml_row_size(cur->type, n_dim), + ggml_row_size(cur->type, n_dim*n_head), + 0); + first = ggml_rope_ext( + ctx0, + first, + pos_a, // positions + nullptr, // freq factors + n_dim/2, // n_dims + 0, 0, freq_base, + 1.0f, 0.0f, 1.0f, 0.0f, 0.0f + ); + } + + // second half + ggml_tensor * second; + { + second = ggml_view_3d(ctx0, cur, + n_dim/2, n_head, n_pos, + ggml_row_size(cur->type, n_dim), + ggml_row_size(cur->type, n_dim*n_head), + n_dim/2 * ggml_element_size(cur)); + second = ggml_rope_ext( + ctx0, + second, + pos_b, // positions + nullptr, // freq factors + n_dim/2, // n_dims + 0, 0, freq_base, + freq_scale_odd, + 0.0f, 1.0f, 0.0f, 0.0f + ); + } + + cur = ggml_concat(ctx0, first, second, 0); + return cur; +} + +// Generic function to stack frames for audio processing +// Abstracts out the StackAudioFrames logic used by ultravox +ggml_tensor * clip_graph::build_stack(ggml_tensor * cur, int32_t stack_factor, int32_t n_embed) { + if (stack_factor <= 1) { return cur; } - // aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL) - // support dynamic resolution - ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor) { - GGML_ASSERT(scale_factor > 1); + int64_t total_elements = ggml_nelements(cur); + int64_t stride = n_embed * stack_factor; - const int n_embd = cur->ne[0]; - int width = img.nx / patch_size; - int height = img.ny / patch_size; + // Calculate padded length + int64_t padded_len = GGML_PAD(total_elements, stride); + int64_t pad = padded_len - total_elements; - // pad width and height to factor - const int64_t pad_width = CLIP_ALIGN(width, scale_factor) - width; - const int64_t pad_height = CLIP_ALIGN(height, scale_factor) - height; - cur = ggml_reshape_3d(ctx0, cur, n_embd, width, height); - if (pad_width || pad_height) { - cur = ggml_pad(ctx0, cur, 0, pad_width, pad_height, 0); - width += pad_width; - height += pad_height; - } - - // unshuffle h - cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height); - cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); - - // unshuffle w - cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor); - cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); - - cur = ggml_cont_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]); - cb(cur, "pixel_shuffle", -1); - - return cur; + if (pad > 0) { + // Pad the tensor to make it divisible by stride + cur = ggml_view_1d(ctx0, cur, total_elements, 0); + cur = ggml_pad(ctx0, cur, pad, 0, 0, 0); } -}; + // Reshape to [stride, padded_len / stride] + cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride, + ggml_row_size(cur->type, stride), 0); + return cur; +} + +// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL) +// support dynamic resolution +ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale_factor) { + GGML_ASSERT(scale_factor > 1); + + const int n_embd = cur->ne[0]; + int width = img.nx / patch_size; + int height = img.ny / patch_size; + + // pad width and height to factor + const int64_t pad_width = CLIP_ALIGN(width, scale_factor) - width; + const int64_t pad_height = CLIP_ALIGN(height, scale_factor) - height; + cur = ggml_reshape_3d(ctx0, cur, n_embd, width, height); + if (pad_width || pad_height) { + cur = ggml_pad(ctx0, cur, 0, pad_width, pad_height, 0); + width += pad_width; + height += pad_height; + } + + // unshuffle h + cur = ggml_reshape_3d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + + // unshuffle w + cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, height / scale_factor, width / scale_factor); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + + cur = ggml_cont_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]); + cb(cur, "pixel_shuffle", -1); + + return cur; +} static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) { GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported"); - clip_graph graph(ctx, *imgs.entries[0]); - ggml_cgraph * res; + const clip_image_f32 & img = *imgs.entries[0]; + std::unique_ptr builder; switch (ctx->proj_type()) { case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_IDEFICS3: case PROJECTOR_TYPE_LFM2: + case PROJECTOR_TYPE_JANUS_PRO: { - res = graph.build_siglip(); + builder = std::make_unique(ctx, img); } break; case PROJECTOR_TYPE_PIXTRAL: case PROJECTOR_TYPE_LIGHTONOCR: { - res = graph.build_pixtral(); + builder = std::make_unique(ctx, img); } break; case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN25VL: { - res = graph.build_qwen2vl(); + builder = std::make_unique(ctx, img); } break; case PROJECTOR_TYPE_QWEN3VL: { - res = graph.build_qwen3vl(); + builder = std::make_unique(ctx, img); } break; case PROJECTOR_TYPE_MINICPMV: { - res = graph.build_minicpmv(); + builder = std::make_unique(ctx, img); } break; case PROJECTOR_TYPE_INTERNVL: { - res = graph.build_internvl(); + builder = std::make_unique(ctx, img); } break; case PROJECTOR_TYPE_LLAMA4: { - res = graph.build_llama4(); + builder = std::make_unique(ctx, img); } break; case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_VOXTRAL: case PROJECTOR_TYPE_QWEN2A: + case PROJECTOR_TYPE_GLMA: { - res = graph.build_whisper_enc(); + builder = std::make_unique(ctx, img); } break; case PROJECTOR_TYPE_KIMIVL: { - res = graph.build_kimivl(); - } break; - case PROJECTOR_TYPE_JANUS_PRO: - { - res = graph.build_siglip(); + builder = std::make_unique(ctx, img); } break; case PROJECTOR_TYPE_COGVLM: { - res = graph.build_cogvlm(); + builder = std::make_unique(ctx, img); + } break; + case PROJECTOR_TYPE_MLP: + case PROJECTOR_TYPE_MLP_NORM: + case PROJECTOR_TYPE_LDP: + case PROJECTOR_TYPE_LDPV2: + case PROJECTOR_TYPE_GLM_EDGE: + { + builder = std::make_unique(ctx, img); } break; default: - { - res = graph.build_llava(); - } break; + GGML_ABORT("missing cgraph builder"); } - return res; + + return builder->build(); } +// +// clip_model_loader +// + struct clip_model_loader { ggml_context_ptr ctx_meta; gguf_context_ptr ctx_gguf; @@ -2933,10 +1234,12 @@ struct clip_model_loader { } break; case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_QWEN2A: + case PROJECTOR_TYPE_GLMA: case PROJECTOR_TYPE_VOXTRAL: { bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX || - model.proj_type == PROJECTOR_TYPE_VOXTRAL; + model.proj_type == PROJECTOR_TYPE_VOXTRAL || + model.proj_type == PROJECTOR_TYPE_GLMA; get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor, require_stack); if (hparams.n_mel_bins != 128) { throw std::runtime_error(string_format("%s: only 128 mel bins are supported for ultravox\n", __func__)); @@ -3307,6 +1610,21 @@ struct clip_model_loader { model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight")); model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias")); } break; + case PROJECTOR_TYPE_GLMA: + { + model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight")); + model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias")); + model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight")); + model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias")); + model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight")); + model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias")); + model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight")); + model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias")); + model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight")); + model.mm_norm_pre_b = get_tensor(string_format(TN_MM_NORM_PRE, "bias")); + model.mm_boi = get_tensor(string_format(TN_TOK_BOI, "weight")); + model.mm_eoi = get_tensor(string_format(TN_TOK_EOI, "weight")); + } break; case PROJECTOR_TYPE_LLAMA4: { model.mm_model_proj = get_tensor(TN_MM_PROJECTOR); @@ -4152,7 +2470,14 @@ struct llava_uhd { clip_image_size refined_size; // size of image right before slicing (must be multiple of slice size) clip_image_size grid_size; // grid_size.width * grid_size.height = number of slices std::vector slices; + + img_tool::resize_algo interpolation_overview = img_tool::RESIZE_ALGO_BILINEAR; + bool padding_overview = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6) + std::array pad_color_overview = {0, 0, 0}; + + img_tool::resize_algo interpolation_refined = img_tool::RESIZE_ALGO_BICUBIC; bool padding_refined = false; // if true, refine image will be padded to the grid size (e.g. llava-1.6) + std::array pad_color_refined = {0, 0, 0}; }; static slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) { @@ -4179,10 +2504,11 @@ struct llava_uhd { auto refine_size = llava_uhd::select_best_resolution( original_size, ctx->model.hparams.image_res_candidates); - res.overview_size = clip_image_size{slice_size, slice_size}; - res.refined_size = refine_size; - res.grid_size = clip_image_size{0, 0}; - res.padding_refined = true; + res.overview_size = clip_image_size{slice_size, slice_size}; + res.refined_size = refine_size; + res.grid_size = clip_image_size{0, 0}; + res.padding_refined = true; + res.interpolation_refined = img_tool::RESIZE_ALGO_BILINEAR; // preserve old behavior when padding LOG_DBG("%s: using pinpoints for slicing\n", __func__); LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d\n", @@ -4261,12 +2587,13 @@ struct llava_uhd { static std::vector slice_image(const clip_image_u8 * img, const slice_instructions & inst) { std::vector output; - img_tool::resize_algo interpolation = img_tool::RESIZE_ALGO_BILINEAR; // TODO: make it configurable // resize to overview size clip_image_u8_ptr resized_img(clip_image_u8_init()); - img_tool::resize(*img, *resized_img, inst.overview_size, interpolation); + img_tool::resize(*img, *resized_img, inst.overview_size, inst.interpolation_overview, + inst.padding_overview, inst.pad_color_overview); output.push_back(std::move(resized_img)); + if (inst.slices.empty()) { // no slices, just return the resized image return output; @@ -4274,13 +2601,8 @@ struct llava_uhd { // resize to refined size clip_image_u8_ptr refined_img(clip_image_u8_init()); - if (inst.padding_refined) { - img_tool::resize(*img, *refined_img, inst.refined_size, interpolation); - } else { - // only algo bicubic preserves the ratio; old models rely on this behavior - // TODO: do we need to support other algos here? - img_tool::resize(*img, *refined_img, inst.refined_size, img_tool::RESIZE_ALGO_BICUBIC, false); - } + img_tool::resize(*img, *refined_img, inst.refined_size, inst.interpolation_refined, + inst.padding_refined, inst.pad_color_refined); // create slices for (const auto & slice : inst.slices) { @@ -4813,6 +3135,16 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im n_patches /= 2; } } break; + case PROJECTOR_TYPE_GLMA: + { + n_patches = img->nx; + // whisper downscales input token by half after conv1d + n_patches /= 2; + // reshape by merge_factor + n_patches /= ctx->model.hparams.proj_stack_factor; + // for BOI and EOI token embeddings + n_patches += 2; + } break; case PROJECTOR_TYPE_COGVLM: { n_patches += 2; // for BOI and EOI token embeddings @@ -5148,6 +3480,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima case PROJECTOR_TYPE_IDEFICS3: case PROJECTOR_TYPE_INTERNVL: case PROJECTOR_TYPE_QWEN2A: + case PROJECTOR_TYPE_GLMA: case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_VOXTRAL: @@ -5455,6 +3788,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { return ctx->model.mm_model_proj->ne[1]; case PROJECTOR_TYPE_QWEN2A: return ctx->model.mm_fc_w->ne[1]; + case PROJECTOR_TYPE_GLMA: + return ctx->model.mm_2_w->ne[1]; case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_KIMIVL: return ctx->model.mm_2_w->ne[1]; @@ -5505,6 +3840,7 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) { bool clip_has_whisper_encoder(const struct clip_ctx * ctx) { return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX || ctx->proj_type() == PROJECTOR_TYPE_QWEN2A + || ctx->proj_type() == PROJECTOR_TYPE_GLMA || ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL; } diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index d8801c459..3fd2f9cd0 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -7,6 +7,8 @@ // !!! Internal header, to be used by mtmd only !!! +#define MTMD_INTERNAL_HEADER + struct clip_ctx; struct clip_image_size { diff --git a/tools/mtmd/models/cogvlm.cpp b/tools/mtmd/models/cogvlm.cpp new file mode 100644 index 000000000..d5b739c68 --- /dev/null +++ b/tools/mtmd/models/cogvlm.cpp @@ -0,0 +1,98 @@ +#include "models.h" + +ggml_cgraph * clip_graph_cogvlm::build() { + GGML_ASSERT(model.class_embedding != nullptr); + GGML_ASSERT(model.position_embeddings != nullptr); + + const int n_pos = n_patches + 1; // +1 for [CLS] + + // build input and concatenate class embedding + ggml_tensor * inp = build_inp(); + inp = ggml_concat(ctx0, inp, model.class_embedding, 1); + + inp = ggml_add(ctx0, inp, model.position_embeddings); + cb(inp, "inp_pos", -1); + + ggml_tensor * inpL = inp; + + for (int il = 0; il < n_layer; il++) { + auto & layer = model.layers[il]; + ggml_tensor * cur = inpL; + + cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); + + cur = ggml_add(ctx0, cur, layer.qkv_b); + + ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float), + cur->nb[1], 0); + ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float), + cur->nb[1], n_embd * sizeof(float)); + ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float), + cur->nb[1], 2 * n_embd * sizeof(float)); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(layer.o_w, layer.o_b, + Qcur, Kcur, Vcur, nullptr, kq_scale, il); + cb(cur, "attn_out", il); + + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il); + cb(cur, "attn_post_norm", il); + + cur = ggml_add(ctx0, cur, inpL); + inpL = cur; + + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + layer.ff_gate_w, layer.ff_gate_b, + layer.ff_down_w, layer.ff_down_b, + hparams.ffn_op, il); + + cb(cur, "ffn_out", il); + + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il); + cb(cur, "ffn_post_norm", il); + + cur = ggml_add(ctx0, cur, inpL); + cb(cur, "layer_out", il); + inpL = cur; + + } + + // remove CLS token (like build_llama4 does) + ggml_tensor * cur = ggml_view_2d(ctx0, inpL, + n_embd, n_patches, + ggml_row_size(inpL->type, n_embd), 0); + + // Multiply with mm_model_proj + cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur); + + // Apply layernorm, weight, bias + cur = build_norm(cur, model.mm_post_fc_norm_w, model.mm_post_fc_norm_b, NORM_TYPE_NORMAL, 1e-5, -1); + + // Apply GELU + cur = ggml_gelu_inplace(ctx0, cur); + + // Branch 1: multiply with mm_h_to_4h_w + ggml_tensor * h_to_4h = ggml_mul_mat(ctx0, model.mm_h_to_4h_w, cur); + + // Branch 2: multiply with mm_gate_w + ggml_tensor * gate = ggml_mul_mat(ctx0, model.mm_gate_w, cur); + + // Apply silu + gate = ggml_swiglu_split(ctx0, gate, h_to_4h); + + // Apply mm_4h_to_h_w + cur = ggml_mul_mat(ctx0, model.mm_4h_to_h_w, gate); + + // Concatenate with boi and eoi + cur = ggml_concat(ctx0, model.mm_boi, cur, 1); + cur = ggml_concat(ctx0, cur, model.mm_eoi, 1); + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/tools/mtmd/models/internvl.cpp b/tools/mtmd/models/internvl.cpp new file mode 100644 index 000000000..9aded3b97 --- /dev/null +++ b/tools/mtmd/models/internvl.cpp @@ -0,0 +1,69 @@ +#include "models.h" + +ggml_cgraph * clip_graph_internvl::build() { + GGML_ASSERT(model.class_embedding != nullptr); + GGML_ASSERT(model.position_embeddings != nullptr); + + const int n_pos = n_patches + 1; + ggml_tensor * inp = build_inp(); + + // add CLS token + inp = ggml_concat(ctx0, inp, model.class_embedding, 1); + + // The larger models use a different ViT, which uses RMS norm instead of layer norm + // ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188 + norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45) + ? NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B) + : NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models) + + ggml_tensor * cur = build_vit( + inp, n_pos, + norm_t, + hparams.ffn_op, + model.position_embeddings, + nullptr); + + // remove CLS token + cur = ggml_view_2d(ctx0, cur, + n_embd, n_patches, + ggml_row_size(cur->type, n_embd), 0); + + // pixel shuffle + { + const int scale_factor = model.hparams.n_merge; + const int bsz = 1; // batch size, always 1 for now since we don't support batching + const int height = n_patches_y; + const int width = n_patches_x; + GGML_ASSERT(scale_factor > 0); + cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, height / scale_factor, width, bsz); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + cur = ggml_cont_4d(ctx0, cur, + n_embd * scale_factor * scale_factor, + height / scale_factor, + width / scale_factor, + bsz); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + // flatten to 2D + cur = ggml_cont_2d(ctx0, cur, + n_embd * scale_factor * scale_factor, + cur->ne[1] * cur->ne[2]); + } + + // projector (always using GELU activation) + { + // projector LayerNorm uses pytorch's default eps = 1e-5 + // ref: https://huggingface.co/OpenGVLab/InternVL3-8B-Instruct/blob/a34d3e4e129a5856abfd6aa6de79776484caa14e/modeling_internvl_chat.py#L79 + cur = build_norm(cur, model.mm_0_w, model.mm_0_b, NORM_TYPE_NORMAL, 1e-5, -1); + cur = build_ffn(cur, + model.mm_1_w, model.mm_1_b, + nullptr, nullptr, + model.mm_3_w, model.mm_3_b, + FFN_GELU, + -1); + } + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/tools/mtmd/models/kimivl.cpp b/tools/mtmd/models/kimivl.cpp new file mode 100644 index 000000000..0a06f5090 --- /dev/null +++ b/tools/mtmd/models/kimivl.cpp @@ -0,0 +1,63 @@ +#include "models.h" + +ggml_cgraph * clip_graph_kimivl::build() { + // 2D input positions + ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(pos_h, "pos_h"); + ggml_set_input(pos_h); + + ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(pos_w, "pos_w"); + ggml_set_input(pos_w); + + ggml_tensor * learned_pos_embd = resize_position_embeddings(); + + // build ViT with 2D position embeddings + auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { + // first half is X axis and second half is Y axis + return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false); + }; + + ggml_tensor * inp = build_inp(); + ggml_tensor * cur = build_vit( + inp, n_patches, + NORM_TYPE_NORMAL, + hparams.ffn_op, + learned_pos_embd, + add_pos); + + cb(cur, "vit_out", -1); + + { + // patch_merger + const int scale_factor = model.hparams.n_merge; + cur = build_patch_merge_permute(cur, scale_factor); + + // projection norm + int proj_inp_dim = cur->ne[0]; + cur = ggml_view_2d(ctx0, cur, + n_embd, cur->ne[1] * scale_factor * scale_factor, + ggml_row_size(cur->type, n_embd), 0); + cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm + cur = ggml_mul(ctx0, cur, model.mm_input_norm_w); + cur = ggml_add(ctx0, cur, model.mm_input_norm_b); + cur = ggml_view_2d(ctx0, cur, + proj_inp_dim, cur->ne[1] / scale_factor / scale_factor, + ggml_row_size(cur->type, proj_inp_dim), 0); + cb(cur, "proj_inp_normed", -1); + + // projection mlp + cur = build_ffn(cur, + model.mm_1_w, model.mm_1_b, + nullptr, nullptr, + model.mm_2_w, model.mm_2_b, + FFN_GELU, + -1); + cb(cur, "proj_out", -1); + } + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/tools/mtmd/models/llama4.cpp b/tools/mtmd/models/llama4.cpp new file mode 100644 index 000000000..30d1df5bc --- /dev/null +++ b/tools/mtmd/models/llama4.cpp @@ -0,0 +1,96 @@ +#include "models.h" + +ggml_cgraph * clip_graph_llama4::build() { + GGML_ASSERT(model.class_embedding != nullptr); + GGML_ASSERT(model.position_embeddings != nullptr); + + const int n_pos = n_patches + 1; // +1 for [CLS] + + // 2D input positions + ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); + ggml_set_name(pos_h, "pos_h"); + ggml_set_input(pos_h); + + ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); + ggml_set_name(pos_w, "pos_w"); + ggml_set_input(pos_w); + + ggml_tensor * inp = build_inp_raw(); + + // Llama4UnfoldConvolution + { + ggml_tensor * kernel = ggml_reshape_4d(ctx0, model.patch_embeddings_0, + patch_size, patch_size, 3, n_embd); + inp = ggml_im2col(ctx0, kernel, inp, patch_size, patch_size, 0, 0, 1, 1, true, inp->type); + inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp); + inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches); + cb(inp, "patch_conv", -1); + } + + // add CLS token + inp = ggml_concat(ctx0, inp, model.class_embedding, 1); + + // build ViT with 2D position embeddings + auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { + // first half is X axis and second half is Y axis + // ref: https://github.com/huggingface/transformers/blob/40a493c7ed4f19f08eadb0639cf26d49bfa5e180/src/transformers/models/llama4/modeling_llama4.py#L1312 + // ref: https://github.com/Blaizzy/mlx-vlm/blob/a57156aa87b33cca6e5ee6cfc14dd4ef8f611be6/mlx_vlm/models/llama4/vision.py#L441 + return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false); + }; + ggml_tensor * cur = build_vit( + inp, n_pos, + NORM_TYPE_NORMAL, + hparams.ffn_op, + model.position_embeddings, + add_pos); + + // remove CLS token + cur = ggml_view_2d(ctx0, cur, + n_embd, n_patches, + ggml_row_size(cur->type, n_embd), 0); + + // pixel shuffle + // based on Llama4VisionPixelShuffleMLP + // https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151 + { + const int scale_factor = model.hparams.n_merge; + const int bsz = 1; // batch size, always 1 for now since we don't support batching + GGML_ASSERT(scale_factor > 0); + GGML_ASSERT(n_patches_x == n_patches_y); // llama4 only supports square images + cur = ggml_reshape_4d(ctx0, cur, + n_embd * scale_factor, + n_patches_x / scale_factor, + n_patches_y, + bsz); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + cur = ggml_cont_4d(ctx0, cur, + n_embd * scale_factor * scale_factor, + n_patches_x / scale_factor, + n_patches_y / scale_factor, + bsz); + //cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + // flatten to 2D + cur = ggml_cont_2d(ctx0, cur, + n_embd * scale_factor * scale_factor, + n_patches / scale_factor / scale_factor); + cb(cur, "pixel_shuffle", -1); + } + + // based on Llama4VisionMLP2 (always uses GELU activation, no bias) + { + cur = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, cur); + cur = ggml_gelu(ctx0, cur); + cur = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, cur); + cur = ggml_gelu(ctx0, cur); + cb(cur, "adapter_mlp", -1); + } + + // Llama4MultiModalProjector + cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur); + cb(cur, "projected", -1); + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/tools/mtmd/models/llava.cpp b/tools/mtmd/models/llava.cpp new file mode 100644 index 000000000..0bfb5f05f --- /dev/null +++ b/tools/mtmd/models/llava.cpp @@ -0,0 +1,374 @@ +#include "models.h" + +// this graph is used by llava, granite and glm +// due to having embedding_stack (used by granite), we cannot reuse build_vit +ggml_cgraph * clip_graph_llava::build() { + const int batch_size = 1; + const int n_pos = n_patches + (model.class_embedding ? 1 : 0); + + GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported"); + + // Calculate the deepest feature layer based on hparams and projector type + int max_feature_layer = n_layer; + { + // Get the index of the second to last layer; this is the default for models that have a llava projector + int il_last = hparams.n_layer - 1; + int deepest_feature_layer = -1; + + if (proj_type == PROJECTOR_TYPE_MINICPMV || proj_type == PROJECTOR_TYPE_GLM_EDGE) { + il_last += 1; + } + + // If we set explicit vision feature layers, only go up to the deepest one + // NOTE: only used by granite-vision models for now + for (const auto & feature_layer : hparams.vision_feature_layer) { + if (feature_layer > deepest_feature_layer) { + deepest_feature_layer = feature_layer; + } + } + max_feature_layer = deepest_feature_layer < 0 ? il_last : deepest_feature_layer; + } + + ggml_tensor * inp = build_inp(); + + // concat class_embeddings and patch_embeddings + if (model.class_embedding) { + inp = ggml_concat(ctx0, inp, model.class_embedding, 1); + } + + ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + inp = ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions)); + + ggml_tensor * inpL = inp; + + // pre-layernorm + if (model.pre_ln_w) { + inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, NORM_TYPE_NORMAL, eps, -1); + cb(inpL, "pre_ln", -1); + } + + std::vector embedding_stack; + const auto & vision_feature_layer = hparams.vision_feature_layer; + + // loop over layers + for (int il = 0; il < max_feature_layer; il++) { + auto & layer = model.layers[il]; + ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states + + // If this is an embedding feature layer, save the output. + // NOTE: 0 index here refers to the input to the encoder. + if (vision_feature_layer.find(il) != vision_feature_layer.end()) { + embedding_stack.push_back(cur); + } + + // layernorm1 + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il); + cb(cur, "layer_inp_normed", il); + + // self-attention + { + ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur); + if (layer.q_b) { + Qcur = ggml_add(ctx0, Qcur, layer.q_b); + } + + ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur); + if (layer.k_b) { + Kcur = ggml_add(ctx0, Kcur, layer.k_b); + } + + ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur); + if (layer.v_b) { + Vcur = ggml_add(ctx0, Vcur, layer.v_b); + } + + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos); + Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos); + Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(layer.o_w, layer.o_b, + Qcur, Kcur, Vcur, nullptr, kq_scale, il); + cb(cur, "attn_out", il); + } + + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, inpL); + + inpL = cur; // inpL = residual, cur = hidden_states + + cb(cur, "ffn_inp", il); + + // layernorm2 + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il); + cb(cur, "ffn_inp_normed", il); + + // ffn + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + layer.ff_gate_w, layer.ff_gate_b, + layer.ff_down_w, layer.ff_down_b, + hparams.ffn_op, il); + + cb(cur, "ffn_out", il); + + // residual 2 + cur = ggml_add(ctx0, inpL, cur); + cb(cur, "layer_out", il); + + inpL = cur; + } + + // post-layernorm + if (model.post_ln_w) { + inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, eps, -1); + } + + ggml_tensor * embeddings = inpL; + + // process vision feature layers (used by granite) + { + // final layer is a vision feature layer + if (vision_feature_layer.find(max_feature_layer) != vision_feature_layer.end()) { + embedding_stack.push_back(inpL); + } + + // If feature layers are explicitly set, stack them (if we have multiple) + if (!embedding_stack.empty()) { + embeddings = embedding_stack[0]; + for (size_t i = 1; i < embedding_stack.size(); i++) { + embeddings = ggml_concat(ctx0, embeddings, embedding_stack[i], 0); + } + } + } + + // llava projector (also used by granite) + if (hparams.has_llava_projector) { + embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); + + ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(patches, "patches"); + ggml_set_input(patches); + + // shape [1, 576, 1024] + // ne is whcn, ne = [1024, 576, 1, 1] + embeddings = ggml_get_rows(ctx0, embeddings, patches); + + // print_tensor_info(embeddings, "embeddings"); + + // llava projector + if (proj_type == PROJECTOR_TYPE_MLP) { + embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); + + embeddings = ggml_gelu(ctx0, embeddings); + if (model.mm_2_w) { + embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); + } + } + else if (proj_type == PROJECTOR_TYPE_MLP_NORM) { + embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); + // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false); + // First LayerNorm + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w), + model.mm_1_b); + + // GELU activation + embeddings = ggml_gelu(ctx0, embeddings); + + // Second linear layer + embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings); + embeddings = ggml_add(ctx0, embeddings, model.mm_3_b); + + // Second LayerNorm + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w), + model.mm_4_b); + } + else if (proj_type == PROJECTOR_TYPE_LDP) { + // MobileVLM projector + int n_patch = 24; + ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings); + mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b); + mlp_1 = ggml_gelu(ctx0, mlp_1); + ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1); + mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b); + // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1] + + // block 1 + ggml_tensor * block_1 = nullptr; + { + // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24] + mlp_3 = ggml_permute(ctx0, mlp_3, 1, 0, 2, 3); + mlp_3 = ggml_cont_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]); + // stride = 1, padding = 1, bias is nullptr + block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1); + + // layer norm + // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3)); + // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1] + block_1 = ggml_norm(ctx0, block_1, eps); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); + + // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] + // hardswish + ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1); + + block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0); + // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] + // pointwise conv + block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]); + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1); + block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b); + block_1 = ggml_relu(ctx0, block_1); + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1); + block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b); + block_1 = ggml_hardsigmoid(ctx0, block_1); + // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1] + block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]); + block_1 = ggml_mul(ctx0, block_1_hw, block_1); + + int w = block_1->ne[0], h = block_1->ne[1]; + block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3)); + + // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1] + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1); + block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]); + + // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1] + block_1 = ggml_norm(ctx0, block_1, eps); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); + // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] + // residual + block_1 = ggml_add(ctx0, mlp_3, block_1); + } + + // block_2 + { + // stride = 2 + block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1); + + // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] + // layer norm + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3)); + // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1] + block_1 = ggml_norm(ctx0, block_1, eps); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3)); + // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] + // hardswish + ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1); + + // not sure the parameters is right for globalAvgPooling + block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0); + // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] + // pointwise conv + block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]); + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1); + block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b); + block_1 = ggml_relu(ctx0, block_1); + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1); + block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b); + block_1 = ggml_hardsigmoid(ctx0, block_1); + + // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1] + block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]); + block_1 = ggml_mul(ctx0, block_1_hw, block_1); + + int w = block_1->ne[0], h = block_1->ne[1]; + block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]); + block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3)); + // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1] + block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1); + block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]); + + + // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1] + block_1 = ggml_norm(ctx0, block_1, eps); + block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b); + block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]); + // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1] + } + embeddings = block_1; + } + else if (proj_type == PROJECTOR_TYPE_LDPV2) + { + int n_patch = 24; + ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); + mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b); + mlp_0 = ggml_gelu(ctx0, mlp_0); + ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0); + mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b); + // mlp_2 ne = [2048, 576, 1, 1] + // // AVG Pool Layer 2*2, strides = 2 + mlp_2 = ggml_permute(ctx0, mlp_2, 1, 0, 2, 3); + // mlp_2 ne = [576, 2048, 1, 1] + mlp_2 = ggml_cont_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]); + // mlp_2 ne [24, 24, 2048, 1] + mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0); + // weight ne = [3, 3, 2048, 1] + ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1); + peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3)); + peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b); + mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3)); + peg_0 = ggml_add(ctx0, peg_0, mlp_2); + peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]); + embeddings = peg_0; + } + else { + GGML_ABORT("fatal error"); + } + } + + // glm projector + else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) { + size_t gridsz = (size_t)sqrt(embeddings->ne[1]); + embeddings = ggml_permute(ctx0,embeddings,1,0,2,3); + embeddings = ggml_cont_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]); + embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1); + embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size); + embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3)); + embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b); + // GLU + { + embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); + embeddings = ggml_norm(ctx0, embeddings, eps); + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b); + embeddings = ggml_gelu_inplace(ctx0, embeddings); + ggml_tensor * x = embeddings; + embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings); + x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x); + embeddings = ggml_swiglu_split(ctx0, embeddings, x); + embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings); + } + // arrangement of BOI/EOI token embeddings + // note: these embeddings are not present in text model, hence we cannot process them as text tokens + // see: https://huggingface.co/THUDM/glm-edge-v-2b/blob/main/siglip.py#L53 + { + embeddings = ggml_concat(ctx0, model.mm_boi, embeddings, 1); // BOI + embeddings = ggml_concat(ctx0, embeddings, model.mm_eoi, 1); // EOI + } + } + + else { + GGML_ABORT("llava: unknown projector type"); + } + + // build the graph + ggml_build_forward_expand(gf, embeddings); + + return gf; +} diff --git a/tools/mtmd/models/minicpmv.cpp b/tools/mtmd/models/minicpmv.cpp new file mode 100644 index 000000000..3594ea29f --- /dev/null +++ b/tools/mtmd/models/minicpmv.cpp @@ -0,0 +1,114 @@ +#include "models.h" + +ggml_cgraph * clip_graph_minicpmv::build() { + GGML_ASSERT(model.class_embedding == nullptr); + const int n_pos = n_patches; + const int n_embd_proj = n_mmproj_embd; + + // position embeddings for the projector (not for ViT) + // see: https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/resampler.py#L70 + // base frequency omega + ggml_tensor * omega = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_embd_proj / 4); + ggml_set_name(omega, "omega"); + ggml_set_input(omega); + + // 2D input positions (using float for sinusoidal embeddings) + ggml_tensor * pos_h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos); + ggml_set_name(pos_h, "pos_h"); + ggml_set_input(pos_h); + ggml_tensor * pos_w = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_pos); + ggml_set_name(pos_w, "pos_w"); + ggml_set_input(pos_w); + + // for selecting learned pos embd, used by ViT + struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + ggml_tensor * learned_pos_embd = ggml_get_rows(ctx0, model.position_embeddings, positions); + + ggml_tensor * inp = build_inp(); + ggml_tensor * embeddings = build_vit( + inp, n_pos, + NORM_TYPE_NORMAL, + hparams.ffn_op, + learned_pos_embd, + nullptr); + + // resampler projector (it is just another transformer) + + ggml_tensor * q = model.mm_model_query; + ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings); + + // norm + q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1); + v = build_norm(v, model.mm_model_ln_kv_w, model.mm_model_ln_kv_b, NORM_TYPE_NORMAL, eps, -1); + + // calculate sinusoidal pos embd + ggml_tensor * pos_embed = nullptr; + { + // outer product + ggml_tensor * omega_b = ggml_repeat_4d(ctx0, omega, omega->ne[0], n_pos, 1, 1); // n_pos rows + ggml_tensor * theta_x = ggml_mul(ctx0, omega_b, pos_w); + ggml_tensor * theta_y = ggml_mul(ctx0, omega_b, pos_h); + // sin and cos + ggml_tensor * pos_embd_x = ggml_concat( + ctx0, + ggml_sin(ctx0, theta_x), + ggml_cos(ctx0, theta_x), + 0 // concat on first dim + ); + ggml_tensor * pos_embd_y = ggml_concat( + ctx0, + ggml_sin(ctx0, theta_y), + ggml_cos(ctx0, theta_y), + 0 // concat on first dim + ); + pos_embed = ggml_concat(ctx0, pos_embd_x, pos_embd_y, 0); + } + + // k = v + pos_embed + ggml_tensor * k = ggml_add(ctx0, v, pos_embed); + + // attention + { + const int d_head = 128; + int n_head = n_embd_proj/d_head; + // Use actual config value if available, otherwise fall back to hardcoded values + int num_query = hparams.minicpmv_query_num; + ggml_tensor * Q = ggml_add(ctx0, + ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), + model.mm_model_attn_q_b); + ggml_tensor * K = ggml_add(ctx0, + ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), + model.mm_model_attn_k_b); + ggml_tensor * V = ggml_add(ctx0, + ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), + model.mm_model_attn_v_b); + + Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_query); + K = ggml_reshape_3d(ctx0, K, d_head, n_head, n_pos); + V = ggml_reshape_3d(ctx0, V, d_head, n_head, n_pos); + + cb(Q, "resampler_Q", -1); + cb(K, "resampler_K", -1); + cb(V, "resampler_V", -1); + + float resampler_kq_scale = 1.0f/ sqrtf(float(d_head)); + embeddings = build_attn( + model.mm_model_attn_o_w, + model.mm_model_attn_o_b, + Q, K, V, nullptr, resampler_kq_scale, -1); + cb(embeddings, "resampler_attn_out", -1); + } + // layernorm + embeddings = build_norm(embeddings, model.mm_model_ln_post_w, model.mm_model_ln_post_b, NORM_TYPE_NORMAL, eps, -1); + + // projection + embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings); + + // build the graph + ggml_build_forward_expand(gf, embeddings); + + return gf; +} diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h new file mode 100644 index 000000000..4b35da259 --- /dev/null +++ b/tools/mtmd/models/models.h @@ -0,0 +1,58 @@ +#pragma once + +#include "../clip-graph.h" + +struct clip_graph_siglip : clip_graph { + clip_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_pixtral : clip_graph { + clip_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_qwen2vl : clip_graph { + clip_graph_qwen2vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_qwen3vl : clip_graph { + clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_minicpmv : clip_graph { + clip_graph_minicpmv(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_internvl : clip_graph { + clip_graph_internvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_llama4 : clip_graph { + clip_graph_llama4(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_kimivl : clip_graph { + clip_graph_kimivl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_cogvlm : clip_graph { + clip_graph_cogvlm(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_llava : clip_graph { + clip_graph_llava(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + +struct clip_graph_whisper_enc : clip_graph { + clip_graph_whisper_enc(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; diff --git a/tools/mtmd/models/pixtral.cpp b/tools/mtmd/models/pixtral.cpp new file mode 100644 index 000000000..a849210b5 --- /dev/null +++ b/tools/mtmd/models/pixtral.cpp @@ -0,0 +1,86 @@ +#include "models.h" + +ggml_cgraph * clip_graph_pixtral::build() { + const int n_merge = hparams.n_merge; + + // 2D input positions + ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(pos_h, "pos_h"); + ggml_set_input(pos_h); + + ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(pos_w, "pos_w"); + ggml_set_input(pos_w); + + auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { + return build_rope_2d(ctx0, cur, pos_h, pos_w, hparams.rope_theta, true); + }; + + ggml_tensor * inp = build_inp(); + ggml_tensor * cur = build_vit( + inp, n_patches, + NORM_TYPE_RMS, + hparams.ffn_op, + nullptr, // no learned pos embd + add_pos); + + // mistral small 3.1 patch merger + // ref: https://github.com/huggingface/transformers/blob/7a3e208892c06a5e278144eaf38c8599a42f53e7/src/transformers/models/mistral3/modeling_mistral3.py#L67 + if (model.mm_patch_merger_w) { + GGML_ASSERT(hparams.n_merge > 0); + + cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.mm_input_norm_w); + + // reshape image tokens to 2D grid + cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y); + cur = ggml_permute(ctx0, cur, 2, 0, 1, 3); // [x, y, n_embd] + cur = ggml_cont(ctx0, cur); + + // torch.nn.functional.unfold is just an im2col under the hood + // we just need a dummy kernel to make it work + ggml_tensor * kernel = ggml_view_3d(ctx0, cur, n_merge, n_merge, cur->ne[2], 0, 0, 0); + cur = ggml_im2col(ctx0, kernel, cur, n_merge, n_merge, 0, 0, 1, 1, true, inp->type); + + // project to n_embd + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]); + cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur); + } + + // LlavaMultiModalProjector (always using GELU activation) + { + cur = build_ffn(cur, + model.mm_1_w, model.mm_1_b, + nullptr, nullptr, + model.mm_2_w, model.mm_2_b, + FFN_GELU, + -1); + } + + // arrangement of the [IMG_BREAK] token + if (model.token_embd_img_break) { + // not efficient, but works + // the trick is to view the embeddings as a 3D tensor with shape [n_embd, n_patches_per_row, n_rows] + // and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension + // after the concatenation, we have a tensor with shape [n_embd, n_patches_per_row + 1, n_rows] + + const int p_y = n_merge > 0 ? n_patches_y / n_merge : n_patches_y; + const int p_x = n_merge > 0 ? n_patches_x / n_merge : n_patches_x; + const int p_total = p_x * p_y; + const int n_embd_text = cur->ne[0]; + const int n_tokens_output = p_total + p_y - 1; // one [IMG_BREAK] per row, except the last row + + ggml_tensor * tmp = ggml_reshape_3d(ctx0, cur, n_embd_text, p_x, p_y); + ggml_tensor * tok = ggml_new_tensor_3d(ctx0, tmp->type, n_embd_text, 1, p_y); + tok = ggml_scale(ctx0, tok, 0.0); // clear the tensor + tok = ggml_add(ctx0, tok, model.token_embd_img_break); + tmp = ggml_concat(ctx0, tmp, tok, 1); + cur = ggml_view_2d(ctx0, tmp, + n_embd_text, n_tokens_output, + ggml_row_size(tmp->type, n_embd_text), 0); + } + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/tools/mtmd/models/qwen2vl.cpp b/tools/mtmd/models/qwen2vl.cpp new file mode 100644 index 000000000..85f158bb1 --- /dev/null +++ b/tools/mtmd/models/qwen2vl.cpp @@ -0,0 +1,183 @@ +#include "models.h" + +ggml_cgraph * clip_graph_qwen2vl::build() { + GGML_ASSERT(model.patch_bias == nullptr); + GGML_ASSERT(model.class_embedding == nullptr); + + const int batch_size = 1; + const bool use_window_attn = hparams.n_wa_pattern > 0; + const int n_wa_pattern = hparams.n_wa_pattern; + const int n_pos = n_patches; + const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position + + norm_type norm_t = proj_type == PROJECTOR_TYPE_QWEN25VL + ? NORM_TYPE_RMS // qwen 2.5 vl + : NORM_TYPE_NORMAL; // qwen 2 vl + + int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; + + ggml_tensor * inp_raw = build_inp_raw(); + ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + + GGML_ASSERT(img.nx % (patch_size * 2) == 0); + GGML_ASSERT(img.ny % (patch_size * 2) == 0); + + // second conv dimension + { + auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + inp = ggml_add(ctx0, inp, inp_1); + + inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b] + inp = ggml_cont_4d( + ctx0, inp, + n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); + inp = ggml_reshape_4d( + ctx0, inp, + n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); + inp = ggml_permute(ctx0, inp, 0, 2, 1, 3); + inp = ggml_cont_3d( + ctx0, inp, + n_embd, n_patches_x * n_patches_y, batch_size); + } + + ggml_tensor * inpL = inp; + ggml_tensor * window_mask = nullptr; + ggml_tensor * window_idx = nullptr; + ggml_tensor * inv_window_idx = nullptr; + + ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + // pre-layernorm + if (model.pre_ln_w) { + inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1); + } + + if (use_window_attn) { + // handle window attention inputs + inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4); + ggml_set_name(inv_window_idx, "inv_window_idx"); + ggml_set_input(inv_window_idx); + // mask for window attention + window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos); + ggml_set_name(window_mask, "window_mask"); + ggml_set_input(window_mask); + + // if flash attn is used, we need to pad the mask and cast to f16 + if (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) { + window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16); + } + + // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size] + GGML_ASSERT(batch_size == 1); + inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4); + inpL = ggml_get_rows(ctx0, inpL, inv_window_idx); + inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_patches_x * n_patches_y, batch_size); + } + + // loop over layers + for (int il = 0; il < n_layer; il++) { + const auto & layer = model.layers[il]; + const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true; + + ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states + + // layernorm1 + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); + cb(cur, "ln1", il); + + // self-attention + { + ggml_tensor * Qcur = ggml_add(ctx0, + ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b); + ggml_tensor * Kcur = ggml_add(ctx0, + ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b); + ggml_tensor * Vcur = ggml_add(ctx0, + ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b); + + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches); + Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches); + Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_patches); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + // apply M-RoPE + Qcur = ggml_rope_multi( + ctx0, Qcur, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + Kcur = ggml_rope_multi( + ctx0, Kcur, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + + cb(Qcur, "Qcur_rope", il); + cb(Kcur, "Kcur_rope", il); + + ggml_tensor * attn_mask = full_attn ? nullptr : window_mask; + + cur = build_attn(layer.o_w, layer.o_b, + Qcur, Kcur, Vcur, attn_mask, kq_scale, il); + cb(cur, "attn_out", il); + } + + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, inpL); + + inpL = cur; // inpL = residual, cur = hidden_states + + cb(cur, "ffn_inp", il); + + // layernorm2 + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); + cb(cur, "ffn_inp_normed", il); + + // ffn + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + layer.ff_gate_w, layer.ff_gate_b, + layer.ff_down_w, layer.ff_down_b, + hparams.ffn_op, il); + + cb(cur, "ffn_out", il); + + // residual 2 + cur = ggml_add(ctx0, inpL, cur); + cb(cur, "layer_out", il); + + inpL = cur; + } + + // post-layernorm + if (model.post_ln_w) { + inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer); + } + + // multimodal projection + ggml_tensor * embeddings = inpL; + embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size); + embeddings = build_ffn(embeddings, + model.mm_0_w, model.mm_0_b, + nullptr, nullptr, + model.mm_1_w, model.mm_1_b, + FFN_GELU, + -1); + + if (use_window_attn) { + window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos / 4); + ggml_set_name(window_idx, "window_idx"); + ggml_set_input(window_idx); + + // embeddings shape: [n_embd, n_patches_x * n_patches_y, batch_size] + GGML_ASSERT(batch_size == 1); + embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4); + embeddings = ggml_get_rows(ctx0, embeddings, window_idx); + embeddings = ggml_reshape_3d(ctx0, embeddings, hparams.projection_dim, n_patches_x * n_patches_y / 4, batch_size); + } + + // build the graph + ggml_build_forward_expand(gf, embeddings); + + return gf; +} diff --git a/tools/mtmd/models/qwen3vl.cpp b/tools/mtmd/models/qwen3vl.cpp new file mode 100644 index 000000000..35a42cb84 --- /dev/null +++ b/tools/mtmd/models/qwen3vl.cpp @@ -0,0 +1,191 @@ +#include "models.h" + +ggml_cgraph * clip_graph_qwen3vl::build() { + GGML_ASSERT(model.patch_bias != nullptr); + GGML_ASSERT(model.position_embeddings != nullptr); + GGML_ASSERT(model.class_embedding == nullptr); + + const int batch_size = 1; + const int n_pos = n_patches; + const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position + + norm_type norm_t = NORM_TYPE_NORMAL; + + int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; + + ggml_tensor * inp_raw = build_inp_raw(); + ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + + GGML_ASSERT(img.nx % (patch_size * 2) == 0); + GGML_ASSERT(img.ny % (patch_size * 2) == 0); + + // second conv dimension + { + auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + inp = ggml_add(ctx0, inp, inp_1); + + inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b] + inp = ggml_cont_4d( + ctx0, inp, + n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); + inp = ggml_reshape_4d( + ctx0, inp, + n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); + inp = ggml_permute(ctx0, inp, 0, 2, 1, 3); + inp = ggml_cont_3d( + ctx0, inp, + n_embd, n_patches_x * n_patches_y, batch_size); + } + + // add patch bias + if (model.patch_bias != nullptr) { + inp = ggml_add(ctx0, inp, model.patch_bias); + cb(inp, "patch_bias", -1); + } + + // calculate absolute position embedding and apply + ggml_tensor * learned_pos_embd = resize_position_embeddings(); + learned_pos_embd = ggml_cont_4d( + ctx0, learned_pos_embd, + n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); + learned_pos_embd = ggml_reshape_4d( + ctx0, learned_pos_embd, + n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); + learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3); + learned_pos_embd = ggml_cont_3d( + ctx0, learned_pos_embd, + n_embd, n_patches_x * n_patches_y, batch_size); + inp = ggml_add(ctx0, inp, learned_pos_embd); + cb(inp, "inp_pos_emb", -1); + + ggml_tensor * inpL = inp; + + ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + // pre-layernorm + if (model.pre_ln_w) { + inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, norm_t, eps, -1); + } + + // deepstack features (stack along the feature dimension), [n_embd * len(deepstack_layers), n_patches_x * n_patches_y, batch_size] + ggml_tensor * deepstack_features = nullptr; + const int merge_factor = hparams.n_merge > 0 ? hparams.n_merge * hparams.n_merge : 4; // default 2x2=4 for qwen3vl + + // loop over layers + for (int il = 0; il < n_layer; il++) { + auto & layer = model.layers[il]; + + ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states + + // layernorm1 + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); + cb(cur, "ln1", il); + + // self-attention + { + cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); + cur = ggml_add(ctx0, cur, layer.qkv_b); + + ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, + /* nb1 */ ggml_row_size(cur->type, d_head), + /* nb2 */ cur->nb[1], + /* offset */ 0); + + ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, + /* nb1 */ ggml_row_size(cur->type, d_head), + /* nb2 */ cur->nb[1], + /* offset */ ggml_row_size(cur->type, n_embd)); + + ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, + /* nb1 */ ggml_row_size(cur->type, d_head), + /* nb2 */ cur->nb[1], + /* offset */ ggml_row_size(cur->type, 2 * n_embd)); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + // apply M-RoPE + Qcur = ggml_rope_multi( + ctx0, Qcur, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + Kcur = ggml_rope_multi( + ctx0, Kcur, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1); + + cb(Qcur, "Qcur_rope", il); + cb(Kcur, "Kcur_rope", il); + + cur = build_attn(layer.o_w, layer.o_b, + Qcur, Kcur, Vcur, nullptr, kq_scale, il); + cb(cur, "attn_out", il); + } + + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, inpL); + + inpL = cur; // inpL = residual, cur = hidden_states + + cb(cur, "ffn_inp", il); + + // layernorm2 + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); + cb(cur, "ffn_inp_normed", il); + + // ffn + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + layer.ff_gate_w, layer.ff_gate_b, + layer.ff_down_w, layer.ff_down_b, + hparams.ffn_op, il); + + cb(cur, "ffn_out", il); + + // residual 2 + cur = ggml_add(ctx0, inpL, cur); + cb(cur, "layer_out", il); + + if (layer.has_deepstack()) { + ggml_tensor * feat = ggml_reshape_3d(ctx0, cur, n_embd * merge_factor, n_pos / merge_factor, batch_size); + feat = build_norm(feat, layer.deepstack_norm_w, layer.deepstack_norm_b, norm_t, eps, il); + feat = build_ffn(feat, + layer.deepstack_fc1_w, layer.deepstack_fc1_b, + nullptr, nullptr, + layer.deepstack_fc2_w, layer.deepstack_fc2_b, + ffn_op_type::FFN_GELU, il); + + if(!deepstack_features) { + deepstack_features = feat; + } else { + // concat along the feature dimension + deepstack_features = ggml_concat(ctx0, deepstack_features, feat, 0); + } + } + + inpL = cur; + } + + // post-layernorm + if (model.post_ln_w) { + inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer); + } + + // multimodal projection + ggml_tensor * embeddings = inpL; + embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size); + + embeddings = build_ffn(embeddings, + model.mm_0_w, model.mm_0_b, + nullptr, nullptr, + model.mm_1_w, model.mm_1_b, + ffn_op_type::FFN_GELU, -1); + + embeddings = ggml_concat(ctx0, embeddings, deepstack_features, 0); // concat along the feature dimension + + // build the graph + ggml_build_forward_expand(gf, embeddings); + + return gf; +} diff --git a/tools/mtmd/models/siglip.cpp b/tools/mtmd/models/siglip.cpp new file mode 100644 index 000000000..ef094cfd0 --- /dev/null +++ b/tools/mtmd/models/siglip.cpp @@ -0,0 +1,81 @@ +#include "models.h" + +ggml_cgraph * clip_graph_siglip::build() { + ggml_tensor * inp = build_inp(); + + ggml_tensor * learned_pos_embd = model.position_embeddings; + if (proj_type == PROJECTOR_TYPE_LFM2) { + learned_pos_embd = resize_position_embeddings(); + } + + ggml_tensor * cur = build_vit( + inp, n_patches, + NORM_TYPE_NORMAL, + hparams.ffn_op, + learned_pos_embd, + nullptr); + + if (proj_type == PROJECTOR_TYPE_GEMMA3) { + const int batch_size = 1; + GGML_ASSERT(n_patches_x == n_patches_y); + const int patches_per_image = n_patches_x; + const int kernel_size = hparams.n_merge; + + cur = ggml_transpose(ctx0, cur); + cur = ggml_cont_4d(ctx0, cur, patches_per_image, patches_per_image, n_embd, batch_size); + + // doing a pool2d to reduce the number of output tokens + cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0); + cur = ggml_reshape_3d(ctx0, cur, cur->ne[0] * cur->ne[0], n_embd, batch_size); + cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + + // apply norm before projection + cur = ggml_rms_norm(ctx0, cur, eps); + cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w); + + // apply projection + cur = ggml_mul_mat(ctx0, + ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)), + cur); + + } else if (proj_type == PROJECTOR_TYPE_IDEFICS3) { + // pixel_shuffle + // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578 + const int scale_factor = model.hparams.n_merge; + cur = build_patch_merge_permute(cur, scale_factor); + cur = ggml_mul_mat(ctx0, model.projection, cur); + + } else if (proj_type == PROJECTOR_TYPE_LFM2) { + // pixel unshuffle block + const int scale_factor = model.hparams.n_merge; + cur = build_patch_merge_permute(cur, scale_factor); + + // projection + cur = ggml_norm(ctx0, cur, 1e-5); // default nn.LayerNorm + cur = ggml_mul(ctx0, cur, model.mm_input_norm_w); + cur = ggml_add(ctx0, cur, model.mm_input_norm_b); + + cur = build_ffn(cur, + model.mm_1_w, model.mm_1_b, + nullptr, nullptr, + model.mm_2_w, model.mm_2_b, + FFN_GELU, + -1); + + } else if (proj_type == PROJECTOR_TYPE_JANUS_PRO) { + cur = build_ffn(cur, + model.mm_0_w, model.mm_0_b, + nullptr, nullptr, + model.mm_1_w, model.mm_1_b, + hparams.ffn_op, + -1); + + } else { + GGML_ABORT("SigLIP: Unsupported projector type"); + } + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/tools/mtmd/models/whisper-enc.cpp b/tools/mtmd/models/whisper-enc.cpp new file mode 100644 index 000000000..2870d854a --- /dev/null +++ b/tools/mtmd/models/whisper-enc.cpp @@ -0,0 +1,106 @@ +#include "models.h" + +ggml_cgraph * clip_graph_whisper_enc::build() { + const int n_frames = img.nx; + const int n_pos = n_frames / 2; + GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos); + + ggml_tensor * inp = build_inp_raw(1); + + // conv1d block + { + // convolution + gelu + ggml_tensor * cur = ggml_conv_1d_ph(ctx0, model.conv1d_1_w, inp, 1, 1); + cur = ggml_add(ctx0, cur, model.conv1d_1_b); + + cur = ggml_gelu_erf(ctx0, cur); + + cur = ggml_conv_1d_ph(ctx0, model.conv1d_2_w, cur, 2, 1); + cur = ggml_add(ctx0, cur, model.conv1d_2_b); + + cur = ggml_gelu_erf(ctx0, cur); + // transpose + inp = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + cb(inp, "after_conv1d", -1); + } + + // sanity check (only check one layer, but it should be the same for all) + GGML_ASSERT(model.layers[0].ln_1_w && model.layers[0].ln_1_b); + GGML_ASSERT(model.layers[0].ln_2_w && model.layers[0].ln_2_b); + GGML_ASSERT(model.layers[0].q_b); + GGML_ASSERT(model.layers[0].v_b); + GGML_ASSERT(!model.layers[0].k_b); // no bias for k + + ggml_tensor * pos_embd_selected = ggml_view_2d( + ctx0, model.position_embeddings, + model.position_embeddings->ne[0], n_pos, + model.position_embeddings->nb[1], 0 + ); + ggml_tensor * cur = build_vit( + inp, n_pos, + NORM_TYPE_NORMAL, + hparams.ffn_op, + pos_embd_selected, + nullptr); + + cb(cur, "after_transformer", -1); + + if (model.audio_has_stack_frames()) { + // StackAudioFrames + // https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py + cur = build_stack(cur, hparams.proj_stack_factor, n_embd); + cb(cur, "after_stacked", -1); + } + + if (proj_type == PROJECTOR_TYPE_ULTRAVOX) { + // UltravoxProjector + // pre-norm + cur = ggml_rms_norm(ctx0, cur, 1e-6); + cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w); + + // ffn in + cur = ggml_mul_mat(ctx0, model.mm_1_w, cur); + + // swiglu + // see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half + cur = ggml_swiglu_swapped(ctx0, cur); + + // mid-norm + cur = ggml_rms_norm(ctx0, cur, 1e-6); + cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w); + + // ffn out + cur = ggml_mul_mat(ctx0, model.mm_2_w, cur); + + } else if (proj_type == PROJECTOR_TYPE_QWEN2A) { + // projector + cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur); + cur = ggml_add(ctx0, cur, model.mm_fc_b); + + } else if (proj_type == PROJECTOR_TYPE_VOXTRAL) { + // projector + cur = build_ffn(cur, + model.mm_1_w, model.mm_1_b, + nullptr, nullptr, + model.mm_2_w, model.mm_2_b, + FFN_GELU_ERF, + -1); + + } else if (proj_type == PROJECTOR_TYPE_GLMA) { + cur = ggml_norm(ctx0, cur, hparams.eps); + cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w); + cur = ggml_add(ctx0, cur, model.mm_norm_pre_b); + cur = build_stack(cur, hparams.proj_stack_factor, n_embd); + cur = build_ffn(cur, model.mm_1_w, model.mm_1_b, nullptr, nullptr, model.mm_2_w, model.mm_2_b, hparams.ffn_op, 0); + cur = ggml_concat(ctx0, model.mm_boi, cur, 1); + cur = ggml_concat(ctx0, cur, model.mm_eoi, 1); + } else { + GGML_ABORT("%s: unknown projector type", __func__); + } + + cb(cur, "projected", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/tools/mtmd/mtmd-audio.h b/tools/mtmd/mtmd-audio.h index b7b940aff..0e552347a 100644 --- a/tools/mtmd/mtmd-audio.h +++ b/tools/mtmd/mtmd-audio.h @@ -6,6 +6,8 @@ #include #include +#define MTMD_INTERNAL_HEADER + #define WHISPER_ASSERT GGML_ASSERT #define WHISPER_SAMPLE_RATE 16000 diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index 25d24603d..332d2049e 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -65,7 +65,7 @@ static void sigint_handler(int signo) { struct mtmd_cli_context { mtmd::context_ptr ctx_vision; - common_init_result llama_init; + common_init_result_ptr llama_init; llama_model * model; llama_context * lctx; @@ -89,8 +89,8 @@ struct mtmd_cli_context { llama_pos n_past = 0; mtmd_cli_context(common_params & params) : llama_init(common_init_from_params(params)) { - model = llama_init.model.get(); - lctx = llama_init.context.get(); + model = llama_init->model(); + lctx = llama_init->context(); vocab = llama_model_get_vocab(model); smpl = common_sampler_init(model, params.sampling); n_threads = params.cpuparams.n_threads; diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp index e65c37310..d4c6b4c7f 100644 --- a/tools/mtmd/mtmd-helper.cpp +++ b/tools/mtmd/mtmd-helper.cpp @@ -32,6 +32,10 @@ // #define STB_IMAGE_IMPLEMENTATION #include "stb/stb_image.h" +#ifdef MTMD_INTERNAL_HEADER +#error "mtmd-helper is a public library outside of mtmd. it must not include internal headers" +#endif + // // internal logging functions // diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index b3df24c29..9f7e861e9 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -22,6 +22,11 @@ * Issues related to API usage may receive lower priority support. * * For the usage, see an example in mtmd-cli.cpp + * + * For contributors: + * - Make sure the C API is aligned with the libllama C API (as in llama.h) + * - Do not include model name (e.g., qwen, gemma) in the API, use generic terms instead + * - Keep the API minimal, do not expose internal details unless necessary */ #ifdef LLAMA_SHARED diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 5a67f508d..90898b5ec 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -153,7 +153,7 @@ struct server_slot { // sampling json json_schema; - struct common_sampler * smpl = nullptr; + common_sampler_ptr smpl; llama_token sampled; // in speculative mode, this is the last accepted token llama_tokens drafted; @@ -510,8 +510,8 @@ struct server_context_impl { common_params params_base; // note: keep these alive - they determine the lifetime of the model, context, etc. - common_init_result llama_init; - common_init_result llama_init_dft; + common_init_result_ptr llama_init; + common_init_result_ptr llama_init_dft; llama_model * model = nullptr; llama_context * ctx = nullptr; @@ -557,9 +557,6 @@ struct server_context_impl { // Clear any sampling context for (server_slot & slot : slots) { - common_sampler_free(slot.smpl); - slot.smpl = nullptr; - llama_free(slot.ctx_dft); slot.ctx_dft = nullptr; @@ -580,8 +577,8 @@ struct server_context_impl { llama_init = common_init_from_params(params_base); - model = llama_init.model.get(); - ctx = llama_init.context.get(); + model = llama_init->model(); + ctx = llama_init->context(); if (model == nullptr) { SRV_ERR("failed to load model, '%s'\n", params_base.model.path.c_str()); @@ -613,25 +610,25 @@ struct server_context_impl { llama_init_dft = common_init_from_params(params_dft); - model_dft = llama_init_dft.model.get(); + model_dft = llama_init_dft->model(); if (model_dft == nullptr) { SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.path.c_str()); return false; } - vocab_dft_compatible = common_speculative_are_compatible(ctx, llama_init_dft.context.get()); + vocab_dft_compatible = common_speculative_are_compatible(ctx, llama_init_dft->context()); if (!vocab_dft_compatible) { SRV_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params_base.speculative.model.path.c_str(), params_base.model.path.c_str()); } - const int n_ctx_dft = llama_n_ctx(llama_init_dft.context.get()); + const int n_ctx_dft = llama_n_ctx(llama_init_dft->context()); cparams_dft = common_context_params_to_llama(params_dft); cparams_dft.n_batch = n_ctx_dft; // the context is not needed - we will create one for each slot - llama_init_dft.context.reset(); + llama_init_dft->free_context(); } chat_templates = common_chat_templates_init(model, params_base.chat_template); @@ -1051,18 +1048,15 @@ struct server_context_impl { // initialize samplers { - if (slot.smpl != nullptr) { - common_sampler_free(slot.smpl); - } + slot.smpl.reset(common_sampler_init(model, task.params.sampling)); - slot.smpl = common_sampler_init(model, task.params.sampling); if (slot.smpl == nullptr) { // for now, the only error that may happen here is invalid grammar send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST); return false; } - SLT_INF(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl).c_str()); + SLT_INF(slot, "sampler chain: %s\n", common_sampler_print(slot.smpl.get()).c_str()); } // initialize draft batch @@ -1216,11 +1210,10 @@ struct server_context_impl { } void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) const { - size_t n_probs = slot.task->params.sampling.n_probs; - size_t n_vocab = llama_vocab_n_tokens(vocab); + const size_t n_probs = slot.task->params.sampling.n_probs; if (post_sampling) { - const auto * cur_p = common_sampler_get_candidates(slot.smpl, true); + const auto * cur_p = common_sampler_get_candidates(slot.smpl.get(), true); const size_t max_probs = cur_p->size; // set probability for sampled token @@ -1245,7 +1238,7 @@ struct server_context_impl { std::vector cur = get_token_probabilities(ctx, idx); // set probability for sampled token - for (size_t i = 0; i < n_vocab; i++) { + for (size_t i = 0; i < cur.size(); i++) { // set probability for sampled token if (cur[i].id == result.tok) { result.prob = cur[i].p; @@ -1255,7 +1248,7 @@ struct server_context_impl { // set probability for top n_probs tokens result.probs.reserve(n_probs); - for (size_t i = 0; i < std::min(n_vocab, n_probs); i++) { + for (size_t i = 0; i < std::min(cur.size(), n_probs); i++) { result.probs.push_back({ cur[i].id, common_token_to_piece(ctx, cur[i].id, special), @@ -2301,13 +2294,13 @@ struct server_context_impl { GGML_ASSERT(batch.n_tokens > 0); - common_sampler_reset(slot.smpl); + common_sampler_reset(slot.smpl.get()); // Process all prompt tokens through sampler system for (int i = 0; i < slot.task->n_tokens(); ++i) { llama_token id = input_tokens[i]; if (id != LLAMA_TOKEN_NULL) { - common_sampler_accept(slot.smpl, id, false); + common_sampler_accept(slot.smpl.get(), id, false); } } @@ -2525,11 +2518,11 @@ struct server_context_impl { const int tok_idx = slot.i_batch - i; - llama_token id = common_sampler_sample(slot.smpl, ctx, tok_idx); + llama_token id = common_sampler_sample(slot.smpl.get(), ctx, tok_idx); slot.i_batch = -1; - common_sampler_accept(slot.smpl, id, true); + common_sampler_accept(slot.smpl.get(), id, true); slot.n_decoded += 1; @@ -2570,7 +2563,7 @@ struct server_context_impl { size_t n_draft = slot.drafted.size(); // the accepted tokens from the speculation - const auto ids = common_sampler_sample_and_accept_n(slot.smpl, ctx, slot.i_batch_dft, slot.drafted); + const auto ids = common_sampler_sample_and_accept_n(slot.smpl.get(), ctx, slot.i_batch_dft, slot.drafted); slot.i_batch_dft.clear(); slot.drafted.clear(); diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 6c618a673..3690c0bb8 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #ifdef _WIN32 #include @@ -171,7 +172,7 @@ server_presets::server_presets(int argc, char ** argv, common_params & base_para } // read base args from router's argv - common_params_parse(argc, argv, LLAMA_EXAMPLE_SERVER, base_args); + common_params_to_map(argc, argv, LLAMA_EXAMPLE_SERVER, base_args); // remove any router-controlled args from base_args for (const auto & cargs : control_args) { diff --git a/tools/server/tests/unit/test_compat_anthropic.py b/tools/server/tests/unit/test_compat_anthropic.py index d55dd1d94..e0a003557 100644 --- a/tools/server/tests/unit/test_compat_anthropic.py +++ b/tools/server/tests/unit/test_compat_anthropic.py @@ -684,7 +684,7 @@ def test_anthropic_streaming_content_block_indices(): # Request that might produce both text and tool use res = server.make_stream_request("POST", "/v1/messages", data={ "model": "test", - "max_tokens": 200, + "max_tokens": 400, "stream": True, "tools": [{ "name": "test_tool", diff --git a/tools/tts/tts.cpp b/tools/tts/tts.cpp index eaf56591d..8c39fce8b 100644 --- a/tools/tts/tts.cpp +++ b/tools/tts/tts.cpp @@ -568,10 +568,10 @@ int main(int argc, char ** argv) { llama_context * ctx_ttc = NULL; llama_context * ctx_cts = NULL; - common_init_result llama_init_ttc = common_init_from_params(params); + auto llama_init_ttc = common_init_from_params(params); - model_ttc = llama_init_ttc.model.get(); - ctx_ttc = llama_init_ttc.context.get(); + model_ttc = llama_init_ttc->model(); + ctx_ttc = llama_init_ttc->context(); if (model_ttc == nullptr || ctx_ttc == nullptr) { return ENOENT; @@ -583,10 +583,10 @@ int main(int argc, char ** argv) { params.embedding = true; params.n_ubatch = params.n_batch; - common_init_result llama_init_cts = common_init_from_params(params); + auto llama_init_cts = common_init_from_params(params); - model_cts = llama_init_cts.model.get(); - ctx_cts = llama_init_cts.context.get(); + model_cts = llama_init_cts->model(); + ctx_cts = llama_init_cts->context(); if (model_cts == nullptr || ctx_cts == nullptr) { return ENOENT; diff --git a/vendor/cpp-httplib/CMakeLists.txt b/vendor/cpp-httplib/CMakeLists.txt index e90e8e2d1..8f0d15d1f 100644 --- a/vendor/cpp-httplib/CMakeLists.txt +++ b/vendor/cpp-httplib/CMakeLists.txt @@ -11,8 +11,9 @@ endif() target_link_libraries (${TARGET} PRIVATE Threads::Threads) if (WIN32 AND NOT MSVC) - target_link_libraries(${TARGET} PUBLIC ws2_32) + target_link_libraries(${TARGET} PRIVATE ws2_32) endif() + target_compile_features(${TARGET} PRIVATE cxx_std_17) target_compile_definitions(${TARGET} PRIVATE