Merge branch 'upstream' into concedo_experimental

# Conflicts: # .gitignore # CONTRIBUTING.md # Makefile # examples/llava/CMakeLists.txt # scripts/sync-ggml-am.sh # scripts/sync-ggml.last # scripts/sync-ggml.sh # src/llama-vocab.cpp
2025-09-11 01:24:36 +00:00 · 2024-08-10 11:42:32 +08:00 · 2024-08-10 11:42:32 +08:00 · bdfe8526b8
commit bdfe8526b8
parent 6dd3d5515e b72942fac9
44 changed files with 2241 additions and 439 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -685,14 +685,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
    }
    if (arg == "--lora") {
        CHECK_ARG
-        params.lora_adapter.emplace_back(argv[i], 1.0f);
+        params.lora_adapters.push_back({
            std::string(argv[i]),
            1.0,
        });
        return true;
    }
    if (arg == "--lora-scaled") {
        CHECK_ARG
-        const char* lora_adapter = argv[i];
+        std::string lora_adapter = argv[i];
        CHECK_ARG
-        params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
+        params.lora_adapters.push_back({
            lora_adapter,
            std::stof(argv[i]),
        });
        return true;
    }
    if (arg == "--lora-init-without-apply") {
        params.lora_init_without_apply = true;
        return true;
    }
    if (arg == "--control-vector") {
@ -1655,6 +1665,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
                                                                        "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
    options.push_back({ "server",      "-sps,  --slot-prompt-similarity SIMILARITY",
                                                                        "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
    options.push_back({ "server",      "       --lora-init-without-apply",     "load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"});
 #ifndef LOG_DISABLE_LOGS
    options.push_back({ "logging" });
@ -1767,6 +1778,17 @@ std::string string_get_sortable_timestamp() {
    return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
 }
 void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
    if (search.empty()) {
        return; // Avoid infinite loop if 'search' is an empty string
    }
    size_t pos = 0;
    while ((pos = s.find(search, pos)) != std::string::npos) {
        s.replace(pos, search.length(), replace);
        pos += replace.length();
    }
 }
 void string_process_escapes(std::string & input) {
    std::size_t input_len = input.length();
    std::size_t output_idx = 0;
@ -2092,17 +2114,22 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
        }
    }
-    for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
+    // load and optionally apply lora adapters
-        const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
+    for (auto & la : params.lora_adapters) {
-        float lora_scale = std::get<1>(params.lora_adapter[i]);
+        llama_lora_adapter_container loaded_la;
-        auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
+        loaded_la.path = la.path;
-        if (adapter == nullptr) {
+        loaded_la.scale = la.scale;
-            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
+        loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
        if (loaded_la.adapter == nullptr) {
            fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
            llama_free(lctx);
            llama_free_model(model);
            return iparams;
        }
-        llama_lora_adapter_set(lctx, adapter, lora_scale);
+        iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
    }
    if (!params.lora_init_without_apply) {
        llama_lora_adapters_apply(lctx, iparams.lora_adapters);
    }
    if (params.ignore_eos) {
@ -2141,6 +2168,15 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
    return iparams;
 }
 void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters) {
    llama_lora_adapter_clear(ctx);
    for (auto & la : lora_adapters) {
        if (la.scale != 0.0f) {
            llama_lora_adapter_set(ctx, la.adapter, la.scale);
        }
    }
 }
 struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
    auto mparams = llama_model_default_params();
@ -3163,19 +3199,18 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
    }
    fprintf(stream, "lora:\n");
-    for (std::tuple<std::string, float> la : params.lora_adapter) {
+    for (auto & la : params.lora_adapters) {
-        if (std::get<1>(la) != 1.0f) {
+        if (la.scale == 1.0f) {
-            continue;
+            fprintf(stream, "  - %s\n", la.path.c_str());
        }
        fprintf(stream, "  - %s\n", std::get<0>(la).c_str());
    }
    fprintf(stream, "lora_scaled:\n");
-    for (std::tuple<std::string, float> la : params.lora_adapter) {
+    for (auto & la : params.lora_adapters) {
-        if (std::get<1>(la) == 1.0f) {
+        if (la.scale != 1.0f) {
-            continue;
+            fprintf(stream, "  - %s: %f\n", la.path.c_str(), la.scale);
        }
        fprintf(stream, "  - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
    }
    fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
    fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
    fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
    fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
--- a/common/common.h
+++ b/common/common.h
@ -33,6 +33,15 @@
 #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
 struct llama_lora_adapter_info {
    std::string path;
    float scale;
 };
 struct llama_lora_adapter_container : llama_lora_adapter_info {
    struct llama_lora_adapter * adapter;
 };
 // build info
 struct llama_control_vector_load_info;
@ -148,8 +157,8 @@ struct gpt_params {
    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
    std::vector<llama_model_kv_override> kv_overrides;
-    // TODO: avoid tuple, use struct
+    bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
-    std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
+    std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
    std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
@ -299,6 +308,8 @@ std::vector<std::string> string_split(std::string input, char separator);
 std::string string_strip(const std::string & str);
 std::string string_get_sortable_timestamp();
 void string_replace_all(std::string & s, const std::string & search, const std::string & replace);
 template<class T>
 static std::vector<T> string_split(const std::string & str, char delim) {
    std::vector<T> values;
@ -333,6 +344,7 @@ std::string fs_get_cache_file(const std::string & filename);
 struct llama_init_result {
    struct llama_model   * model   = nullptr;
    struct llama_context * context = nullptr;
    std::vector<llama_lora_adapter_container> lora_adapters;
 };
 struct llama_init_result    llama_init_from_gpt_params(gpt_params & params);
@ -343,6 +355,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
 struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
 struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
 // clear LoRA adapters from context, then apply new list of adapters
 void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters);
 // Batch utils
 void llama_batch_clear(struct llama_batch & batch);
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -251,12 +251,7 @@ class Model:
        return [(self.map_tensor_name(name), data_torch)]
-    def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
+    def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
        del name, new_name, bid, n_dims  # unused
        return False
    def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
        del name, new_name, bid, n_dims  # unused
        return False
@ -285,55 +280,47 @@ class Model:
            for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
                data: np.ndarray  # type hint
                n_dims = len(data.shape)
-                data_dtype = data.dtype
+                data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims)
                data_qtype: gguf.GGMLQuantizationType | None = None
                # when both are True, f32 should win
                extra_f32 = self.extra_f32_tensors(name, new_name, bid, n_dims)
                extra_f16 = self.extra_f16_tensors(name, new_name, bid, n_dims)
                # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors
-                # Conditions should closely match those in llama_model_quantize_internal in llama.cpp
+                if n_dims <= 1 or new_name.endswith("_norm.weight"):
-                extra_f32 = any(cond for cond in (
+                    data_qtype = gguf.GGMLQuantizationType.F32
                    extra_f32,
                    n_dims == 1,
                    new_name.endswith("_norm.weight"),
                ))
                # Conditions should closely match those in llama_model_quantize_internal in llama.cpp
                # Some tensor types are always in float32
-                extra_f32 = extra_f32 or any(self.match_model_tensor_name(new_name, key, bid) for key in (
+                if data_qtype is False and (
                    any(
                        self.match_model_tensor_name(new_name, key, bid)
                        for key in (
                            gguf.MODEL_TENSOR.FFN_GATE_INP,
                            gguf.MODEL_TENSOR.POS_EMBD,
                            gguf.MODEL_TENSOR.TOKEN_TYPES,
-                ))
+                        )
-
+                    )
-                # if f16 desired, convert any float32 2-dim weight tensors to float16
+                    or not name.endswith(".weight")
-                extra_f16 = any(cond for cond in (
+                ):
                    extra_f16,
                    (name.endswith(".weight") and n_dims >= 2),
                ))
                if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
                    if self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
                        data = gguf.quantize_bf16(data)
                        assert data.dtype == np.uint16
                        data_qtype = gguf.GGMLQuantizationType.BF16
                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0 and gguf.can_quantize_to_q8_0(data):
                        data = gguf.quantize_q8_0(data)
                        assert data.dtype == np.uint8
                        data_qtype = gguf.GGMLQuantizationType.Q8_0
                    else:  # default to float16 for quantized tensors
                        if data_dtype != np.float16:
                            data = data.astype(np.float16)
                        data_qtype = gguf.GGMLQuantizationType.F16
                if data_qtype is None:  # by default, convert to float32
                    if data_dtype != np.float32:
                        data = data.astype(np.float32)
                    data_qtype = gguf.GGMLQuantizationType.F32
                # No override (data_qtype is False), or wants to be quantized (data_qtype is True)
                if isinstance(data_qtype, bool):
                    if self.ftype == gguf.LlamaFileType.ALL_F32:
                        data_qtype = gguf.GGMLQuantizationType.F32
                    elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
                        data_qtype = gguf.GGMLQuantizationType.F16
                    elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
                        data_qtype = gguf.GGMLQuantizationType.BF16
                    elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
                        data_qtype = gguf.GGMLQuantizationType.Q8_0
                    else:
                        raise ValueError(f"Unknown file type: {self.ftype.name}")
                try:
                    data = gguf.quants.quantize(data, data_qtype)
                except gguf.QuantError as e:
                    logger.warning("%s, %s", e, "falling back to F16")
                    data_qtype = gguf.GGMLQuantizationType.F16
                    data = gguf.quants.quantize(data, data_qtype)
                shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
                # reverse shape to make it similar to the internal ggml dimension order
@ -1765,7 +1752,7 @@ class DbrxModel(Model):
        return [(new_name, data_torch)]
-    def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
+    def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
        del name, new_name, bid  # unused
        return n_dims > 1
@ -2786,18 +2773,22 @@ class MambaModel(Model):
        return [(new_name, data_torch)]
-    def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
+    def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
-        del n_dims  # unused
+        if bid is not None and new_name in (
-
+            self.format_tensor_name(
-        return bid is not None and new_name in (
+                n, bid, ".weight" if name.endswith(".weight") else ""
-            self.format_tensor_name(n, bid, ".weight" if name.endswith(".weight") else "") for n in [
+            )
            for n in [
                gguf.MODEL_TENSOR.SSM_CONV1D,
                gguf.MODEL_TENSOR.SSM_X,
                gguf.MODEL_TENSOR.SSM_DT,
                gguf.MODEL_TENSOR.SSM_A,
                gguf.MODEL_TENSOR.SSM_D,
            ]
-        )
+        ):
            return gguf.GGMLQuantizationType.F32
        return super().tensor_force_quant(name, new_name, bid, n_dims)
@Model.register("CohereForCausalLM")
--- a/examples/embedding/README.md
+++ b/examples/embedding/README.md
@ -9,13 +9,13 @@ To get started right away, run the following command, making sure to use the cor
 ### Unix-based systems (Linux, macOS, etc.):
 ```bash
-./llama-embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
+./llama-embedding -m ./path/to/model --pooling mean --log-disable -p "Hello World!" 2>/dev/null
 ```
 ### Windows:
 ```powershell
-llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
+llama-embedding.exe -m ./path/to/model --pooling mean --log-disable -p "Hello World!" 2>$null
 ```
 The above command will output space-separated float values.
@ -50,11 +50,11 @@ The above command will output space-separated float values.
 ### Unix-based systems (Linux, macOS, etc.):
 ```bash
-./embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2  --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
+./llama-embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --pooling mean --embd-separator '<#sep#>' --embd-normalize 2  --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
 ```
 ### Windows:
 ```powershell
-embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2  --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
+llama-embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --pooling mean --embd-separator '<#sep#>' --embd-normalize 2  --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
 ```
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@ -50,20 +50,6 @@ static struct gguf_context * load_gguf(std::string & fname, struct ggml_context
    return ctx_gguf;
 }
 static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
    std::string result;
    for (size_t pos = 0; ; pos += search.length()) {
        auto new_pos = s.find(search, pos);
        if (new_pos == std::string::npos) {
            result += s.substr(pos, s.size() - pos);
            break;
        }
        result += s.substr(pos, new_pos - pos) + replace;
        pos = new_pos;
    }
    s = std::move(result);
 }
 struct file_input {
    struct ggml_context * ctx_meta = nullptr;
    struct gguf_context * ctx_gguf = nullptr;
@ -135,7 +121,7 @@ struct lora_merge_ctx {
    lora_merge_ctx(
            std::string & base_fname,
-            std::vector<std::tuple<std::string, float>> & lora_files,
+            std::vector<llama_lora_adapter_info> & lora_files,
            std::string & outfile,
            int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
        fout.exceptions(std::ofstream::failbit); // fail fast on write errors
@ -144,9 +130,9 @@ struct lora_merge_ctx {
            throw std::runtime_error("split model is not yet supported");
        }
-        for (auto lora_inp : lora_files) {
+        for (auto & lora_inp : lora_files) {
-            auto fname = std::get<0>(lora_inp);
+            auto fname = lora_inp.path;
-            auto scale = std::get<1>(lora_inp);
+            auto scale = lora_inp.scale;
            std::unique_ptr<file_input> adapter(new file_input(fname, scale));
            check_metadata_lora(adapter.get());
            adapters.push_back(std::move(adapter));
@ -407,7 +393,7 @@ int main(int argc, char ** argv) {
    g_verbose = (params.verbosity == 1);
    try {
-        lora_merge_ctx ctx(params.model, params.lora_adapter, params.lora_outfile, params.n_threads);
+        lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.n_threads);
        ctx.run_merge();
    } catch (const std::exception & err) {
        fprintf(stderr, "%s\n", err.what());
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -28,6 +28,14 @@
 #include "ggml-cann.h"
 #endif
 #ifdef _WIN32
 #define WIN32_LEAN_AND_MEAN
 #ifndef NOMINMAX
 #   define NOMINMAX
 #endif
 #include <windows.h>
 #endif
 // utils
 static uint64_t get_time_ns() {
    using clock = std::chrono::high_resolution_clock;
@ -97,6 +105,27 @@ static std::string get_cpu_info() {
        }
        fclose(f);
    }
 #elif defined(_WIN32)
    HKEY hKey;
    if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
                     TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
                     0,
                     KEY_READ,
                     &hKey) != ERROR_SUCCESS) {
        // fail to open registry key
        return "";
    }
    char cpu_brand[256];
    DWORD cpu_brand_size = sizeof(cpu_brand);
    if (RegQueryValueExA(hKey,
                        TEXT("ProcessorNameString"),
                        NULL,
                        NULL,
                        (LPBYTE)cpu_brand,
                        &cpu_brand_size) == ERROR_SUCCESS) {
        id.assign(cpu_brand, cpu_brand_size);
    }
    RegCloseKey(hKey);
 #endif
    // TODO: other platforms
    return id;
--- a/examples/llava/README-minicpmv2.5.md
+++ b/examples/llava/README-minicpmv2.5.md
@ -0,0 +1,99 @@
 ## MiniCPM-Llama3-V 2.5
 ### Prepare models and code
 Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) PyTorch model from huggingface to "MiniCPM-Llama3-V-2_5" folder.
 Clone llama.cpp:
 ```bash
 git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
 ```
 ### Usage
 Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
 ```bash
 python ./examples/minicpmv/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
 python ./examples/minicpmv/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5
 python ./convert-hf-to-gguf.py ../MiniCPM-Llama3-V-2_5/model
 # quantize int4 version
 ./llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M
 ```
 Build for Linux or Mac
 ```bash
 make
 make llama-minicpmv-cli
 ```
 Inference on Linux or Mac
 ```
 # run f16 version
 ./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
 # run quantized int4 version
 ./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
 # or run in interactive mode
 ./llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
 ```
 ### Android
 #### Build on Android device using Termux
 We found that build on Android device would bring better runtime performance, so we recommend to build on device.
 [Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required).
 Install tools in Termux:
 ```
 apt update && apt upgrade -y
 apt install git make cmake
 ```
 It's recommended to move your model inside the `~/` directory for best performance:
 ```
 cd storage/downloads
 mv model.gguf ~/
 ```
 #### Building the Project using Android NDK
 Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
 Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
 ```bash
 mkdir build-android
 cd build-android
 export NDK=/your_ndk_path
 cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
 make
 ```
 Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
 Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
 (Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
 ```
 $cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
 $cd /data/data/com.termux/files/home/bin
 $chmod +x ./*
 ```
 Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
 ```
 $mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/
 $mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/
 ```
 Now, you can start chatting:
 ```
 $cd /data/data/com.termux/files/home/bin
 $./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
 ```
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -80,6 +80,7 @@ static std::string format(const char * fmt, ...) {
 #define KEY_HAS_TEXT_ENC        "clip.has_text_encoder"
 #define KEY_HAS_VIS_ENC         "clip.has_vision_encoder"
 #define KEY_HAS_LLAVA_PROJ      "clip.has_llava_projector"
 #define KEY_HAS_MINICPMV_PROJ   "clip.has_minicpmv_projector"
 #define KEY_USE_GELU            "clip.use_gelu"
 #define KEY_N_EMBD              "clip.%s.embedding_length"
 #define KEY_N_FF                "clip.%s.feed_forward_length"
@ -127,12 +128,20 @@ static std::string format(const char * fmt, ...) {
 #define TN_MVLM_PROJ_PEG   "mm.model.peg.%d.%s"
 #define TN_IMAGE_NEWLINE   "model.image_newline"
 #define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
 #define TN_MINICPMV_QUERY "resampler.query"
 #define TN_MINICPMV_PROJ "resampler.proj.weight"
 #define TN_MINICPMV_KV_PROJ "resampler.kv.weight"
 #define TN_MINICPMV_ATTN "resampler.attn.%s.%s"
 #define TN_MINICPMV_LN "resampler.ln_%s.%s"
 enum projector_type {
    PROJECTOR_TYPE_MLP,
    PROJECTOR_TYPE_MLP_NORM,
    PROJECTOR_TYPE_LDP,
    PROJECTOR_TYPE_LDPV2,
    PROJECTOR_TYPE_RESAMPLER,
    PROJECTOR_TYPE_UNKNOWN,
 };
@ -140,6 +149,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_MLP, "mlp" },
    { PROJECTOR_TYPE_LDP, "ldp" },
    { PROJECTOR_TYPE_LDPV2, "ldpv2"},
    { PROJECTOR_TYPE_RESAMPLER, "resampler"},
 };
@ -200,17 +210,14 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
 }
 static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
-    std::string result;
+    if (search.empty()) {
-    for (size_t pos = 0; ; pos += search.length()) {
+        return; // Avoid infinite loop if 'search' is an empty string
        auto new_pos = s.find(search, pos);
        if (new_pos == std::string::npos) {
            result += s.substr(pos, s.size() - pos);
            break;
    }
-        result += s.substr(pos, new_pos - pos) + replace;
+    size_t pos = 0;
-        pos = new_pos;
+    while ((pos = s.find(search, pos)) != std::string::npos) {
        s.replace(pos, search.length(), replace);
        pos += replace.length();
    }
    s = std::move(result);
 }
 static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
@ -492,12 +499,33 @@ struct clip_vision_model {
    struct ggml_tensor * mm_model_mlp_2_b;
    struct ggml_tensor * mm_model_peg_0_w;
    struct ggml_tensor * mm_model_peg_0_b;
    // MINICPMV projection
    struct ggml_tensor * mm_model_pos_embed_k;
    struct ggml_tensor * mm_model_query;
    struct ggml_tensor * mm_model_proj;
    struct ggml_tensor * mm_model_kv_proj;
    struct ggml_tensor * mm_model_attn_q_w;
    struct ggml_tensor * mm_model_attn_q_b;
    struct ggml_tensor * mm_model_attn_k_w;
    struct ggml_tensor * mm_model_attn_k_b;
    struct ggml_tensor * mm_model_attn_v_w;
    struct ggml_tensor * mm_model_attn_v_b;
    struct ggml_tensor * mm_model_attn_o_w;
    struct ggml_tensor * mm_model_attn_o_b;
    struct ggml_tensor * mm_model_ln_q_w;
    struct ggml_tensor * mm_model_ln_q_b;
    struct ggml_tensor * mm_model_ln_kv_w;
    struct ggml_tensor * mm_model_ln_kv_b;
    struct ggml_tensor * mm_model_ln_post_w;
    struct ggml_tensor * mm_model_ln_post_b;
 };
 struct clip_ctx {
    bool has_text_encoder    = false;
    bool has_vision_encoder  = false;
    bool has_llava_projector = false;
    bool has_minicpmv_projector = false;
    struct clip_vision_model vision_model;
    projector_type proj_type = PROJECTOR_TYPE_MLP;
@ -522,9 +550,11 @@ struct clip_ctx {
    ggml_backend_t backend       = NULL;
    ggml_gallocr_t compute_alloc = NULL;
    struct clip_image_size * load_image_size;
 };
-static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
+static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
    if (!ctx->has_vision_encoder) {
        LOG_TEE("This gguf file seems to have no vision encoder\n");
        return nullptr;
@ -534,19 +564,32 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    const auto & hparams = model.hparams;
    const int image_size = hparams.image_size;
    int image_size_width  = image_size;
    int image_size_height = image_size;
    if (ctx->has_minicpmv_projector) {
        if (load_image_size == nullptr) {
            load_image_size = clip_image_size_init();
        }
        LOG_TEE("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
        image_size_width  = load_image_size->width;
        image_size_height = load_image_size->height;
        if (is_inf) {
            image_size_width  = imgs->data->nx;
            image_size_height = imgs->data->ny;
        }
    }
    const int patch_size           = hparams.patch_size;
-    const int num_patches          = ((image_size / patch_size) * (image_size / patch_size));
+    const int num_patches          = ((image_size_width / patch_size) * (image_size_height / patch_size));
    const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
    const int num_positions        = num_patches + (ctx->has_class_embedding ? 1 : 0);
    const int hidden_size          = hparams.hidden_size;
    const int n_head               = hparams.n_head;
    const int d_head               = hidden_size / n_head;
-    const int n_layer              = hparams.n_layer;
+    int n_layer                    = hparams.n_layer;
    const float eps                = hparams.eps;
    const int batch_size = imgs->size;
-    if (ctx->has_llava_projector) {
+    if (ctx->has_llava_projector || ctx->has_minicpmv_projector) {
        GGML_ASSERT(batch_size == 1);
    }
@ -559,7 +602,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    struct ggml_context * ctx0 = ggml_init(params);
    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
-    struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size);
+    struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size);
    ggml_set_name(inp_raw, "inp_raw");
    ggml_set_input(inp_raw);
@ -572,9 +615,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
        inp = ggml_add(ctx0, inp, model.patch_bias);
    }
    // concat class_embeddings and patch_embeddings
    struct ggml_tensor * embeddings = inp;
    struct ggml_tensor * pos_embed = nullptr;
    if (ctx->has_llava_projector) {
        // concat class_embeddings and patch_embeddings
        if (ctx->has_class_embedding) {
            embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
            ggml_set_name(embeddings, "embeddings");
@ -584,7 +629,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            embeddings = ggml_acc(ctx0, embeddings, inp,
                    embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
        }
-
+    }
    struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
    ggml_set_name(positions, "positions");
@ -593,6 +638,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    embeddings =
        ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
    if (ctx->has_minicpmv_projector) {
        int pos_w = image_size_width/patch_size;
        int pos_h = image_size_height/patch_size;
        pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1);
        ggml_set_name(pos_embed, "pos_embed");
        ggml_set_input(pos_embed);
    }
    // pre-layernorm
    if (ctx->has_pre_norm) {
        embeddings = ggml_norm(ctx0, embeddings, eps);
@ -602,6 +655,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    }
    // loop over layers
    if (ctx->has_minicpmv_projector) {
        n_layer += 1;
    }
    for (int il = 0; il < n_layer - 1; il++) {
        struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
@ -691,7 +747,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    }
    // llava projector
-    {
+    if (ctx->has_llava_projector) {
        embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
        struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
@ -872,6 +928,65 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            GGML_ABORT("fatal error");
        }
    }
    // minicpmv projector
    else if (ctx->has_minicpmv_projector)
    {
        if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
            struct ggml_tensor * q = model.mm_model_query;
            { // layernorm
                q = ggml_norm(ctx0, q, eps);
                q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
            }
            struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
            { // layernorm
                v = ggml_norm(ctx0, v, eps);
                v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b);
            }
            struct ggml_tensor * k;
            { // position
                // q = ggml_add(ctx0, q, model.mm_model_pos_embed);
                k = ggml_add(ctx0, v, pos_embed);
            }
            { // attention
                const int hidden_size = 4096;
                const int d_head = 128;
                const int n_head = hidden_size/d_head;
                const int num_query = 96;
                struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
                Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
                struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b);
                struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
                // permute
                Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size);
                Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
                Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size);
                K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
                K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
                K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
                V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
                V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
                V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
                struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
                KQ = ggml_soft_max_inplace(ctx0, KQ);
                struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
                KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
                KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
                KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
                embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
            }
            { // layernorm
                embeddings = ggml_norm(ctx0, embeddings, eps);
                embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b);
            }
            embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
        }
        else {
            GGML_ASSERT(false);
        }
    }
    // build the graph
    ggml_build_forward_expand(gf, embeddings);
@ -1029,7 +1144,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            new_clip->has_llava_projector = gguf_get_val_bool(ctx, idx);
        }
-        GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
+        idx = gguf_find_key(ctx, KEY_HAS_MINICPMV_PROJ);
        if (idx != -1) {
            new_clip->has_minicpmv_projector = gguf_get_val_bool(ctx, idx);
        }
        // GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search
        GGML_ASSERT(new_clip->has_vision_encoder);
        GGML_ASSERT(!new_clip->has_text_encoder);
@ -1040,6 +1161,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            LOG_TEE("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
            LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
            LOG_TEE("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
            LOG_TEE("%s: minicpmv_projector:  %d\n", __func__, new_clip->has_minicpmv_projector);
            LOG_TEE("%s: model size:     %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
            LOG_TEE("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
        }
@ -1281,6 +1403,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight"));
            vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias"));
        }
        else if (new_clip->proj_type == PROJECTOR_TYPE_RESAMPLER) {
            // vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
            vision_model.mm_model_pos_embed_k = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD_K);
            vision_model.mm_model_query = get_tensor(new_clip->ctx_data, TN_MINICPMV_QUERY);
            vision_model.mm_model_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_PROJ);
            vision_model.mm_model_kv_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_KV_PROJ);
            vision_model.mm_model_attn_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "weight"));
            vision_model.mm_model_attn_k_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "weight"));
            vision_model.mm_model_attn_v_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "weight"));
            vision_model.mm_model_attn_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "q", "bias"));
            vision_model.mm_model_attn_k_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "k", "bias"));
            vision_model.mm_model_attn_v_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "v", "bias"));
            vision_model.mm_model_attn_o_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "weight"));
            vision_model.mm_model_attn_o_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_ATTN, "out", "bias"));
            vision_model.mm_model_ln_q_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "weight"));
            vision_model.mm_model_ln_q_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "q", "bias"));
            vision_model.mm_model_ln_kv_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "weight"));
            vision_model.mm_model_ln_kv_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "kv", "bias"));
            vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight"));
            vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias"));
        }
        else {
            std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
            throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
@ -1319,7 +1462,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        new_clip->compute_alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(new_clip->backend));
        clip_image_f32_batch batch;
        batch.size = 1;
-        ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch);
+        ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
        ggml_gallocr_reserve(new_clip->compute_alloc, gf);
        size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
        LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
@ -1328,6 +1471,17 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
    return new_clip;
 }
 void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) {
    ctx_clip->load_image_size = load_image_size;
 }
 struct clip_image_size * clip_image_size_init() {
    struct clip_image_size * load_image_size = new struct clip_image_size();
    load_image_size->width = 448;
    load_image_size->height = 448;
    return load_image_size;
 }
 struct clip_image_u8 * clip_image_u8_init() {
    return new clip_image_u8();
 }
@ -1598,9 +1752,184 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
    return patches;
 }
 static int ensure_divide(int length, int patch_size) {
    return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
 }
 static std::pair<int, int> uhd_find_best_resize(std::pair<int, int> original_size, int scale_resolution, int patch_size, bool allow_upscale = false) {
    int width = original_size.first;
    int height = original_size.second;
    if ((width * height > scale_resolution * scale_resolution) || allow_upscale) {
        float r = static_cast<float>(width) / height;
        height = static_cast<int>(scale_resolution / std::sqrt(r));
        width = static_cast<int>(height * r);
    }
    int best_width = ensure_divide(width, patch_size);
    int best_height = ensure_divide(height, patch_size);
    return std::make_pair(best_width, best_height);
 }
 static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size, std::pair<int, int> grid, int scale_resolution, int patch_size, bool allow_upscale = false) {
    int width, height;
    std::tie(width, height) = original_size;
    int grid_x, grid_y;
    std::tie(grid_x, grid_y) = grid;
    int refine_width = ensure_divide(width, grid_x);
    int refine_height = ensure_divide(height, grid_y);
    int grid_width = refine_width / grid_x;
    int grid_height = refine_height / grid_y;
   // auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line)
    auto best_grid_size = uhd_find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair
    int best_grid_width, best_grid_height;
    std::tie(best_grid_width, best_grid_height) = best_grid_size;
  //  std::pair<int, int> refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line)
    std::pair<int, int> refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line)
    return refine_size;
 }
 inline int clip(int x, int lower, int upper) {
    return std::max(lower, std::min(x, upper));
 }
 static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
    std::vector<int> candidate_split_grids_nums;
    for (int i : {multiple - 1, multiple, multiple + 1}) {
        if (i == 1 || i > max_slice_nums) {
            continue;
        }
        candidate_split_grids_nums.push_back(i);
    }
    std::vector<std::pair<int, int>> candidate_grids;
    for (int split_grids_nums : candidate_split_grids_nums) {
        int m = 1;
        while (m <= split_grids_nums) {
            if (split_grids_nums % m == 0) {
                candidate_grids.emplace_back(m, split_grids_nums / m);
            }
            ++m;
        }
    }
    std::pair<int, int> best_grid{1, 1};
    float min_error = std::numeric_limits<float>::infinity();
    for (const auto& grid : candidate_grids) {
        float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second));
        if (error < min_error) {
            best_grid = grid;
            min_error = error;
        }
    }
    return best_grid;
 }
 // inspired from LLaVA-UHD:
 //    -> https://arxiv.org/pdf/2403.11703
 //    -> https://github.com/thunlp/LLaVA-UHD
 //    -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
 static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) {
    const std::pair<int, int> original_size={img->nx,img->ny};
    const int original_width = img->nx;
    const int original_height = img->ny;
    const float log_ratio = log(1.0*original_width/original_height);
    const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
    const int multiple = fmin(ceil(ratio), max_slice_nums);
    std::vector<std::vector<clip_image_u8 *>> images;
    LOG_TEE("%s: multiple %d\n", __func__, multiple);
    images.push_back(std::vector<clip_image_u8 *>());
    if (multiple <= 1) {
        auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true);
        clip_image_u8 * source_image = clip_image_u8_init();
        bicubic_resize(*img, *source_image, best_size.first, best_size.second);
        // source_image = image.resize(best_size, Image.Resampling.BICUBIC)
        images[images.size()-1].push_back(source_image);
    }
    else if (multiple > 1) {
        auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size);
        clip_image_u8 * source_image = clip_image_u8_init();
        bicubic_resize(*img, *source_image, best_size.first, best_size.second);
        // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
        LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
        images[images.size()-1].push_back(source_image);
        std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
        LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
        auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
        clip_image_u8 * refine_image = clip_image_u8_init();
        bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
        LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
        // split_to_patches
        int width = refine_image->nx;
        int height = refine_image->ny;
        int grid_x = int(width / best_grid.first);
        int grid_y = int(height / best_grid.second);
        for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){
            images.push_back(std::vector<clip_image_u8 *>());
            for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){
                clip_image_u8 * patch = clip_image_u8_init();
                patch->nx = grid_x;
                patch->ny = grid_y;
                patch->buf.resize(3 * patch->nx * patch->ny);
                for (int y = patches_i; y < patches_i + grid_y; ++y) {
                    for (int x = patches_j; x < patches_j + grid_x; ++x) {
                        const int i = 3 * (y * refine_image->nx + x);
                        const int j = 3 * ((y-patches_i) * patch->nx + (x-patches_j));
                        patch->buf[j]   = refine_image->buf[i];
                        patch->buf[j+1] = refine_image->buf[i+1];
                        patch->buf[j+2] = refine_image->buf[i+2];
                    }
                }
                images[images.size()-1].push_back(patch);
            }
        }
    }
    return images;
 }
 int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
    const int max_slice_nums=9;
    const int scale_resolution=448;
    const int original_width = ctx_clip->load_image_size->width;
    const int original_height = ctx_clip->load_image_size->height;
    const float log_ratio = log(1.0*original_width/original_height);
    const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
    const int multiple = fmin(ceil(ratio), max_slice_nums);
    std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
    return best_grid.first;
 }
 // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
 // res_imgs memory is being allocated here, previous allocations will be freed if found
 bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
    if (clip_is_minicpmv(ctx)) {
        std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img);
        res_imgs->size = 0;
        for (size_t i = 0; i < imgs.size(); ++i) {
            res_imgs->size += imgs[i].size();
        }
        res_imgs->data = new clip_image_f32[res_imgs->size];
        int idx = 0;
        for (size_t i = 0; i < imgs.size(); ++i) {
            for (size_t j = 0; j < imgs[i].size(); ++j) {
                LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
                clip_image_f32 * res = clip_image_f32_init();
                normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
                res_imgs->data[idx++] = *res;
                clip_image_f32_free(res);
            }
        }
        return true;
    }
    bool pad_to_square = true;
    if (!ctx->has_vision_encoder) {
        LOG_TEE("This gguf file seems to have no vision encoder\n");
@ -1816,11 +2145,99 @@ int clip_n_patches(const struct clip_ctx * ctx) {
    if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
        n_patches /= 4;
    } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
        n_patches = 96;
    }
    return n_patches;
 }
 static std::vector<std::vector<std::vector<float>>> get_1d_sincos_pos_embed_from_grid_new(int embed_dim, const std::vector<std::vector<float>> & pos) {
    assert(embed_dim % 2 == 0);
    int H = pos.size();
    int W = pos[0].size();
    std::vector<float> omega(embed_dim / 2);
    for (int i = 0; i < embed_dim / 2; ++i) {
        omega[i] = 1.0 / pow(10000.0, static_cast<float>(i) / (embed_dim / 2));
    }
    std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
    for (int h = 0; h < H; ++h) {
        for (int w = 0; w < W; ++w) {
            for (int d = 0; d < embed_dim / 2; ++d) {
                float out_value = pos[h][w] * omega[d];
                emb[h][w][d] = sin(out_value);
                emb[h][w][d + embed_dim / 2] = cos(out_value);
            }
        }
    }
    return emb;
 }
 static std::vector<std::vector<std::vector<float>>> get_2d_sincos_pos_embed_from_grid(int embed_dim, const std::vector<std::vector<std::vector<float>>> & grid) {
    assert(embed_dim % 2 == 0);
    std::vector<std::vector<std::vector<float>>> emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[0]); // (H, W, D/2)
    std::vector<std::vector<std::vector<float>>> emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, grid[1]); // (H, W, D/2)
    int H = emb_h.size();
    int W = emb_h[0].size();
    std::vector<std::vector<std::vector<float>>> emb(H, std::vector<std::vector<float>>(W, std::vector<float>(embed_dim)));
    for (int h = 0; h < H; ++h) {
        for (int w = 0; w < W; ++w) {
            for (int d = 0; d < embed_dim / 2; ++d) {
                emb[h][w][d] = emb_h[h][w][d];
                emb[h][w][d + embed_dim / 2] = emb_w[h][w][d];
            }
        }
    }
    return emb;
 }
 static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, const std::pair<int, int> image_size) {
    int grid_h_size = image_size.first;
    int grid_w_size = image_size.second;
    std::vector<float> grid_h(grid_h_size);
    std::vector<float> grid_w(grid_w_size);
    for (int i = 0; i < grid_h_size; ++i) {
        grid_h[i] = static_cast<float>(i);
    }
    for (int i = 0; i < grid_w_size; ++i) {
        grid_w[i] = static_cast<float>(i);
    }
    std::vector<std::vector<float>> grid(grid_h_size, std::vector<float>(grid_w_size));
    for (int h = 0; h < grid_h_size; ++h) {
        for (int w = 0; w < grid_w_size; ++w) {
            grid[h][w] = grid_w[w];
        }
    }
    std::vector<std::vector<std::vector<float>>> grid_2d = {grid, grid};
    for (int h = 0; h < grid_h_size; ++h) {
        for (int w = 0; w < grid_w_size; ++w) {
            grid_2d[0][h][w] = grid_h[h];
            grid_2d[1][h][w] = grid_w[w];
        }
    }
    std::vector<std::vector<std::vector<float>>> pos_embed_3d = get_2d_sincos_pos_embed_from_grid(embed_dim, grid_2d);
    int H = image_size.first;
    int W = image_size.second;
    std::vector<std::vector<float>> pos_embed_2d(H * W, std::vector<float>(embed_dim));
    for (int h = 0; h < H; ++h) {
        for (int w = 0; w < W; ++w) {
            pos_embed_2d[w * H + h] = pos_embed_3d[h][w];
        }
    }
    return pos_embed_2d;
 }
 bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
    if (!ctx->has_vision_encoder) {
        LOG_TEE("This gguf file seems to have no vision encoder\n");
@ -1843,9 +2260,12 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    if (ctx->has_llava_projector) {
        GGML_ASSERT(batch_size == 1); // TODO: support multiple images
    }
    if (ctx->has_minicpmv_projector) {
        GGML_ASSERT(batch_size == 1);
    }
    // build the inference graph
-    ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
+    ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
    ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
    // set inputs
@ -1853,8 +2273,14 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    const auto & hparams = model.hparams;
    const int image_size = hparams.image_size;
    int image_size_width  = image_size;
    int image_size_height = image_size;
    if (ctx->has_minicpmv_projector) {
        image_size_width  = imgs->data[0].nx;
        image_size_height = imgs->data[0].ny;
    }
    const int patch_size    = hparams.patch_size;
-    const int num_patches   = ((image_size / patch_size) * (image_size / patch_size));
+    const int num_patches   = ((image_size_width / patch_size) * (image_size_height / patch_size));
    const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
    {
@ -1864,7 +2290,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        for (size_t i = 0; i < imgs->size; i++) {
            const int nx = imgs->data[i].nx;
            const int ny = imgs->data[i].ny;
            if (!ctx->has_minicpmv_projector) {
                GGML_ASSERT(nx == image_size && ny == image_size);
            }
            const int n = nx * ny;
@ -1881,7 +2309,44 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
        free(data);
    }
    if (ctx->has_minicpmv_projector) {
        {
            // inspired from siglip:
            //    -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
            //    -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
            struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
            int* positions_data = (int*)malloc(ggml_nbytes(positions));
            for (int i = 0; i < num_positions; i++) {
                positions_data[i] = std::floor(70.0*i/num_positions);
            }
            ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
            free(positions_data);
        }
        {
            // inspired from resampler of Qwen-VL:
            //    -> https://huggingface.co/Qwen/Qwen-VL/tree/main
            //    -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
            struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
            if(ctx->load_image_size==nullptr){
                ctx->load_image_size= clip_image_size_init();
            }
            int pos_w = ctx->load_image_size->width/patch_size;
            int pos_h = ctx->load_image_size->height/patch_size;
            int embed_dim = 4096;
            auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
            float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
            for(int i=0;i<pos_w * pos_h;++i){
                for(int j=0;j<embed_dim;++j){
                    pos_embed_data[i*embed_dim+j]=pos_embed_t[i][j];
                }
            }
            ggml_backend_tensor_set(pos_embed, pos_embed_data, 0, ggml_nbytes(pos_embed));
            free(pos_embed_data);
        }
    } else {
        {
            if (ctx->has_class_embedding) {
                struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
@ -1913,6 +2378,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
            ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
            free(patches_data);
        }
    }
    if (ggml_backend_is_cpu(ctx->backend)) {
        ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
@ -2081,7 +2547,14 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
    if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
        return ctx->vision_model.mm_3_b->ne[0];
    }
    if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
        return 4096;
    }
    std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
    throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
 }
 bool clip_is_minicpmv(const struct clip_ctx * ctx) {
    return ctx->has_minicpmv_projector;
 }
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@ -18,14 +18,17 @@
 #    define CLIP_API
 #endif
 struct clip_ctx;
 #ifdef __cplusplus
 extern "C" {
 #endif
 struct clip_ctx;
 struct clip_image_size {
    int width;
    int height;
 };
 struct clip_image_u8_batch {
    struct clip_image_u8 * data;
    size_t size;
@ -55,6 +58,10 @@ CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
 CLIP_API int clip_n_patches    (const struct clip_ctx * ctx);
 CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
 CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
 CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
 CLIP_API struct clip_image_size * clip_image_size_init();
 CLIP_API struct clip_image_u8  * clip_image_u8_init ();
 CLIP_API struct clip_image_f32 * clip_image_f32_init();
@ -78,6 +85,8 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
 CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
 CLIP_API bool clip_is_minicpmv(const struct clip_ctx * ctx);
 #ifdef __cplusplus
 }
 #endif
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@ -202,6 +202,33 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
    return true;
 }
 static clip_image_f32 * only_v2_5_reshape_by_patch(clip_image_f32 * image, int patch_size) {
    int width = image->nx;
    int height = image->ny;
    int num_patches = (height / patch_size) * (width / patch_size);
    clip_image_f32 * patch = clip_image_f32_init();
    patch->nx = patch_size * num_patches;
    patch->ny = patch_size;
    patch->buf.resize(3 * patch->nx * patch->ny);
    int patch_index = 0;
    for (int i = 0; i < height; i += patch_size) {
        for (int j = 0; j < width; j += patch_size) {
            for (int pi = 0; pi < patch_size; ++pi) {
                for (int pj = 0; pj < patch_size; ++pj) {
                    int input_index = ((i + pi) * width + (j + pj)) * 3;
                    int output_index = (pi * patch_size * num_patches + patch_index * patch_size + pj) * 3;
                    patch->buf[output_index] = image->buf[input_index];
                    patch->buf[output_index+1] = image->buf[input_index+1];
                    patch->buf[output_index+2] = image->buf[input_index+2];
                }
            }
            patch_index++;
        }
    }
    return patch;
 }
 static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
    // std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
@ -218,7 +245,44 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
    const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
-    if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
+    if (clip_is_minicpmv(ctx_clip)) {
        std::vector<float *> image_embd_v;
        image_embd_v.resize(img_res_v.size);
        struct clip_image_size * load_image_size = clip_image_size_init();
        for (size_t i = 0; i < img_res_v.size; i++) {
            const int64_t t_img_enc_step_start_us = ggml_time_us();
            image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip));
            int patch_size=14;
            load_image_size->width = img_res_v.data[i].nx;
            load_image_size->height = img_res_v.data[i].ny;
            clip_add_load_image_size(ctx_clip, load_image_size);
            const bool encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
            if (!encoded) {
                LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
                return false;
            }
            const int64_t t_img_enc_steop_batch_us = ggml_time_us();
            LOG_TEE("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
        }
        const int64_t t_img_enc_batch_us = ggml_time_us();
        LOG_TEE("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
        int n_img_pos_out = 0;
        for (size_t i = 0; i < image_embd_v.size(); i++) {
            std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip));
            n_img_pos_out += clip_n_patches(ctx_clip);
        }
        *n_img_pos = n_img_pos_out;
        for (size_t i = 0; i < image_embd_v.size(); i++) {
            free(image_embd_v[i]);
        }
        image_embd_v.clear();
        load_image_size->width = img->nx;
        load_image_size->height = img->ny;
        clip_add_load_image_size(ctx_clip, load_image_size);
        LOG_TEE("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
    }
    else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
        // flat / default llava-1.5 type embedding
        *n_img_pos = clip_n_patches(ctx_clip);
        bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
@ -228,7 +292,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
            return false;
        }
-    } else {
+    }
    else {
        // spatial_unpad llava-1.6 type embedding
        // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
        std::vector<float *> image_embd_v;
@ -297,7 +362,11 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
 }
 bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
-    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
+    int num_max_patches = 6;
    if (clip_is_minicpmv(ctx_clip)) {
        num_max_patches = 10;
    }
    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
    if (!image_embd) {
        LOG_TEE("Unable to allocate memory for image embeddings\n");
        return false;
--- a/examples/llava/llava.h
+++ b/examples/llava/llava.h
@ -17,12 +17,11 @@
 #    define LLAVA_API
 #endif
 struct clip_ctx;
 #ifdef __cplusplus
 extern "C" {
 #endif
 struct clip_ctx;
 struct llava_image_embed {
    float * embed;
    int n_image_pos;
@ -37,8 +36,8 @@ LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip,
 LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
 /** build an image embed from a path to an image filename */
 LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
 LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
 /** free an embedding made with llava_image_embed_make_* */
 LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
 /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
 LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@ -0,0 +1,309 @@
 #include "ggml.h"
 #include "log.h"
 #include "common.h"
 #include "clip.h"
 #include "llava.h"
 #include "llama.h"
 #include <cstdio>
 #include <cstdlib>
 #include <vector>
 struct llava_context {
    struct clip_ctx * ctx_clip = NULL;
    struct llama_context * ctx_llama = NULL;
    struct llama_model * model = NULL;
 };
 static void show_additional_info(int /*argc*/, char ** argv) {
    LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
    LOG_TEE("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
 }
 static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
    (void) level;
    (void) user_data;
    LOG_TEE("%s", text);
 }
 static struct llama_model * llava_init(gpt_params * params) {
    llama_backend_init();
    llama_numa_init(params->numa);
    llama_model_params model_params = llama_model_params_from_gpt_params(*params);
    llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
    if (model == NULL) {
        LOG_TEE("%s: error: unable to load model\n" , __func__);
        return NULL;
    }
    return model;
 }
 static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) {
    auto prompt = params->prompt;
    if (prompt.empty()) {
        prompt = "describe the image in detail.";
    }
    llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
    if (params->n_ctx < 2048) {
        // warn user here, "Image processing requires at least 2048 context, setting context to 2048"
        LOG_TEE("%s: warn: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
        ctx_params.n_ctx = 2048;
    } else {
        ctx_params.n_ctx = params->n_ctx;
    }
    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
    if (ctx_llama == NULL) {
        LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
        return NULL;
    }
    auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
    ctx_llava->ctx_llama = ctx_llama;
    ctx_llava->model = model;
    return ctx_llava;
 }
 static void llava_free(struct llava_context * ctx_llava) {
    if (ctx_llava->ctx_clip) {
        clip_free(ctx_llava->ctx_clip);
        ctx_llava->ctx_clip = NULL;
    }
    llama_free(ctx_llava->ctx_llama);
    llama_free_model(ctx_llava->model);
    llama_backend_free();
 }
 static struct clip_ctx * clip_init_context(gpt_params * params) {
    const char * clip_path = params->mmproj.c_str();
    auto prompt = params->prompt;
    if (prompt.empty()) {
        prompt = "describe the image in detail.";
    }
    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
    return ctx_clip;
 }
 static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
    int N = (int) tokens.size();
    for (int i = 0; i < N; i += n_batch) {
        int n_eval = (int) tokens.size() - i;
        if (n_eval > n_batch) {
            n_eval = n_batch;
        }
        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
            LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
            return false;
        }
        *n_past += n_eval;
    }
    return true;
 }
 static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
    std::vector<llama_token> tokens;
    tokens.push_back(id);
    return eval_tokens(ctx_llama, tokens, 1, n_past);
 }
 static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
    std::string              str2     = str;
    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true);
    return eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
 }
 static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) {
    float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
    std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
    auto slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
    slice_embed->embed = image_embed;
    slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
    llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
    llava_image_embed_free(slice_embed);
 }
 static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, gpt_params * params, int &n_past) {
    std::string system_prompt;
    int idx = 0;
    int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
    system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";
    LOG_TEE("%s: image token past: %d\n", __func__, n_past);
    eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
    process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
    eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
    if (num_image_embeds > 1) {
        size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
        eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
        for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
            for (size_t j = 0; j < num_image_embeds_col; ++j) {
                eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
                process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
                eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
                if (j == num_image_embeds_col - 1) {
                    eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
                }
            }
        }
        eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
    }
    LOG_TEE("%s: image token past: %d\n", __func__, n_past);
 }
 static const char * sample(struct llama_sampling_context * ctx_sampling,
                           struct llama_context * ctx_llama,
                           int * n_past) {
    const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
    llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
    static std::string ret;
    if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
        ret = "</s>";
    } else {
        ret = llama_token_to_piece(ctx_llama, id);
    }
    eval_id(ctx_llama, id, n_past);
    return ret.c_str();
 }
 static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
    auto ctx_clip = clip_init_context(params);
    auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str());
    if (!embeds) {
        std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
        return NULL;
    }
    // process the prompt
    if (params->prompt.empty() && params->interactive == false) {
        LOG_TEE("prompt should be given or interactive mode should be on");
        return NULL;
    }
    auto model = llava_init(params);
    if (model == NULL) {
        fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
        return NULL;
    }
    const int64_t t_llava_init_start_us = ggml_time_us();
    auto ctx_llava = llava_init_context(params, model);
    ctx_llava->ctx_clip = ctx_clip;
    const int64_t t_llava_init_end_us = ggml_time_us();
    float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
    LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
    const int64_t t_process_image_start_us = ggml_time_us();
    process_image(ctx_llava, embeds, params, n_past);
    const int64_t t_process_image_end_us = ggml_time_us();
    float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
    LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
    llava_image_embed_free(embeds);
    return ctx_llava;
 }
 static struct llama_sampling_context * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){
    std::string user_prompt = prompt;
    if (!is_first) user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt;
    eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
    eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false);
    // generate the response
    LOG_TEE("\n");
    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
    return ctx_sampling;
 }
 static const char * llama_loop(struct llava_context * ctx_llava,struct llama_sampling_context * ctx_sampling, int &n_past){
    const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
    return tmp;
 }
 int main(int argc, char ** argv) {
    ggml_time_init();
    gpt_params params;
    if (!gpt_params_parse(argc, argv, params)) {
        show_additional_info(argc, argv);
        return 1;
    }
 #ifndef LOG_DISABLE_LOGS
    log_set_target(log_filename_generator("llava", "log"));
    LOG_TEE("Log start\n");
    log_dump_cmdline(argc, argv);
    llama_log_set(llama_log_callback_logTee, nullptr);
 #endif // LOG_DISABLE_LOGS
    if (params.mmproj.empty() || (params.image.empty())) {
        gpt_params_print_usage(argc, argv, params);
        show_additional_info(argc, argv);
        return 1;
    }
    for (auto & image : params.image) {
        int n_past = 0;
        auto ctx_llava = minicpmv_init(&params, image, n_past);
        if (!params.prompt.empty()) {
            LOG_TEE("<user>%s\n", params.prompt.c_str());
            LOG_TEE("<assistant>");
            auto ctx_sampling = llama_init(ctx_llava, &params, params.prompt.c_str(), n_past, true);
            const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
            std::string response = "";
            bool have_tmp = false;
            for (int i = 0; i < max_tgt_len; i++) {
                auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past);
                response += tmp;
                if (strcmp(tmp, "</s>") == 0){
                    if(!have_tmp)continue;
                    else break;
                }
                if (strstr(tmp, "###")) break; // Yi-VL behavior
                have_tmp = true;
                printf("%s", tmp);
                if (strstr(response.c_str(), "<user>")) break; // minicpm-v
                fflush(stdout);
            }
            llama_sampling_free(ctx_sampling);
        }else {
            while (true) {
                LOG_TEE("<user>");
                std::string prompt;
                std::getline(std::cin, prompt);
                LOG_TEE("<assistant>");
                auto ctx_sampling = llama_init(ctx_llava, &params, prompt, n_past, true);
                const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
                std::string response = "";
                for (int i = 0; i < max_tgt_len; i++) {
                    auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past);
                    response += tmp;
                    if (strcmp(tmp, "</s>") == 0) break;
                    if (strstr(tmp, "###")) break; // Yi-VL behavior
                    printf("%s", tmp);// mistral llava-1.6
                    if (strstr(response.c_str(), "<user>")) break; // minicpm-v
                    fflush(stdout);
                }
                llama_sampling_free(ctx_sampling);
            }
        }
        printf("\n");
        llama_print_timings(ctx_llava->ctx_llama);
        ctx_llava->model = NULL;
        llava_free(ctx_llava);
    }
    return 0;
 }
--- a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py
+++ b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py
@ -0,0 +1,382 @@
 import argparse
 import os
 import json
 import re
 import torch
 import numpy as np
 from gguf import *
 from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer, Idefics2VisionConfig
 TEXT = "clip.text"
 VISION = "clip.vision"
 def add_key_str(raw_key: str, arch: str) -> str:
    return raw_key.format(arch=arch)
 def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_minicpmv: bool) -> bool:
    if name in (
        "logit_scale",
        "text_model.embeddings.position_ids",
        "vision_model.embeddings.position_ids",
    ):
        return True
    if has_minicpmv and name in ["visual_projection.weight"]:
        return True
    if name.startswith("v") and not has_vision:
        return True
    if name.startswith("t") and not has_text:
        return True
    return False
 def get_tensor_name(name: str) -> str:
    if "projection" in name:
        return name
    if "mm_projector" in name:
        name = name.replace("model.mm_projector", "mm")
        name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
        name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
        return name
    return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
 def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a significant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = (
        list(range(ord("!"), ord("~") + 1))
        + list(range(ord("¡"), ord("¬") + 1))
        + list(range(ord("®"), ord("ÿ") + 1))
    )
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8 + n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))
 ap = argparse.ArgumentParser()
 ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
 ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
 ap.add_argument("--text-only", action="store_true", required=False,
                help="Save a text-only model. It can't be used to encode images")
 ap.add_argument("--vision-only", action="store_true", required=False,
                help="Save a vision-only model. It can't be used to encode texts")
 ap.add_argument("--clip-model-is-vision", action="store_true", required=False,
                help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
 ap.add_argument("--clip-model-is-openclip", action="store_true", required=False,
                help="The clip model is from openclip (for ViT-SO400M type))")
 ap.add_argument("--minicpmv-projector", help="Path to minicpmv.projector file. If specified, save an image encoder for MiniCPM-V models.")
 ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp")
 ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
 # Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
 # Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5
 default_image_mean = [0.48145466, 0.4578275, 0.40821073]
 default_image_std = [0.26862954, 0.26130258, 0.27577711]
 ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
 ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)
 # with proper
 args = ap.parse_args()
 if args.text_only and args.vision_only:
    print("--text-only and --image-only arguments cannot be specified at the same time.")
    exit(1)
 if args.use_f32:
    print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
 # output in the same directory as the model if output_dir is None
 dir_model = args.model_dir
 if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
    vocab = None
    tokens = None
 else:
    with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
        vocab = json.load(f)
        tokens = [key for key in vocab]
 # possible data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16
 #
 # map from ftype to string
 ftype_str = ["f32", "f16"]
 ftype = 1
 if args.use_f32:
    ftype = 0
 # if args.clip_model_is_vision or args.clip_model_is_openclip:
 #     model = CLIPVisionModel.from_pretrained(dir_model)
 #     processor = None
 # else:
 #     model = CLIPModel.from_pretrained(dir_model)
 #     processor = CLIPProcessor.from_pretrained(dir_model)
 default_vision_config = {
        "hidden_size": 1152,
        "image_size": 980,
        "intermediate_size": 4304,
        "model_type": "idefics2",
        "num_attention_heads": 16,
        "num_hidden_layers": 27,
        "patch_size": 14,
    }
 vision_config = Idefics2VisionConfig(**default_vision_config)
 model = Idefics2VisionTransformer(vision_config)
 processor = None
 # if model.attn_pool is not None:
 #     model.attn_pool = torch.nn.Identity()
 # model.blocks = model.blocks[:-1]
 model.load_state_dict(torch.load(os.path.join(dir_model, "minicpmv.clip")))
 fname_middle = None
 has_text_encoder = True
 has_vision_encoder = True
 has_minicpmv_projector = False
 if args.text_only:
    fname_middle = "text-"
    has_vision_encoder = False
 elif args.minicpmv_projector is not None:
    fname_middle = "mmproj-"
    has_text_encoder = False
    has_minicpmv_projector = True
 elif args.vision_only:
    fname_middle = "vision-"
    has_text_encoder = False
 else:
    fname_middle = ""
 output_dir = args.output_dir if args.output_dir is not None else dir_model
 os.makedirs(output_dir, exist_ok=True)
 output_prefix = os.path.basename(output_dir).replace("ggml_", "")
 fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf")
 fout = GGUFWriter(path=fname_out, arch="clip")
 fout.add_bool("clip.has_text_encoder", has_text_encoder)
 fout.add_bool("clip.has_vision_encoder", has_vision_encoder)
 fout.add_bool("clip.has_minicpmv_projector", has_minicpmv_projector)
 fout.add_file_type(ftype)
 if args.text_only:
    fout.add_description("text-only CLIP model")
 elif args.vision_only and not has_minicpmv_projector:
    fout.add_description("vision-only CLIP model")
 elif has_minicpmv_projector:
    fout.add_description("image encoder for MiniCPM-V")
    # add projector type
    fout.add_string("clip.projector_type", "resampler")
 else:
    fout.add_description("two-tower CLIP model")
 if has_vision_encoder:
    # vision_model hparams
    fout.add_uint32("clip.vision.image_size", 448)
    fout.add_uint32("clip.vision.patch_size", 14)
    fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), 1152)
    fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), 4304)
    fout.add_uint32("clip.vision.projection_dim", 0)
    fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), 16)
    fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
    block_count = 26
    fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count)
    if processor is not None:
        image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
        image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
    else:
        image_mean = args.image_mean if args.image_mean is not None else default_image_mean
        image_std = args.image_std if args.image_std is not None else default_image_std
    fout.add_array("clip.vision.image_mean", image_mean)
    fout.add_array("clip.vision.image_std", image_std)
 use_gelu = True
 fout.add_bool("clip.use_gelu", use_gelu)
 def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
    """
    embed_dim: output dimension for each position
    pos: a list of positions to be encoded: size (M,)
    out: (M, D)
    """
    assert embed_dim % 2 == 0
    omega = np.arange(embed_dim // 2, dtype=np.float32)
    omega /= embed_dim / 2.
    omega = 1. / 10000 ** omega  # (D/2,)
    pos = pos.reshape(-1)  # (M,)
    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
    emb_sin = np.sin(out)  # (M, D/2)
    emb_cos = np.cos(out)  # (M, D/2)
    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
    return emb
 def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
    assert embed_dim % 2 == 0
    # use half of dimensions to encode grid_h
    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
    return emb
 # https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
 def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
    """
    grid_size: int of the grid height and width
    return:
    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
    """
    if isinstance(grid_size, int):
        grid_h_size, grid_w_size = grid_size, grid_size
    else:
        grid_h_size, grid_w_size = grid_size[0], grid_size[1]
    grid_h = np.arange(grid_h_size, dtype=np.float32)
    grid_w = np.arange(grid_w_size, dtype=np.float32)
    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
    grid = np.stack(grid, axis=0)
    grid = grid.reshape([2, 1, grid_h_size, grid_w_size])
    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
    if cls_token:
        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
    return pos_embed
 def _replace_name_resampler(s, v):
    if re.match("resampler.pos_embed", s):
        return {
            s: v,
            re.sub("pos_embed", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(4096, (70, 70))),
        }
    if re.match("resampler.proj", s):
        return {
            re.sub("proj", "pos_embed_k", s): torch.from_numpy(get_2d_sincos_pos_embed(4096, (70, 70))),
            re.sub("proj", "proj.weight", s): v.transpose(-1, -2).contiguous(),
        }
    if re.match("resampler.attn.in_proj_.*", s):
        return {
            re.sub("attn.in_proj_", "attn.q.", s): v.chunk(3, dim=0)[0],
            re.sub("attn.in_proj_", "attn.k.", s): v.chunk(3, dim=0)[1],
            re.sub("attn.in_proj_", "attn.v.", s): v.chunk(3, dim=0)[2],
        }
    return {s: v}
 if has_minicpmv_projector:
    projector = torch.load(args.minicpmv_projector)
    new_state_dict = {}
    for k, v in projector.items():
        kvs = _replace_name_resampler(k, v)
        for nk, nv in kvs.items():
            new_state_dict[nk] = nv
    projector = new_state_dict
    ftype_cur = 0
    for name, data in projector.items():
        name = get_tensor_name(name)
        data = data.squeeze().numpy()
        n_dims = len(data.shape)
        if ftype == 1:
            if name[-7:] == ".weight" and n_dims == 2:
                print("  Converting to float16")
                data = data.astype(np.float16)
                ftype_cur = 1
            else:
                print("  Converting to float32")
                data = data.astype(np.float32)
                ftype_cur = 0
        else:
            if data.dtype != np.float32:
                print("  Converting to float32")
                data = data.astype(np.float32)
                ftype_cur = 0
        fout.add_tensor(name, data)
        print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
    print("Projector tensors added\n")
 def _replace_name(s, v):
    s = "vision_model." + s
    if re.match("vision_model.embeddings.position_embedding", s):
        v = v.unsqueeze(0)
        return {s: v}
    return {s: v}
 state_dict = model.state_dict()
 new_state_dict = {}
 for k, v in state_dict.items():
    kvs = _replace_name(k, v)
    for nk, nv in kvs.items():
        new_state_dict[nk] = nv
 state_dict = new_state_dict
 for name, data in state_dict.items():
    if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_minicpmv_projector):
        # we don't need this
        print(f"skipping parameter: {name}")
        continue
    name = get_tensor_name(name)
    data = data.squeeze().numpy()
    n_dims = len(data.shape)
    # ftype == 0 -> float32, ftype == 1 -> float16
    ftype_cur = 0
    if n_dims == 4:
        print(f"tensor {name} is always saved in f16")
        data = data.astype(np.float16)
        ftype_cur = 1
    elif ftype == 1:
        if name[-7:] == ".weight" and n_dims == 2:
            print("  Converting to float16")
            data = data.astype(np.float16)
            ftype_cur = 1
        else:
            print("  Converting to float32")
            data = data.astype(np.float32)
            ftype_cur = 0
    else:
        if data.dtype != np.float32:
            print("  Converting to float32")
            data = data.astype(np.float32)
            ftype_cur = 0
    print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
    fout.add_tensor(name, data)
 fout.write_header_to_file()
 fout.write_kv_data_to_file()
 fout.write_tensors_to_file()
 fout.close()
 print("Done. Output file: " + fname_out)
--- a/examples/llava/minicpmv-surgery.py
+++ b/examples/llava/minicpmv-surgery.py
@ -0,0 +1,47 @@
 import argparse
 import os
 import torch
 from transformers import AutoModel, AutoTokenizer
 ap = argparse.ArgumentParser()
 ap.add_argument("-m", "--model", help="Path to MiniCPM-V-2.5 model")
 args = ap.parse_args()
 # find the model part that includes the the multimodal projector weights
 model = AutoModel.from_pretrained(args.model, trust_remote_code=True, local_files_only=True)
 checkpoint = model.state_dict()
 # get a list of mm tensor names
 mm_tensors = [k for k, v in checkpoint.items() if k.startswith("resampler")]
 # store these tensors in a new dictionary and torch.save them
 projector = {name: checkpoint[name].float() for name in mm_tensors}
 torch.save(projector, f"{args.model}/minicpmv.projector")
 clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vpm")]
 if len(clip_tensors) > 0:
    clip = {name.replace("vpm.", ""): checkpoint[name].float() for name in clip_tensors}
    torch.save(clip, f"{args.model}/minicpmv.clip")
    # added tokens should be removed to be able to convert Mistral models
    if os.path.exists(f"{args.model}/added_tokens.json"):
        with open(f"{args.model}/added_tokens.json", "w") as f:
            f.write("{}\n")
 config = model.llm.config
 config._name_or_path = "openbmb/MiniCPM-Llama3-V-2.5"
 config.auto_map = {
    "AutoConfig": "configuration_minicpm.MiniCPMConfig",
    "AutoModel": "modeling_minicpm.MiniCPMModel",
    "AutoModelForCausalLM": "modeling_minicpm.MiniCPMForCausalLM",
    "AutoModelForSeq2SeqLM": "modeling_minicpm.MiniCPMForCausalLM",
    "AutoModelForSequenceClassification": "modeling_minicpm.MiniCPMForSequenceClassification"
 }
 model.llm.save_pretrained(f"{args.model}/model")
 tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
 tok.save_pretrained(f"{args.model}/model")
 # os.system(f"cp {args.model}/modeling_minicpm.py {args.model}/MiniCPM_l3/modeling_minicpm.py")
 print("Done!")
 print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")
 print(f"Also, use {args.model}/minicpmv.projector to prepare a minicpmv-encoder.gguf file.")
--- a/examples/llava/requirements.txt
+++ b/examples/llava/requirements.txt
@ -2,3 +2,4 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 pillow~=10.2.0
 torch~=2.2.1
 torchvision==0.17.1
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -92,7 +92,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 }
 // usage:
-//  ./quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
+//  ./llama-quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
 //
 [[noreturn]]
 static void usage(const char * executable) {
--- a/examples/rpc/README.md
+++ b/examples/rpc/README.md
@ -1,5 +1,9 @@
 ## Overview
 > [!IMPORTANT]
 > This example and the RPC backend are currently in a proof-of-concept development stage. As such, the functionality is fragile and
 > insecure. **Never run the RPC server on an open network or in a sensitive environment!**
 The `rpc-server` allows  running `ggml` backend on a remote host.
 The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them.
 This can be used for distributed LLM inference with `llama.cpp` in the following way:
--- a/examples/rpc/rpc-server.cpp
+++ b/examples/rpc/rpc-server.cpp
@ -16,7 +16,7 @@
 #include <stdio.h>
 struct rpc_server_params {
-    std::string host        = "0.0.0.0";
+    std::string host        = "127.0.0.1";
    int         port        = 50052;
    size_t      backend_mem = 0;
 };
@ -114,6 +114,17 @@ int main(int argc, char * argv[]) {
        fprintf(stderr, "Invalid parameters\n");
        return 1;
    }
    if (params.host != "127.0.0.1") {
        fprintf(stderr, "\n");
        fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
        fprintf(stderr, "WARNING: Host ('%s') is != '127.0.0.1'\n", params.host.c_str());
        fprintf(stderr, "         Never expose the RPC server to an open network!\n");
        fprintf(stderr, "         This is an experimental feature and is not secure!\n");
        fprintf(stderr, "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
        fprintf(stderr, "\n");
    }
    ggml_backend_t backend = create_backend();
    if (!backend) {
        fprintf(stderr, "Failed to create backend\n");
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -207,41 +207,6 @@ model:
  -hff,  --hf-file FILE           Hugging Face model file (default: unused)
  -hft,  --hf-token TOKEN         Hugging Face access token (default: value from HF_TOKEN environment variable)
 retrieval:
         --context-file FNAME     file to load context from (repeat to specify multiple files)
         --chunk-size N           minimum length of embedded text chunks (default: 64)
         --chunk-separator STRING
                                  separator between chunks (default: '
                                  ')
 passkey:
         --junk N                 number of times to repeat the junk text (default: 250)
         --pos N                  position of the passkey in the junk text (default: -1)
 imatrix:
  -o,    --output FNAME           output file (default: 'imatrix.dat')
         --output-frequency N     output the imatrix every N iterations (default: 10)
         --save-frequency N       save an imatrix copy every N iterations (default: 0)
         --process-output         collect data for the output tensor (default: false)
         --no-ppl                 do not compute perplexity (default: true)
         --chunk N                start processing the input from chunk N (default: 0)
 bench:
  -pps                            is the prompt shared across parallel sequences (default: false)
  -npp n0,n1,...                  number of prompt tokens
  -ntg n0,n1,...                  number of text generation tokens
  -npl n0,n1,...                  number of parallel prompts
 embedding:
         --embd-normalize         normalisation for embendings (default: 2) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
         --embd-output-format     empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
         --embd-separator         separator of embendings (default \n) for example "<#sep#>"
 server:
         --host HOST              ip address to listen (default: 127.0.0.1)
@ -267,7 +232,8 @@ server:
                                  https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
  -sps,  --slot-prompt-similarity SIMILARITY
                                  how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
-
+         --lora-init-without-apply
                                  load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled)
 logging:
@ -279,15 +245,6 @@ logging:
         --log-file FNAME         Specify a log filename (without extension)
         --log-new                Create a separate new log file on start. Each log file will have unique name: "<name>.<ID>.log"
         --log-append             Don't truncate the old log file.
 cvector:
  -o,    --output FNAME           output file (default: 'control_vector.gguf')
         --positive-file FNAME    positive prompts file, one prompt per line (default: 'examples/cvector-generator/positive.txt')
         --negative-file FNAME    negative prompts file, one prompt per line (default: 'examples/cvector-generator/negative.txt')
         --pca-batch N            batch size used for PCA. Larger batch runs faster, but uses more memory (default: 100)
         --pca-iter N             number of iterations used for PCA (default: 1000)
         --method {pca,mean}      dimensionality reduction method to be used (default: pca)
 ```
@ -411,7 +368,8 @@ node index.js
 ## API Endpoints
- **GET** `/health`: Returns the current state of the server:
+### GET `/health`: Returns the current state of the server
  - 503 -> `{"status": "loading model"}` if the model is still being loaded.
  - 500 -> `{"status": "error"}` if the model failed to load.
  - 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below.
@ -420,7 +378,7 @@ node index.js
  If the query parameter `include_slots` is passed, `slots` field will contain internal slots data except if `--slots-endpoint-disable` is set.
- **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
+### POST `/completion`: Given a `prompt`, it returns the predicted completion.
    *Options:*
@ -498,7 +456,7 @@ node index.js
    `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values.
-### Result JSON
+**Response format**
 - Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion.
@ -537,7 +495,7 @@ Notice that each `probs` is an array of length `n_probs`.
 - `tokens_evaluated`: Number of tokens evaluated in total from the prompt
 - `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
- **POST** `/tokenize`: Tokenize a given text.
+### POST `/tokenize`: Tokenize a given text
    *Options:*
@ -545,13 +503,15 @@ Notice that each `probs` is an array of length `n_probs`.
    `add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`
- **POST** `/detokenize`: Convert tokens to text.
+### POST `/detokenize`: Convert tokens to text
    *Options:*
    `tokens`: Set the tokens to detokenize.
- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.
+### POST `/embedding`: Generate embedding of a given text
 The same as [the embedding example](../embedding) does.
    *Options:*
@ -559,7 +519,9 @@ Notice that each `probs` is an array of length `n_probs`.
    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
- **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream.
+### POST `/infill`: For code infilling.
 Takes a prefix and a suffix and returns the predicted completion as stream.
    *Options:*
@ -571,7 +533,7 @@ Notice that each `probs` is an array of length `n_probs`.
 - **GET** `/props`: Return current server settings.
-### Result JSON
+**Response format**
 ```json
 {
@ -589,7 +551,9 @@ Notice that each `probs` is an array of length `n_probs`.
 - `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
 - `chat_template` - the model's original Jinja2 prompt template
- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
+### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API
 Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
    *Options:*
@ -641,7 +605,7 @@ Notice that each `probs` is an array of length `n_probs`.
    }'
    ```
- **POST** `/v1/embeddings`: OpenAI-compatible embeddings API.
+### POST `/v1/embeddings`: OpenAI-compatible embeddings API
    *Options:*
@ -675,9 +639,9 @@ Notice that each `probs` is an array of length `n_probs`.
    }'
    ```
- **GET** `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`.
+### GET `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`.
-### Result JSON
+**Response format**
 ```json
 [
@ -738,7 +702,7 @@ Notice that each `probs` is an array of length `n_probs`.
 ]
 ```
- **GET** `/metrics`: [Prometheus](https://prometheus.io/) compatible metrics exporter endpoint if `--metrics` is enabled:
+### GET `/metrics`: Prometheus compatible metrics exporter endpoint if `--metrics` is enabled:
 Available metrics:
 - `llamacpp:prompt_tokens_total`: Number of prompt tokens processed.
@ -750,13 +714,13 @@ Available metrics:
 - `llamacpp:requests_processing`: Number of requests processing.
 - `llamacpp:requests_deferred`: Number of requests deferred.
- **POST** `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file.
+### POST `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file.
    *Options:*
    `filename`: Name of the file to save the slot's prompt cache. The file will be saved in the directory specified by the `--slot-save-path` server parameter.
-### Result JSON
+**Response format**
 ```json
 {
@ -770,13 +734,13 @@ Available metrics:
 }
 ```
- **POST** `/slots/{id_slot}?action=restore`: Restore the prompt cache of the specified slot from a file.
+### POST `/slots/{id_slot}?action=restore`: Restore the prompt cache of the specified slot from a file.
    *Options:*
    `filename`: Name of the file to restore the slot's prompt cache from. The file should be located in the directory specified by the `--slot-save-path` server parameter.
-### Result JSON
+**Response format**
 ```json
 {
@ -790,9 +754,9 @@ Available metrics:
 }
 ```
- **POST** `/slots/{id_slot}?action=erase`: Erase the prompt cache of the specified slot.
+### POST `/slots/{id_slot}?action=erase`: Erase the prompt cache of the specified slot.
-### Result JSON
+**Response format**
 ```json
 {
@ -801,6 +765,42 @@ Available metrics:
 }
 ```
 ### GET `/lora-adapters`: Get list of all LoRA adapters
 If an adapter is disabled, the scale will be set to 0.
 **Response format**
 ```json
 [
    {
        "id": 0,
        "path": "my_adapter_1.gguf",
        "scale": 0.0
    },
    {
        "id": 1,
        "path": "my_adapter_2.gguf",
        "scale": 0.0
    }
 ]
 ```
 ### POST `/lora-adapters`: Set list of LoRA adapters
 To disable an adapter, either remove it from the list below, or set scale to 0.
 **Request format**
 To know the `id` of the adapter, use GET `/lora-adapters`
 ```json
 [
  {"id": 0, "scale": 0.2},
  {"id": 1, "scale": 0.8}
 ]
 ```
 ## More examples
 ### Change system prompt on runtime
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -79,6 +79,7 @@ enum server_task_type {
    SERVER_TASK_TYPE_SLOT_SAVE,
    SERVER_TASK_TYPE_SLOT_RESTORE,
    SERVER_TASK_TYPE_SLOT_ERASE,
    SERVER_TASK_TYPE_SET_LORA,
 };
 struct server_task {
@ -623,6 +624,7 @@ struct server_response {
 struct server_context {
    llama_model * model = nullptr;
    llama_context * ctx = nullptr;
    std::vector<llama_lora_adapter_container> lora_adapters;
    gpt_params params;
@ -682,6 +684,7 @@ struct server_context {
        model = llama_init.model;
        ctx = llama_init.context;
        lora_adapters = llama_init.lora_adapters;
        params.n_parallel -= 1; // but be sneaky about it
        if (model == nullptr) {
            LOG_ERROR("unable to load model", {{"model", params.model}});
@ -973,6 +976,8 @@ struct server_context {
                (prompt->is_array() &&  prompt->size() == 1 && prompt->at(0).is_string()) ||
                (prompt->is_array() && !prompt->empty()     && prompt->at(0).is_number_integer())) {
                slot.prompt = *prompt;
            } else if (prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_array()) {
                slot.prompt = prompt->at(0);
            } else {
                send_error(task, "\"prompt\" must be a string or an array of integers", ERROR_TYPE_INVALID_REQUEST);
                return false;
@ -1851,6 +1856,14 @@ struct server_context {
                    };
                    queue_results.send(result);
                } break;
            case SERVER_TASK_TYPE_SET_LORA:
                {
                    llama_lora_adapters_apply(ctx, lora_adapters);
                    server_task_result result;
                    result.id = task.id;
                    result.data = json{{ "success", true }};
                    queue_results.send(result);
                } break;
        }
    }
@ -3329,6 +3342,55 @@ int main(int argc, char ** argv) {
        return res.set_content(root.dump(), "application/json; charset=utf-8");
    };
    const auto handle_lora_adapters_list = [&](const httplib::Request & req, httplib::Response & res) {
        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
        json result = json::array();
        for (size_t i = 0; i < ctx_server.lora_adapters.size(); ++i) {
            auto & la = ctx_server.lora_adapters[i];
            result.push_back({
                {"id", i},
                {"path", la.path},
                {"scale", la.scale},
            });
        }
        res.set_content(result.dump(), "application/json");
        res.status = 200; // HTTP OK
    };
    const auto handle_lora_adapters_apply = [&](const httplib::Request & req, httplib::Response & res) {
        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
        const std::vector<json> body = json::parse(req.body);
        int max_idx = ctx_server.lora_adapters.size();
        // clear existing value
        for (auto & la : ctx_server.lora_adapters) {
            la.scale = 0.0f;
        }
        // set value
        for (auto entry : body) {
            int id      = entry.at("id");
            float scale = entry.at("scale");
            if (0 <= id && id < max_idx) {
                ctx_server.lora_adapters[id].scale = scale;
            } else {
                throw std::runtime_error("invalid adapter id");
            }
        }
        server_task task;
        task.type = SERVER_TASK_TYPE_SET_LORA;
        const int id_task = ctx_server.queue_tasks.post(task);
        ctx_server.queue_results.add_waiting_task_id(id_task);
        server_task_result result = ctx_server.queue_results.recv(id_task);
        ctx_server.queue_results.remove_waiting_task_id(id_task);
        res.set_content(result.data.dump(), "application/json");
        res.status = 200; // HTTP OK
    };
    auto handle_static_file = [](unsigned char * content, size_t len, const char * mime_type) {
        return [content, len, mime_type](const httplib::Request &, httplib::Response & res) {
            res.set_content(reinterpret_cast<const char*>(content), len, mime_type);
@ -3367,7 +3429,6 @@ int main(int argc, char ** argv) {
    // register API routes
    svr->Get ("/health",              handle_health);
    svr->Get ("/slots",               handle_slots);
    svr->Get ("/metrics",             handle_metrics);
    svr->Get ("/props",               handle_props);
    svr->Get ("/v1/models",           handle_models);
@ -3382,6 +3443,11 @@ int main(int argc, char ** argv) {
    svr->Post("/v1/embeddings",       handle_embeddings);
    svr->Post("/tokenize",            handle_tokenize);
    svr->Post("/detokenize",          handle_detokenize);
    // LoRA adapters hotswap
    svr->Get ("/lora-adapters",       handle_lora_adapters_list);
    svr->Post("/lora-adapters",       handle_lora_adapters_apply);
    // Save & load slots
    svr->Get ("/slots",               handle_slots);
    if (!params.slot_save_path.empty()) {
        // only enable slot endpoints if slot_save_path is set
        svr->Post("/slots/:id_slot",  handle_slots_action);
--- a/examples/server/tests/features/lora.feature
+++ b/examples/server/tests/features/lora.feature
@ -0,0 +1,36 @@
@llama.cpp
@lora
 Feature: llama.cpp server
  Background: Server startup
    Given a server listening on localhost:8080
    And   a model url https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/stories15M_MOE-F16.gguf
    And   a model file stories15M_MOE-F16.gguf
    And   a model alias stories15M_MOE
    And   a lora adapter file from https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe_shakespeare15M.gguf
    And   42 as server seed
    And   1024 as batch size
    And   1024 as ubatch size
    And   2048 KV cache size
    And   64 max tokens to predict
    And   0.0 temperature
    Then  the server is starting
    Then  the server is healthy
  Scenario: Completion LoRA disabled
    Given switch off lora adapter 0
    Given a prompt:
    """
    Look in thy glass
    """
    And   a completion request with no api error
    Then  64 tokens are predicted matching little|girl|three|years|old
  Scenario: Completion LoRA enabled
    Given switch on lora adapter 0
    Given a prompt:
    """
    Look in thy glass
    """
    And   a completion request with no api error
    Then  64 tokens are predicted matching eye|love|glass|sun
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -7,6 +7,7 @@ import subprocess
 import sys
 import threading
 import time
 import requests
 from collections.abc import Sequence
 from contextlib import closing
 from re import RegexFlag
@ -70,6 +71,7 @@ def step_server_config(context, server_fqdn: str, server_port: str):
    context.user_api_key = None
    context.response_format = None
    context.temperature = None
    context.lora_file = None
    context.tasks_result = []
    context.concurrent_tasks = []
@ -82,6 +84,12 @@ def step_download_hf_model(context, hf_file: str, hf_repo: str):
    context.model_hf_file = hf_file
    context.model_file = os.path.basename(hf_file)
@step('a lora adapter file from {lora_file_url}')
 def step_download_lora_file(context, lora_file_url: str):
    file_name = lora_file_url.split('/').pop()
    context.lora_file = f'../../../{file_name}'
    with open(context.lora_file, 'wb') as f:
        f.write(requests.get(lora_file_url).content)
@step('a model file {model_file}')
 def step_model_file(context, model_file: str):
@ -849,6 +857,17 @@ async def step_erase_slot(context, slot_id):
            context.response = response
@step('switch {on_or_off} lora adapter {lora_id:d}')
@async_run_until_complete
 async def toggle_lora_adapter(context, on_or_off: str, lora_id: int):
    async with aiohttp.ClientSession() as session:
        async with session.post(f'{context.base_url}/lora-adapters',
                                json=[{'id': lora_id, 'scale': 1 if on_or_off == 'on' else 0}],
                                headers={"Content-Type": "application/json"}) as response:
            context.response = response
            print([{'id': lora_id, 'scale': 1 if on_or_off == 'on' else 0}])
@step('the server responds with status code {status_code:d}')
 def step_server_responds_with_status_code(context, status_code):
    assert context.response.status == status_code
@ -1326,6 +1345,8 @@ def start_server_background(context):
        server_args.extend(['--grp-attn-w', context.n_ga_w])
    if context.debug:
        server_args.append('--verbose')
    if context.lora_file:
        server_args.extend(['--lora', context.lora_file])
    if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
        server_args.extend(['--log-format', "text"])
--- a/examples/server/tests/requirements.txt
+++ b/examples/server/tests/requirements.txt
@ -4,3 +4,4 @@ huggingface_hub~=0.20.3
 numpy~=1.26.4
 openai~=1.30.3
 prometheus-client~=0.20.0
 requests~=2.32.3
--- a/examples/simple/README.md
+++ b/examples/simple/README.md
@ -3,7 +3,7 @@
 The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt.
 ```bash
-./simple -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is"
+./llama-simple -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is"
 ...
--- a/examples/sycl/README.md
+++ b/examples/sycl/README.md
@ -12,9 +12,9 @@ This example program provides the tools for llama.cpp for SYCL on Intel GPU.
 List all SYCL devices with ID, compute capability, max work group size, ect.
-1. Build the llama.cpp for SYCL for all targets.
+1. Build the llama.cpp for SYCL for the specified target *(using GGML_SYCL_TARGET)*.
-2. Enable oneAPI running environment
+2. Enable oneAPI running environment *(if GGML_SYCL_TARGET is set to INTEL -default-)*
 ```
 source /opt/intel/oneapi/setvars.sh
@ -29,19 +29,13 @@ source /opt/intel/oneapi/setvars.sh
 Check the ID in startup log, like:
 ```
-found 4 SYCL devices:
+found 2 SYCL devices:
-  Device 0: Intel(R) Arc(TM) A770 Graphics,	compute capability 1.3,
+|  |                   |                                       |       |Max    |        |Max  |Global |                     |
-    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
+|  |                   |                                       |       |compute|Max work|sub  |mem    |                     |
-  Device 1: Intel(R) FPGA Emulation Device,	compute capability 1.2,
+|ID|        Device Type|                                   Name|Version|units  |group   |group|size   |       Driver version|
-    max compute_units 24,	max work group size 67108864,	max sub group size 64,	global mem size 67065057280
+|--|-------------------|---------------------------------------|-------|-------|--------|-----|-------|---------------------|
-  Device 2: 13th Gen Intel(R) Core(TM) i7-13700K,	compute capability 3.0,
+| 0| [level_zero:gpu:0]|                Intel Arc A770 Graphics|    1.3|    512|    1024|   32| 16225M|            1.3.29138|
-    max compute_units 24,	max work group size 8192,	max sub group size 64,	global mem size 67065057280
+| 1| [level_zero:gpu:1]|                 Intel UHD Graphics 750|    1.3|     32|     512|   32| 62631M|            1.3.29138|
  Device 3: Intel(R) Arc(TM) A770 Graphics,	compute capability 3.0,
    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
 ```
 |Attribute|Note|
 |-|-|
 |compute capability 1.3|Level-zero running time, recommended |
 |compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
--- a/ggml/include/ggml-metal.h
+++ b/ggml/include/ggml-metal.h
@ -50,6 +50,8 @@ GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void
 GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
 GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
 // helper to check if the device supports a specific family
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
@ -16,6 +16,8 @@
 #if defined(__GNUC__)
 #pragma GCC diagnostic ignored "-Woverlength-strings"
 #elif defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 #define UNUSED GGML_UNUSED
--- a/ggml/src/ggml-backend.c
+++ b/ggml/src/ggml-backend.c
@ -351,15 +351,10 @@ void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t b
    }
    // an async copy would normally happen after all the queued operations on both backends are completed
-    // sync src, set_async dst
+    // to simulate the same behavior, we need to synchronize both backends first, and do a blocking copy
    if (ggml_backend_buffer_is_host(src->buffer)) {
    ggml_backend_synchronize(backend_src);
        ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, ggml_nbytes(src));
    } else {
        ggml_backend_synchronize(backend_src);
        ggml_backend_tensor_copy(src, dst);
    ggml_backend_synchronize(backend_dst);
-    }
+    ggml_backend_tensor_copy(src, dst);
 }
 // events
@ -1782,7 +1777,17 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
                } else {
                    ggml_backend_synchronize(split_backend);
                }
-                ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
+                // try async copy, but if not possible, we can still use a sync copy without synchronizing the dst backend, since we handle the synchronization here with multiple copies and events
                // TODO: add public function to facilitate this, since applications do not have direct access to the backend interface
                if (!split_backend->iface.cpy_tensor_async || !split_backend->iface.cpy_tensor_async(input_backend, split_backend, input, input_cpy)) {
                    ggml_backend_synchronize(input_backend);
                    if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
                        ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
                    } else {
                        ggml_backend_synchronize(split_backend);
                    }
                    ggml_backend_tensor_copy(input, input_cpy);
                }
            }
        }
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@ -1501,7 +1501,7 @@ static void ggml_cuda_op_mul_mat(
        }
        // If src0 is on a temporary compute buffers (partial offloading) there may be some padding that needs to be cleared:
-        if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
+        if (ne00 % MATRIX_ROW_PADDING != 0 && ggml_is_quantized(src0->type) && ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && src0->view_src == nullptr) {
            const int64_t nbytes_data    = ggml_row_size(src0->type, (dev[id].row_high - dev[id].row_low)*ne00);
            const int64_t nbytes_padding = ggml_row_size(src0->type, MATRIX_ROW_PADDING - ne00 % MATRIX_ROW_PADDING);
            CUDA_CHECK(cudaMemsetAsync(dev[id].src0_dd + nbytes_data , 0, nbytes_padding, stream));
@ -2362,33 +2362,35 @@ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend,
 }
 GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) {
    GGML_ASSERT(ggml_backend_is_cuda(backend_src) || ggml_backend_is_cuda(backend_dst));
    ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
    ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
-    if (!ggml_backend_buffer_is_cuda(src->buffer)) {
+    if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) {
        return false;
    }
-    if (!ggml_backend_buffer_is_cuda(dst->buffer)) {
+    if (!ggml_backend_buffer_is_cuda(src->buffer) || !ggml_backend_buffer_is_cuda(dst->buffer)) {
        return false;
    }
-    // device -> device
+    // device -> device copy
    ggml_backend_cuda_context * cuda_ctx_src = (ggml_backend_cuda_context *)backend_src->context;
    ggml_backend_cuda_context * cuda_ctx_dst = (ggml_backend_cuda_context *)backend_dst->context;
    if (backend_src != backend_dst) {
    ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *)buf_src->context;
    ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *)buf_dst->context;
-        GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
+    if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
-        GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
+#ifndef NDEBUG
        GGML_CUDA_LOG_WARN("%s: backend and buffer devices do not match\n", __func__);
 #endif
        return false;
    }
    if (backend_src != backend_dst) {
        // copy on src stream
        if (cuda_ctx_src->device == cuda_ctx_dst->device) {
-            CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
+            CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
        } else {
 #ifdef GGML_CUDA_NO_PEER_COPY
            return false;
@ -2397,7 +2399,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
 #endif
        }
-        // record event on src stream
+        // record event on src stream after the copy
        if (!cuda_ctx_src->copy_event) {
            ggml_cuda_set_device(cuda_ctx_src->device);
            CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
@ -2409,7 +2411,7 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
        CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0));
    } else {
        // src and dst are on the same backend
-        CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
+        CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream()));
    }
    return true;
 }
@ -2746,11 +2748,12 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
        case GGML_OP_MUL_MAT_ID:
            {
                struct ggml_tensor * a = op->src[0];
                if (op->op == GGML_OP_MUL_MAT) {
                struct ggml_tensor * b = op->src[1];
-                    if (a->ne[3] != b->ne[3]) {
+                if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
                    return false;
                }
                if (op->op == GGML_OP_MUL_MAT && a->ne[3] != b->ne[3]) {
                    return false;
                }
                switch (a->type) {
                    case GGML_TYPE_F32:
@ -2881,7 +2884,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
            return true;
        case GGML_OP_FLASH_ATTN_EXT:
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-            return op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128;
+            return (op->src[0]->ne[0] == 64 && op->src[1]->type == GGML_TYPE_F16) || op->src[0]->ne[0] == 128;
 #else
            if (op->src[0]->ne[0] == 128) {
                return true;
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@ -210,7 +210,7 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_COUNT
 };
-struct ggml_metal_context {
+struct ggml_backend_metal_context {
    int n_cb;
    id<MTLDevice>       device;
@ -224,6 +224,10 @@ struct ggml_metal_context {
    bool support_simdgroup_mm;
    bool should_capture_next_compute;
    // abort ggml_metal_graph_compute if callback returns true
    ggml_abort_callback abort_callback;
    void *              abort_callback_data;
 };
 // MSL code
@ -289,7 +293,7 @@ static void * ggml_metal_host_malloc(size_t n) {
    return data;
 }
-static struct ggml_metal_context * ggml_metal_init(int n_cb) {
+static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) {
    GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
 #if TARGET_OS_OSX && !GGML_METAL_NDEBUG
@ -306,7 +310,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
    GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
    // Configure context
-    struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
+    struct ggml_backend_metal_context * ctx = malloc(sizeof(struct ggml_backend_metal_context));
    ctx->device = device;
    ctx->n_cb   = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
    ctx->queue  = [ctx->device newCommandQueue];
@ -668,7 +672,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
    return ctx;
 }
-static void ggml_metal_free(struct ggml_metal_context * ctx) {
+static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
    GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
    for (int i = 0; i < GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
@ -734,7 +738,7 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs
    return nil;
 }
-static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const struct ggml_tensor * op) {
+static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx, const struct ggml_tensor * op) {
    for (size_t i = 0, n = 3; i < n; ++i) {
        if (op->src[i] != NULL && op->src[i]->type == GGML_TYPE_BF16) {
            return false;
@ -845,7 +849,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
 }
 static enum ggml_status ggml_metal_graph_compute(
-        struct ggml_metal_context * ctx,
+        struct ggml_backend_metal_context * ctx,
               struct ggml_cgraph * gf) {
    @autoreleasepool {
@ -878,9 +882,12 @@ static enum ggml_status ggml_metal_graph_compute(
        id<MTLCommandBuffer> command_buffer  = [ctx->queue commandBufferWithUnretainedReferences];
        command_buffer_builder[cb_idx] = command_buffer;
-        // enqueue the command buffers in order to specify their execution order
+        // always enqueue the first two command buffers
        // enqueue all of the command buffers if we don't need to abort
        if (cb_idx < 2 || ctx->abort_callback == NULL) {
            [command_buffer enqueue];
        }
    }
    const id<MTLCommandBuffer> *command_buffers = command_buffer_builder;
@ -2827,7 +2834,9 @@ static enum ggml_status ggml_metal_graph_compute(
        [encoder endEncoding];
        if (cb_idx < 2 || ctx->abort_callback == NULL) {
            [command_buffer commit];
        }
    });
    // Wait for completion and check status of each command buffer
@ -2847,6 +2856,23 @@ static enum ggml_status ggml_metal_graph_compute(
            return GGML_STATUS_FAILED;
        }
        id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? command_buffers[i + 1] : nil);
        if (!next_buffer) {
            continue;
        }
        bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued);
        if (next_queued) {
            continue;
        }
        if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) {
            GGML_METAL_LOG_INFO("%s: command buffer %d aborted", __func__, i);
            return GGML_STATUS_ABORTED;
        }
        [next_buffer commit];
    }
    if (should_capture) {
@ -3150,7 +3176,7 @@ GGML_CALL static const char * ggml_backend_metal_name(ggml_backend_t backend) {
 }
 GGML_CALL static void ggml_backend_metal_free(ggml_backend_t backend) {
-    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
+    struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
    ggml_metal_free(ctx);
    free(backend);
 }
@ -3162,13 +3188,13 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffe
 }
 GGML_CALL static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
+    struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context;
    return ggml_metal_graph_compute(metal_ctx, cgraph);
 }
 GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
-    struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
+    struct ggml_backend_metal_context * metal_ctx = (struct ggml_backend_metal_context *)backend->context;
    return ggml_metal_supports_op(metal_ctx, op);
 }
@ -3213,9 +3239,9 @@ static ggml_guid_t ggml_backend_metal_guid(void) {
 }
 ggml_backend_t ggml_backend_metal_init(void) {
-    struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
+    struct ggml_backend_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
    if (ctx == NULL) {
        GGML_METAL_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
        return NULL;
    }
@ -3237,15 +3263,24 @@ bool ggml_backend_is_metal(ggml_backend_t backend) {
 void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
    GGML_ASSERT(ggml_backend_is_metal(backend));
-    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
+    struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
    ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
 }
 void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data) {
    GGML_ASSERT(ggml_backend_is_metal(backend));
    struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
    ctx->abort_callback = abort_callback;
    ctx->abort_callback_data = user_data;
 }
 bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
    GGML_ASSERT(ggml_backend_is_metal(backend));
-    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
+    struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
    return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
 }
@ -3253,7 +3288,7 @@ bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
 void ggml_backend_metal_capture_next_compute(ggml_backend_t backend) {
    GGML_ASSERT(ggml_backend_is_metal(backend));
-    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
+    struct ggml_backend_metal_context * ctx = (struct ggml_backend_metal_context *)backend->context;
    ctx->should_capture_next_compute = true;
 }
--- a/ggml/src/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc.cpp
@ -197,6 +197,10 @@ static std::shared_ptr<socket_t> create_server_socket(const char * host, int por
        fprintf(stderr, "Failed to set SO_REUSEADDR\n");
        return nullptr;
    }
    if (inet_addr(host) == INADDR_NONE) {
        fprintf(stderr, "Invalid host address: %s\n", host);
        return nullptr;
    }
    struct sockaddr_in serv_addr;
    serv_addr.sin_family = AF_INET;
    serv_addr.sin_addr.s_addr = inet_addr(host);
@ -879,6 +883,14 @@ ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rp
    if (result->buffer && buffers.find(result->buffer) == buffers.end()) {
        return nullptr;
    }
    // require that the tensor data does not go beyond the buffer end
    uint64_t tensor_size = (uint64_t) ggml_nbytes(result);
    uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer);
    uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer);
    GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow
    GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size);
    result->op = (ggml_op) tensor->op;
    for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
        result->op_params[i] = tensor->op_params[i];
@ -898,7 +910,7 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
    const rpc_tensor * in_tensor = (const rpc_tensor *)input.data();
    uint64_t offset;
    memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset));
-    size_t size = input.size() - sizeof(rpc_tensor) - sizeof(offset);
+    const size_t size = input.size() - sizeof(rpc_tensor) - sizeof(offset);
    struct ggml_init_params params {
        /*.mem_size   =*/ ggml_tensor_overhead(),
@ -913,6 +925,17 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
        return false;
    }
    GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
    // sanitize tensor->data
    {
        const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer);
        const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer);
        if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) {
            GGML_ABORT("[%s] tensor->data out of bounds\n", __func__);
        }
    }
    const void * data = input.data() + sizeof(rpc_tensor) + sizeof(offset);
    ggml_backend_tensor_set(tensor, data, offset, size);
    ggml_free(ctx);
@ -943,6 +966,17 @@ bool rpc_server::get_tensor(const std::vector<uint8_t> & input, std::vector<uint
        return false;
    }
    GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
    // sanitize tensor->data
    {
        const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer);
        const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer);
        if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) {
            GGML_ABORT("[%s] tensor->data out of bounds\n", __func__);
        }
    }
    // output serialization format: | data (size bytes) |
    output.resize(size, 0);
    ggml_backend_tensor_get(tensor, output.data(), offset, size);
--- a/ggml/src/ggml-sycl/dpct/helper.hpp
+++ b/ggml/src/ggml-sycl/dpct/helper.hpp
@ -874,7 +874,7 @@ namespace dpct
        inline std::string get_preferred_gpu_platform_name() {
            std::string result;
-            std::string filter = "level-zero";
+            std::string filter = "";
            char* env = getenv("ONEAPI_DEVICE_SELECTOR");
            if (env) {
                if (std::strstr(env, "level_zero")) {
@ -892,11 +892,24 @@ namespace dpct
                else {
                    throw std::runtime_error("invalid device filter: " + std::string(env));
                }
            } else {
                auto default_device = sycl::device(sycl::default_selector_v);
                auto default_platform_name = default_device.get_platform().get_info<sycl::info::platform::name>();
                if (std::strstr(default_platform_name.c_str(), "Level-Zero") || default_device.is_cpu()) {
                    filter = "level-zero";
                }
                else if (std::strstr(default_platform_name.c_str(), "CUDA")) {
                    filter = "cuda";
                }
                else if (std::strstr(default_platform_name.c_str(), "HIP")) {
                    filter = "hip";
                }
            }
-            auto plaform_list = sycl::platform::get_platforms();
+            auto platform_list = sycl::platform::get_platforms();
-            for (const auto& platform : plaform_list) {
+            for (const auto& platform : platform_list) {
                auto devices = platform.get_devices();
                auto gpu_dev = std::find_if(devices.begin(), devices.end(), [](const sycl::device& d) {
                    return d.is_gpu();
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@ -2108,9 +2108,9 @@ void ggml_vk_instance_init() {
 }
 static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
    GGML_ASSERT(idx < vk_instance.device_indices.size());
    VK_LOG_DEBUG("ggml_vk_init(" << ctx->name << ", " << idx << ")");
    ggml_vk_instance_init();
    GGML_ASSERT(idx < vk_instance.device_indices.size());
    ctx->name = GGML_VK_NAME + std::to_string(idx);
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -56,6 +56,9 @@ int ggml_sve_cnt_b = 0;
 // disable POSIX deprecation warnings
 // these functions are never going away, anyway
 #pragma warning(disable: 4996)
 // unreachable code because of multiple instances of code after GGML_ABORT
 #pragma warning(disable: 4702)
 #endif
 #if defined(_WIN32)
@ -3745,7 +3748,8 @@ static struct ggml_tensor * ggml_new_tensor_impl(
        struct ggml_tensor  * view_src,
        size_t                view_offs) {
-    assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
+    GGML_ASSERT(type >= 0 && type < GGML_TYPE_COUNT);
    GGML_ASSERT(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
    // find the base tensor and absolute offset
    if (view_src != NULL && view_src->view_src != NULL) {
--- a/ggml/src/vulkan-shaders/CMakeLists.txt
+++ b/ggml/src/vulkan-shaders/CMakeLists.txt
@ -1,5 +1,7 @@
 find_package (Threads REQUIRED)
 set(TARGET vulkan-shaders-gen)
 add_executable(${TARGET} vulkan-shaders-gen.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads)
--- a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
@ -23,6 +23,7 @@
 #ifdef _WIN32
    #include <windows.h>
    #include <direct.h> // For _mkdir on Windows
    #include <algorithm> // For std::replace on w64devkit
 #else
    #include <unistd.h>
    #include <sys/wait.h>
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -1146,6 +1146,9 @@ class GGMLQuantizationType(IntEnum):
    F64     = 28
    IQ1_M   = 29
    BF16    = 30
    Q4_0_4_4 = 31
    Q4_0_4_8 = 32
    Q4_0_8_8 = 33
 # TODO: add GGMLFileType from ggml_ftype in ggml.h
@ -1158,7 +1161,7 @@ class LlamaFileType(IntEnum):
    MOSTLY_F16           = 1   # except 1d tensors
    MOSTLY_Q4_0          = 2   # except 1d tensors
    MOSTLY_Q4_1          = 3   # except 1d tensors
-    MOSTLY_Q4_1_SOME_F16 = 4   # tok_embeddings.weight and output.weight are F16
+    # MOSTLY_Q4_1_SOME_F16 = 4   # tok_embeddings.weight and output.weight are F16
    # MOSTLY_Q4_2        = 5   # support has been removed
    # MOSTLY_Q4_3        = 6   # support has been removed
    MOSTLY_Q8_0          = 7   # except 1d tensors
@ -1187,6 +1190,9 @@ class LlamaFileType(IntEnum):
    MOSTLY_IQ4_XS        = 30  # except 1d tensors
    MOSTLY_IQ1_M         = 31  # except 1d tensors
    MOSTLY_BF16          = 32  # except 1d tensors
    MOSTLY_Q4_0_4_4      = 33  # except 1d tensors
    MOSTLY_Q4_0_4_8      = 34  # except 1d tensors
    MOSTLY_Q4_0_8_8      = 35  # except 1d tensors
    GUESSED              = 1024  # not specified in the model file
@ -1260,6 +1266,9 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
    GGMLQuantizationType.F64:     (1, 8),
    GGMLQuantizationType.IQ1_M:   (256, QK_K // 8 + QK_K // 16  + QK_K // 32),
    GGMLQuantizationType.BF16:    (1, 2),
    GGMLQuantizationType.Q4_0_4_4:(32, 2 + 16),
    GGMLQuantizationType.Q4_0_4_8:(32, 2 + 16),
    GGMLQuantizationType.Q4_0_8_8:(32, 2 + 16),
 }
--- a/gguf-py/gguf/lazy.py
+++ b/gguf-py/gguf/lazy.py
@ -191,6 +191,8 @@ class LazyBase(ABC, metaclass=LazyMeta):
 class LazyNumpyTensor(LazyBase):
    _tensor_type = np.ndarray
    shape: tuple[int, ...]  # Makes the type checker happy in quants.py
    @classmethod
    def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]:
        # The initial idea was to use np.nan as the fill value,
--- a/gguf-py/gguf/metadata.py
+++ b/gguf-py/gguf/metadata.py
@ -174,7 +174,7 @@ class Metadata:
            org_component, model_full_name_component = None, model_id
        # Check if we erroneously matched against './' or '../' etc...
-        if org_component is not None and org_component[0] == '.':
+        if org_component is not None and len(org_component) > 0 and org_component[0] == '.':
            org_component = None
        name_parts: list[str] = model_full_name_component.split('-')
--- a/gguf-py/gguf/quants.py
+++ b/gguf-py/gguf/quants.py
@ -1,5 +1,6 @@
 from __future__ import annotations
-from typing import Callable, Sequence
+from abc import ABC, abstractmethod
 from typing import Any, Callable, Sequence
 from numpy.typing import DTypeLike
@ -9,32 +10,22 @@ from .lazy import LazyNumpyTensor
 import numpy as np
-def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
+def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType) -> tuple[int, ...]:
    block_size, type_size = GGML_QUANT_SIZES[quant_type]
    if shape[-1] % block_size != 0:
        raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})")
    return (*shape[:-1], shape[-1] // block_size * type_size)
-def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
+def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType) -> tuple[int, ...]:
    block_size, type_size = GGML_QUANT_SIZES[quant_type]
    if shape[-1] % type_size != 0:
        raise ValueError(f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})")
    return (*shape[:-1], shape[-1] // type_size * block_size)
 # same as ggml_compute_fp32_to_bf16 in ggml-impl.h
 def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
    n = n.astype(np.float32, copy=False).view(np.uint32)
    # force nan to quiet
    n = np.where((n & 0x7fffffff) > 0x7f800000, (n & np.uint32(0xffff0000)) | np.uint32(64 << 16), n)
    # round to nearest even
    n = (np.uint64(n) + (0x7fff + ((n >> 16) & 1))) >> 16
    return n.astype(np.uint16)
 # This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time
-def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray:
+def _apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray:
    rows = arr.reshape((-1, arr.shape[-1]))
    osize = 1
    for dim in oshape:
@ -46,27 +37,6 @@ def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.
    return out.reshape(oshape)
 def __quantize_bf16_array(n: np.ndarray) -> np.ndarray:
    return __apply_over_grouped_rows(__compute_fp32_to_bf16, arr=n, otype=np.uint16, oshape=n.shape)
 __quantize_bf16_lazy = LazyNumpyTensor._wrap_fn(__quantize_bf16_array, meta_noop=np.uint16)
 def quantize_bf16(n: np.ndarray):
    if type(n) is LazyNumpyTensor:
        return __quantize_bf16_lazy(n)
    else:
        return __quantize_bf16_array(n)
 __q8_block_size, __q8_type_size = GGML_QUANT_SIZES[GGMLQuantizationType.Q8_0]
 def can_quantize_to_q8_0(n: np.ndarray) -> bool:
    return n.shape[-1] % __q8_block_size == 0
 # round away from zero
 # ref: https://stackoverflow.com/a/59143326/22827863
 def np_roundf(n: np.ndarray) -> np.ndarray:
@ -76,18 +46,151 @@ def np_roundf(n: np.ndarray) -> np.ndarray:
    return np.sign(n) * b
-def __quantize_q8_0_shape_change(s: tuple[int, ...]) -> tuple[int, ...]:
+class QuantError(Exception): ...
    return (*s[:-1], s[-1] // __q8_block_size * __q8_type_size)
-# Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c
+_type_traits: dict[GGMLQuantizationType, type[__Quant]] = {}
 def __quantize_q8_0_rows(n: np.ndarray) -> np.ndarray:
    shape = n.shape
    assert shape[-1] % __q8_block_size == 0
    n_blocks = n.size // __q8_block_size
-    blocks = n.reshape((n_blocks, __q8_block_size)).astype(np.float32, copy=False)
+def quantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
    if qtype == GGMLQuantizationType.F32:
        return data.astype(np.float32, copy=False)
    elif qtype == GGMLQuantizationType.F16:
        return data.astype(np.float16, copy=False)
    elif (q := _type_traits.get(qtype)) is not None:
        return q.quantize(data)
    else:
        raise NotImplementedError(f"Quantization for {qtype.name} is not yet implemented")
 def dequantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
    if qtype == GGMLQuantizationType.F32 or qtype == GGMLQuantizationType.F16:
        return data.astype(np.float32, copy=False)
    elif (q := _type_traits.get(qtype)) is not None:
        return q.dequantize(data)
    else:
        raise NotImplementedError(f"Dequantization for {qtype.name} is not yet implemented")
 class __Quant(ABC):
    qtype: GGMLQuantizationType
    block_size: int
    type_size: int
    def __init__(self):
        return TypeError("Quant conversion classes can't have instances")
    def __init_subclass__(cls, qtype: GGMLQuantizationType) -> None:
        cls.qtype = qtype
        cls.block_size, cls.type_size = GGML_QUANT_SIZES[qtype]
        cls.__quantize_lazy = LazyNumpyTensor._wrap_fn(
            cls.__quantize_array,
            meta_noop=(np.uint8, cls.__shape_to_bytes)
        )
        cls.__dequantize_lazy = LazyNumpyTensor._wrap_fn(
            cls.__dequantize_array,
            meta_noop=(np.float32, cls.__shape_from_bytes)
        )
        assert qtype not in _type_traits
        _type_traits[qtype] = cls
    @classmethod
    @abstractmethod
    def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
        raise NotImplementedError
    @classmethod
    @abstractmethod
    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
        raise NotImplementedError
    @classmethod
    def quantize_rows(cls, rows: np.ndarray) -> np.ndarray:
        rows = rows.astype(np.float32, copy=False)
        shape = rows.shape
        n_blocks = rows.size // cls.block_size
        blocks = rows.reshape((n_blocks, cls.block_size))
        blocks = cls.quantize_blocks(blocks)
        assert blocks.dtype == np.uint8
        assert blocks.shape[-1] == cls.type_size
        return blocks.reshape(cls.__shape_to_bytes(shape))
    @classmethod
    def dequantize_rows(cls, rows: np.ndarray) -> np.ndarray:
        rows = rows.view(np.uint8)
        shape = rows.shape
        n_blocks = rows.size // cls.type_size
        blocks = rows.reshape((n_blocks, cls.type_size))
        blocks = cls.dequantize_blocks(blocks)
        assert blocks.dtype == np.float32
        assert blocks.shape[-1] == cls.block_size
        return blocks.reshape(cls.__shape_from_bytes(shape))
    @classmethod
    def __shape_to_bytes(cls, shape: Sequence[int]):
        return quant_shape_to_byte_shape(shape, cls.qtype)
    @classmethod
    def __shape_from_bytes(cls, shape: Sequence[int]):
        return quant_shape_from_byte_shape(shape, cls.qtype)
    @classmethod
    def __quantize_array(cls, array: np.ndarray) -> np.ndarray:
        return _apply_over_grouped_rows(cls.quantize_rows, arr=array, otype=np.uint8, oshape=cls.__shape_to_bytes(array.shape))
    @classmethod
    def __dequantize_array(cls, array: np.ndarray) -> np.ndarray:
        return _apply_over_grouped_rows(cls.dequantize_rows, arr=array, otype=np.float32, oshape=cls.__shape_from_bytes(array.shape))
    @classmethod
    def __quantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any:
        pass
    @classmethod
    def __dequantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any:
        pass
    @classmethod
    def can_quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> bool:
        return tensor.shape[-1] % cls.block_size == 0
    @classmethod
    def quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray:
        if not cls.can_quantize(tensor):
            raise QuantError(f"Can't quantize tensor with shape {tensor.shape} to {cls.qtype.name}")
        if isinstance(tensor, LazyNumpyTensor):
            return cls.__quantize_lazy(tensor)
        else:
            return cls.__quantize_array(tensor)
    @classmethod
    def dequantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray:
        if isinstance(tensor, LazyNumpyTensor):
            return cls.__dequantize_lazy(tensor)
        else:
            return cls.__dequantize_array(tensor)
 class BF16(__Quant, qtype=GGMLQuantizationType.BF16):
    @classmethod
    # same as ggml_compute_fp32_to_bf16 in ggml-impl.h
    def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
        n = blocks.view(np.uint32)
        # force nan to quiet
        n = np.where((n & 0x7fffffff) > 0x7f800000, (n & np.uint32(0xffff0000)) | np.uint32(64 << 16), n)
        # round to nearest even
        n = (np.uint64(n) + (0x7fff + ((n >> 16) & 1))) >> 16
        return n.astype(np.uint16).view(np.uint8)
    @classmethod
    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
        return (blocks.view(np.int16).astype(np.int32) << 16).view(np.float32)
 class Q8_0(__Quant, qtype=GGMLQuantizationType.Q8_0):
    @classmethod
    # Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c
    def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
        d = abs(blocks).max(axis=1, keepdims=True) / 127
        with np.errstate(divide="ignore"):
@ -99,23 +202,12 @@ def __quantize_q8_0_rows(n: np.ndarray) -> np.ndarray:
        # (n_blocks, block_size)
        qs = qs.astype(np.int8).view(np.uint8)
-    assert d.shape[1] + qs.shape[1] == __q8_type_size
+        return np.concatenate([d, qs], axis=1)
-    return np.concatenate([d, qs], axis=1).reshape(__quantize_q8_0_shape_change(shape))
+    @classmethod
    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
        d, x = np.split(blocks, [2], axis=1)
        d = d.view(np.float16).astype(np.float32)
        x = x.view(np.int8).astype(np.float32)
-
+        return (x * d)
 def __quantize_q8_0_array(n: np.ndarray) -> np.ndarray:
    return __apply_over_grouped_rows(__quantize_q8_0_rows, arr=n, otype=np.uint8, oshape=__quantize_q8_0_shape_change(n.shape))
 __quantize_q8_0_lazy = LazyNumpyTensor._wrap_fn(
    __quantize_q8_0_array,
    meta_noop=(np.uint8, __quantize_q8_0_shape_change),
 )
 def quantize_q8_0(data: np.ndarray):
    if type(data) is LazyNumpyTensor:
        return __quantize_q8_0_lazy(data)
    else:
        return __quantize_q8_0_array(data)
--- a/include/llama.h
+++ b/include/llama.h
@ -345,7 +345,7 @@ extern "C" {
        int32_t nthread;                     // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
        enum llama_ftype ftype;              // quantize to this llama_ftype
        enum ggml_type output_tensor_type;   // output tensor type
-        enum ggml_type token_embedding_type; // itoken embeddings tensor type
+        enum ggml_type token_embedding_type; // token embeddings tensor type
        bool allow_requantize;               // allow quantizing non-f32/f16 tensors
        bool quantize_output_tensor;         // quantize output.weight
        bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
--- a/src/llama-impl.h
+++ b/src/llama-impl.h
@ -24,3 +24,18 @@ void llama_log_callback_default(ggml_log_level level, const char * text, void *
 #define LLAMA_LOG_INFO(...)  llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
 #define LLAMA_LOG_WARN(...)  llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
 #define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
 //
 // helpers
 //
 static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
    if (search.empty()) {
        return; // Avoid infinite loop if 'search' is an empty string
    }
    size_t pos = 0;
    while ((pos = s.find(search, pos)) != std::string::npos) {
        s.replace(pos, search.length(), replace);
        pos += replace.length();
    }
 }
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@ -17,17 +17,6 @@
 // helpers
 //
 static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
    if (search.empty()) {
        return; // Avoid infinite loop if 'search' is an empty string
    }
    size_t pos = 0;
    while ((pos = s.find(search, pos)) != std::string::npos) {
        s.replace(pos, search.length(), replace);
        pos += replace.length();
    }
 }
 LLAMA_ATTRIBUTE_FORMAT(1, 2)
 static std::string format(const char * fmt, ...) {
    va_list ap;
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -13245,13 +13245,13 @@ struct llm_build_context {
                // self-attention
                {
-                    struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq_enc, cur);
+                    struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_enc, cur);
                    cb(Qcur, "Qcur", il);
-                    struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk_enc, cur);
+                    struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_enc, cur);
                    cb(Kcur, "Kcur", il);
-                    struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv_enc, cur);
+                    struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_enc, cur);
                    cb(Vcur, "Vcur", il);
                    Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
@ -13285,7 +13285,7 @@ struct llm_build_context {
                    ggml_build_forward_expand(gf, cur);
-                    cur = ggml_mul_mat(ctx0, model.layers[il].wo_enc, cur);
+                    cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_enc, cur);
                    cb(cur, "kqv_out", il);
                }
@ -13359,13 +13359,13 @@ struct llm_build_context {
                // self-attention
                {
-                    struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                    struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
                    cb(Qcur, "Qcur", il);
-                    struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                    struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
                    cb(Kcur, "Kcur", il);
-                    struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                    struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
                    cb(Vcur, "Vcur", il);
                    llm_build_kv_store(ctx0, hparams, cparams, kv_self, gf, Kcur, Vcur, n_tokens, kv_head, cb, il);
@ -13412,7 +13412,7 @@ struct llm_build_context {
                    ggml_build_forward_expand(gf, cur);
-                    cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
+                    cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
                    cb(cur, "kqv_out", il);
                }
@ -13429,13 +13429,13 @@ struct llm_build_context {
                // cross-attention
                {
-                    struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq_cross, cur);
+                    struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq_cross, cur);
                    cb(Qcur, "Qcur", il);
-                    struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk_cross, embd_enc);
+                    struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk_cross, embd_enc);
                    cb(Kcur, "Kcur", il);
-                    struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv_cross, embd_enc);
+                    struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv_cross, embd_enc);
                    cb(Vcur, "Vcur", il);
                    Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
@ -13464,7 +13464,7 @@ struct llm_build_context {
                    ggml_build_forward_expand(gf, cur);
-                    cur = ggml_mul_mat(ctx0, model.layers[il].wo_cross, cur);
+                    cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo_cross, cur);
                    cb(cur, "kqv_out", il);
                }
@ -13521,7 +13521,7 @@ struct llm_build_context {
            cb(cur, "result_norm", -1);
            // lm_head
-            cur = ggml_mul_mat(ctx0, model.output, cur);
+            cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
            cb(cur, "result_output", -1);
        }
@ -15371,7 +15371,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
    const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
    auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
        if (n_expert > 1) {
-            // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
+            // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly
            // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
            // for getting the current layer as I initially thought, and we need to resort to parsing the
            // tensor name.
@ -17396,6 +17396,7 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
 // TODO: replace all non-fatal assertions with returned errors or exceptions
 struct llama_data_write {
    virtual void write(const void * src, size_t size) = 0;
    virtual void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) = 0;
    virtual size_t get_size_written() = 0;
    virtual ~llama_data_write() = default;
@ -17518,9 +17519,8 @@ struct llama_data_write {
            // Read each range of cells of k_size length each into tmp_buf and write out
            for (const auto & range : cell_ranges) {
                const size_t range_size = range.second - range.first;
-                tmp_buf.resize(range_size * k_size_row);
+                const size_t buf_size = range_size * k_size_row;
-                ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row);
+                write_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size);
                write(tmp_buf.data(), tmp_buf.size());
            }
        }
@ -17539,9 +17539,8 @@ struct llama_data_write {
                // Read each range of cells of v_size length each into tmp_buf and write out
                for (const auto & range : cell_ranges) {
                    const size_t range_size = range.second - range.first;
-                    tmp_buf.resize(range_size * v_size_row);
+                    const size_t buf_size = range_size * v_size_row;
-                    ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row);
+                    write_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size);
                    write(tmp_buf.data(), tmp_buf.size());
                }
            }
        } else {
@ -17567,9 +17566,8 @@ struct llama_data_write {
                    for (const auto & range : cell_ranges) {
                        const size_t range_size = range.second - range.first;
                        const size_t src_offset = (range.first + j * kv_size) * v_size_el;
-                        tmp_buf.resize(range_size * v_size_el);
+                        const size_t buf_size = range_size * v_size_el;
-                        ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
+                        write_tensor_data(kv_self.v_l[il], src_offset, buf_size);
                        write(tmp_buf.data(), tmp_buf.size());
                    }
                }
            }
@ -17928,12 +17926,14 @@ struct llama_data_write_dummy : llama_data_write {
    llama_data_write_dummy() {}
    // TODO: avoid unnecessary calls to ggml_backend_tensor_get in a dummy context
    void write(const void * /* src */, size_t size) override {
        size_written += size;
    }
    void write_tensor_data(const struct ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override {
        size_written += size;
    }
    size_t get_size_written() override {
        return size_written;
    }
@ -17956,6 +17956,16 @@ struct llama_data_write_buffer : llama_data_write {
        buf_size -= size;
    }
    void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
        if (size > buf_size) {
            throw std::runtime_error("unexpectedly reached end of buffer");
        }
        ggml_backend_tensor_get(tensor, ptr, offset, size);
        ptr += size;
        size_written += size;
        buf_size -= size;
    }
    size_t get_size_written() override {
        return size_written;
    }
@ -17991,6 +18001,7 @@ struct llama_data_read_buffer : llama_data_read {
 struct llama_data_write_file : llama_data_write {
    llama_file * file;
    size_t size_written = 0;
    std::vector<uint8_t> temp_buffer;
    llama_data_write_file(llama_file * f) : file(f) {}
@ -17999,6 +18010,12 @@ struct llama_data_write_file : llama_data_write {
        size_written += size;
    }
    void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
        temp_buffer.resize(size);
        ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size);
        write(temp_buffer.data(), temp_buffer.size());
    }
    size_t get_size_written() override {
        return size_written;
    }