allow for single token prompt processing (actual batch size 1)

2025-09-10 09:04:36 +00:00 · 2025-04-25 16:54:46 +08:00 · 2025-04-25 16:54:46 +08:00 · 6b6597ebf1
commit 6b6597ebf1
parent d32d0b382a 13be08daf9
24 changed files with 413 additions and 846 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -39,6 +39,11 @@
 using json = nlohmann::ordered_json;
 std::initializer_list<enum llama_example> mmproj_examples = {
    LLAMA_EXAMPLE_LLAVA,
    // TODO: add LLAMA_EXAMPLE_SERVER when it's ready
 };
 common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
    this->examples = std::move(examples);
    return *this;
@ -642,11 +647,16 @@ static struct common_hf_file_res common_get_hf_file(const std::string &, const s
 // utils
 //
-static void common_params_handle_model(
+struct handle_model_result {
    bool found_mmproj = false;
    common_params_model mmproj;
 };
 static handle_model_result common_params_handle_model(
        struct common_params_model & model,
        const std::string & bearer_token,
-        const std::string & model_path_default,
+        const std::string & model_path_default) {
-        bool is_mmproj = false) { // TODO: move is_mmproj to an enum when we have more files?
+    handle_model_result result;
    // handle pre-fill default model path and url based on hf_repo and hf_file
    {
        if (!model.hf_repo.empty()) {
@ -658,7 +668,12 @@ static void common_params_handle_model(
                        exit(1); // built without CURL, error message already printed
                    }
                    model.hf_repo = auto_detected.repo;
-                    model.hf_file = is_mmproj ? auto_detected.mmprojFile : auto_detected.ggufFile;
+                    model.hf_file = auto_detected.ggufFile;
                    if (!auto_detected.mmprojFile.empty()) {
                        result.found_mmproj   = true;
                        result.mmproj.hf_repo = model.hf_repo;
                        result.mmproj.hf_file = auto_detected.mmprojFile;
                    }
                } else {
                    model.hf_file = model.path;
                }
@ -695,6 +710,8 @@ static void common_params_handle_model(
            exit(1);
        }
    }
    return result;
 }
 const std::vector<ggml_type> kv_cache_types = {
@ -828,16 +845,25 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
    }
-    common_params_handle_model(params.model,             params.hf_token, DEFAULT_MODEL_PATH);
+    // handle model and download
    {
        auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
        if (params.no_mmproj) {
            params.mmproj = {};
        } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
            // optionally, handle mmproj model when -hf is specified
            params.mmproj = res.mmproj;
        }
        // only download mmproj if the current example is using it
        for (auto & ex : mmproj_examples) {
            if (ctx_arg.ex == ex) {
                common_params_handle_model(params.mmproj,    params.hf_token, "");
                break;
            }
        }
        common_params_handle_model(params.speculative.model, params.hf_token, "");
        common_params_handle_model(params.vocoder.model,     params.hf_token, "");
    // allow --mmproj to be set from -hf
    // assuming that mmproj is always in the same repo as text model
    if (!params.model.hf_repo.empty() && ctx_arg.ex == LLAMA_EXAMPLE_LLAVA) {
        params.mmproj.hf_repo = params.model.hf_repo;
    }
    common_params_handle_model(params.mmproj,            params.hf_token, "", true);
    if (params.escape) {
        string_process_escapes(params.prompt);
@ -969,7 +995,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
        "llama-embedding",
        "llama-eval-callback",
        "llama-export-lora",
        "llama-gbnf-validator",
        "llama-gen-docs",
        "llama-gguf",
        "llama-gguf-hash",
@ -989,7 +1014,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
        "llama-perplexity",
        "llama-q8dot",
        "llama-quantize",
        "llama-quantize-stats",
        "llama-qwen2vl-cli",
        "llama-retrieval",
        "llama-run",
@ -2096,18 +2120,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
    add_opt(common_arg(
        {"--mmproj"}, "FILE",
-        "path to a multimodal projector file for LLaVA. see examples/llava/README.md",
+        "path to a multimodal projector file. see examples/llava/README.md",
        [](common_params & params, const std::string & value) {
            params.mmproj.path = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
+    ).set_examples(mmproj_examples));
    add_opt(common_arg(
        {"--mmproj-url"}, "URL",
-        "URL to a multimodal projector file for LLaVA. see examples/llava/README.md",
+        "URL to a multimodal projector file. see examples/llava/README.md",
        [](common_params & params, const std::string & value) {
            params.mmproj.url = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
+    ).set_examples(mmproj_examples));
    add_opt(common_arg(
        {"--no-mmproj"},
        "explicitly disable multimodal projector, useful when using -hf",
        [](common_params & params) {
            params.no_mmproj = true;
        }
    ).set_examples(mmproj_examples));
    add_opt(common_arg(
        {"--no-mmproj-offload"},
        "do not offload multimodal projector to GPU",
        [](common_params & params) {
            params.mmproj_use_gpu = false;
        }
    ).set_examples(mmproj_examples));
    add_opt(common_arg(
        {"--image"}, "FILE",
        "path to an image file. use with multimodal models. Specify multiple times for batching",
@ -2382,6 +2420,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    add_opt(common_arg(
        {"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
        "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
        "mmproj is also downloaded automatically if available. to disable, add --no-mmproj\n"
        "example: unsloth/phi-4-GGUF:q4_k_m\n"
        "(default: unused)",
        [](common_params & params, const std::string & value) {
--- a/common/common.h
+++ b/common/common.h
@ -338,6 +338,8 @@ struct common_params {
    // multimodal models (see examples/llava)
    struct common_params_model mmproj;
    bool mmproj_use_gpu = true;     // use GPU for multimodal model
    bool no_mmproj = false;         // explicitly disable multimodal model
    std::vector<std::string> image; // path to image file(s)
    // embedding
--- a/docs/multimodal/gemma3.md
+++ b/docs/multimodal/gemma3.md
@ -1,51 +0,0 @@
 # Gemma 3 vision
 > [!IMPORTANT]
 >
 > This is very experimental, only used for demo purpose.
 ## Quick started
 You can use pre-quantized model from [ggml-org](https://huggingface.co/ggml-org)'s Hugging Face account
 ```bash
 # build
 cmake -B build
 cmake --build build --target llama-mtmd-cli
 # alternatively, install from brew (MacOS)
 brew install llama.cpp
 # run it
 llama-mtmd-cli -hf ggml-org/gemma-3-4b-it-GGUF
 llama-mtmd-cli -hf ggml-org/gemma-3-12b-it-GGUF
 llama-mtmd-cli -hf ggml-org/gemma-3-27b-it-GGUF
 # note: 1B model does not support vision
 ```
 ## How to get mmproj.gguf?
 Simply to add `--mmproj` in when converting model via `convert_hf_to_gguf.py`:
 ```bash
 cd gemma-3-4b-it
 python ../llama.cpp/convert_hf_to_gguf.py --outfile model.gguf --outtype f16 --mmproj .
 # output file: mmproj-model.gguf
 ```
 ## How to run it?
 What you need:
 - The text model GGUF, can be converted using `convert_hf_to_gguf.py`
 - The mmproj file from step above
 - An image file
 ```bash
 # build
 cmake -B build
 cmake --build build --target llama-mtmd-cli
 # run it
 ./build/bin/llama-mtmd-cli -m {text_model}.gguf --mmproj mmproj.gguf --image your_image.jpg
 ```
--- a/docs/multimodal/glmedge.md
+++ b/docs/multimodal/glmedge.md
@ -1,43 +0,0 @@
 # GLMV-EDGE
 Currently this implementation supports [glm-edge-v-2b](https://huggingface.co/THUDM/glm-edge-v-2b) and [glm-edge-v-5b](https://huggingface.co/THUDM/glm-edge-v-5b).
 ## Usage
 Build the `llama-mtmd-cli` binary.
 After building, run: `./llama-mtmd-cli` to see the usage. For example:
 ```sh
 ./llama-mtmd-cli -m model_path/ggml-model-f16.gguf --mmproj model_path/mmproj-model-f16.gguf
 ```
 **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
 **note**: For GPU offloading ensure to use the `-ngl` flag just like usual
 ## GGUF conversion
 1. Clone a GLMV-EDGE model ([2B](https://huggingface.co/THUDM/glm-edge-v-2b) or [5B](https://huggingface.co/THUDM/glm-edge-v-5b)). For example:
 ```sh
 git clone https://huggingface.co/THUDM/glm-edge-v-5b or https://huggingface.co/THUDM/glm-edge-v-2b
 ```
 2. Use `glmedge-surgery.py` to split the GLMV-EDGE model to LLM and multimodel projector constituents:
 ```sh
 python ./examples/llava/glmedge-surgery.py -m ../model_path
 ```
 4. Use `glmedge-convert-image-encoder-to-gguf.py` to convert the GLMV-EDGE image encoder to GGUF:
 ```sh
 python ./examples/llava/glmedge-convert-image-encoder-to-gguf.py -m ../model_path --llava-projector ../model_path/glm.projector --output-dir ../model_path
 ```
 5. Use `examples/convert_hf_to_gguf.py` to convert the LLM part of GLMV-EDGE to GGUF:
 ```sh
 python convert_hf_to_gguf.py ../model_path
 ```
 Now both the LLM part and the image encoder are in the `model_path` directory.
--- a/docs/multimodal/granitevision.md
+++ b/docs/multimodal/granitevision.md
@ -1,186 +0,0 @@
 # Granite Vision
 Download the model and point your `GRANITE_MODEL` environment variable to the path.
 ```bash
 $ git clone https://huggingface.co/ibm-granite/granite-vision-3.2-2b
 $ export GRANITE_MODEL=./granite-vision-3.2-2b
 ```
 ### 1. Running llava surgery v2.
 First, we need to run the llava surgery script as shown below:
 `python llava_surgery_v2.py -C -m $GRANITE_MODEL`
 You should see two new files (`llava.clip` and `llava.projector`) written into your model's directory, as shown below.
 ```bash
 $ ls $GRANITE_MODEL | grep -i llava
 llava.clip
 llava.projector
 ```
 We should see that the projector and visual encoder get split out into the llava files. Quick check to make sure they aren't empty:
 ```python
 import os
 import torch
 MODEL_PATH = os.getenv("GRANITE_MODEL")
 if not MODEL_PATH:
    raise ValueError("env var GRANITE_MODEL is unset!")
 encoder_tensors = torch.load(os.path.join(MODEL_PATH, "llava.clip"))
 projector_tensors = torch.load(os.path.join(MODEL_PATH, "llava.projector"))
 assert len(encoder_tensors) > 0
 assert len(projector_tensors) > 0
 ```
 If you actually inspect the `.keys()` of the loaded tensors, you should see a lot of `vision_model` tensors in the `encoder_tensors`, and 5 tensors (`'multi_modal_projector.linear_1.bias'`, `'multi_modal_projector.linear_1.weight'`, `'multi_modal_projector.linear_2.bias'`, `'multi_modal_projector.linear_2.weight'`, `'image_newline'`) in the multimodal `projector_tensors`.
 ### 2. Creating the Visual Component GGUF
 Next, create a new directory to hold the visual components, and copy the llava.clip/projector files, as shown below.
 ```bash
 $ ENCODER_PATH=$PWD/visual_encoder
 $ mkdir $ENCODER_PATH
 $ cp $GRANITE_MODEL/llava.clip $ENCODER_PATH/pytorch_model.bin
 $ cp $GRANITE_MODEL/llava.projector $ENCODER_PATH/
 ```
 Now, we need to write a config for the visual encoder. In order to convert the model, be sure to use the correct `image_grid_pinpoints`, as these may vary based on the model. You can find the `image_grid_pinpoints` in `$GRANITE_MODEL/config.json`.
 ```json
 {
    "_name_or_path": "siglip-model",
    "architectures": [
      "SiglipVisionModel"
    ],
    "image_grid_pinpoints": [
        [384,384],
        [384,768],
        [384,1152],
        [384,1536],
        [384,1920],
        [384,2304],
        [384,2688],
        [384,3072],
        [384,3456],
        [384,3840],
        [768,384],
        [768,768],
        [768,1152],
        [768,1536],
        [768,1920],
        [1152,384],
        [1152,768],
        [1152,1152],
        [1536,384],
        [1536,768],
        [1920,384],
        [1920,768],
        [2304,384],
        [2688,384],
        [3072,384],
        [3456,384],
        [3840,384]
    ],
    "mm_patch_merge_type": "spatial_unpad",
    "hidden_size": 1152,
    "image_size": 384,
    "intermediate_size": 4304,
    "model_type": "siglip_vision_model",
    "num_attention_heads": 16,
    "num_hidden_layers": 27,
    "patch_size": 14,
    "layer_norm_eps": 1e-6,
    "hidden_act": "gelu_pytorch_tanh",
    "projection_dim": 0,
    "vision_feature_layer": [-24, -20, -12, -1]
 }
 ```
 At this point you should have something like this:
 ```bash
 $ ls $ENCODER_PATH
 config.json             llava.projector         pytorch_model.bin
 ```
 Now convert the components to GGUF; Note that we also override the image mean/std dev to `[.5,.5,.5]` since we use the SigLIP visual encoder - in the transformers model, you can find these numbers in the `preprocessor_config.json`.
 ```bash
 $ python convert_image_encoder_to_gguf.py \
    -m $ENCODER_PATH \
    --llava-projector $ENCODER_PATH/llava.projector \
    --output-dir $ENCODER_PATH \
    --clip-model-is-vision \
    --clip-model-is-siglip \
    --image-mean 0.5 0.5 0.5 \
    --image-std 0.5 0.5 0.5
 ```
 This will create the first GGUF file at `$ENCODER_PATH/mmproj-model-f16.gguf`; we will refer to the absolute path of this file as the `$VISUAL_GGUF_PATH.`
 ### 3. Creating the LLM GGUF.
 The granite vision model contains a granite LLM as its language model. For now, the easiest way to get the GGUF for LLM is by loading the composite model in `transformers` and exporting the LLM so that it can be directly converted with the normal conversion path.
 First, set the `LLM_EXPORT_PATH` to the path to export the `transformers` LLM to.
 ```bash
 $ export LLM_EXPORT_PATH=$PWD/granite_vision_llm
 ```
 ```python
 import os
 import transformers
 MODEL_PATH = os.getenv("GRANITE_MODEL")
 if not MODEL_PATH:
    raise ValueError("env var GRANITE_MODEL is unset!")
 LLM_EXPORT_PATH = os.getenv("LLM_EXPORT_PATH")
 if not LLM_EXPORT_PATH:
    raise ValueError("env var LLM_EXPORT_PATH is unset!")
 tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH)
 # NOTE: granite vision support was added to transformers very recently (4.49);
 # if you get size mismatches, your version is too old.
 # If you are running with an older version, set `ignore_mismatched_sizes=True`
 # as shown below; it won't be loaded correctly, but the LLM part of the model that
 # we are exporting will be loaded correctly.
 model = transformers.AutoModelForImageTextToText.from_pretrained(MODEL_PATH, ignore_mismatched_sizes=True)
 tokenizer.save_pretrained(LLM_EXPORT_PATH)
 model.language_model.save_pretrained(LLM_EXPORT_PATH)
 ```
 Now you can convert the exported LLM to GGUF with the normal converter in the root of the llama cpp project.
 ```bash
 $ LLM_GGUF_PATH=$LLM_EXPORT_PATH/granite_llm.gguf
 ...
 $ python convert_hf_to_gguf.py --outfile $LLM_GGUF_PATH $LLM_EXPORT_PATH
 ```
 ### 4. Quantization
 If you want to quantize the LLM, you can do so with `llama-quantize` as you would any other LLM. For example:
 ```bash
 $ ./build/bin/llama-quantize $LLM_EXPORT_PATH/granite_llm.gguf $LLM_EXPORT_PATH/granite_llm_q4_k_m.gguf Q4_K_M
 $ LLM_GGUF_PATH=$LLM_EXPORT_PATH/granite_llm_q4_k_m.gguf
 ```
 Note that currently you cannot quantize the visual encoder because granite vision models use SigLIP as the visual encoder, which has tensor dimensions that are not divisible by 32.
 ### 5. Running the Model in Llama cpp
 Build llama cpp normally; you should have a target binary named `llama-mtmd-cli`, which you can pass two binaries to. As an example, we pass the the llama.cpp banner.
 ```bash
 $ ./build/bin/llama-mtmd-cli -m $LLM_GGUF_PATH \
    --mmproj $VISUAL_GGUF_PATH \
    -c 16384 \
    --temp 0
 ```
--- a/docs/multimodal/llava.md
+++ b/docs/multimodal/llava.md
@ -1,143 +0,0 @@
 # LLaVA
 Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants,
 as well as llava-1.6 [llava-v1.6](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2) variants.
 The pre-converted [7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
 and [13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
 models are available.
 For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](https://huggingface.co/cmp-nct/llava-1.6-gguf)
 After API is confirmed, more models will be supported / uploaded.
 ## Usage
 Build the `llama-mtmd-cli` binary.
 After building, run: `./llama-mtmd-cli` to see the usage. For example:
 ```sh
 ./llama-mtmd-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf \
    --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf \
    --chat-template vicuna
 ```
 **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
 **note**: For GPU offloading ensure to use the `-ngl` flag just like usual
 ## LLaVA 1.5
 1. Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example:
 ```sh
 git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
 git clone https://huggingface.co/openai/clip-vit-large-patch14-336
 ```
 2. Install the required Python packages:
 ```sh
 pip install -r examples/llava/requirements.txt
 ```
 3. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
 ```sh
 python ./examples/llava/llava_surgery.py -m ../llava-v1.5-7b
 ```
 4. Use `convert_image_encoder_to_gguf.py` to convert the LLaVA image encoder to GGUF:
 ```sh
 python ./examples/llava/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
 ```
 5. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
 ```sh
 python ./examples/convert_legacy_llama.py ../llava-v1.5-7b --skip-unknown
 ```
 Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory.
 ## LLaVA 1.6 gguf conversion
 1) First clone a LLaVA 1.6 model:
 ```console
 git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
 ```
 2) Install the required Python packages:
 ```sh
 pip install -r examples/llava/requirements.txt
 ```
 3) Use `llava_surgery_v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
 ```console
 python examples/llava/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/
 ```
 - you will find a llava.projector and a llava.clip file in your model directory
 4) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory:
 ```console
 mkdir vit
 cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin
 cp ../llava-v1.6-vicuna-7b/llava.projector vit/
 curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json
 ```
 5) Create the visual gguf model:
 ```console
 python ./examples/llava/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
 ```
 - This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
 6) Then convert the model to gguf format:
 ```console
 python ./examples/convert_legacy_llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
 ```
 7) And finally we can run the llava cli using the 1.6 model version:
 ```console
 ./llama-mtmd-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf
 ```
 **note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
 **note** llava-1.6 greatly benefits from batched prompt processing (defaults work)
 **note** if the language model in step `6)` is incompatible with the legacy conversion script, the easiest way handle the LLM model conversion is to load the model in transformers, and export only the LLM from the llava next model.
 ```python
 import os
 import transformers
 model_path = ...
 llm_export_path = ...
 tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)
 model = transformers.AutoModelForImageTextToText.from_pretrained(model_path)
 tokenizer.save_pretrained(llm_export_path)
 model.language_model.save_pretrained(llm_export_path)
 ```
 Then, you can convert the LLM using the `convert_hf_to_gguf.py` script, which handles more LLM architectures.
 ## Chat template
 For llava-1.5 and llava-1.6, you need to use `vicuna` chat template. Simply add `--chat-template vicuna` to activate this template.
 ## How to know if you are running in llava-1.5 or llava-1.6 mode
 When running llava-cli you will see a visual information right before the prompt is being processed:
 **Llava-1.5:**
 `encode_image_with_clip: image embedding created: 576 tokens`
 **Llava-1.6 (anything above 576):**
 `encode_image_with_clip: image embedding created: 2880 tokens`
 Alternatively just pay notice to how many "tokens" have been used for your prompt, it will also show 1000+ tokens for llava-1.6
--- a/docs/multimodal/minicpmo2.6.md
+++ b/docs/multimodal/minicpmo2.6.md
@ -1,48 +0,0 @@
 ## MiniCPM-o 2.6
 Currently, this readme only supports minicpm-omni's image capabilities, and we will update the full-mode support as soon as possible.
 ### Prepare models and code
 Download [MiniCPM-o-2_6](https://huggingface.co/openbmb/MiniCPM-o-2_6) PyTorch model from huggingface to "MiniCPM-o-2_6" folder.
 ### Build llama.cpp
 Readme modification time: 20250206
 If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
 Clone llama.cpp:
 ```bash
 git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
 ```
 Build llama.cpp using `CMake`:
 ```bash
 cmake -B build
 cmake --build build --config Release
 ```
 ### Usage of MiniCPM-o 2.6
 Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us)
 ```bash
 python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-o-2_6
 python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
 python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model
 # quantize int4 version
 ./build/bin/llama-quantize ../MiniCPM-o-2_6/model/ggml-model-f16.gguf ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
 ```
 Inference on Linux or Mac
 ```bash
 # run in single-turn mode
 ./build/bin/llama-mtmd-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
 # run in conversation mode
 ./build/bin/llama-mtmd-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf
 ```
--- a/docs/multimodal/minicpmv2.5.md
+++ b/docs/multimodal/minicpmv2.5.md
@ -1,47 +0,0 @@
 ## MiniCPM-Llama3-V 2.5
 ### Prepare models and code
 Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) PyTorch model from huggingface to "MiniCPM-Llama3-V-2_5" folder.
 ### Build llama.cpp
 Readme modification time: 20250206
 If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
 Clone llama.cpp:
 ```bash
 git clone https://github.com/ggml-org/llama.cpp
 cd llama.cpp
 ```
 Build llama.cpp using `CMake`:
 ```bash
 cmake -B build
 cmake --build build --config Release
 ```
 ### Usage of MiniCPM-Llama3-V 2.5
 Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
 ```bash
 python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
 python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
 python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
 # quantize int4 version
 ./build/bin/llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M
 ```
 Inference on Linux or Mac
 ```bash
 # run in single-turn mode
 ./build/bin/llama-mtmd-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
 # run in conversation mode
 ./build/bin/llama-mtmd-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf
 ```
--- a/docs/multimodal/minicpmv2.6.md
+++ b/docs/multimodal/minicpmv2.6.md
@ -1,47 +0,0 @@
 ## MiniCPM-V 2.6
 ### Prepare models and code
 Download [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) PyTorch model from huggingface to "MiniCPM-V-2_6" folder.
 ### Build llama.cpp
 Readme modification time: 20250206
 If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
 Clone llama.cpp:
 ```bash
 git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
 ```
 Build llama.cpp using `CMake`:
 ```bash
 cmake -B build
 cmake --build build --config Release
 ```
 ### Usage of MiniCPM-V 2.6
 Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us)
 ```bash
 python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-V-2_6
 python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3
 python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model
 # quantize int4 version
 ./build/bin/llama-quantize ../MiniCPM-V-2_6/model/ggml-model-f16.gguf ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
 ```
 Inference on Linux or Mac
 ```bash
 # run in single-turn mode
 ./build/bin/llama-mtmd-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
 # run in conversation mode
 ./build/bin/llama-mtmd-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf
 ```
--- a/examples/llava/clip-impl.h
+++ b/examples/llava/clip-impl.h
@ -95,8 +95,6 @@
 #define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
 #define TN_GLM_ADAPTER_GATE     "adapter.linear.gate.%s"
 #define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
 #define TN_GLM_BOI_W            "adapter.boi"
 #define TN_GLM_EOI_W            "adapter.eoi"
 enum projector_type {
    PROJECTOR_TYPE_MLP,
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -263,8 +263,6 @@ struct clip_vision_model {
    //GLMV-Edge projection
    struct ggml_tensor * mm_model_adapter_conv_w = nullptr;
    struct ggml_tensor * mm_model_adapter_conv_b = nullptr;
    struct ggml_tensor * boi_w = nullptr;
    struct ggml_tensor * eoi_w = nullptr;
    // MobileVLM projection
    struct ggml_tensor * mm_model_mlp_1_w = nullptr;
@ -1822,8 +1820,6 @@ struct clip_model_loader {
                    vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H,"weight"));
                    vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE,"weight"));
                    vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H,"weight"));
                    vision_model.boi_w = get_tensor(TN_GLM_BOI_W);
                    vision_model.eoi_w = get_tensor(TN_GLM_EOI_W);
                } break;
            case PROJECTOR_TYPE_MERGER:
                {
@ -2805,8 +2801,7 @@ void clip_free(clip_ctx * ctx) {
 }
 size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
-    int extra_tokens = ctx->has_glm_projector ? 2 : 0;
+    return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
    return (clip_n_patches(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float);
 }
 size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
@ -3006,9 +3001,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    }
    if (ctx->has_glm_projector) {
        GGML_ASSERT(batch_size == 1);
        ggml_tensor * boi = ctx->vision_model.boi_w;
        ggml_backend_tensor_get(boi,vec,0,ggml_nbytes(boi));
        vec = (float*)(vec+ggml_nelements(boi)); //offset for boi
    }
    // build the inference graph
@ -3341,13 +3333,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    // copy the embeddings to the location passed by the user
    ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
    if (ctx->has_glm_projector) {
        //eoi
        ggml_tensor * eoi = ctx->vision_model.eoi_w;
        int offset = ggml_nelements(embeddings);
        ggml_backend_tensor_get(eoi, vec+offset, 0, ggml_nbytes(eoi));
    }
    return true;
 }
--- a/examples/llava/mtmd-cli.cpp
+++ b/examples/llava/mtmd-cli.cpp
@ -40,7 +40,8 @@ static void show_additional_info(int /*argc*/, char ** argv) {
        "Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> -p <prompt>\n\n"
        "  -m and --mmproj are required\n"
        "  -hf user/repo can replace both -m and --mmproj in most cases\n"
-        "  --image and -p are optional, if NOT provided, the CLI will run in chat mode\n",
+        "  --image and -p are optional, if NOT provided, the CLI will run in chat mode\n"
        "  to disable using GPU for mmproj model, add --no-mmproj-offload\n",
        argv[0]
    );
 }
@ -112,10 +113,10 @@ struct mtmd_cli_context {
    void init_vision_context(common_params & params) {
        const char * clip_path = params.mmproj.path.c_str();
        ctx_vision.reset(mtmd_init_from_file(clip_path, model, mtmd_context_params{
-            /* use_gpu */   true,
+            /* use_gpu */   params.mmproj_use_gpu,
            /* timings */   true,
            /* n_threads */ params.cpuparams.n_threads,
-            /* verbosity */ GGML_LOG_LEVEL_INFO,
+            /* verbosity */ params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO,
        }));
        if (!ctx_vision.get()) {
            LOG_ERR("Failed to load vision model from %s\n", clip_path);
@ -261,6 +262,7 @@ int main(int argc, char ** argv) {
    if (params.mmproj.path.empty()) {
        show_additional_info(argc, argv);
        LOG_ERR("ERR: Missing --mmproj argument\n");
        return 1;
    }
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@ -186,6 +186,11 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
        marker_modified = "<start_of_image>" + ctx->image_marker + "<end_of_image>";
        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
    } else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) {
        // <|begin_of_image|> ... (image embeddings) ... <|end_of_image|>
        marker_modified = "<|begin_of_image|>" + ctx->image_marker + "<|end_of_image|>";
        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
    } else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
        // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
        marker_modified = "<fake_token_around_image><global-img>" + ctx->image_marker + "<fake_token_around_image>";
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -487,6 +487,7 @@ extern "C" {
        GGML_OP_CONV_TRANSPOSE_1D,
        GGML_OP_IM2COL,
        GGML_OP_IM2COL_BACK,
        GGML_OP_CONV_2D_DW,
        GGML_OP_CONV_TRANSPOSE_2D,
        GGML_OP_POOL_1D,
        GGML_OP_POOL_2D,
@ -690,6 +691,9 @@ extern "C" {
    GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
    GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
    // true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
    GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
    GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
    GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
@ -1673,7 +1677,7 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
-    // depthwise
+    // depthwise (via im2col and mul_mat)
    GGML_API struct ggml_tensor * ggml_conv_2d_dw(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,  // convolution kernel
@ -1685,6 +1689,22 @@ extern "C" {
            int                  d0,  // dilation dimension 0
            int                  d1); // dilation dimension 1
    // Depthwise 2D convolution
    // may be faster than ggml_conv_2d_dw, but not available in all backends
    // a:   KW    KH    1    C    convolution kernel
    // b:   W     H     C    N    input data
    // res: W_out H_out C    N
    GGML_API struct ggml_tensor * ggml_conv_2d_dw_direct(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
            int                   stride0,
            int                   stride1,
            int                   pad0,
            int                   pad1,
            int                   dilation0,
            int                   dilation1);
    GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@ -1946,6 +1946,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            {
                ggml_compute_forward_im2col_back_f32(params, tensor);
            } break;
        case GGML_OP_CONV_2D_DW:
            {
                ggml_compute_forward_conv_2d_dw(params, tensor);
            } break;
        case GGML_OP_CONV_TRANSPOSE_2D:
            {
                ggml_compute_forward_conv_transpose_2d(params, tensor);
@ -2282,6 +2286,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
            } break;
        case GGML_OP_IM2COL:
        case GGML_OP_IM2COL_BACK:
        case GGML_OP_CONV_2D_DW:
        case GGML_OP_CONV_TRANSPOSE_1D:
        case GGML_OP_CONV_TRANSPOSE_2D:
            {
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@ -6064,6 +6064,178 @@ void ggml_compute_forward_conv_transpose_2d(
    }
 }
 // ggml_compute_forward_conv_2d_dw
 struct ggml_conv_2d_dw_params {
    int64_t channels;
    int64_t batch;
    int64_t src_w;
    int64_t src_h;
    int64_t dst_w;
    int64_t dst_h;
    int64_t knl_w;
    int64_t knl_h;
    int stride_x;
    int stride_y;
    int pad_x;
    int pad_y;
    int dilation_x;
    int dilation_y;
 };
 static void ggml_compute_forward_conv_2d_dw_cwhn(
        const ggml_compute_params * params,
        const ggml_tensor * src,
        const ggml_tensor * kernel,
        ggml_tensor * dst,
        const ggml_conv_2d_dw_params & p) {
    const int64_t c = p.channels;
    const float * knl_data = (const float *)kernel->data;
    const int64_t rows_total = p.dst_h * p.batch;
    const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth;
    const int64_t row_start = params->ith * rows_per_thread;
    const int64_t row_end = MIN(row_start + rows_per_thread, rows_total);
 #ifdef GGML_SIMD
    const int64_t pkg_size = GGML_F32_EPR;
    const int64_t pkg_count = c / pkg_size;
    const int64_t c_pkg_end = pkg_count * pkg_size;
 #else
    const int64_t c_pkg_end = 0;
 #endif
    for (int64_t row = row_start; row < row_end; ++row) {
        const int64_t dst_y = row % p.dst_h;
        const float * src_data = (const float *)src->data + (row / p.dst_h) * p.src_w * p.src_h * c;
        for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
            float * dst_data = (float *)dst->data + (row * p.dst_w + dst_x) * c;
            const int64_t src_y_base = dst_y * p.stride_y - p.pad_y;
            const int64_t src_x_base = dst_x * p.stride_x - p.pad_x;
 #ifdef GGML_SIMD
            // Vectorized loop
            for (int64_t c_i = 0; c_i < c_pkg_end; c_i += pkg_size) {
                GGML_F32_VEC sum = GGML_F32_VEC_ZERO;
                for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
                    const int64_t src_y = src_y_base + knl_y * p.dilation_y;
                    if (src_y < 0 || src_y >= p.src_h) {
                        continue;
                    }
                    for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
                        const int64_t src_x = src_x_base + knl_x * p.dilation_x;
                        if (src_x < 0 || src_x >= p.src_w) {
                            continue;
                        }
                        GGML_F32_VEC k = GGML_F32_VEC_LOAD(knl_data + (knl_y * p.knl_w + knl_x) * c + c_i);
                        GGML_F32_VEC s = GGML_F32_VEC_LOAD(src_data + (src_y * p.src_w + src_x) * c + c_i);
                        sum = GGML_F32_VEC_FMA(sum, k, s);
                    }
                }
                GGML_F32_VEC_STORE(dst_data + c_i, sum);
            }
 #endif
            // Scalar loop
            for (int64_t c_i = c_pkg_end; c_i < c; ++c_i) {
                float sum = 0.0f;
                for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
                    const int64_t src_y = src_y_base + knl_y * p.dilation_y;
                    if (src_y < 0 || src_y >= p.src_h) {
                        continue;
                    }
                    for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
                        const int64_t src_x = src_x_base + knl_x * p.dilation_x;
                        if (src_x < 0 || src_x >= p.src_w) {
                            continue;
                        }
                        sum += knl_data[(knl_y * p.knl_w + knl_x) * c + c_i]
                             * src_data[(src_y * p.src_w + src_x) * c + c_i];
                    }
                }
                dst_data[c_i] = sum;
            }
        }
    }
 }
 static void ggml_compute_forward_conv_2d_dw_whcn(
        const ggml_compute_params * params,
        const ggml_tensor * src,
        const ggml_tensor * kernel,
        ggml_tensor * dst,
        const ggml_conv_2d_dw_params & p) {
    const int64_t n = p.channels * p.batch;
    const int64_t per_thread = (n + params->nth - 1) / params->nth;
    const int64_t start = params->ith * per_thread;
    const int64_t end = MIN(start + per_thread, n);
    for (int64_t i = start; i < end; ++i) {
        const float * knl_data = (const float *)kernel->data + (i % p.channels) * p.knl_w * p.knl_h;
        const float * src_data = (const float *)src->data + i * p.src_w * p.src_h;
        float * dst_data = (float *)dst->data + i * p.dst_w * p.dst_h;
        for (int64_t dst_y = 0; dst_y < p.dst_h; ++dst_y) {
            for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
                float sum = 0.0f;
                for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
                    const int64_t src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
                    if (src_y < 0 || src_y >= p.src_h) {
                        continue;
                    }
                    for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
                        const int64_t src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
                        if (src_x < 0 || src_x >= p.src_w) {
                            continue;
                        }
                        sum += knl_data[knl_y * p.knl_w + knl_x]
                             * src_data[src_y * p.src_w + src_x];
                    }
                }
                dst_data[dst_y * p.dst_w + dst_x] = sum;
            }
        }
    }
 }
 void ggml_compute_forward_conv_2d_dw(
        const ggml_compute_params * params,
        ggml_tensor * dst) {
    const ggml_tensor * kernel = dst->src[0];
    const ggml_tensor * src = dst->src[1];
    ggml_conv_2d_dw_params p;
    p.channels = src->ne[2];
    p.batch = src->ne[3];
    p.src_w = src->ne[0];
    p.src_h = src->ne[1];
    p.dst_w = dst->ne[0];
    p.dst_h = dst->ne[1];
    p.knl_w = kernel->ne[0];
    p.knl_h = kernel->ne[1];
    p.stride_x = dst->op_params[0];
    p.stride_y = dst->op_params[1];
    p.pad_x = dst->op_params[2];
    p.pad_y = dst->op_params[3];
    p.dilation_x = dst->op_params[4];
    p.dilation_y = dst->op_params[5];
    GGML_ASSERT(kernel->ne[3] == p.channels);
    GGML_ASSERT(dst->ne[3] == p.batch);
    if (ggml_is_contiguous(src)) {
        ggml_compute_forward_conv_2d_dw_whcn(params, src, kernel, dst, p);
    } else if (ggml_is_contiguous_channels(src)) {
        // kernel should also have channels most contiguous in memory
        GGML_ASSERT(kernel->nb[0] >= kernel->nb[2] && kernel->nb[1] >= kernel->nb[0]);
        ggml_compute_forward_conv_2d_dw_cwhn(params, src, kernel, dst, p);
    } else {
        GGML_ABORT("non-contiguous memory layout not supported");
    }
 }
 // ggml_compute_forward_pool_1d_sk_p0
 static void ggml_compute_forward_pool_1d_sk_p0(
--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
@ -65,6 +65,7 @@ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * p
 void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_pool_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@ -156,25 +156,27 @@ static constexpr __device__ int get_mmq_y_device() {
 #define MMQ_DP4A_TXS_Q6_K    tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE/QI6_K   + mmq_y/QI6_K,     mmq_y*WARP_SIZE/8 + mmq_y/8}
 static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml_type type, int mmq_y) {
-    return type == GGML_TYPE_Q4_0 ? MMQ_DP4A_TXS_Q4_0 :
+    switch (type) {
-        type == GGML_TYPE_Q4_1    ? MMQ_DP4A_TXS_Q4_1 :
+        case GGML_TYPE_Q4_0:    return MMQ_DP4A_TXS_Q4_0;
-        type == GGML_TYPE_Q5_0    ? MMQ_DP4A_TXS_Q8_0 :
+        case GGML_TYPE_Q4_1:    return MMQ_DP4A_TXS_Q4_1;
-        type == GGML_TYPE_Q5_1    ? MMQ_DP4A_TXS_Q8_1 :
+        case GGML_TYPE_Q5_0:    return MMQ_DP4A_TXS_Q8_0;
-        type == GGML_TYPE_Q8_0    ? MMQ_DP4A_TXS_Q8_0 :
+        case GGML_TYPE_Q5_1:    return MMQ_DP4A_TXS_Q8_1;
-        type == GGML_TYPE_Q2_K    ? MMQ_DP4A_TXS_Q2_K :
+        case GGML_TYPE_Q8_0:    return MMQ_DP4A_TXS_Q8_0;
-        type == GGML_TYPE_Q3_K    ? MMQ_DP4A_TXS_Q3_K :
+        case GGML_TYPE_Q2_K:    return MMQ_DP4A_TXS_Q2_K;
-        type == GGML_TYPE_Q4_K    ? MMQ_DP4A_TXS_Q4_K :
+        case GGML_TYPE_Q3_K:    return MMQ_DP4A_TXS_Q3_K;
-        type == GGML_TYPE_Q5_K    ? MMQ_DP4A_TXS_Q5_K :
+        case GGML_TYPE_Q4_K:    return MMQ_DP4A_TXS_Q4_K;
-        type == GGML_TYPE_Q6_K    ? MMQ_DP4A_TXS_Q6_K :
+        case GGML_TYPE_Q5_K:    return MMQ_DP4A_TXS_Q5_K;
-        type == GGML_TYPE_IQ2_XXS ? MMQ_DP4A_TXS_Q8_0 :
+        case GGML_TYPE_Q6_K:    return MMQ_DP4A_TXS_Q6_K;
-        type == GGML_TYPE_IQ2_XS  ? MMQ_DP4A_TXS_Q8_0_16 :
+        case GGML_TYPE_IQ2_XXS: return MMQ_DP4A_TXS_Q8_0;
-        type == GGML_TYPE_IQ2_S   ? MMQ_DP4A_TXS_Q8_0_16 :
+        case GGML_TYPE_IQ2_XS:  return MMQ_DP4A_TXS_Q8_0_16;
-        type == GGML_TYPE_IQ3_XXS ? MMQ_DP4A_TXS_Q8_0 :
+        case GGML_TYPE_IQ2_S:   return MMQ_DP4A_TXS_Q8_0_16;
-        type == GGML_TYPE_IQ3_S   ? MMQ_DP4A_TXS_Q8_0 :
+        case GGML_TYPE_IQ3_XXS: return MMQ_DP4A_TXS_Q8_0;
-        type == GGML_TYPE_IQ1_S   ? MMQ_DP4A_TXS_Q8_0 :
+        case GGML_TYPE_IQ3_S:   return MMQ_DP4A_TXS_Q8_0;
-        type == GGML_TYPE_IQ4_XS  ? MMQ_DP4A_TXS_Q8_0 :
+        case GGML_TYPE_IQ1_S:   return MMQ_DP4A_TXS_Q8_0;
-        type == GGML_TYPE_IQ4_NL  ? MMQ_DP4A_TXS_Q8_0 :
+        case GGML_TYPE_IQ4_XS:  return MMQ_DP4A_TXS_Q8_0;
-        tile_x_sizes{0, 0, 0};
+        case GGML_TYPE_IQ4_NL:  return MMQ_DP4A_TXS_Q8_0;
        default:                return tile_x_sizes{0, 0, 0};
    }
 }
 #define MMQ_MMA_TILE_X_K_Q8_0 (2*WARP_SIZE + 2*WARP_SIZE/QI8_0                 + 4)
@ -190,25 +192,27 @@ static_assert(MMQ_MMA_TILE_X_K_Q3_K % 8 == 4, "Wrong padding.");
 static_assert(MMQ_MMA_TILE_X_K_Q6_K % 8 == 4, "Wrong padding.");
 static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) {
-    return type == GGML_TYPE_Q4_0 ? MMQ_MMA_TILE_X_K_Q8_0 :
+    switch (type) {
-        type == GGML_TYPE_Q4_1    ? MMQ_MMA_TILE_X_K_Q8_1 :
+        case GGML_TYPE_Q4_0:    return MMQ_MMA_TILE_X_K_Q8_0;
-        type == GGML_TYPE_Q5_0    ? MMQ_MMA_TILE_X_K_Q8_0 :
+        case GGML_TYPE_Q4_1:    return MMQ_MMA_TILE_X_K_Q8_1;
-        type == GGML_TYPE_Q5_1    ? MMQ_MMA_TILE_X_K_Q8_1 :
+        case GGML_TYPE_Q5_0:    return MMQ_MMA_TILE_X_K_Q8_0;
-        type == GGML_TYPE_Q8_0    ? MMQ_MMA_TILE_X_K_Q8_0 :
+        case GGML_TYPE_Q5_1:    return MMQ_MMA_TILE_X_K_Q8_1;
-        type == GGML_TYPE_Q2_K    ? MMQ_MMA_TILE_X_K_Q2_K :
+        case GGML_TYPE_Q8_0:    return MMQ_MMA_TILE_X_K_Q8_0;
-        type == GGML_TYPE_Q3_K    ? MMQ_MMA_TILE_X_K_Q3_K :
+        case GGML_TYPE_Q2_K:    return MMQ_MMA_TILE_X_K_Q2_K;
-        type == GGML_TYPE_Q4_K    ? MMQ_MMA_TILE_X_K_Q8_1 :
+        case GGML_TYPE_Q3_K:    return MMQ_MMA_TILE_X_K_Q3_K;
-        type == GGML_TYPE_Q5_K    ? MMQ_MMA_TILE_X_K_Q8_1 :
+        case GGML_TYPE_Q4_K:    return MMQ_MMA_TILE_X_K_Q8_1;
-        type == GGML_TYPE_Q6_K    ? MMQ_MMA_TILE_X_K_Q6_K :
+        case GGML_TYPE_Q5_K:    return MMQ_MMA_TILE_X_K_Q8_1;
-        type == GGML_TYPE_IQ2_XXS ? MMQ_MMA_TILE_X_K_Q8_0 :
+        case GGML_TYPE_Q6_K:    return MMQ_MMA_TILE_X_K_Q6_K;
-        type == GGML_TYPE_IQ2_XS  ? MMQ_MMA_TILE_X_K_Q3_K :
+        case GGML_TYPE_IQ2_XXS: return MMQ_MMA_TILE_X_K_Q8_0;
-        type == GGML_TYPE_IQ2_S   ? MMQ_MMA_TILE_X_K_Q3_K :
+        case GGML_TYPE_IQ2_XS:  return MMQ_MMA_TILE_X_K_Q3_K;
-        type == GGML_TYPE_IQ3_XXS ? MMQ_MMA_TILE_X_K_Q8_0 :
+        case GGML_TYPE_IQ2_S:   return MMQ_MMA_TILE_X_K_Q3_K;
-        type == GGML_TYPE_IQ3_S   ? MMQ_MMA_TILE_X_K_Q8_0 :
+        case GGML_TYPE_IQ3_XXS: return MMQ_MMA_TILE_X_K_Q8_0;
-        type == GGML_TYPE_IQ1_S   ? MMQ_MMA_TILE_X_K_Q8_0 :
+        case GGML_TYPE_IQ3_S:   return MMQ_MMA_TILE_X_K_Q8_0;
-        type == GGML_TYPE_IQ4_XS  ? MMQ_MMA_TILE_X_K_Q8_0 :
+        case GGML_TYPE_IQ1_S:   return MMQ_MMA_TILE_X_K_Q8_0;
-        type == GGML_TYPE_IQ4_NL  ? MMQ_MMA_TILE_X_K_Q8_0 :
+        case GGML_TYPE_IQ4_XS:  return MMQ_MMA_TILE_X_K_Q8_0;
-        0;
+        case GGML_TYPE_IQ4_NL:  return MMQ_MMA_TILE_X_K_Q8_0;
        default:                return 0;
    }
 }
 #define MMQ_TILE_Y_K (WARP_SIZE + WARP_SIZE/QI8_1)
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@ -7,47 +7,51 @@
 typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs);
 static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) {
-    return type == GGML_TYPE_Q4_0 ? vec_dot_q4_0_q8_1 :
+    switch (type) {
-        type == GGML_TYPE_Q4_1 ? vec_dot_q4_1_q8_1 :
+        case GGML_TYPE_Q4_0:    return vec_dot_q4_0_q8_1;
-        type == GGML_TYPE_Q5_0 ? vec_dot_q5_0_q8_1 :
+        case GGML_TYPE_Q4_1:    return vec_dot_q4_1_q8_1;
-        type == GGML_TYPE_Q5_1 ? vec_dot_q5_1_q8_1 :
+        case GGML_TYPE_Q5_0:    return vec_dot_q5_0_q8_1;
-        type == GGML_TYPE_Q8_0 ? vec_dot_q8_0_q8_1 :
+        case GGML_TYPE_Q5_1:    return vec_dot_q5_1_q8_1;
-        type == GGML_TYPE_Q2_K ? vec_dot_q2_K_q8_1 :
+        case GGML_TYPE_Q8_0:    return vec_dot_q8_0_q8_1;
-        type == GGML_TYPE_Q3_K ? vec_dot_q3_K_q8_1 :
+        case GGML_TYPE_Q2_K:    return vec_dot_q2_K_q8_1;
-        type == GGML_TYPE_Q4_K ? vec_dot_q4_K_q8_1 :
+        case GGML_TYPE_Q3_K:    return vec_dot_q3_K_q8_1;
-        type == GGML_TYPE_Q5_K ? vec_dot_q5_K_q8_1 :
+        case GGML_TYPE_Q4_K:    return vec_dot_q4_K_q8_1;
-        type == GGML_TYPE_Q6_K ? vec_dot_q6_K_q8_1 :
+        case GGML_TYPE_Q5_K:    return vec_dot_q5_K_q8_1;
-        type == GGML_TYPE_IQ2_XXS ? vec_dot_iq2_xxs_q8_1 :
+        case GGML_TYPE_Q6_K:    return vec_dot_q6_K_q8_1;
-        type == GGML_TYPE_IQ2_XS ? vec_dot_iq2_xs_q8_1 :
+        case GGML_TYPE_IQ2_XXS: return vec_dot_iq2_xxs_q8_1;
-        type == GGML_TYPE_IQ2_S ? vec_dot_iq2_s_q8_1 :
+        case GGML_TYPE_IQ2_XS:  return vec_dot_iq2_xs_q8_1;
-        type == GGML_TYPE_IQ3_XXS ? vec_dot_iq3_xxs_q8_1 :
+        case GGML_TYPE_IQ2_S:   return vec_dot_iq2_s_q8_1;
-        type == GGML_TYPE_IQ1_S ? vec_dot_iq1_s_q8_1 :
+        case GGML_TYPE_IQ3_XXS: return vec_dot_iq3_xxs_q8_1;
-        type == GGML_TYPE_IQ1_M ? vec_dot_iq1_m_q8_1 :
+        case GGML_TYPE_IQ1_S:   return vec_dot_iq1_s_q8_1;
-        type == GGML_TYPE_IQ4_NL ? vec_dot_iq4_nl_q8_1 :
+        case GGML_TYPE_IQ1_M:   return vec_dot_iq1_m_q8_1;
-        type == GGML_TYPE_IQ4_XS ? vec_dot_iq4_xs_q8_1 :
+        case GGML_TYPE_IQ4_NL:  return vec_dot_iq4_nl_q8_1;
-        type == GGML_TYPE_IQ3_S ? vec_dot_iq3_s_q8_1 :
+        case GGML_TYPE_IQ4_XS:  return vec_dot_iq4_xs_q8_1;
-        nullptr;
+        case GGML_TYPE_IQ3_S:   return vec_dot_iq3_s_q8_1;
        default:                return nullptr;
    }
 }
 static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
-    return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ :
+    switch (type) {
-        type == GGML_TYPE_Q4_1    ? VDR_Q4_1_Q8_1_MMVQ :
+        case GGML_TYPE_Q4_0:    return VDR_Q4_0_Q8_1_MMVQ;
-        type == GGML_TYPE_Q5_0    ? VDR_Q5_0_Q8_1_MMVQ :
+        case GGML_TYPE_Q4_1:    return VDR_Q4_1_Q8_1_MMVQ;
-        type == GGML_TYPE_Q5_1    ? VDR_Q5_1_Q8_1_MMVQ :
+        case GGML_TYPE_Q5_0:    return VDR_Q5_0_Q8_1_MMVQ;
-        type == GGML_TYPE_Q8_0    ? VDR_Q8_0_Q8_1_MMVQ :
+        case GGML_TYPE_Q5_1:    return VDR_Q5_1_Q8_1_MMVQ;
-        type == GGML_TYPE_Q2_K    ? VDR_Q2_K_Q8_1_MMVQ :
+        case GGML_TYPE_Q8_0:    return VDR_Q8_0_Q8_1_MMVQ;
-        type == GGML_TYPE_Q3_K    ? VDR_Q3_K_Q8_1_MMVQ :
+        case GGML_TYPE_Q2_K:    return VDR_Q2_K_Q8_1_MMVQ;
-        type == GGML_TYPE_Q4_K    ? VDR_Q4_K_Q8_1_MMVQ :
+        case GGML_TYPE_Q3_K:    return VDR_Q3_K_Q8_1_MMVQ;
-        type == GGML_TYPE_Q5_K    ? VDR_Q5_K_Q8_1_MMVQ :
+        case GGML_TYPE_Q4_K:    return VDR_Q4_K_Q8_1_MMVQ;
-        type == GGML_TYPE_Q6_K    ? VDR_Q6_K_Q8_1_MMVQ :
+        case GGML_TYPE_Q5_K:    return VDR_Q5_K_Q8_1_MMVQ;
-        type == GGML_TYPE_IQ2_XXS ? VDR_IQ2_XXS_Q8_1_MMVQ :
+        case GGML_TYPE_Q6_K:    return VDR_Q6_K_Q8_1_MMVQ;
-        type == GGML_TYPE_IQ2_XS  ? VDR_IQ2_XS_Q8_1_MMVQ :
+        case GGML_TYPE_IQ2_XXS: return VDR_IQ2_XXS_Q8_1_MMVQ;
-        type == GGML_TYPE_IQ2_S   ? VDR_IQ2_S_Q8_1_MMVQ :
+        case GGML_TYPE_IQ2_XS:  return VDR_IQ2_XS_Q8_1_MMVQ;
-        type == GGML_TYPE_IQ3_XXS ? VDR_IQ3_XXS_Q8_1_MMVQ :
+        case GGML_TYPE_IQ2_S:   return VDR_IQ2_S_Q8_1_MMVQ;
-        type == GGML_TYPE_IQ3_S   ? VDR_IQ3_S_Q8_1_MMVQ :
+        case GGML_TYPE_IQ3_XXS: return VDR_IQ3_XXS_Q8_1_MMVQ;
-        type == GGML_TYPE_IQ4_NL  ? VDR_IQ4_NL_Q8_1_MMVQ :
+        case GGML_TYPE_IQ3_S:   return VDR_IQ3_S_Q8_1_MMVQ;
-        type == GGML_TYPE_IQ4_XS  ? VDR_IQ4_XS_Q8_1_MMVQ :
+        case GGML_TYPE_IQ4_NL:  return VDR_IQ4_NL_Q8_1_MMVQ;
-        1;
+        case GGML_TYPE_IQ4_XS:  return VDR_IQ4_XS_Q8_1_MMVQ;
        default:                return 1;
    }
 }
 enum mmvq_parameter_table_id {
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -969,6 +969,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "CONV_TRANSPOSE_1D",
    "IM2COL",
    "IM2COL_BACK",
    "CONV_2D_DW",
    "CONV_TRANSPOSE_2D",
    "POOL_1D",
    "POOL_2D",
@ -1006,7 +1007,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "OPT_STEP_ADAMW",
 };
-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "none",
@ -1063,6 +1064,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "conv_transpose_1d(x)",
    "im2col(x)",
    "im2col_back(x)",
    "conv_2d_dw(x)",
    "conv_transpose_2d(x)",
    "pool_1d(x)",
    "pool_2d(x)",
@ -1100,7 +1102,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "adamw(x)",
 };
-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@ -1357,6 +1359,13 @@ bool ggml_is_permuted(const struct ggml_tensor * tensor) {
    return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
 }
 bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor) {
    return
        tensor->nb[0] > tensor->nb[2] &&
        tensor->nb[1] > tensor->nb[0] &&
        tensor->nb[2] == ggml_type_size(tensor->type);
 }
 static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
@ -4063,6 +4072,46 @@ struct ggml_tensor * ggml_conv_2d_dw(
    return result;
 }
 // ggml_conv_2d_dw_direct
 struct ggml_tensor * ggml_conv_2d_dw_direct(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        struct ggml_tensor  * b,
        int                   stride0,
        int                   stride1,
        int                   pad0,
        int                   pad1,
        int                   dilation0,
        int                   dilation1) {
    GGML_ASSERT(a->ne[2] == 1);
    GGML_ASSERT(a->ne[3] == b->ne[2]);
    int64_t ne[4];
    ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], stride0, pad0, dilation0);
    ne[1] = ggml_calc_conv_output_size(b->ne[1], a->ne[1], stride1, pad1, dilation1);
    ne[2] = b->ne[2];
    ne[3] = b->ne[3];
    struct ggml_tensor * result = ggml_new_tensor(ctx, b->type, 4, ne);
    if (ggml_is_contiguous_channels(b)) {
        // Result will be permuted the same way as input (CWHN order)
        const int64_t type_size = ggml_type_size(result->type);
        GGML_ASSERT(ggml_blck_size(result->type) == 1);
        result->nb[0] = result->ne[2] * type_size;
        result->nb[1] = result->ne[0] * result->nb[0];
        result->nb[2] = type_size;
    }
    int32_t params[] = { stride0, stride1, pad0, pad1, dilation0, dilation1 };
    ggml_set_op_params(result, params, sizeof(params));
    result->op     = GGML_OP_CONV_2D_DW;
    result->src[0] = a;
    result->src[1] = b;
    return result;
 }
 // ggml_conv_transpose_2d_p0
 static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@ -1832,10 +1832,14 @@ static int GetBatchSize(int desiredBlasBatchSize,FileFormat in_file_format)
                            file_format == FileFormat::GPTJ_2 ||
                            file_format == FileFormat::RWKV_1 ||
                            file_format==FileFormat::RWKV_2);
-    if(!approved_format || desiredBlasBatchSize<=0)
+    if(!approved_format && desiredBlasBatchSize>0)
    {
        desiredBlasBatchSize = 16;
    }
    if(desiredBlasBatchSize<=0)
    {
        desiredBlasBatchSize = 1;
    }
    if (file_format != FileFormat::GGML && file_format != FileFormat::GGHF && file_format != FileFormat::GGJT && file_format != FileFormat::GGJT_2 && file_format != FileFormat::GGJT_3 && file_format != FileFormat::GGUF_GENERIC)
    {
        desiredBlasBatchSize = (desiredBlasBatchSize > 256 ? 256 : desiredBlasBatchSize);
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -3861,8 +3861,8 @@ def show_gui():
    tabcontent = {}
    # slider data
-    blasbatchsize_values = ["-1", "32", "64", "128", "256", "512", "1024", "2048"]
+    blasbatchsize_values = ["-1", "16", "32", "64", "128", "256", "512", "1024", "2048"]
-    blasbatchsize_text = ["Don't Batch BLAS","32","64","128","256","512","1024","2048"]
+    blasbatchsize_text = ["Don't Batch BLAS", "16","32","64","128","256","512","1024","2048"]
    contextsize_text = ["256", "512", "1024", "2048", "3072", "4096", "6144", "8192", "10240", "12288", "14336", "16384", "20480", "24576", "28672", "32768", "40960", "49152", "57344", "65536", "81920", "98304", "114688", "131072"]
    antirunopts = [opt.replace("Use ", "") for lib, opt in lib_option_pairs if opt not in runopts]
    quantkv_text = ["F16 (Off)","8-Bit","4-Bit"]
@ -6590,7 +6590,7 @@ if __name__ == '__main__':
    advparser.add_argument("--version", help="Prints version and exits.", action='store_true')
    advparser.add_argument("--analyze", metavar=('[filename]'), help="Reads the metadata, weight types and tensor names in any GGUF file.", default="")
    advparser.add_argument("--ropeconfig", help="If set, uses customized RoPE scaling from configured frequency scale and frequency base (e.g. --ropeconfig 0.25 10000). Otherwise, uses NTK-Aware scaling set automatically based on context size. For linear rope, simply set the freq-scale and ignore the freq-base",metavar=('[rope-freq-scale]', '[rope-freq-base]'), default=[0.0, 10000.0], type=float, nargs='+')
-    advparser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,32,64,128,256,512,1024,2048], default=512)
+    advparser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,16,32,64,128,256,512,1024,2048], default=512)
    advparser.add_argument("--blasthreads", help="Use a different number of threads during BLAS if specified. Otherwise, has the same value as --threads",metavar=('[threads]'), type=int, default=0)
    advparser.add_argument("--lora", help="LLAMA models only, applies a lora file on top of model. Experimental.", metavar=('[lora_filename]', '[lora_base]'), nargs='+')
    advparser.add_argument("--noshift", help="If set, do not attempt to Trim and Shift the GGUF context.", action='store_true')
--- a/models/ggml-vocab-pixtral.gguf.inp
+++ b/models/ggml-vocab-pixtral.gguf.inp
@ -1,112 +0,0 @@
 ied 4 ½ months
 __ggml_vocab_test__
 Führer
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 Hello world
 __ggml_vocab_test__
 Hello world
 __ggml_vocab_test__
 Hello World
 __ggml_vocab_test__
 Hello World
 __ggml_vocab_test__
 Hello World!
 __ggml_vocab_test__
 Hello, world!
 __ggml_vocab_test__
 Hello, world!
 __ggml_vocab_test__
 this is 🦙.cpp
 __ggml_vocab_test__
 w048 7tuijk dsdfhu
 __ggml_vocab_test__
 нещо на Български
 __ggml_vocab_test__
 កាន់តែពិសេសអាចខលចេញ
 __ggml_vocab_test__
 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
 __ggml_vocab_test__
 Hello
 __ggml_vocab_test__
 Hello
 __ggml_vocab_test__
  Hello
 __ggml_vocab_test__
   Hello
 __ggml_vocab_test__
    Hello
 __ggml_vocab_test__
    Hello
    Hello
 __ggml_vocab_test__
 (
 __ggml_vocab_test__
 =
 __ggml_vocab_test__
 ' era
 __ggml_vocab_test__
 Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
 __ggml_vocab_test__
 !!!!!!
 __ggml_vocab_test__
 3
 __ggml_vocab_test__
 33
 __ggml_vocab_test__
 333
 __ggml_vocab_test__
 3333
 __ggml_vocab_test__
 33333
 __ggml_vocab_test__
 333333
 __ggml_vocab_test__
 3333333
 __ggml_vocab_test__
 33333333
 __ggml_vocab_test__
 333333333
 __ggml_vocab_test__
 Cửa Việt
 __ggml_vocab_test__
 discards
 __ggml_vocab_test__
 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
 __ggml_vocab_test__
--- a/models/ggml-vocab-pixtral.gguf.out
+++ b/models/ggml-vocab-pixtral.gguf.out
@ -1,46 +0,0 @@
 2014 1032 1052 1032 28504 6972
 1070 7088 1258
 1032
 1256
 1293
 1009
 1010
 1267
 4688
 1009 1010
 22177 4304
 45383 4304
 22177 5325
 45383 5325
 45383 5325 1033
 22177 1044 4304 1033
 45383 1044 4304 1033
 1593 1395 119685 1166 1153 1046 51228
 1119 1048 1052 1056 1032 1055 17391 23216 30203 7785 17279
 3337 30757 1902 4200 63073 3671
 1225 1158 1128 1225 1158 1182 1225 1158 1147 1225 1159 1139 1225 1158 1143 1225 1159 1130 1225 1158 1150 1225 1158 1183 1225 1158 1159 1225 21359 1225 1158 1159 1225 1158 1162 1225 1158 1182 1225 1158 1133 1225 1158 1129 1225 1158 1155 1225 1158 1133 1225 21359 1225 1158 1137
 1240 1159 1154 1128 1319 13052 1041 119685 1152 1182 29568 1240 1159 1140 1171 1239 1184 1143 1319 88181 1873 3659 1275 56421 1621 1041 126241 1133 1319 11234 1873 26303 1455 1934 2246 3754 10835 1041
 22177
 45383
 1032 45383
 1256 45383
 1293 45383
 1293 45383 1010 1293 45383
 1319
 1010 1376
 1039 4033
 22177 1044 1404 48054 1033 3075 1584 1636 119685 1152 1129 3082 26060 2998 63614 82278 1049 1051 1049 1052 1049 1053 1049 6434 6749
 7290 7290 7290
 1051
 1051 1051
 1051 1051 1051
 1051 1051 1051 1051
 1051 1051 1051 1051 1051
 1051 1051 1051 1051 1051 1051
 1051 1051 1051 1051 1051 1051 1051
 1051 1051 1051 1051 1051 1051 1051 1051
 1051 1051 1051 1051 1051 1051 1051 1051 1051
 1067 59503 28783
 3724 4058
 1010 1032 1267 1032 4688 1032 17152 1458 29356 1010 1256 1010 1293 1010 1260 1010 1652 1010 1240 1159 1154 1128 1319 13052 1041 119685 1152 1182 29568 1240 1159 1140 1171 1239 1184 1143 1319 88181 1873 3659 1275 56421 1621 1041 126241 1133 119685 1166 1153 1240 1159 1166 1153 1032 1051 1032 1051 1051 1032 1051 1051 1051 1032 1051 1051 1051 1051 1032 1051 1051 1051 1051 1051 1032 1051 1051 1051 1051 1051 1051 1032 1051 1051 1051 1051 1051 1051 1051 1032 1051 1051 1051 1051 1051 1051 1051 1051 1032 1051 1046 1051 1032 1051 1791 1051 1032 1051 2880 1051 71881 1158 1128 1225 1158 1182 1225 1158 1147 1225 1159 1139 1225 1158 1143 1225 1159 1130 1225 1158 1150 1225 1158 1183 1225 1158 1159 1225 21359 1225 1158 1159 1225 1158 1162 1225 1158 1182 1225 1158 1133 1240 1159 1152 1129 3082 26060 2998 63614 82278 1049 1051 1049 1052 1049 1053 1049 6434 6749 45577 1045 6626 43555 2843 30757 1902 4200 63073 3671 14931 20040 20040 1657 1657 1975 14135 14135 83923 7290 7290 7290 45509 45509 45509 1362 6483 2151 1576 1116 2189 1514 1681 2156 1044 1576 3609 1636 5257 1063 1576 1077 1605 5257 1362 7534 3180 1494 1044 1576 1068 1636 2479 2269 26883 1063 2837 1039 45654 1261 54297 1076