allow for single token prompt processing (actual batch size 1)

This commit is contained in:
Concedo 2025-04-25 16:54:46 +08:00
commit 6b6597ebf1
24 changed files with 413 additions and 846 deletions

View file

@ -39,6 +39,11 @@
using json = nlohmann::ordered_json; using json = nlohmann::ordered_json;
std::initializer_list<enum llama_example> mmproj_examples = {
LLAMA_EXAMPLE_LLAVA,
// TODO: add LLAMA_EXAMPLE_SERVER when it's ready
};
common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) { common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
this->examples = std::move(examples); this->examples = std::move(examples);
return *this; return *this;
@ -642,11 +647,16 @@ static struct common_hf_file_res common_get_hf_file(const std::string &, const s
// utils // utils
// //
static void common_params_handle_model( struct handle_model_result {
bool found_mmproj = false;
common_params_model mmproj;
};
static handle_model_result common_params_handle_model(
struct common_params_model & model, struct common_params_model & model,
const std::string & bearer_token, const std::string & bearer_token,
const std::string & model_path_default, const std::string & model_path_default) {
bool is_mmproj = false) { // TODO: move is_mmproj to an enum when we have more files? handle_model_result result;
// handle pre-fill default model path and url based on hf_repo and hf_file // handle pre-fill default model path and url based on hf_repo and hf_file
{ {
if (!model.hf_repo.empty()) { if (!model.hf_repo.empty()) {
@ -658,7 +668,12 @@ static void common_params_handle_model(
exit(1); // built without CURL, error message already printed exit(1); // built without CURL, error message already printed
} }
model.hf_repo = auto_detected.repo; model.hf_repo = auto_detected.repo;
model.hf_file = is_mmproj ? auto_detected.mmprojFile : auto_detected.ggufFile; model.hf_file = auto_detected.ggufFile;
if (!auto_detected.mmprojFile.empty()) {
result.found_mmproj = true;
result.mmproj.hf_repo = model.hf_repo;
result.mmproj.hf_file = auto_detected.mmprojFile;
}
} else { } else {
model.hf_file = model.path; model.hf_file = model.path;
} }
@ -695,6 +710,8 @@ static void common_params_handle_model(
exit(1); exit(1);
} }
} }
return result;
} }
const std::vector<ggml_type> kv_cache_types = { const std::vector<ggml_type> kv_cache_types = {
@ -828,16 +845,25 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
} }
common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH); // handle model and download
{
auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
if (params.no_mmproj) {
params.mmproj = {};
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
// optionally, handle mmproj model when -hf is specified
params.mmproj = res.mmproj;
}
// only download mmproj if the current example is using it
for (auto & ex : mmproj_examples) {
if (ctx_arg.ex == ex) {
common_params_handle_model(params.mmproj, params.hf_token, "");
break;
}
}
common_params_handle_model(params.speculative.model, params.hf_token, ""); common_params_handle_model(params.speculative.model, params.hf_token, "");
common_params_handle_model(params.vocoder.model, params.hf_token, ""); common_params_handle_model(params.vocoder.model, params.hf_token, "");
// allow --mmproj to be set from -hf
// assuming that mmproj is always in the same repo as text model
if (!params.model.hf_repo.empty() && ctx_arg.ex == LLAMA_EXAMPLE_LLAVA) {
params.mmproj.hf_repo = params.model.hf_repo;
} }
common_params_handle_model(params.mmproj, params.hf_token, "", true);
if (params.escape) { if (params.escape) {
string_process_escapes(params.prompt); string_process_escapes(params.prompt);
@ -969,7 +995,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
"llama-embedding", "llama-embedding",
"llama-eval-callback", "llama-eval-callback",
"llama-export-lora", "llama-export-lora",
"llama-gbnf-validator",
"llama-gen-docs", "llama-gen-docs",
"llama-gguf", "llama-gguf",
"llama-gguf-hash", "llama-gguf-hash",
@ -989,7 +1014,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
"llama-perplexity", "llama-perplexity",
"llama-q8dot", "llama-q8dot",
"llama-quantize", "llama-quantize",
"llama-quantize-stats",
"llama-qwen2vl-cli", "llama-qwen2vl-cli",
"llama-retrieval", "llama-retrieval",
"llama-run", "llama-run",
@ -2096,18 +2120,32 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
add_opt(common_arg( add_opt(common_arg(
{"--mmproj"}, "FILE", {"--mmproj"}, "FILE",
"path to a multimodal projector file for LLaVA. see examples/llava/README.md", "path to a multimodal projector file. see examples/llava/README.md",
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.mmproj.path = value; params.mmproj.path = value;
} }
).set_examples({LLAMA_EXAMPLE_LLAVA})); ).set_examples(mmproj_examples));
add_opt(common_arg( add_opt(common_arg(
{"--mmproj-url"}, "URL", {"--mmproj-url"}, "URL",
"URL to a multimodal projector file for LLaVA. see examples/llava/README.md", "URL to a multimodal projector file. see examples/llava/README.md",
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.mmproj.url = value; params.mmproj.url = value;
} }
).set_examples({LLAMA_EXAMPLE_LLAVA})); ).set_examples(mmproj_examples));
add_opt(common_arg(
{"--no-mmproj"},
"explicitly disable multimodal projector, useful when using -hf",
[](common_params & params) {
params.no_mmproj = true;
}
).set_examples(mmproj_examples));
add_opt(common_arg(
{"--no-mmproj-offload"},
"do not offload multimodal projector to GPU",
[](common_params & params) {
params.mmproj_use_gpu = false;
}
).set_examples(mmproj_examples));
add_opt(common_arg( add_opt(common_arg(
{"--image"}, "FILE", {"--image"}, "FILE",
"path to an image file. use with multimodal models. Specify multiple times for batching", "path to an image file. use with multimodal models. Specify multiple times for batching",
@ -2382,6 +2420,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
add_opt(common_arg( add_opt(common_arg(
{"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]", {"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
"Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n" "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
"mmproj is also downloaded automatically if available. to disable, add --no-mmproj\n"
"example: unsloth/phi-4-GGUF:q4_k_m\n" "example: unsloth/phi-4-GGUF:q4_k_m\n"
"(default: unused)", "(default: unused)",
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {

View file

@ -338,6 +338,8 @@ struct common_params {
// multimodal models (see examples/llava) // multimodal models (see examples/llava)
struct common_params_model mmproj; struct common_params_model mmproj;
bool mmproj_use_gpu = true; // use GPU for multimodal model
bool no_mmproj = false; // explicitly disable multimodal model
std::vector<std::string> image; // path to image file(s) std::vector<std::string> image; // path to image file(s)
// embedding // embedding

View file

@ -1,51 +0,0 @@
# Gemma 3 vision
> [!IMPORTANT]
>
> This is very experimental, only used for demo purpose.
## Quick started
You can use pre-quantized model from [ggml-org](https://huggingface.co/ggml-org)'s Hugging Face account
```bash
# build
cmake -B build
cmake --build build --target llama-mtmd-cli
# alternatively, install from brew (MacOS)
brew install llama.cpp
# run it
llama-mtmd-cli -hf ggml-org/gemma-3-4b-it-GGUF
llama-mtmd-cli -hf ggml-org/gemma-3-12b-it-GGUF
llama-mtmd-cli -hf ggml-org/gemma-3-27b-it-GGUF
# note: 1B model does not support vision
```
## How to get mmproj.gguf?
Simply to add `--mmproj` in when converting model via `convert_hf_to_gguf.py`:
```bash
cd gemma-3-4b-it
python ../llama.cpp/convert_hf_to_gguf.py --outfile model.gguf --outtype f16 --mmproj .
# output file: mmproj-model.gguf
```
## How to run it?
What you need:
- The text model GGUF, can be converted using `convert_hf_to_gguf.py`
- The mmproj file from step above
- An image file
```bash
# build
cmake -B build
cmake --build build --target llama-mtmd-cli
# run it
./build/bin/llama-mtmd-cli -m {text_model}.gguf --mmproj mmproj.gguf --image your_image.jpg
```

View file

@ -1,43 +0,0 @@
# GLMV-EDGE
Currently this implementation supports [glm-edge-v-2b](https://huggingface.co/THUDM/glm-edge-v-2b) and [glm-edge-v-5b](https://huggingface.co/THUDM/glm-edge-v-5b).
## Usage
Build the `llama-mtmd-cli` binary.
After building, run: `./llama-mtmd-cli` to see the usage. For example:
```sh
./llama-mtmd-cli -m model_path/ggml-model-f16.gguf --mmproj model_path/mmproj-model-f16.gguf
```
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
**note**: For GPU offloading ensure to use the `-ngl` flag just like usual
## GGUF conversion
1. Clone a GLMV-EDGE model ([2B](https://huggingface.co/THUDM/glm-edge-v-2b) or [5B](https://huggingface.co/THUDM/glm-edge-v-5b)). For example:
```sh
git clone https://huggingface.co/THUDM/glm-edge-v-5b or https://huggingface.co/THUDM/glm-edge-v-2b
```
2. Use `glmedge-surgery.py` to split the GLMV-EDGE model to LLM and multimodel projector constituents:
```sh
python ./examples/llava/glmedge-surgery.py -m ../model_path
```
4. Use `glmedge-convert-image-encoder-to-gguf.py` to convert the GLMV-EDGE image encoder to GGUF:
```sh
python ./examples/llava/glmedge-convert-image-encoder-to-gguf.py -m ../model_path --llava-projector ../model_path/glm.projector --output-dir ../model_path
```
5. Use `examples/convert_hf_to_gguf.py` to convert the LLM part of GLMV-EDGE to GGUF:
```sh
python convert_hf_to_gguf.py ../model_path
```
Now both the LLM part and the image encoder are in the `model_path` directory.

View file

@ -1,186 +0,0 @@
# Granite Vision
Download the model and point your `GRANITE_MODEL` environment variable to the path.
```bash
$ git clone https://huggingface.co/ibm-granite/granite-vision-3.2-2b
$ export GRANITE_MODEL=./granite-vision-3.2-2b
```
### 1. Running llava surgery v2.
First, we need to run the llava surgery script as shown below:
`python llava_surgery_v2.py -C -m $GRANITE_MODEL`
You should see two new files (`llava.clip` and `llava.projector`) written into your model's directory, as shown below.
```bash
$ ls $GRANITE_MODEL | grep -i llava
llava.clip
llava.projector
```
We should see that the projector and visual encoder get split out into the llava files. Quick check to make sure they aren't empty:
```python
import os
import torch
MODEL_PATH = os.getenv("GRANITE_MODEL")
if not MODEL_PATH:
raise ValueError("env var GRANITE_MODEL is unset!")
encoder_tensors = torch.load(os.path.join(MODEL_PATH, "llava.clip"))
projector_tensors = torch.load(os.path.join(MODEL_PATH, "llava.projector"))
assert len(encoder_tensors) > 0
assert len(projector_tensors) > 0
```
If you actually inspect the `.keys()` of the loaded tensors, you should see a lot of `vision_model` tensors in the `encoder_tensors`, and 5 tensors (`'multi_modal_projector.linear_1.bias'`, `'multi_modal_projector.linear_1.weight'`, `'multi_modal_projector.linear_2.bias'`, `'multi_modal_projector.linear_2.weight'`, `'image_newline'`) in the multimodal `projector_tensors`.
### 2. Creating the Visual Component GGUF
Next, create a new directory to hold the visual components, and copy the llava.clip/projector files, as shown below.
```bash
$ ENCODER_PATH=$PWD/visual_encoder
$ mkdir $ENCODER_PATH
$ cp $GRANITE_MODEL/llava.clip $ENCODER_PATH/pytorch_model.bin
$ cp $GRANITE_MODEL/llava.projector $ENCODER_PATH/
```
Now, we need to write a config for the visual encoder. In order to convert the model, be sure to use the correct `image_grid_pinpoints`, as these may vary based on the model. You can find the `image_grid_pinpoints` in `$GRANITE_MODEL/config.json`.
```json
{
"_name_or_path": "siglip-model",
"architectures": [
"SiglipVisionModel"
],
"image_grid_pinpoints": [
[384,384],
[384,768],
[384,1152],
[384,1536],
[384,1920],
[384,2304],
[384,2688],
[384,3072],
[384,3456],
[384,3840],
[768,384],
[768,768],
[768,1152],
[768,1536],
[768,1920],
[1152,384],
[1152,768],
[1152,1152],
[1536,384],
[1536,768],
[1920,384],
[1920,768],
[2304,384],
[2688,384],
[3072,384],
[3456,384],
[3840,384]
],
"mm_patch_merge_type": "spatial_unpad",
"hidden_size": 1152,
"image_size": 384,
"intermediate_size": 4304,
"model_type": "siglip_vision_model",
"num_attention_heads": 16,
"num_hidden_layers": 27,
"patch_size": 14,
"layer_norm_eps": 1e-6,
"hidden_act": "gelu_pytorch_tanh",
"projection_dim": 0,
"vision_feature_layer": [-24, -20, -12, -1]
}
```
At this point you should have something like this:
```bash
$ ls $ENCODER_PATH
config.json llava.projector pytorch_model.bin
```
Now convert the components to GGUF; Note that we also override the image mean/std dev to `[.5,.5,.5]` since we use the SigLIP visual encoder - in the transformers model, you can find these numbers in the `preprocessor_config.json`.
```bash
$ python convert_image_encoder_to_gguf.py \
-m $ENCODER_PATH \
--llava-projector $ENCODER_PATH/llava.projector \
--output-dir $ENCODER_PATH \
--clip-model-is-vision \
--clip-model-is-siglip \
--image-mean 0.5 0.5 0.5 \
--image-std 0.5 0.5 0.5
```
This will create the first GGUF file at `$ENCODER_PATH/mmproj-model-f16.gguf`; we will refer to the absolute path of this file as the `$VISUAL_GGUF_PATH.`
### 3. Creating the LLM GGUF.
The granite vision model contains a granite LLM as its language model. For now, the easiest way to get the GGUF for LLM is by loading the composite model in `transformers` and exporting the LLM so that it can be directly converted with the normal conversion path.
First, set the `LLM_EXPORT_PATH` to the path to export the `transformers` LLM to.
```bash
$ export LLM_EXPORT_PATH=$PWD/granite_vision_llm
```
```python
import os
import transformers
MODEL_PATH = os.getenv("GRANITE_MODEL")
if not MODEL_PATH:
raise ValueError("env var GRANITE_MODEL is unset!")
LLM_EXPORT_PATH = os.getenv("LLM_EXPORT_PATH")
if not LLM_EXPORT_PATH:
raise ValueError("env var LLM_EXPORT_PATH is unset!")
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH)
# NOTE: granite vision support was added to transformers very recently (4.49);
# if you get size mismatches, your version is too old.
# If you are running with an older version, set `ignore_mismatched_sizes=True`
# as shown below; it won't be loaded correctly, but the LLM part of the model that
# we are exporting will be loaded correctly.
model = transformers.AutoModelForImageTextToText.from_pretrained(MODEL_PATH, ignore_mismatched_sizes=True)
tokenizer.save_pretrained(LLM_EXPORT_PATH)
model.language_model.save_pretrained(LLM_EXPORT_PATH)
```
Now you can convert the exported LLM to GGUF with the normal converter in the root of the llama cpp project.
```bash
$ LLM_GGUF_PATH=$LLM_EXPORT_PATH/granite_llm.gguf
...
$ python convert_hf_to_gguf.py --outfile $LLM_GGUF_PATH $LLM_EXPORT_PATH
```
### 4. Quantization
If you want to quantize the LLM, you can do so with `llama-quantize` as you would any other LLM. For example:
```bash
$ ./build/bin/llama-quantize $LLM_EXPORT_PATH/granite_llm.gguf $LLM_EXPORT_PATH/granite_llm_q4_k_m.gguf Q4_K_M
$ LLM_GGUF_PATH=$LLM_EXPORT_PATH/granite_llm_q4_k_m.gguf
```
Note that currently you cannot quantize the visual encoder because granite vision models use SigLIP as the visual encoder, which has tensor dimensions that are not divisible by 32.
### 5. Running the Model in Llama cpp
Build llama cpp normally; you should have a target binary named `llama-mtmd-cli`, which you can pass two binaries to. As an example, we pass the the llama.cpp banner.
```bash
$ ./build/bin/llama-mtmd-cli -m $LLM_GGUF_PATH \
--mmproj $VISUAL_GGUF_PATH \
-c 16384 \
--temp 0
```

View file

@ -1,143 +0,0 @@
# LLaVA
Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants,
as well as llava-1.6 [llava-v1.6](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2) variants.
The pre-converted [7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
and [13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
models are available.
For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](https://huggingface.co/cmp-nct/llava-1.6-gguf)
After API is confirmed, more models will be supported / uploaded.
## Usage
Build the `llama-mtmd-cli` binary.
After building, run: `./llama-mtmd-cli` to see the usage. For example:
```sh
./llama-mtmd-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf \
--mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf \
--chat-template vicuna
```
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
**note**: For GPU offloading ensure to use the `-ngl` flag just like usual
## LLaVA 1.5
1. Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example:
```sh
git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
git clone https://huggingface.co/openai/clip-vit-large-patch14-336
```
2. Install the required Python packages:
```sh
pip install -r examples/llava/requirements.txt
```
3. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
```sh
python ./examples/llava/llava_surgery.py -m ../llava-v1.5-7b
```
4. Use `convert_image_encoder_to_gguf.py` to convert the LLaVA image encoder to GGUF:
```sh
python ./examples/llava/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
```
5. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
```sh
python ./examples/convert_legacy_llama.py ../llava-v1.5-7b --skip-unknown
```
Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory.
## LLaVA 1.6 gguf conversion
1) First clone a LLaVA 1.6 model:
```console
git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
```
2) Install the required Python packages:
```sh
pip install -r examples/llava/requirements.txt
```
3) Use `llava_surgery_v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
```console
python examples/llava/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/
```
- you will find a llava.projector and a llava.clip file in your model directory
4) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory:
```console
mkdir vit
cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin
cp ../llava-v1.6-vicuna-7b/llava.projector vit/
curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json
```
5) Create the visual gguf model:
```console
python ./examples/llava/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
```
- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
6) Then convert the model to gguf format:
```console
python ./examples/convert_legacy_llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
```
7) And finally we can run the llava cli using the 1.6 model version:
```console
./llama-mtmd-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf
```
**note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
**note** llava-1.6 greatly benefits from batched prompt processing (defaults work)
**note** if the language model in step `6)` is incompatible with the legacy conversion script, the easiest way handle the LLM model conversion is to load the model in transformers, and export only the LLM from the llava next model.
```python
import os
import transformers
model_path = ...
llm_export_path = ...
tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)
model = transformers.AutoModelForImageTextToText.from_pretrained(model_path)
tokenizer.save_pretrained(llm_export_path)
model.language_model.save_pretrained(llm_export_path)
```
Then, you can convert the LLM using the `convert_hf_to_gguf.py` script, which handles more LLM architectures.
## Chat template
For llava-1.5 and llava-1.6, you need to use `vicuna` chat template. Simply add `--chat-template vicuna` to activate this template.
## How to know if you are running in llava-1.5 or llava-1.6 mode
When running llava-cli you will see a visual information right before the prompt is being processed:
**Llava-1.5:**
`encode_image_with_clip: image embedding created: 576 tokens`
**Llava-1.6 (anything above 576):**
`encode_image_with_clip: image embedding created: 2880 tokens`
Alternatively just pay notice to how many "tokens" have been used for your prompt, it will also show 1000+ tokens for llava-1.6

View file

@ -1,48 +0,0 @@
## MiniCPM-o 2.6
Currently, this readme only supports minicpm-omni's image capabilities, and we will update the full-mode support as soon as possible.
### Prepare models and code
Download [MiniCPM-o-2_6](https://huggingface.co/openbmb/MiniCPM-o-2_6) PyTorch model from huggingface to "MiniCPM-o-2_6" folder.
### Build llama.cpp
Readme modification time: 20250206
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
Clone llama.cpp:
```bash
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
```
Build llama.cpp using `CMake`:
```bash
cmake -B build
cmake --build build --config Release
```
### Usage of MiniCPM-o 2.6
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us)
```bash
python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-o-2_6
python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4
python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model
# quantize int4 version
./build/bin/llama-quantize ../MiniCPM-o-2_6/model/ggml-model-f16.gguf ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
```
Inference on Linux or Mac
```bash
# run in single-turn mode
./build/bin/llama-mtmd-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# run in conversation mode
./build/bin/llama-mtmd-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf
```

View file

@ -1,47 +0,0 @@
## MiniCPM-Llama3-V 2.5
### Prepare models and code
Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5) PyTorch model from huggingface to "MiniCPM-Llama3-V-2_5" folder.
### Build llama.cpp
Readme modification time: 20250206
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
Clone llama.cpp:
```bash
git clone https://github.com/ggml-org/llama.cpp
cd llama.cpp
```
Build llama.cpp using `CMake`:
```bash
cmake -B build
cmake --build build --config Release
```
### Usage of MiniCPM-Llama3-V 2.5
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us)
```bash
python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
# quantize int4 version
./build/bin/llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M
```
Inference on Linux or Mac
```bash
# run in single-turn mode
./build/bin/llama-mtmd-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# run in conversation mode
./build/bin/llama-mtmd-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf
```

View file

@ -1,47 +0,0 @@
## MiniCPM-V 2.6
### Prepare models and code
Download [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) PyTorch model from huggingface to "MiniCPM-V-2_6" folder.
### Build llama.cpp
Readme modification time: 20250206
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
Clone llama.cpp:
```bash
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
```
Build llama.cpp using `CMake`:
```bash
cmake -B build
cmake --build build --config Release
```
### Usage of MiniCPM-V 2.6
Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us)
```bash
python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-V-2_6
python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3
python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model
# quantize int4 version
./build/bin/llama-quantize ../MiniCPM-V-2_6/model/ggml-model-f16.gguf ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
```
Inference on Linux or Mac
```bash
# run in single-turn mode
./build/bin/llama-mtmd-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# run in conversation mode
./build/bin/llama-mtmd-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf
```

View file

@ -95,8 +95,6 @@
#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s" #define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
#define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s" #define TN_GLM_ADAPTER_GATE "adapter.linear.gate.%s"
#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s" #define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
#define TN_GLM_BOI_W "adapter.boi"
#define TN_GLM_EOI_W "adapter.eoi"
enum projector_type { enum projector_type {
PROJECTOR_TYPE_MLP, PROJECTOR_TYPE_MLP,

View file

@ -263,8 +263,6 @@ struct clip_vision_model {
//GLMV-Edge projection //GLMV-Edge projection
struct ggml_tensor * mm_model_adapter_conv_w = nullptr; struct ggml_tensor * mm_model_adapter_conv_w = nullptr;
struct ggml_tensor * mm_model_adapter_conv_b = nullptr; struct ggml_tensor * mm_model_adapter_conv_b = nullptr;
struct ggml_tensor * boi_w = nullptr;
struct ggml_tensor * eoi_w = nullptr;
// MobileVLM projection // MobileVLM projection
struct ggml_tensor * mm_model_mlp_1_w = nullptr; struct ggml_tensor * mm_model_mlp_1_w = nullptr;
@ -1822,8 +1820,6 @@ struct clip_model_loader {
vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H,"weight")); vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H,"weight"));
vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE,"weight")); vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE,"weight"));
vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H,"weight")); vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H,"weight"));
vision_model.boi_w = get_tensor(TN_GLM_BOI_W);
vision_model.eoi_w = get_tensor(TN_GLM_EOI_W);
} break; } break;
case PROJECTOR_TYPE_MERGER: case PROJECTOR_TYPE_MERGER:
{ {
@ -2805,8 +2801,7 @@ void clip_free(clip_ctx * ctx) {
} }
size_t clip_embd_nbytes(const struct clip_ctx * ctx) { size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
int extra_tokens = ctx->has_glm_projector ? 2 : 0; return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
return (clip_n_patches(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float);
} }
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) { size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
@ -3006,9 +3001,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
} }
if (ctx->has_glm_projector) { if (ctx->has_glm_projector) {
GGML_ASSERT(batch_size == 1); GGML_ASSERT(batch_size == 1);
ggml_tensor * boi = ctx->vision_model.boi_w;
ggml_backend_tensor_get(boi,vec,0,ggml_nbytes(boi));
vec = (float*)(vec+ggml_nelements(boi)); //offset for boi
} }
// build the inference graph // build the inference graph
@ -3341,13 +3333,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
// copy the embeddings to the location passed by the user // copy the embeddings to the location passed by the user
ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
if (ctx->has_glm_projector) {
//eoi
ggml_tensor * eoi = ctx->vision_model.eoi_w;
int offset = ggml_nelements(embeddings);
ggml_backend_tensor_get(eoi, vec+offset, 0, ggml_nbytes(eoi));
}
return true; return true;
} }

View file

@ -40,7 +40,8 @@ static void show_additional_info(int /*argc*/, char ** argv) {
"Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> -p <prompt>\n\n" "Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> -p <prompt>\n\n"
" -m and --mmproj are required\n" " -m and --mmproj are required\n"
" -hf user/repo can replace both -m and --mmproj in most cases\n" " -hf user/repo can replace both -m and --mmproj in most cases\n"
" --image and -p are optional, if NOT provided, the CLI will run in chat mode\n", " --image and -p are optional, if NOT provided, the CLI will run in chat mode\n"
" to disable using GPU for mmproj model, add --no-mmproj-offload\n",
argv[0] argv[0]
); );
} }
@ -112,10 +113,10 @@ struct mtmd_cli_context {
void init_vision_context(common_params & params) { void init_vision_context(common_params & params) {
const char * clip_path = params.mmproj.path.c_str(); const char * clip_path = params.mmproj.path.c_str();
ctx_vision.reset(mtmd_init_from_file(clip_path, model, mtmd_context_params{ ctx_vision.reset(mtmd_init_from_file(clip_path, model, mtmd_context_params{
/* use_gpu */ true, /* use_gpu */ params.mmproj_use_gpu,
/* timings */ true, /* timings */ true,
/* n_threads */ params.cpuparams.n_threads, /* n_threads */ params.cpuparams.n_threads,
/* verbosity */ GGML_LOG_LEVEL_INFO, /* verbosity */ params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO,
})); }));
if (!ctx_vision.get()) { if (!ctx_vision.get()) {
LOG_ERR("Failed to load vision model from %s\n", clip_path); LOG_ERR("Failed to load vision model from %s\n", clip_path);
@ -261,6 +262,7 @@ int main(int argc, char ** argv) {
if (params.mmproj.path.empty()) { if (params.mmproj.path.empty()) {
show_additional_info(argc, argv); show_additional_info(argc, argv);
LOG_ERR("ERR: Missing --mmproj argument\n");
return 1; return 1;
} }

View file

@ -186,6 +186,11 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
marker_modified = "<start_of_image>" + ctx->image_marker + "<end_of_image>"; marker_modified = "<start_of_image>" + ctx->image_marker + "<end_of_image>";
string_replace_all(prompt_modified, ctx->image_marker, marker_modified); string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
} else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) {
// <|begin_of_image|> ... (image embeddings) ... <|end_of_image|>
marker_modified = "<|begin_of_image|>" + ctx->image_marker + "<|end_of_image|>";
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
} else if (proj_type == PROJECTOR_TYPE_IDEFICS3) { } else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215 // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
marker_modified = "<fake_token_around_image><global-img>" + ctx->image_marker + "<fake_token_around_image>"; marker_modified = "<fake_token_around_image><global-img>" + ctx->image_marker + "<fake_token_around_image>";

View file

@ -487,6 +487,7 @@ extern "C" {
GGML_OP_CONV_TRANSPOSE_1D, GGML_OP_CONV_TRANSPOSE_1D,
GGML_OP_IM2COL, GGML_OP_IM2COL,
GGML_OP_IM2COL_BACK, GGML_OP_IM2COL_BACK,
GGML_OP_CONV_2D_DW,
GGML_OP_CONV_TRANSPOSE_2D, GGML_OP_CONV_TRANSPOSE_2D,
GGML_OP_POOL_1D, GGML_OP_POOL_1D,
GGML_OP_POOL_2D, GGML_OP_POOL_2D,
@ -690,6 +691,9 @@ extern "C" {
GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1 GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2 GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
// true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1); GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1); GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
@ -1673,7 +1677,7 @@ extern "C" {
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
// depthwise // depthwise (via im2col and mul_mat)
GGML_API struct ggml_tensor * ggml_conv_2d_dw( GGML_API struct ggml_tensor * ggml_conv_2d_dw(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel struct ggml_tensor * a, // convolution kernel
@ -1685,6 +1689,22 @@ extern "C" {
int d0, // dilation dimension 0 int d0, // dilation dimension 0
int d1); // dilation dimension 1 int d1); // dilation dimension 1
// Depthwise 2D convolution
// may be faster than ggml_conv_2d_dw, but not available in all backends
// a: KW KH 1 C convolution kernel
// b: W H C N input data
// res: W_out H_out C N
GGML_API struct ggml_tensor * ggml_conv_2d_dw_direct(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int stride0,
int stride1,
int pad0,
int pad1,
int dilation0,
int dilation1);
GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0( GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,

View file

@ -1946,6 +1946,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{ {
ggml_compute_forward_im2col_back_f32(params, tensor); ggml_compute_forward_im2col_back_f32(params, tensor);
} break; } break;
case GGML_OP_CONV_2D_DW:
{
ggml_compute_forward_conv_2d_dw(params, tensor);
} break;
case GGML_OP_CONV_TRANSPOSE_2D: case GGML_OP_CONV_TRANSPOSE_2D:
{ {
ggml_compute_forward_conv_transpose_2d(params, tensor); ggml_compute_forward_conv_transpose_2d(params, tensor);
@ -2282,6 +2286,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
} break; } break;
case GGML_OP_IM2COL: case GGML_OP_IM2COL:
case GGML_OP_IM2COL_BACK: case GGML_OP_IM2COL_BACK:
case GGML_OP_CONV_2D_DW:
case GGML_OP_CONV_TRANSPOSE_1D: case GGML_OP_CONV_TRANSPOSE_1D:
case GGML_OP_CONV_TRANSPOSE_2D: case GGML_OP_CONV_TRANSPOSE_2D:
{ {

View file

@ -6064,6 +6064,178 @@ void ggml_compute_forward_conv_transpose_2d(
} }
} }
// ggml_compute_forward_conv_2d_dw
struct ggml_conv_2d_dw_params {
int64_t channels;
int64_t batch;
int64_t src_w;
int64_t src_h;
int64_t dst_w;
int64_t dst_h;
int64_t knl_w;
int64_t knl_h;
int stride_x;
int stride_y;
int pad_x;
int pad_y;
int dilation_x;
int dilation_y;
};
static void ggml_compute_forward_conv_2d_dw_cwhn(
const ggml_compute_params * params,
const ggml_tensor * src,
const ggml_tensor * kernel,
ggml_tensor * dst,
const ggml_conv_2d_dw_params & p) {
const int64_t c = p.channels;
const float * knl_data = (const float *)kernel->data;
const int64_t rows_total = p.dst_h * p.batch;
const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth;
const int64_t row_start = params->ith * rows_per_thread;
const int64_t row_end = MIN(row_start + rows_per_thread, rows_total);
#ifdef GGML_SIMD
const int64_t pkg_size = GGML_F32_EPR;
const int64_t pkg_count = c / pkg_size;
const int64_t c_pkg_end = pkg_count * pkg_size;
#else
const int64_t c_pkg_end = 0;
#endif
for (int64_t row = row_start; row < row_end; ++row) {
const int64_t dst_y = row % p.dst_h;
const float * src_data = (const float *)src->data + (row / p.dst_h) * p.src_w * p.src_h * c;
for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
float * dst_data = (float *)dst->data + (row * p.dst_w + dst_x) * c;
const int64_t src_y_base = dst_y * p.stride_y - p.pad_y;
const int64_t src_x_base = dst_x * p.stride_x - p.pad_x;
#ifdef GGML_SIMD
// Vectorized loop
for (int64_t c_i = 0; c_i < c_pkg_end; c_i += pkg_size) {
GGML_F32_VEC sum = GGML_F32_VEC_ZERO;
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
const int64_t src_y = src_y_base + knl_y * p.dilation_y;
if (src_y < 0 || src_y >= p.src_h) {
continue;
}
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
const int64_t src_x = src_x_base + knl_x * p.dilation_x;
if (src_x < 0 || src_x >= p.src_w) {
continue;
}
GGML_F32_VEC k = GGML_F32_VEC_LOAD(knl_data + (knl_y * p.knl_w + knl_x) * c + c_i);
GGML_F32_VEC s = GGML_F32_VEC_LOAD(src_data + (src_y * p.src_w + src_x) * c + c_i);
sum = GGML_F32_VEC_FMA(sum, k, s);
}
}
GGML_F32_VEC_STORE(dst_data + c_i, sum);
}
#endif
// Scalar loop
for (int64_t c_i = c_pkg_end; c_i < c; ++c_i) {
float sum = 0.0f;
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
const int64_t src_y = src_y_base + knl_y * p.dilation_y;
if (src_y < 0 || src_y >= p.src_h) {
continue;
}
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
const int64_t src_x = src_x_base + knl_x * p.dilation_x;
if (src_x < 0 || src_x >= p.src_w) {
continue;
}
sum += knl_data[(knl_y * p.knl_w + knl_x) * c + c_i]
* src_data[(src_y * p.src_w + src_x) * c + c_i];
}
}
dst_data[c_i] = sum;
}
}
}
}
static void ggml_compute_forward_conv_2d_dw_whcn(
const ggml_compute_params * params,
const ggml_tensor * src,
const ggml_tensor * kernel,
ggml_tensor * dst,
const ggml_conv_2d_dw_params & p) {
const int64_t n = p.channels * p.batch;
const int64_t per_thread = (n + params->nth - 1) / params->nth;
const int64_t start = params->ith * per_thread;
const int64_t end = MIN(start + per_thread, n);
for (int64_t i = start; i < end; ++i) {
const float * knl_data = (const float *)kernel->data + (i % p.channels) * p.knl_w * p.knl_h;
const float * src_data = (const float *)src->data + i * p.src_w * p.src_h;
float * dst_data = (float *)dst->data + i * p.dst_w * p.dst_h;
for (int64_t dst_y = 0; dst_y < p.dst_h; ++dst_y) {
for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
float sum = 0.0f;
for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
const int64_t src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
if (src_y < 0 || src_y >= p.src_h) {
continue;
}
for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
const int64_t src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
if (src_x < 0 || src_x >= p.src_w) {
continue;
}
sum += knl_data[knl_y * p.knl_w + knl_x]
* src_data[src_y * p.src_w + src_x];
}
}
dst_data[dst_y * p.dst_w + dst_x] = sum;
}
}
}
}
void ggml_compute_forward_conv_2d_dw(
const ggml_compute_params * params,
ggml_tensor * dst) {
const ggml_tensor * kernel = dst->src[0];
const ggml_tensor * src = dst->src[1];
ggml_conv_2d_dw_params p;
p.channels = src->ne[2];
p.batch = src->ne[3];
p.src_w = src->ne[0];
p.src_h = src->ne[1];
p.dst_w = dst->ne[0];
p.dst_h = dst->ne[1];
p.knl_w = kernel->ne[0];
p.knl_h = kernel->ne[1];
p.stride_x = dst->op_params[0];
p.stride_y = dst->op_params[1];
p.pad_x = dst->op_params[2];
p.pad_y = dst->op_params[3];
p.dilation_x = dst->op_params[4];
p.dilation_y = dst->op_params[5];
GGML_ASSERT(kernel->ne[3] == p.channels);
GGML_ASSERT(dst->ne[3] == p.batch);
if (ggml_is_contiguous(src)) {
ggml_compute_forward_conv_2d_dw_whcn(params, src, kernel, dst, p);
} else if (ggml_is_contiguous_channels(src)) {
// kernel should also have channels most contiguous in memory
GGML_ASSERT(kernel->nb[0] >= kernel->nb[2] && kernel->nb[1] >= kernel->nb[0]);
ggml_compute_forward_conv_2d_dw_cwhn(params, src, kernel, dst, p);
} else {
GGML_ABORT("non-contiguous memory layout not supported");
}
}
// ggml_compute_forward_pool_1d_sk_p0 // ggml_compute_forward_pool_1d_sk_p0
static void ggml_compute_forward_pool_1d_sk_p0( static void ggml_compute_forward_pool_1d_sk_p0(

View file

@ -65,6 +65,7 @@ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * p
void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pool_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pool_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);

View file

@ -156,25 +156,27 @@ static constexpr __device__ int get_mmq_y_device() {
#define MMQ_DP4A_TXS_Q6_K tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE/QI6_K + mmq_y/QI6_K, mmq_y*WARP_SIZE/8 + mmq_y/8} #define MMQ_DP4A_TXS_Q6_K tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE/QI6_K + mmq_y/QI6_K, mmq_y*WARP_SIZE/8 + mmq_y/8}
static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml_type type, int mmq_y) { static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml_type type, int mmq_y) {
return type == GGML_TYPE_Q4_0 ? MMQ_DP4A_TXS_Q4_0 : switch (type) {
type == GGML_TYPE_Q4_1 ? MMQ_DP4A_TXS_Q4_1 : case GGML_TYPE_Q4_0: return MMQ_DP4A_TXS_Q4_0;
type == GGML_TYPE_Q5_0 ? MMQ_DP4A_TXS_Q8_0 : case GGML_TYPE_Q4_1: return MMQ_DP4A_TXS_Q4_1;
type == GGML_TYPE_Q5_1 ? MMQ_DP4A_TXS_Q8_1 : case GGML_TYPE_Q5_0: return MMQ_DP4A_TXS_Q8_0;
type == GGML_TYPE_Q8_0 ? MMQ_DP4A_TXS_Q8_0 : case GGML_TYPE_Q5_1: return MMQ_DP4A_TXS_Q8_1;
type == GGML_TYPE_Q2_K ? MMQ_DP4A_TXS_Q2_K : case GGML_TYPE_Q8_0: return MMQ_DP4A_TXS_Q8_0;
type == GGML_TYPE_Q3_K ? MMQ_DP4A_TXS_Q3_K : case GGML_TYPE_Q2_K: return MMQ_DP4A_TXS_Q2_K;
type == GGML_TYPE_Q4_K ? MMQ_DP4A_TXS_Q4_K : case GGML_TYPE_Q3_K: return MMQ_DP4A_TXS_Q3_K;
type == GGML_TYPE_Q5_K ? MMQ_DP4A_TXS_Q5_K : case GGML_TYPE_Q4_K: return MMQ_DP4A_TXS_Q4_K;
type == GGML_TYPE_Q6_K ? MMQ_DP4A_TXS_Q6_K : case GGML_TYPE_Q5_K: return MMQ_DP4A_TXS_Q5_K;
type == GGML_TYPE_IQ2_XXS ? MMQ_DP4A_TXS_Q8_0 : case GGML_TYPE_Q6_K: return MMQ_DP4A_TXS_Q6_K;
type == GGML_TYPE_IQ2_XS ? MMQ_DP4A_TXS_Q8_0_16 : case GGML_TYPE_IQ2_XXS: return MMQ_DP4A_TXS_Q8_0;
type == GGML_TYPE_IQ2_S ? MMQ_DP4A_TXS_Q8_0_16 : case GGML_TYPE_IQ2_XS: return MMQ_DP4A_TXS_Q8_0_16;
type == GGML_TYPE_IQ3_XXS ? MMQ_DP4A_TXS_Q8_0 : case GGML_TYPE_IQ2_S: return MMQ_DP4A_TXS_Q8_0_16;
type == GGML_TYPE_IQ3_S ? MMQ_DP4A_TXS_Q8_0 : case GGML_TYPE_IQ3_XXS: return MMQ_DP4A_TXS_Q8_0;
type == GGML_TYPE_IQ1_S ? MMQ_DP4A_TXS_Q8_0 : case GGML_TYPE_IQ3_S: return MMQ_DP4A_TXS_Q8_0;
type == GGML_TYPE_IQ4_XS ? MMQ_DP4A_TXS_Q8_0 : case GGML_TYPE_IQ1_S: return MMQ_DP4A_TXS_Q8_0;
type == GGML_TYPE_IQ4_NL ? MMQ_DP4A_TXS_Q8_0 : case GGML_TYPE_IQ4_XS: return MMQ_DP4A_TXS_Q8_0;
tile_x_sizes{0, 0, 0}; case GGML_TYPE_IQ4_NL: return MMQ_DP4A_TXS_Q8_0;
default: return tile_x_sizes{0, 0, 0};
}
} }
#define MMQ_MMA_TILE_X_K_Q8_0 (2*WARP_SIZE + 2*WARP_SIZE/QI8_0 + 4) #define MMQ_MMA_TILE_X_K_Q8_0 (2*WARP_SIZE + 2*WARP_SIZE/QI8_0 + 4)
@ -190,25 +192,27 @@ static_assert(MMQ_MMA_TILE_X_K_Q3_K % 8 == 4, "Wrong padding.");
static_assert(MMQ_MMA_TILE_X_K_Q6_K % 8 == 4, "Wrong padding."); static_assert(MMQ_MMA_TILE_X_K_Q6_K % 8 == 4, "Wrong padding.");
static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) { static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) {
return type == GGML_TYPE_Q4_0 ? MMQ_MMA_TILE_X_K_Q8_0 : switch (type) {
type == GGML_TYPE_Q4_1 ? MMQ_MMA_TILE_X_K_Q8_1 : case GGML_TYPE_Q4_0: return MMQ_MMA_TILE_X_K_Q8_0;
type == GGML_TYPE_Q5_0 ? MMQ_MMA_TILE_X_K_Q8_0 : case GGML_TYPE_Q4_1: return MMQ_MMA_TILE_X_K_Q8_1;
type == GGML_TYPE_Q5_1 ? MMQ_MMA_TILE_X_K_Q8_1 : case GGML_TYPE_Q5_0: return MMQ_MMA_TILE_X_K_Q8_0;
type == GGML_TYPE_Q8_0 ? MMQ_MMA_TILE_X_K_Q8_0 : case GGML_TYPE_Q5_1: return MMQ_MMA_TILE_X_K_Q8_1;
type == GGML_TYPE_Q2_K ? MMQ_MMA_TILE_X_K_Q2_K : case GGML_TYPE_Q8_0: return MMQ_MMA_TILE_X_K_Q8_0;
type == GGML_TYPE_Q3_K ? MMQ_MMA_TILE_X_K_Q3_K : case GGML_TYPE_Q2_K: return MMQ_MMA_TILE_X_K_Q2_K;
type == GGML_TYPE_Q4_K ? MMQ_MMA_TILE_X_K_Q8_1 : case GGML_TYPE_Q3_K: return MMQ_MMA_TILE_X_K_Q3_K;
type == GGML_TYPE_Q5_K ? MMQ_MMA_TILE_X_K_Q8_1 : case GGML_TYPE_Q4_K: return MMQ_MMA_TILE_X_K_Q8_1;
type == GGML_TYPE_Q6_K ? MMQ_MMA_TILE_X_K_Q6_K : case GGML_TYPE_Q5_K: return MMQ_MMA_TILE_X_K_Q8_1;
type == GGML_TYPE_IQ2_XXS ? MMQ_MMA_TILE_X_K_Q8_0 : case GGML_TYPE_Q6_K: return MMQ_MMA_TILE_X_K_Q6_K;
type == GGML_TYPE_IQ2_XS ? MMQ_MMA_TILE_X_K_Q3_K : case GGML_TYPE_IQ2_XXS: return MMQ_MMA_TILE_X_K_Q8_0;
type == GGML_TYPE_IQ2_S ? MMQ_MMA_TILE_X_K_Q3_K : case GGML_TYPE_IQ2_XS: return MMQ_MMA_TILE_X_K_Q3_K;
type == GGML_TYPE_IQ3_XXS ? MMQ_MMA_TILE_X_K_Q8_0 : case GGML_TYPE_IQ2_S: return MMQ_MMA_TILE_X_K_Q3_K;
type == GGML_TYPE_IQ3_S ? MMQ_MMA_TILE_X_K_Q8_0 : case GGML_TYPE_IQ3_XXS: return MMQ_MMA_TILE_X_K_Q8_0;
type == GGML_TYPE_IQ1_S ? MMQ_MMA_TILE_X_K_Q8_0 : case GGML_TYPE_IQ3_S: return MMQ_MMA_TILE_X_K_Q8_0;
type == GGML_TYPE_IQ4_XS ? MMQ_MMA_TILE_X_K_Q8_0 : case GGML_TYPE_IQ1_S: return MMQ_MMA_TILE_X_K_Q8_0;
type == GGML_TYPE_IQ4_NL ? MMQ_MMA_TILE_X_K_Q8_0 : case GGML_TYPE_IQ4_XS: return MMQ_MMA_TILE_X_K_Q8_0;
0; case GGML_TYPE_IQ4_NL: return MMQ_MMA_TILE_X_K_Q8_0;
default: return 0;
}
} }
#define MMQ_TILE_Y_K (WARP_SIZE + WARP_SIZE/QI8_1) #define MMQ_TILE_Y_K (WARP_SIZE + WARP_SIZE/QI8_1)

View file

@ -7,47 +7,51 @@
typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs); typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs);
static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) { static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) {
return type == GGML_TYPE_Q4_0 ? vec_dot_q4_0_q8_1 : switch (type) {
type == GGML_TYPE_Q4_1 ? vec_dot_q4_1_q8_1 : case GGML_TYPE_Q4_0: return vec_dot_q4_0_q8_1;
type == GGML_TYPE_Q5_0 ? vec_dot_q5_0_q8_1 : case GGML_TYPE_Q4_1: return vec_dot_q4_1_q8_1;
type == GGML_TYPE_Q5_1 ? vec_dot_q5_1_q8_1 : case GGML_TYPE_Q5_0: return vec_dot_q5_0_q8_1;
type == GGML_TYPE_Q8_0 ? vec_dot_q8_0_q8_1 : case GGML_TYPE_Q5_1: return vec_dot_q5_1_q8_1;
type == GGML_TYPE_Q2_K ? vec_dot_q2_K_q8_1 : case GGML_TYPE_Q8_0: return vec_dot_q8_0_q8_1;
type == GGML_TYPE_Q3_K ? vec_dot_q3_K_q8_1 : case GGML_TYPE_Q2_K: return vec_dot_q2_K_q8_1;
type == GGML_TYPE_Q4_K ? vec_dot_q4_K_q8_1 : case GGML_TYPE_Q3_K: return vec_dot_q3_K_q8_1;
type == GGML_TYPE_Q5_K ? vec_dot_q5_K_q8_1 : case GGML_TYPE_Q4_K: return vec_dot_q4_K_q8_1;
type == GGML_TYPE_Q6_K ? vec_dot_q6_K_q8_1 : case GGML_TYPE_Q5_K: return vec_dot_q5_K_q8_1;
type == GGML_TYPE_IQ2_XXS ? vec_dot_iq2_xxs_q8_1 : case GGML_TYPE_Q6_K: return vec_dot_q6_K_q8_1;
type == GGML_TYPE_IQ2_XS ? vec_dot_iq2_xs_q8_1 : case GGML_TYPE_IQ2_XXS: return vec_dot_iq2_xxs_q8_1;
type == GGML_TYPE_IQ2_S ? vec_dot_iq2_s_q8_1 : case GGML_TYPE_IQ2_XS: return vec_dot_iq2_xs_q8_1;
type == GGML_TYPE_IQ3_XXS ? vec_dot_iq3_xxs_q8_1 : case GGML_TYPE_IQ2_S: return vec_dot_iq2_s_q8_1;
type == GGML_TYPE_IQ1_S ? vec_dot_iq1_s_q8_1 : case GGML_TYPE_IQ3_XXS: return vec_dot_iq3_xxs_q8_1;
type == GGML_TYPE_IQ1_M ? vec_dot_iq1_m_q8_1 : case GGML_TYPE_IQ1_S: return vec_dot_iq1_s_q8_1;
type == GGML_TYPE_IQ4_NL ? vec_dot_iq4_nl_q8_1 : case GGML_TYPE_IQ1_M: return vec_dot_iq1_m_q8_1;
type == GGML_TYPE_IQ4_XS ? vec_dot_iq4_xs_q8_1 : case GGML_TYPE_IQ4_NL: return vec_dot_iq4_nl_q8_1;
type == GGML_TYPE_IQ3_S ? vec_dot_iq3_s_q8_1 : case GGML_TYPE_IQ4_XS: return vec_dot_iq4_xs_q8_1;
nullptr; case GGML_TYPE_IQ3_S: return vec_dot_iq3_s_q8_1;
default: return nullptr;
}
} }
static constexpr __device__ int get_vdr_mmvq(ggml_type type) { static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ : switch (type) {
type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ : case GGML_TYPE_Q4_0: return VDR_Q4_0_Q8_1_MMVQ;
type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ : case GGML_TYPE_Q4_1: return VDR_Q4_1_Q8_1_MMVQ;
type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ : case GGML_TYPE_Q5_0: return VDR_Q5_0_Q8_1_MMVQ;
type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ : case GGML_TYPE_Q5_1: return VDR_Q5_1_Q8_1_MMVQ;
type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ : case GGML_TYPE_Q8_0: return VDR_Q8_0_Q8_1_MMVQ;
type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ : case GGML_TYPE_Q2_K: return VDR_Q2_K_Q8_1_MMVQ;
type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ : case GGML_TYPE_Q3_K: return VDR_Q3_K_Q8_1_MMVQ;
type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ : case GGML_TYPE_Q4_K: return VDR_Q4_K_Q8_1_MMVQ;
type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ : case GGML_TYPE_Q5_K: return VDR_Q5_K_Q8_1_MMVQ;
type == GGML_TYPE_IQ2_XXS ? VDR_IQ2_XXS_Q8_1_MMVQ : case GGML_TYPE_Q6_K: return VDR_Q6_K_Q8_1_MMVQ;
type == GGML_TYPE_IQ2_XS ? VDR_IQ2_XS_Q8_1_MMVQ : case GGML_TYPE_IQ2_XXS: return VDR_IQ2_XXS_Q8_1_MMVQ;
type == GGML_TYPE_IQ2_S ? VDR_IQ2_S_Q8_1_MMVQ : case GGML_TYPE_IQ2_XS: return VDR_IQ2_XS_Q8_1_MMVQ;
type == GGML_TYPE_IQ3_XXS ? VDR_IQ3_XXS_Q8_1_MMVQ : case GGML_TYPE_IQ2_S: return VDR_IQ2_S_Q8_1_MMVQ;
type == GGML_TYPE_IQ3_S ? VDR_IQ3_S_Q8_1_MMVQ : case GGML_TYPE_IQ3_XXS: return VDR_IQ3_XXS_Q8_1_MMVQ;
type == GGML_TYPE_IQ4_NL ? VDR_IQ4_NL_Q8_1_MMVQ : case GGML_TYPE_IQ3_S: return VDR_IQ3_S_Q8_1_MMVQ;
type == GGML_TYPE_IQ4_XS ? VDR_IQ4_XS_Q8_1_MMVQ : case GGML_TYPE_IQ4_NL: return VDR_IQ4_NL_Q8_1_MMVQ;
1; case GGML_TYPE_IQ4_XS: return VDR_IQ4_XS_Q8_1_MMVQ;
default: return 1;
}
} }
enum mmvq_parameter_table_id { enum mmvq_parameter_table_id {

View file

@ -969,6 +969,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"CONV_TRANSPOSE_1D", "CONV_TRANSPOSE_1D",
"IM2COL", "IM2COL",
"IM2COL_BACK", "IM2COL_BACK",
"CONV_2D_DW",
"CONV_TRANSPOSE_2D", "CONV_TRANSPOSE_2D",
"POOL_1D", "POOL_1D",
"POOL_2D", "POOL_2D",
@ -1006,7 +1007,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"OPT_STEP_ADAMW", "OPT_STEP_ADAMW",
}; };
static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81"); static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none", "none",
@ -1063,6 +1064,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"conv_transpose_1d(x)", "conv_transpose_1d(x)",
"im2col(x)", "im2col(x)",
"im2col_back(x)", "im2col_back(x)",
"conv_2d_dw(x)",
"conv_transpose_2d(x)", "conv_transpose_2d(x)",
"pool_1d(x)", "pool_1d(x)",
"pool_2d(x)", "pool_2d(x)",
@ -1100,7 +1102,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"adamw(x)", "adamw(x)",
}; };
static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81"); static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@ -1357,6 +1359,13 @@ bool ggml_is_permuted(const struct ggml_tensor * tensor) {
return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3]; return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
} }
bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor) {
return
tensor->nb[0] > tensor->nb[2] &&
tensor->nb[1] > tensor->nb[0] &&
tensor->nb[2] == ggml_type_size(tensor->type);
}
static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) { static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
@ -4063,6 +4072,46 @@ struct ggml_tensor * ggml_conv_2d_dw(
return result; return result;
} }
// ggml_conv_2d_dw_direct
struct ggml_tensor * ggml_conv_2d_dw_direct(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int stride0,
int stride1,
int pad0,
int pad1,
int dilation0,
int dilation1) {
GGML_ASSERT(a->ne[2] == 1);
GGML_ASSERT(a->ne[3] == b->ne[2]);
int64_t ne[4];
ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], stride0, pad0, dilation0);
ne[1] = ggml_calc_conv_output_size(b->ne[1], a->ne[1], stride1, pad1, dilation1);
ne[2] = b->ne[2];
ne[3] = b->ne[3];
struct ggml_tensor * result = ggml_new_tensor(ctx, b->type, 4, ne);
if (ggml_is_contiguous_channels(b)) {
// Result will be permuted the same way as input (CWHN order)
const int64_t type_size = ggml_type_size(result->type);
GGML_ASSERT(ggml_blck_size(result->type) == 1);
result->nb[0] = result->ne[2] * type_size;
result->nb[1] = result->ne[0] * result->nb[0];
result->nb[2] = type_size;
}
int32_t params[] = { stride0, stride1, pad0, pad1, dilation0, dilation1 };
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_CONV_2D_DW;
result->src[0] = a;
result->src[1] = b;
return result;
}
// ggml_conv_transpose_2d_p0 // ggml_conv_transpose_2d_p0
static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) { static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {

View file

@ -1832,10 +1832,14 @@ static int GetBatchSize(int desiredBlasBatchSize,FileFormat in_file_format)
file_format == FileFormat::GPTJ_2 || file_format == FileFormat::GPTJ_2 ||
file_format == FileFormat::RWKV_1 || file_format == FileFormat::RWKV_1 ||
file_format==FileFormat::RWKV_2); file_format==FileFormat::RWKV_2);
if(!approved_format || desiredBlasBatchSize<=0) if(!approved_format && desiredBlasBatchSize>0)
{ {
desiredBlasBatchSize = 16; desiredBlasBatchSize = 16;
} }
if(desiredBlasBatchSize<=0)
{
desiredBlasBatchSize = 1;
}
if (file_format != FileFormat::GGML && file_format != FileFormat::GGHF && file_format != FileFormat::GGJT && file_format != FileFormat::GGJT_2 && file_format != FileFormat::GGJT_3 && file_format != FileFormat::GGUF_GENERIC) if (file_format != FileFormat::GGML && file_format != FileFormat::GGHF && file_format != FileFormat::GGJT && file_format != FileFormat::GGJT_2 && file_format != FileFormat::GGJT_3 && file_format != FileFormat::GGUF_GENERIC)
{ {
desiredBlasBatchSize = (desiredBlasBatchSize > 256 ? 256 : desiredBlasBatchSize); desiredBlasBatchSize = (desiredBlasBatchSize > 256 ? 256 : desiredBlasBatchSize);

View file

@ -3861,8 +3861,8 @@ def show_gui():
tabcontent = {} tabcontent = {}
# slider data # slider data
blasbatchsize_values = ["-1", "32", "64", "128", "256", "512", "1024", "2048"] blasbatchsize_values = ["-1", "16", "32", "64", "128", "256", "512", "1024", "2048"]
blasbatchsize_text = ["Don't Batch BLAS","32","64","128","256","512","1024","2048"] blasbatchsize_text = ["Don't Batch BLAS", "16","32","64","128","256","512","1024","2048"]
contextsize_text = ["256", "512", "1024", "2048", "3072", "4096", "6144", "8192", "10240", "12288", "14336", "16384", "20480", "24576", "28672", "32768", "40960", "49152", "57344", "65536", "81920", "98304", "114688", "131072"] contextsize_text = ["256", "512", "1024", "2048", "3072", "4096", "6144", "8192", "10240", "12288", "14336", "16384", "20480", "24576", "28672", "32768", "40960", "49152", "57344", "65536", "81920", "98304", "114688", "131072"]
antirunopts = [opt.replace("Use ", "") for lib, opt in lib_option_pairs if opt not in runopts] antirunopts = [opt.replace("Use ", "") for lib, opt in lib_option_pairs if opt not in runopts]
quantkv_text = ["F16 (Off)","8-Bit","4-Bit"] quantkv_text = ["F16 (Off)","8-Bit","4-Bit"]
@ -6590,7 +6590,7 @@ if __name__ == '__main__':
advparser.add_argument("--version", help="Prints version and exits.", action='store_true') advparser.add_argument("--version", help="Prints version and exits.", action='store_true')
advparser.add_argument("--analyze", metavar=('[filename]'), help="Reads the metadata, weight types and tensor names in any GGUF file.", default="") advparser.add_argument("--analyze", metavar=('[filename]'), help="Reads the metadata, weight types and tensor names in any GGUF file.", default="")
advparser.add_argument("--ropeconfig", help="If set, uses customized RoPE scaling from configured frequency scale and frequency base (e.g. --ropeconfig 0.25 10000). Otherwise, uses NTK-Aware scaling set automatically based on context size. For linear rope, simply set the freq-scale and ignore the freq-base",metavar=('[rope-freq-scale]', '[rope-freq-base]'), default=[0.0, 10000.0], type=float, nargs='+') advparser.add_argument("--ropeconfig", help="If set, uses customized RoPE scaling from configured frequency scale and frequency base (e.g. --ropeconfig 0.25 10000). Otherwise, uses NTK-Aware scaling set automatically based on context size. For linear rope, simply set the freq-scale and ignore the freq-base",metavar=('[rope-freq-scale]', '[rope-freq-base]'), default=[0.0, 10000.0], type=float, nargs='+')
advparser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,32,64,128,256,512,1024,2048], default=512) advparser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,16,32,64,128,256,512,1024,2048], default=512)
advparser.add_argument("--blasthreads", help="Use a different number of threads during BLAS if specified. Otherwise, has the same value as --threads",metavar=('[threads]'), type=int, default=0) advparser.add_argument("--blasthreads", help="Use a different number of threads during BLAS if specified. Otherwise, has the same value as --threads",metavar=('[threads]'), type=int, default=0)
advparser.add_argument("--lora", help="LLAMA models only, applies a lora file on top of model. Experimental.", metavar=('[lora_filename]', '[lora_base]'), nargs='+') advparser.add_argument("--lora", help="LLAMA models only, applies a lora file on top of model. Experimental.", metavar=('[lora_filename]', '[lora_base]'), nargs='+')
advparser.add_argument("--noshift", help="If set, do not attempt to Trim and Shift the GGUF context.", action='store_true') advparser.add_argument("--noshift", help="If set, do not attempt to Trim and Shift the GGUF context.", action='store_true')

View file

@ -1,112 +0,0 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
Hello world
__ggml_vocab_test__
Hello world
__ggml_vocab_test__
Hello World
__ggml_vocab_test__
Hello World
__ggml_vocab_test__
Hello World!
__ggml_vocab_test__
Hello, world!
__ggml_vocab_test__
Hello, world!
__ggml_vocab_test__
this is 🦙.cpp
__ggml_vocab_test__
w048 7tuijk dsdfhu
__ggml_vocab_test__
нещо на Български
__ggml_vocab_test__
កាន់តែពិសេសអាចខលចេញ
__ggml_vocab_test__
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
Hello
__ggml_vocab_test__
(
__ggml_vocab_test__
=
__ggml_vocab_test__
' era
__ggml_vocab_test__
Hello, y'all! How are you 😁 ?我想在apple工作1314151天
__ggml_vocab_test__
!!!!!!
__ggml_vocab_test__
3
__ggml_vocab_test__
33
__ggml_vocab_test__
333
__ggml_vocab_test__
3333
__ggml_vocab_test__
33333
__ggml_vocab_test__
333333
__ggml_vocab_test__
3333333
__ggml_vocab_test__
33333333
__ggml_vocab_test__
333333333
__ggml_vocab_test__
Cửa Việt
__ggml_vocab_test__
discards
__ggml_vocab_test__
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
__ggml_vocab_test__

View file

@ -1,46 +0,0 @@
2014 1032 1052 1032 28504 6972
1070 7088 1258
1032
1256
1293
1009
1010
1267
4688
1009 1010
22177 4304
45383 4304
22177 5325
45383 5325
45383 5325 1033
22177 1044 4304 1033
45383 1044 4304 1033
1593 1395 119685 1166 1153 1046 51228
1119 1048 1052 1056 1032 1055 17391 23216 30203 7785 17279
3337 30757 1902 4200 63073 3671
1225 1158 1128 1225 1158 1182 1225 1158 1147 1225 1159 1139 1225 1158 1143 1225 1159 1130 1225 1158 1150 1225 1158 1183 1225 1158 1159 1225 21359 1225 1158 1159 1225 1158 1162 1225 1158 1182 1225 1158 1133 1225 1158 1129 1225 1158 1155 1225 1158 1133 1225 21359 1225 1158 1137
1240 1159 1154 1128 1319 13052 1041 119685 1152 1182 29568 1240 1159 1140 1171 1239 1184 1143 1319 88181 1873 3659 1275 56421 1621 1041 126241 1133 1319 11234 1873 26303 1455 1934 2246 3754 10835 1041
22177
45383
1032 45383
1256 45383
1293 45383
1293 45383 1010 1293 45383
1319
1010 1376
1039 4033
22177 1044 1404 48054 1033 3075 1584 1636 119685 1152 1129 3082 26060 2998 63614 82278 1049 1051 1049 1052 1049 1053 1049 6434 6749
7290 7290 7290
1051
1051 1051
1051 1051 1051
1051 1051 1051 1051
1051 1051 1051 1051 1051
1051 1051 1051 1051 1051 1051
1051 1051 1051 1051 1051 1051 1051
1051 1051 1051 1051 1051 1051 1051 1051
1051 1051 1051 1051 1051 1051 1051 1051 1051
1067 59503 28783
3724 4058
1010 1032 1267 1032 4688 1032 17152 1458 29356 1010 1256 1010 1293 1010 1260 1010 1652 1010 1240 1159 1154 1128 1319 13052 1041 119685 1152 1182 29568 1240 1159 1140 1171 1239 1184 1143 1319 88181 1873 3659 1275 56421 1621 1041 126241 1133 119685 1166 1153 1240 1159 1166 1153 1032 1051 1032 1051 1051 1032 1051 1051 1051 1032 1051 1051 1051 1051 1032 1051 1051 1051 1051 1051 1032 1051 1051 1051 1051 1051 1051 1032 1051 1051 1051 1051 1051 1051 1051 1032 1051 1051 1051 1051 1051 1051 1051 1051 1032 1051 1046 1051 1032 1051 1791 1051 1032 1051 2880 1051 71881 1158 1128 1225 1158 1182 1225 1158 1147 1225 1159 1139 1225 1158 1143 1225 1159 1130 1225 1158 1150 1225 1158 1183 1225 1158 1159 1225 21359 1225 1158 1159 1225 1158 1162 1225 1158 1182 1225 1158 1133 1240 1159 1152 1129 3082 26060 2998 63614 82278 1049 1051 1049 1052 1049 1053 1049 6434 6749 45577 1045 6626 43555 2843 30757 1902 4200 63073 3671 14931 20040 20040 1657 1657 1975 14135 14135 83923 7290 7290 7290 45509 45509 45509 1362 6483 2151 1576 1116 2189 1514 1681 2156 1044 1576 3609 1636 5257 1063 1576 1077 1605 5257 1362 7534 3180 1494 1044 1576 1068 1636 2479 2269 26883 1063 2837 1039 45654 1261 54297 1076