Merge commit '34b7c0439e' into concedo_experimental

# Conflicts:
#	ggml/CMakeLists.txt
#	ggml/src/ggml-cpu/CMakeLists.txt
#	ggml/src/ggml-sycl/element_wise.cpp
#	ggml/src/ggml-sycl/element_wise.hpp
#	ggml/src/ggml-sycl/ggml-sycl.cpp
#	scripts/sync-ggml.last
#	src/CMakeLists.txt
#	tools/mtmd/clip.cpp
This commit is contained in:
Concedo 2025-05-31 12:27:45 +08:00
commit 0c108f6054
19 changed files with 1641 additions and 791 deletions

View file

@ -432,6 +432,9 @@ class ModelBase:
if "llm_config" in config: if "llm_config" in config:
# rename for InternVL # rename for InternVL
config["text_config"] = config["llm_config"] config["text_config"] = config["llm_config"]
if "thinker_config" in config:
# rename for Qwen2.5-Omni
config["text_config"] = config["thinker_config"]["text_config"]
return config return config
@classmethod @classmethod
@ -1121,18 +1124,21 @@ class MmprojModel(ModelBase):
preprocessor_config: dict[str, Any] preprocessor_config: dict[str, Any]
global_config: dict[str, Any] global_config: dict[str, Any]
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"]
has_vision_encoder: bool = True # by default has_vision_encoder: bool = True # by default
has_audio_encoder: bool = False has_audio_encoder: bool = False
# for models having multiple encoders, we need to separate their hparams
hparams_vision: dict[str, Any] | None = None
hparams_audio: dict[str, Any] | None = None
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
if self.model_arch != gguf.MODEL_ARCH.MMPROJ: if self.model_arch != gguf.MODEL_ARCH.MMPROJ:
raise TypeError("MmprojModel must be subclassed with model_arch = gguf.MODEL_ARCH.MMPROJ") raise TypeError("MmprojModel must be subclassed with model_arch = gguf.MODEL_ARCH.MMPROJ")
if self.has_vision_encoder and self.has_audio_encoder:
raise NotImplementedError("both vision + audio not supported yet")
# get n_embd of the text model # get n_embd of the text model
if "text_config" not in self.hparams: if "text_config" not in self.hparams:
self.hparams["text_config"] = {} self.hparams["text_config"] = {}
@ -1143,22 +1149,32 @@ class MmprojModel(ModelBase):
assert self.n_embd_text > 0, "n_embd not found in hparams" assert self.n_embd_text > 0, "n_embd not found in hparams"
# move vision config to the top level, while preserving the original hparams in global_config # move vision config to the top level, while preserving the original hparams in global_config
self.global_config = self.hparams import copy
self.global_config = copy.deepcopy(self.hparams)
self.hparams_vision = self.get_vision_config()
self.hparams_audio = self.get_audio_config()
if "vision_config" in self.hparams: if self.hparams_vision is None and self.hparams_audio is None:
self.hparams = self.hparams["vision_config"]
elif "audio_config" in self.hparams:
self.hparams = self.hparams["audio_config"]
else:
raise ValueError("vision_config / audio_config not found in hparams") raise ValueError("vision_config / audio_config not found in hparams")
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"]) # for compat with vision-only models
self.hparams = self.hparams_vision or self.hparams_audio or self.hparams
# TODO @ngxson : this is a hack to support both vision and audio encoders
have_multiple_encoders = self.has_audio_encoder and self.has_vision_encoder
self.block_count = 128 if have_multiple_encoders else self.find_hparam(self.n_block_keys, True)
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count) self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
# load preprocessor config # load preprocessor config
with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f: with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
self.preprocessor_config = json.load(f) self.preprocessor_config = json.load(f)
def get_vision_config(self) -> dict[str, Any] | None:
return self.global_config.get("vision_config")
def get_audio_config(self) -> dict[str, Any] | None:
return self.global_config.get("audio_config")
def set_type(self): def set_type(self):
self.gguf_writer.add_type(gguf.GGUFType.MMPROJ) self.gguf_writer.add_type(gguf.GGUFType.MMPROJ)
@ -1170,26 +1186,26 @@ class MmprojModel(ModelBase):
self.gguf_writer.add_vision_projection_dim(self.n_embd_text) self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
# vision config # vision config
self.gguf_writer.add_vision_image_size(self.find_hparam(["image_size"])) self.gguf_writer.add_vision_image_size(self.find_vparam(["image_size"]))
self.gguf_writer.add_vision_patch_size(self.find_hparam(["patch_size"])) self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"]))
self.gguf_writer.add_vision_embedding_length(self.find_hparam(["hidden_size"])) self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"]))
self.gguf_writer.add_vision_feed_forward_length(self.find_hparam(["intermediate_size"])) self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size"]))
self.gguf_writer.add_vision_block_count(self.block_count) self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys))
self.gguf_writer.add_vision_head_count(self.find_hparam(["num_attention_heads"])) self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads"]))
# preprocessor config # preprocessor config
self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"]) self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"])
self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"]) self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"])
elif self.has_audio_encoder: if self.has_audio_encoder:
self.gguf_writer.add_clip_has_audio_encoder(True) self.gguf_writer.add_clip_has_audio_encoder(True)
self.gguf_writer.add_audio_projection_dim(self.n_embd_text) self.gguf_writer.add_audio_projection_dim(self.n_embd_text)
# audio config # audio config
self.gguf_writer.add_audio_embedding_length(self.find_hparam(["hidden_size"])) self.gguf_writer.add_audio_embedding_length(self.find_aparam(["hidden_size"]))
self.gguf_writer.add_audio_feed_forward_length(self.find_hparam(["intermediate_size"])) self.gguf_writer.add_audio_feed_forward_length(self.find_aparam(["intermediate_size"]))
self.gguf_writer.add_audio_block_count(self.block_count) self.gguf_writer.add_audio_block_count(self.find_aparam(self.n_block_keys))
self.gguf_writer.add_audio_head_count(self.find_hparam(["num_attention_heads"])) self.gguf_writer.add_audio_head_count(self.find_aparam(["num_attention_heads"]))
else: else:
raise ValueError("MmprojModel must have either vision or audio encoder") raise ValueError("MmprojModel must have either vision or audio encoder")
@ -1197,6 +1213,22 @@ class MmprojModel(ModelBase):
def write_vocab(self): def write_vocab(self):
raise ValueError("MmprojModel does not support vocab writing") raise ValueError("MmprojModel does not support vocab writing")
def find_vparam(self, keys: Iterable[str], optional: bool = False) -> Any:
assert self.hparams_vision is not None
return self._find_param(self.hparams_vision, keys, optional)
def find_aparam(self, keys: Iterable[str], optional: bool = False) -> Any:
assert self.hparams_audio is not None
return self._find_param(self.hparams_audio, keys, optional)
def _find_param(self, obj: dict[str, Any], keys: Iterable[str], optional: bool = False) -> Any:
key = next((k for k in keys if k in obj), None)
if key is not None:
return obj[key]
if optional:
return None
raise KeyError(f"could not find any of: {keys}")
@ModelBase.register("GPTNeoXForCausalLM") @ModelBase.register("GPTNeoXForCausalLM")
class GPTNeoXModel(TextModel): class GPTNeoXModel(TextModel):
@ -2674,7 +2706,12 @@ class Qwen2Model(TextModel):
yield from super().modify_tensors(data_torch, name, bid) yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration") @ModelBase.register(
"Qwen2VLModel",
"Qwen2VLForConditionalGeneration",
"Qwen2_5_VLForConditionalGeneration",
"Qwen2_5OmniModel",
)
class Qwen2VLModel(TextModel): class Qwen2VLModel(TextModel):
model_arch = gguf.MODEL_ARCH.QWEN2VL model_arch = gguf.MODEL_ARCH.QWEN2VL
@ -2692,8 +2729,11 @@ class Qwen2VLModel(TextModel):
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused del bid # unused
if name.startswith("visual."): if name.startswith("thinker."):
# skip visual tensors name = name.replace("thinker.", "")
if name.startswith("visual") or name.startswith("audio") or \
name.startswith("talker") or name.startswith("token2wav"):
# skip multimodal tensors
return [] return []
return [(self.map_tensor_name(name), data_torch)] return [(self.map_tensor_name(name), data_torch)]
@ -2702,20 +2742,26 @@ class Qwen2VLModel(TextModel):
class Qwen2VLVisionModel(MmprojModel): class Qwen2VLVisionModel(MmprojModel):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self.hparams["image_size"] = self.hparams.get("image_size", 560) assert self.hparams_vision is not None
self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560)
# rename config.json values # rename config.json values
self.hparams["num_attention_heads"] = self.hparams.get("num_heads") self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads")
self.hparams["num_hidden_layers"] = self.hparams.get("depth") self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth")
if "embed_dim" in self.hparams: # qwen2vl if "embed_dim" in self.hparams_vision: # qwen2vl
self.hparams["intermediate_size"] = self.hparams.get("hidden_size") self.hparams_vision["intermediate_size"] = self.hparams_vision.get("hidden_size")
self.hparams["hidden_size"] = self.hparams.get("embed_dim") self.hparams_vision["hidden_size"] = self.hparams_vision.get("embed_dim")
def set_gguf_parameters(self): def set_gguf_parameters(self):
super().set_gguf_parameters() super().set_gguf_parameters()
hparams = self.hparams assert self.hparams_vision is not None
if self.global_config['model_type'] == 'qwen2_vl': hparams = self.hparams_vision
model_type = self.global_config['model_type']
if model_type == 'qwen2_vl':
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2VL) self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2VL)
elif self.global_config['model_type'] == 'qwen2_5_vl': elif model_type == 'qwen2_5_vl' or model_type == 'qwen2_5_omni':
if model_type == 'qwen2_5_omni':
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25O)
else:
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL) self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL)
self.gguf_writer.add_vision_use_silu(True) self.gguf_writer.add_vision_use_silu(True)
# find n_wa_pattern (window attention pattern) # find n_wa_pattern (window attention pattern)
@ -2774,6 +2820,66 @@ class Qwen2VLVisionModel(MmprojModel):
return [] # skip other tensors return [] # skip other tensors
@ModelBase.register("Qwen2_5OmniModel")
class Qwen25OmniModel(Qwen2VLVisionModel):
has_vision_encoder = True
has_audio_encoder = True
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
assert self.hparams_audio is not None
self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"]
self.hparams_audio["intermediate_size"] = self.hparams_audio["encoder_ffn_dim"]
self.hparams_audio["num_attention_heads"] = self.hparams_audio["encoder_attention_heads"]
def set_gguf_parameters(self):
super().set_gguf_parameters()
assert self.hparams_audio is not None
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["num_mel_bins"])
self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-5))
def get_vision_config(self) -> dict[str, Any] | None:
return self.global_config["thinker_config"].get("vision_config")
def get_audio_config(self) -> dict[str, Any] | None:
return self.global_config["thinker_config"].get("audio_config")
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
# SinusoidsPositionEmbedding
assert self.hparams_audio is not None
max_timescale = 10000
length = 1500
channels = self.hparams_audio["hidden_size"]
log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2).float())
scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
pos_embd = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1).to(dtype=torch.float32)
yield ("audio_tower.embed_positions.weight", pos_embd)
def tensor_force_quant(self, name, new_name, bid, n_dims):
del bid, new_name, n_dims # unused
if ".conv" in name and ".weight" in name:
return gguf.GGMLQuantizationType.F16
return False
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.startswith("thinker."):
name = name.replace("thinker.", "")
if name.startswith("audio_tower"):
# process audio tensors
if "conv1.bias" in name or "conv2.bias" in name:
# transpose conv1 and conv2 bias
data_torch = data_torch.unsqueeze(-1)
if "audio_bos_eos_token" in name:
# this tensor is left unused in transformers code
# https://github.com/huggingface/transformers/blob/6e3063422c4b1c014aa60c32b9254fd2902f0f28/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py#L1809
return []
return [(self.map_tensor_name(name), data_torch)]
return super().modify_tensors(data_torch, name, bid)
@ModelBase.register("InternVisionModel") @ModelBase.register("InternVisionModel")
class InternVisionModel(MmprojModel): class InternVisionModel(MmprojModel):
def set_gguf_parameters(self): def set_gguf_parameters(self):

View file

@ -98,3 +98,12 @@ NOTE: some models may require large context window, for example: `-c 8192`
# note: no pre-quantized GGUF this model, as they have very poor result # note: no pre-quantized GGUF this model, as they have very poor result
# ref: https://github.com/ggml-org/llama.cpp/pull/13760 # ref: https://github.com/ggml-org/llama.cpp/pull/13760
``` ```
**Mixed modalities**:
```sh
# Qwen2.5 Omni
# Capabilities: audio input, vision input
(tool_name) -hf ggml-org/Qwen2.5-Omni-3B-GGUF
(tool_name) -hf ggml-org/Qwen2.5-Omni-7B-GGUF
```

View file

@ -948,6 +948,15 @@ extern "C" {
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
// repeat a to the specified shape
GGML_API struct ggml_tensor * ggml_repeat_4d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int64_t ne0,
int64_t ne1,
int64_t ne2,
int64_t ne3);
// sums repetitions in a into shape of b // sums repetitions in a into shape of b
GGML_API struct ggml_tensor * ggml_repeat_back( GGML_API struct ggml_tensor * ggml_repeat_back(
struct ggml_context * ctx, struct ggml_context * ctx,

View file

@ -1206,7 +1206,7 @@ static void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
} }
} }
return; return;
#elif defined(__riscv_v_intrinsic) #elif defined __riscv_v
if (__riscv_vlenb() >= QK4_0) { if (__riscv_vlenb() >= QK4_0) {
const size_t vl = QK4_0; const size_t vl = QK4_0;
@ -3798,7 +3798,7 @@ static void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
} }
return; return;
} }
#elif defined(__riscv_v_intrinsic) #elif defined __riscv_v
if (__riscv_vlenb() >= QK4_0) { if (__riscv_vlenb() >= QK4_0) {
const size_t vl = QK4_0; const size_t vl = QK4_0;

View file

@ -320,21 +320,17 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
#ifdef __wasm_simd128__ #ifdef __wasm_simd128__
#include <wasm_simd128.h> #include <wasm_simd128.h>
#else #endif
#ifdef __POWER9_VECTOR__ #ifdef __POWER9_VECTOR__
#include <altivec.h> #include <altivec.h>
#else #endif
#if defined(_MSC_VER) || defined(__MINGW32__) #if defined(_MSC_VER) || defined(__MINGW32__)
#include <intrin.h> #include <intrin.h>
#else #elif defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
#if !defined(__riscv)
#include <immintrin.h> #include <immintrin.h>
#endif #endif
#endif
#endif
#endif
#endif
#ifdef __riscv_v_intrinsic #ifdef __riscv_v_intrinsic
#include <riscv_vector.h> #include <riscv_vector.h>

View file

@ -884,7 +884,7 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
_mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4); _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
#endif #endif
} }
#elif defined(__riscv_v_intrinsic) #elif defined(__riscv_v)
size_t vl = QK8_0; size_t vl = QK8_0;
@ -1222,7 +1222,7 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
_mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4); _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
#endif #endif
} }
#elif defined(__riscv_v_intrinsic) #elif defined(__riscv_v)
size_t vl = QK8_1; size_t vl = QK8_1;
@ -2385,7 +2385,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
} }
sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3); sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
#elif defined(__riscv_v_intrinsic) #elif defined(__riscv_v)
size_t vl = qk / 2; size_t vl = qk / 2;
for (; ib < nb; ++ib) { for (; ib < nb; ++ib) {
@ -2775,7 +2775,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
} }
sumf = hsum_float_8(acc) + summs; sumf = hsum_float_8(acc) + summs;
#elif defined(__riscv_v_intrinsic) #elif defined(__riscv_v)
size_t vl = qk / 2; size_t vl = qk / 2;
for (; ib < nb; ++ib) { for (; ib < nb; ++ib) {
@ -3122,7 +3122,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
} }
sumf = hsum_float_8(acc); sumf = hsum_float_8(acc);
#elif defined(__riscv_v_intrinsic) #elif defined(__riscv_v)
size_t vl; size_t vl;
size_t vlenb = __riscv_vlenb(); size_t vlenb = __riscv_vlenb();
@ -3461,7 +3461,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
} }
sumf = hsum_float_8(acc) + summs; sumf = hsum_float_8(acc) + summs;
#elif defined(__riscv_v_intrinsic) #elif defined(__riscv_v)
size_t vl; size_t vl;
size_t vlenb = __riscv_vlenb(); size_t vlenb = __riscv_vlenb();
@ -3898,7 +3898,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
} }
sumf = hsum_float_8(accum); sumf = hsum_float_8(accum);
#elif defined(__riscv_v_intrinsic) #elif defined(__riscv_v)
size_t vl = qk; size_t vl = qk;
for (; ib < nb; ++ib) { for (; ib < nb; ++ib) {
@ -5101,14 +5101,111 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
*s = sumf; *s = sumf;
#elif defined __riscv_v_intrinsic #elif defined __riscv_xtheadvector
float sumf = 0;
uint8_t atmp[16];
for (int i = 0; i < nb; ++i) {
const uint8_t * q2 = x[i].qs;
const int8_t * q8 = y[i].qs;
const uint8_t * sc = x[i].scales;
const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
uint8_t *patmp = atmp;
int vsums;
int tmp;
__asm__ __volatile__(
"th.vsetvli zero, %[vl16], e8, m1\n\t"
"th.vmv.v.x v8, zero\n\t"
"th.vlb.v v1, (%[sc])\n\t"
"th.vand.vi v0, v1, 0xF\n\t"
"th.vsrl.vi v1, v1, 4\n\t"
"th.vsb.v v0, (%[scale])\n\t"
"th.vwaddu.vx v16, v1, zero\n\t"
"th.vsetvli zero, %[vl16], e16, m2\n\t"
"th.vlh.v v2, (%[bsums])\n\t"
"th.vwmul.vv v4, v16, v2\n\t"
"th.vsetvli zero, %[vl16], e32, m4\n\t"
"th.vredsum.vs v8, v4, v8\n\t"
"th.vmv.x.s %[vsums], v8"
: [tmp] "=&r" (tmp), [vsums] "=&r" (vsums)
: [sc] "r" (sc), [scale] "r" (atmp), [bsums] "r" (y[i].bsums)
, [vl16] "r" (16)
: "memory"
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
);
sumf += dmin * vsums;
int isum = 0;
for (int j = 0; j < QK_K/128; ++j) {
__asm__ __volatile__(
"th.vsetvli zero, %[vl32], e8, m2\n\t"
"th.vlb.v v0, (%[q2])\n\t"
"th.vsrl.vi v2, v0, 2\n\t"
"th.vsrl.vi v4, v0, 4\n\t"
"th.vsrl.vi v6, v0, 6\n\t"
"th.vand.vi v0, v0, 0x3\n\t"
"th.vand.vi v2, v2, 0x3\n\t"
"th.vand.vi v4, v4, 0x3\n\t"
"th.vsetvli zero, %[vl128], e8, m8\n\t"
"th.vlb.v v8, (%[q8])\n\t"
"th.vsetvli zero, %[vl64], e8, m4\n\t"
"th.vwmul.vv v16, v0, v8\n\t"
"th.vwmul.vv v24, v4, v12\n\t"
"th.vsetvli zero, %[vl16], e16, m2\n\t"
"th.vmv.v.x v0, zero\n\t"
"th.vwredsum.vs v10, v16, v0\n\t"
"th.vwredsum.vs v9, v18, v0\n\t"
"th.vwredsum.vs v8, v20, v0\n\t"
"th.vwredsum.vs v7, v22, v0\n\t"
"th.vwredsum.vs v11, v24, v0\n\t"
"th.vwredsum.vs v12, v26, v0\n\t"
"th.vwredsum.vs v13, v28, v0\n\t"
"th.vwredsum.vs v14, v30, v0\n\t"
"li %[tmp], 4\n\t"
"th.vsetvli zero, %[tmp], e32, m1\n\t"
"th.vslideup.vi v10, v9, 1\n\t"
"th.vslideup.vi v8, v7, 1\n\t"
"th.vslideup.vi v11, v12, 1\n\t"
"th.vslideup.vi v13, v14, 1\n\t"
"th.vslideup.vi v10, v8, 2\n\t"
"th.vslideup.vi v11, v13, 2\n\t"
"li %[tmp], 8\n\t"
"th.vsetvli zero, %[tmp], e32, m2\n\t"
"th.vlbu.v v12, (%[scale])\n\t"
"th.vmul.vv v10, v10, v12\n\t"
"th.vredsum.vs v0, v10, v0\n\t"
"th.vmv.x.s %[tmp], v0\n\t"
"add %[isum], %[isum], %[tmp]"
: [tmp] "=&r" (tmp), [isum] "+&r" (isum)
: [q2] "r" (q2), [scale] "r" (patmp), [q8] "r" (q8)
, [vl16] "r" (16), [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
: "memory"
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
);
q2 += 32; q8 += 128; patmp += 8;
}
sumf += dall * isum;
}
*s = sumf;
#elif defined __riscv_v
float sumf = 0;
uint8_t atmp[16];
const int vector_length = __riscv_vlenb() * 8; const int vector_length = __riscv_vlenb() * 8;
float sumf = 0;
uint8_t temp_01[32] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, uint8_t temp_01[32] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
uint8_t atmp[16];
switch (vector_length) { switch (vector_length) {
case 256: case 256:
@ -6138,14 +6235,141 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
*s = sumf; *s = sumf;
#elif defined __riscv_v_intrinsic #elif defined __riscv_xtheadvector
uint32_t aux[3];
uint32_t utmp[4]; uint32_t utmp[4];
const int vector_length = __riscv_vlenb() * 8;
float sumf = 0; float sumf = 0;
for (int i = 0; i < nb; ++i) {
const uint8_t * restrict q3 = x[i].qs;
const uint8_t * restrict qh = x[i].hmask;
const int8_t * restrict q8 = y[i].qs;
int8_t * scale = (int8_t *)utmp;
int tmp;
__asm__ __volatile__(
"li %[tmp], 12\n\t"
"th.vsetvli zero, %[tmp], e8, m1\n\t"
"th.vlb.v v0, (%[s6b])\n\t"
"th.vmv.v.v v2, v0\n\t"
"li %[tmp], 2\n\t"
"th.vsetvli zero, %[tmp], e64, m1\n\t"
"th.vmv.v.x v9, %[sh]\n\t"\
"th.vslidedown.vi v1, v0, 1\n\t"
"th.vslide1up.vx v8, v9, zero\n\t" // {0, 0, 4, 4}
"th.vslideup.vi v0, v2, 1\n\t" // {aux[0], aux[1], aux[0], aux[1]}
"li %[tmp], 4\n\t"
"th.vsetvli zero, %[tmp], e32, m1\n\t"
"th.vid.v v9\n\t"
"th.vmv.x.s %[tmp], v1\n\t"
"th.vsll.vi v9, v9, 1\n\t" // {0, 2, 4, 6}
"th.vmv.v.x v1, %[tmp]\n\t" // {aux[2], aux[2], aux[2], aux[2]}
"th.vsrl.vv v4, v1, v9\n\t"
"th.vsrl.vv v2, v0, v8\n\t"
"th.vand.vx v5, v4, %[kmask1]\n\t"
"th.vand.vx v3, v2, %[kmask2]\n\t"
"th.vsll.vi v6, v5, 4\n\t"
"th.vor.vv v7, v6, v3\n\t"
"li %[tmp], 16\n\t"
"th.vsetvli zero, %[tmp], e8, m1\n\t"
"th.vsub.vx v0, v7, %[c]\n\t"
"th.vsb.v v0, (%[scale])"
: [tmp] "=&r" (tmp)
: [sh] "r" (0x0000000400000004), [s6b] "r" (x[i].scales), [c] "r" (32)
, [scale] "r" (scale), [kmask1] "r" (kmask1), [kmask2] "r" (kmask2)
: "memory"
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
);
uint8_t m = 1;
int isum = 0;
for (int j = 0; j < QK_K; j += 128) {
__asm__ __volatile__(
// fixme: use v0p7 mask layout directly
"th.vsetvli zero, %[vl32], e8, m2\n\t"
"th.vlb.v v8, (%[q3])\n\t"
"th.vsrl.vi v10, v8, 2\n\t"
"th.vsrl.vi v12, v8, 4\n\t"
"th.vsrl.vi v14, v8, 6\n\t"
"th.vand.vi v8, v8, 3\n\t"
"th.vand.vi v10, v10, 3\n\t"
"th.vand.vi v12, v12, 3\n\t"
"th.vlb.v v2, (%[qh])\n\t"
"th.vand.vx v4, v2, %[m]\n\t"
"slli %[m], %[m], 1\n\t"
"th.vmseq.vx v0, v4, zero\n\t"
"th.vadd.vi v8, v8, -4, v0.t\n\t"
"th.vand.vx v4, v2, %[m]\n\t"
"slli %[m], %[m], 1\n\t"
"th.vmseq.vx v0, v4, zero\n\t"
"th.vadd.vi v10, v10, -4, v0.t\n\t"
"th.vand.vx v4, v2, %[m]\n\t"
"slli %[m], %[m], 1\n\t"
"th.vmseq.vx v0, v4, zero\n\t"
"th.vadd.vi v12, v12, -4, v0.t\n\t"
"th.vand.vx v4, v2, %[m]\n\t"
"slli %[m], %[m], 1\n\t"
"th.vmseq.vx v0, v4, zero\n\t"
"th.vadd.vi v14, v14, -4, v0.t\n\t"
"th.vsetvli zero, %[vl128], e8, m8\n\t"
"th.vlb.v v0, (%[q8])\n\t"
"th.vsetvli zero, %[vl64], e8, m4\n\t"
"th.vwmul.vv v16, v0, v8\n\t"
"th.vwmul.vv v24, v4, v12\n\t"
"li %[tmp], 16\n\t"
"th.vsetvli zero, %[tmp], e16, m2\n\t"
"th.vmv.v.x v0, zero\n\t"
"th.vwredsum.vs v10, v16, v0\n\t"
"th.vwredsum.vs v9, v18, v0\n\t"
"th.vwredsum.vs v8, v20, v0\n\t"
"th.vwredsum.vs v7, v22, v0\n\t"
"th.vwredsum.vs v11, v24, v0\n\t"
"th.vwredsum.vs v12, v26, v0\n\t"
"th.vwredsum.vs v13, v28, v0\n\t"
"th.vwredsum.vs v14, v30, v0\n\t"
"li %[tmp], 4\n\t"
"th.vsetvli zero, %[tmp], e32, m1\n\t"
"th.vslideup.vi v10, v9, 1\n\t"
"th.vslideup.vi v8, v7, 1\n\t"
"th.vslideup.vi v11, v12, 1\n\t"
"th.vslideup.vi v13, v14, 1\n\t"
"th.vslideup.vi v10, v8, 2\n\t"
"th.vslideup.vi v11, v13, 2\n\t"
"li %[tmp], 8\n\t"
"th.vsetvli zero, %[tmp], e32, m2\n\t"
"th.vlb.v v12, (%[scale])\n\t"
"th.vmul.vv v10, v10, v12\n\t"
"th.vredsum.vs v0, v10, v0\n\t"
"th.vmv.x.s %[tmp], v0\n\t"
"add %[isum], %[isum], %[tmp]"
: [tmp] "=&r" (tmp), [m] "+&r" (m), [isum] "+&r" (isum)
: [vl128] "r" (128), [vl64] "r" (64), [vl32] "r" (32)
, [q3] "r" (q3), [qh] "r" (qh), [scale] "r" (scale), [q8] "r" (q8)
: "memory"
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
);
q3 += 32; q8 += 128; scale += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
sumf += d * isum;
}
*s = sumf;
#elif defined __riscv_v
uint32_t utmp[4];
float sumf = 0;
uint32_t aux[3];
const int vector_length = __riscv_vlenb() * 8;
switch (vector_length) { switch (vector_length) {
case 256: case 256:
for (int i = 0; i < nb; ++i) { for (int i = 0; i < nb; ++i) {
@ -6332,7 +6556,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
"vslideup.vi v13, v14, 1\n\t" "vslideup.vi v13, v14, 1\n\t"
"vslideup.vi v10, v8, 2\n\t" "vslideup.vi v10, v8, 2\n\t"
"vslideup.vi v11, v13, 2\n\t" "vslideup.vi v11, v13, 2\n\t"
"vsetivli zero, 8, e32, m2\n\t"\ "vsetivli zero, 8, e32, m2\n\t"
"vle8.v v15, (%[scale])\n\t" "vle8.v v15, (%[scale])\n\t"
"vsext.vf4 v12, v15\n\t" "vsext.vf4 v12, v15\n\t"
"vmul.vv v10, v10, v12\n\t" "vmul.vv v10, v10, v12\n\t"
@ -7181,14 +7405,130 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
*s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m); *s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
#elif defined __riscv_v_intrinsic #elif defined __riscv_xtheadvector
const uint8_t * scales = (const uint8_t*)&utmp[0]; const uint8_t * scales = (const uint8_t*)&utmp[0];
const uint8_t * mins = (const uint8_t*)&utmp[2]; const uint8_t * mins = (const uint8_t*)&utmp[2];
const int vector_length = __riscv_vlenb() * 8;
float sumf = 0; float sumf = 0;
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
int tmp, tmp2, sumi;
__asm__ __volatile__(
"li %[t1], 12\n\t"
"th.vsetvli zero, %[t1], e8, m1\n\t"
"th.vlb.v v1, (%[s6b])\n\t" // {aux[0], aux[1], aux[2]}
"li %[t1], 4\n\t"
"th.vsetvli zero, %[t1], e32, m1\n\t"
"th.vslidedown.vi v2, v1, 2\n\t"
"th.vmv.v.v v3, v2\n\t"
"th.vslideup.vi v2, v3, 1\n\t" // {aux[2], aux[2]}
"li %[t1], 2\n\t"
"th.vsetvli zero, %[t1], e32, m1\n\t"
"th.vmv.v.i v4, 4\n\t"
"th.vand.vx v8, v1, %[kmask1]\n\t"
"th.vslide1up.vx v5, v4, zero\n\t" // {0, 4}
"th.vsrl.vi v6, v1, 6\n\t"
"th.vsrl.vv v7, v2, v5\n\t"
"th.vand.vx v0, v6, %[kmask3]\n\t"
"th.vand.vx v2, v7, %[kmask2]\n\t"
"th.vsll.vi v6, v0, 4\n\t"
"li %[t2], 8\n\t"
"addi %[t1], %[utmp], 4\n\t"
"th.vor.vv v1, v6, v2\n\t"
"th.vssw.v v8, (%[utmp]), %[t2]\n\t"
"th.vssw.v v1, (%[t1]), %[t2]\n\t"
"th.vsetvli zero, zero, e32, m2\n\t" // vl == 8
"th.vlw.v v2, (%[bsums])\n\t"
"th.vsetvli zero, %[t2], e16, m1\n\t"
"th.vnsrl.vi v0, v2, 0\n\t"
"th.vnsrl.vi v1, v2, 16\n\t"
"th.vadd.vv v2, v0, v1\n\t"
"th.vlbu.v v4, (%[mins])\n\t"
"th.vwmul.vv v6, v4, v2\n\t"
"th.vmv.v.x v0, zero\n\t"
"th.vsetvli zero, %[t2], e32, m2\n\t"
"th.vredsum.vs v0, v6, v0\n\t"
"th.vmv.x.s %[sumi], v0"
: [t1] "=&r" (tmp), [t2] "=&r" (tmp2), [sumi] "=&r" (sumi)
: [bsums] "r" (y[i].bsums), [mins] "r" (mins), [utmp] "r" (utmp)
, [s6b] "r" (x[i].scales), [kmask1] "r" (kmask1)
, [kmask2] "r" (kmask2), [kmask3] "r" (kmask3)
: "memory"
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
);
sumf -= dmin * sumi;
const uint8_t * restrict q4 = x[i].qs;
const int8_t * restrict q8 = y[i].qs;
sumi = 0;
const uint8_t * scale = scales;
for (int j = 0; j < QK_K/128; ++j) {
int vl128 = 128, vl64 = 64, vl32 = 32;
__asm__ __volatile__(
"th.vsetvli zero, %[vl128], e8, m8\n\t"
"th.vlb.v v8, (%[q8])\n\t"
"th.vsetvli zero, %[vl64], e8, m4\n\t"
"th.vlb.v v0, (%[q4])\n\t"
"th.vsrl.vi v4, v0, 4\n\t"
"th.vand.vi v0, v0, 0xF\n\t"
"th.vsetvli zero, %[vl32], e8, m2\n\t"
"th.vwmul.vv v28, v6, v14\n\t"
"th.vwmul.vv v20, v4, v10\n\t"
"th.vwmul.vv v24, v2, v12\n\t"
"th.vwmul.vv v16, v0, v8\n\t"
"li %[tmp], 4\n\t"
"th.vsetvli zero, %[tmp], e32, m1\n\t"
"th.vlbu.v v1, (%[scale])\n\t"
"th.vmv.v.x v0, zero\n\t"
"th.vsetvli zero, %[vl32], e16, m4\n\t"
"th.vwredsum.vs v6, v24, v0\n\t"
"th.vwredsum.vs v7, v28, v0\n\t"
"th.vwredsum.vs v4, v16, v0\n\t"
"th.vwredsum.vs v5, v20, v0\n\t"
"th.vsetvli zero, %[tmp], e32, m1\n\t"
"th.vslideup.vi v6, v7, 1\n\t"
"th.vslideup.vi v4, v5, 1\n\t"
"th.vslideup.vi v4, v6, 2\n\t"
"th.vmul.vv v8, v4, v1\n\t"
"th.vredsum.vs v0, v8, v0\n\t"
"th.vmv.x.s %[tmp], v0\n\t"
"add %[sumi], %[sumi], %[tmp]"
: [tmp] "=&r" (tmp), [sumi] "+&r" (sumi)
: [vl128] "r" (vl128), [vl64] "r" (vl64), [vl32] "r" (vl32)
, [q4] "r" (q4), [q8] "r" (q8), [scale] "r" (scale)
: "memory"
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
);
q4 += 64; q8 += 128; scale += 4;
}
sumf += d * sumi;
}
*s = sumf;
#elif defined __riscv_v
const uint8_t * scales = (const uint8_t*)&utmp[0];
const uint8_t * mins = (const uint8_t*)&utmp[2];
float sumf = 0;
const int vector_length = __riscv_vlenb() * 8;
switch (vector_length) { switch (vector_length) {
case 256: case 256:
for (int i = 0; i < nb; ++i) { for (int i = 0; i < nb; ++i) {
@ -8075,7 +8415,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
*s = sumf; *s = sumf;
#elif defined __riscv_v_intrinsic #elif defined __riscv_v
const uint8_t * scales = (const uint8_t*)&utmp[0]; const uint8_t * scales = (const uint8_t*)&utmp[0];
const uint8_t * mins = (const uint8_t*)&utmp[2]; const uint8_t * mins = (const uint8_t*)&utmp[2];
@ -9233,11 +9573,92 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
} }
*s = sumf; *s = sumf;
#elif defined __riscv_v_intrinsic #elif defined __riscv_xtheadvector
const int vector_length = __riscv_vlenb() * 8;
float sumf = 0; float sumf = 0;
for (int i = 0; i < nb; ++i) {
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * restrict q6 = x[i].ql;
const uint8_t * restrict qh = x[i].qh;
const int8_t * restrict q8 = y[i].qs;
const int8_t * restrict scale = x[i].scales;
int sum_t = 0;
int t0;
for (int j = 0; j < QK_K/128; ++j) {
__asm__ __volatile__(
"th.vsetvli zero, %[vl32], e8, m2\n\t" // vl == 32
"th.vlb.v v4, (%[qh])\n\t"
"th.vsll.vi v0, v4, 4\n\t"
"th.vsll.vi v2, v4, 2\n\t"
"th.vsrl.vi v6, v4, 2\n\t"
"th.vsetvli zero, %[vl64], e8, m4\n\t" // vl == 64
"th.vlb.v v8, (%[q6])\n\t"
"th.vsrl.vi v12, v8, 4\n\t"
"th.vand.vi v8, v8, 0xF\n\t"
"th.vsetvli zero, %[vl128], e8, m8\n\t" // vl == 128
"th.vand.vx v0, v0, %[mask]\n\t"
"th.vor.vv v8, v8, v0\n\t"
"th.vlb.v v0, (%[q8])\n\t"
"th.vsub.vx v8, v8, %[vl32]\n\t"
"th.vsetvli zero, %[vl64], e8, m4\n\t" // vl == 64
"th.vwmul.vv v16, v0, v8\n\t"
"th.vwmul.vv v24, v4, v12\n\t"
"li %[t0], 16\n\t"
"th.vsetvli zero, %[t0], e16, m2\n\t" // vl == 16
"th.vmv.v.x v0, zero\n\t"
"th.vwredsum.vs v10, v16, v0\n\t"
"th.vwredsum.vs v9, v18, v0\n\t"
"th.vwredsum.vs v8, v20, v0\n\t"
"th.vwredsum.vs v7, v22, v0\n\t"
"th.vwredsum.vs v11, v24, v0\n\t"
"th.vwredsum.vs v12, v26, v0\n\t"
"th.vwredsum.vs v13, v28, v0\n\t"
"th.vwredsum.vs v14, v30, v0\n\t"
"li %[t0], 4\n\t"
"th.vsetvli zero, %[t0], e32, m1\n\t" // vl == 4
"th.vslideup.vi v10, v9, 1\n\t"
"th.vslideup.vi v8, v7, 1\n\t"
"th.vslideup.vi v11, v12, 1\n\t"
"th.vslideup.vi v13, v14, 1\n\t"
"th.vslideup.vi v10, v8, 2\n\t"
"th.vslideup.vi v11, v13, 2\n\t"
"li %[t0], 8\n\t"
"th.vsetvli zero, %[t0], e32, m2\n\t" // vl == 8
"th.vlb.v v4, (%[scale])\n\t"
"th.vmul.vv v2, v4, v10\n\t"
"th.vredsum.vs v0, v2, v0\n\t"
"th.vmv.x.s %[t0], v0\n\t"
"add %[sumi], %[sumi], %[t0]"
: [sumi] "+&r" (sum_t), [t0] "=&r" (t0)
: [qh] "r" (qh), [q6] "r" (q6), [q8] "r" (q8), [scale] "r" (scale)
, [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
, [mask] "r" (0x30)
: "memory"
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
);
q6 += 64; qh += 32; q8 += 128; scale += 8;
}
sumf += d * sum_t;
}
*s = sumf;
#elif defined __riscv_v
float sumf = 0;
const int vector_length = __riscv_vlenb() * 8;
switch (vector_length) { switch (vector_length) {
case 256: case 256:
for (int i = 0; i < nb; ++i) { for (int i = 0; i < nb; ++i) {

View file

@ -386,7 +386,7 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
return r; return r;
} }
#elif defined(__riscv) && defined(GGML_RV_ZFH) #elif defined(__riscv) && defined(__riscv_zfhmin)
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
float f; float f;

View file

@ -2325,6 +2325,26 @@ struct ggml_tensor * ggml_repeat(
return result; return result;
} }
struct ggml_tensor * ggml_repeat_4d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
const bool can_repeat = ggml_is_empty(a) || (
(ne0 % a->ne[0] == 0) &&
(ne1 % a->ne[1] == 0) &&
(ne2 % a->ne[2] == 0) &&
(ne3 % a->ne[3] == 0)
);
GGML_ASSERT(can_repeat);
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3);
result->op = GGML_OP_REPEAT;
result->src[0] = a;
return result;
}
// ggml_repeat_back // ggml_repeat_back
struct ggml_tensor * ggml_repeat_back( struct ggml_tensor * ggml_repeat_back(

View file

@ -2260,6 +2260,7 @@ class VisionProjectorType:
ULTRAVOX = "ultravox" ULTRAVOX = "ultravox"
INTERNVL = "internvl" INTERNVL = "internvl"
QWEN2A = "qwen2a" # audio QWEN2A = "qwen2a" # audio
QWEN25O = "qwen2.5o" # omni
# Items here are (block size, type size) # Items here are (block size, type size)

View file

@ -1125,6 +1125,7 @@ class TensorNameMap:
MODEL_TENSOR.A_POST_NORM: ( MODEL_TENSOR.A_POST_NORM: (
"audio_tower.layer_norm", # ultravox "audio_tower.layer_norm", # ultravox
"audio_tower.ln_post", # qwen2omni
), ),
MODEL_TENSOR.A_ENC_ATTN_Q: ( MODEL_TENSOR.A_ENC_ATTN_Q: (
@ -1161,12 +1162,16 @@ class TensorNameMap:
"audio_tower.layers.{bid}.fc2", # ultravox "audio_tower.layers.{bid}.fc2", # ultravox
), ),
# note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors
# this prefix is added in the conversion code in modify_tensors()
MODEL_TENSOR.A_MMPROJ: ( MODEL_TENSOR.A_MMPROJ: (
"audio.multi_modal_projector.linear_{bid}", # ultravox "audio.multi_modal_projector.linear_{bid}", # ultravox
), ),
MODEL_TENSOR.A_MMPROJ_FC: ( MODEL_TENSOR.A_MMPROJ_FC: (
"audio.multi_modal_projector.linear", # qwen2audio "audio.multi_modal_projector.linear", # qwen2audio
"audio_tower.proj", # qwen2omni
), ),
MODEL_TENSOR.A_MM_NORM_PRE: ( MODEL_TENSOR.A_MM_NORM_PRE: (

View file

@ -2412,10 +2412,11 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
set_clip_uses_gpu(false); set_clip_uses_gpu(false);
printf("Clip forced to use CPU!\n"); printf("Clip forced to use CPU!\n");
} }
clp_ctx = clip_init(mmproj_filename.c_str(), clip_context_params{ clip_init_result cres = clip_init(mmproj_filename.c_str(), clip_context_params{
/* use_gpu */ true, /* use_gpu */ true,
/* verbosity */ static_cast<ggml_log_level>(1), /* verbosity */ static_cast<ggml_log_level>(1),
}); });
clp_ctx = cres.ctx_v;
if(clp_ctx == nullptr) { if(clp_ctx == nullptr) {
fprintf(stderr, "%s: error: failed to load mmproj model!\n", __func__); fprintf(stderr, "%s: error: failed to load mmproj model!\n", __func__);
return ModelLoadResult::FAIL; return ModelLoadResult::FAIL;

View file

@ -130,6 +130,7 @@ enum projector_type {
PROJECTOR_TYPE_INTERNVL, PROJECTOR_TYPE_INTERNVL,
PROJECTOR_TYPE_LLAMA4, PROJECTOR_TYPE_LLAMA4,
PROJECTOR_TYPE_QWEN2A, PROJECTOR_TYPE_QWEN2A,
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
PROJECTOR_TYPE_UNKNOWN, PROJECTOR_TYPE_UNKNOWN,
}; };
@ -148,6 +149,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_INTERNVL, "internvl"}, { PROJECTOR_TYPE_INTERNVL, "internvl"},
{ PROJECTOR_TYPE_LLAMA4, "llama4"}, { PROJECTOR_TYPE_LLAMA4, "llama4"},
{ PROJECTOR_TYPE_QWEN2A, "qwen2a"}, { PROJECTOR_TYPE_QWEN2A, "qwen2a"},
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
}; };
static projector_type clip_projector_type_from_string(const std::string & str) { static projector_type clip_projector_type_from_string(const std::string & str) {

File diff suppressed because it is too large Load diff

View file

@ -17,12 +17,22 @@ struct clip_image_f32;
struct clip_image_u8_batch; struct clip_image_u8_batch;
struct clip_image_f32_batch; struct clip_image_f32_batch;
enum clip_modality {
CLIP_MODALITY_VISION,
CLIP_MODALITY_AUDIO,
};
struct clip_context_params { struct clip_context_params {
bool use_gpu; bool use_gpu;
enum ggml_log_level verbosity; enum ggml_log_level verbosity;
}; };
struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params); struct clip_init_result {
struct clip_ctx * ctx_v; // vision context
struct clip_ctx * ctx_a; // audio context
};
struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params);
void clip_free(struct clip_ctx * ctx); void clip_free(struct clip_ctx * ctx);

View file

@ -284,8 +284,10 @@ int main(int argc, char ** argv) {
if (is_single_turn) { if (is_single_turn) {
g_is_generating = true; g_is_generating = true;
if (params.prompt.find(mtmd_default_marker()) == std::string::npos) { if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
for (size_t i = 0; i < params.image.size(); i++) {
params.prompt += mtmd_default_marker(); params.prompt += mtmd_default_marker();
} }
}
common_chat_msg msg; common_chat_msg msg;
msg.role = "user"; msg.role = "user";
msg.content = params.prompt; msg.content = params.prompt;

View file

@ -66,7 +66,8 @@ struct decode_embd_batch {
} }
} }
void set_position_mrope(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) { // M-RoPE for image
void set_position_mrope_2d(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
GGML_ASSERT(n_pos_per_embd == 4); GGML_ASSERT(n_pos_per_embd == 4);
seq_id_0[0] = seq_id; seq_id_0[0] = seq_id;
for (int y = 0; y < ny; y++) { for (int y = 0; y < ny; y++) {
@ -85,6 +86,23 @@ struct decode_embd_batch {
} }
} }
// M-RoPE for audio
void set_position_mrope_1d(llama_pos pos_0, llama_seq_id seq_id) {
GGML_ASSERT(n_pos_per_embd == 4);
seq_id_0[0] = seq_id;
for (int i = 0; i < batch.n_tokens; i++) {
pos[i ] = pos_0 + i;
pos[i + batch.n_tokens ] = pos_0 + i;
pos[i + batch.n_tokens * 2] = pos_0 + i;
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
}
for (int i = 0; i < batch.n_tokens; i++) {
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = false;
}
}
llama_batch get_view(int offset, int n_tokens) { llama_batch get_view(int offset, int n_tokens) {
llama_pos * pos_ptr; llama_pos * pos_ptr;
pos_view.clear(); pos_view.clear();
@ -146,18 +164,20 @@ int32_t mtmd_helper_decode_image_chunk(
decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd); decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
if (mtmd_decode_use_mrope(ctx)) { if (mtmd_decode_use_mrope(ctx)) {
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk); const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
if (chunk_type != MTMD_INPUT_CHUNK_TYPE_IMAGE) {
LOG_ERR("failed to decode chunk: M-RoPE only accepts image chunk\n");
return -1;
}
if (!image_tokens) { if (!image_tokens) {
LOG_ERR("failed to decode chunk: image tokens are null\n"); LOG_ERR("failed to decode chunk: image tokens are null\n");
return -1; return -1;
} }
const int nx = mtmd_image_tokens_get_nx(image_tokens); const int nx = mtmd_image_tokens_get_nx(image_tokens);
const int ny = mtmd_image_tokens_get_ny(image_tokens); const int ny = mtmd_image_tokens_get_ny(image_tokens);
batch_embd.set_position_mrope(n_past, nx, ny, seq_id); batch_embd.set_position_mrope_2d(n_past, nx, ny, seq_id);
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
batch_embd.set_position_mrope_1d(n_past, seq_id);
} else {
GGML_ABORT("invalid chunk type for M-RoPE");
}
} else { } else {
batch_embd.set_position_normal(n_past, seq_id); batch_embd.set_position_normal(n_past, seq_id);
} }

View file

@ -95,15 +95,21 @@ mtmd_context_params mtmd_context_params_default() {
} }
struct mtmd_context { struct mtmd_context {
struct clip_ctx * ctx_clip; struct clip_ctx * ctx_v; // vision
struct clip_ctx * ctx_a; // audio
const struct llama_model * text_model; const struct llama_model * text_model;
std::vector<float> image_embd_v; // image embedding vector std::vector<float> image_embd_v; // image embedding vector
bool print_timings; bool print_timings;
int n_threads; int n_threads;
std::string media_marker; std::string media_marker;
bool has_vision; const int n_embd_text;
bool has_audio;
// these are not token, but strings used to mark the beginning and end of image/audio embeddings
std::string img_beg;
std::string img_end;
std::string aud_beg;
std::string aud_end;
// for llava-uhd style models, we need special tokens in-between slices // for llava-uhd style models, we need special tokens in-between slices
// minicpmv calls them "slices", llama 4 calls them "tiles" // minicpmv calls them "slices", llama 4 calls them "tiles"
@ -132,33 +138,61 @@ struct mtmd_context {
text_model (text_model), text_model (text_model),
print_timings(ctx_params.print_timings), print_timings(ctx_params.print_timings),
n_threads (ctx_params.n_threads), n_threads (ctx_params.n_threads),
media_marker (ctx_params.media_marker) media_marker (ctx_params.media_marker),
n_embd_text (llama_model_n_embd(text_model))
{ {
if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) { if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead"); throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
} }
if (media_marker.empty()) {
throw std::runtime_error("media_marker must not be empty");
}
clip_context_params ctx_clip_params; clip_context_params ctx_clip_params;
ctx_clip_params.use_gpu = ctx_params.use_gpu; ctx_clip_params.use_gpu = ctx_params.use_gpu;
ctx_clip_params.verbosity = ctx_params.verbosity; ctx_clip_params.verbosity = ctx_params.verbosity;
ctx_clip = clip_init(mmproj_fname, ctx_clip_params); auto res = clip_init(mmproj_fname, ctx_clip_params);
if (!ctx_clip) { ctx_v = res.ctx_v;
ctx_a = res.ctx_a;
if (!ctx_v && !ctx_a) {
throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname)); throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
} }
if (llama_model_n_embd(text_model) != clip_n_mmproj_embd(ctx_clip)) { // if both vision and audio mmproj are present, we need to validate their n_embd
if (ctx_v && ctx_a) {
int n_embd_v = clip_n_mmproj_embd(ctx_v);
int n_embd_a = clip_n_mmproj_embd(ctx_a);
if (n_embd_v != n_embd_a) {
throw std::runtime_error(string_format(
"mismatch between vision and audio mmproj (n_embd_v = %d, n_embd_a = %d)\n",
n_embd_v, n_embd_a));
}
}
// since we already validate n_embd of vision and audio mmproj,
// we can safely assume that they are the same
int n_embd_clip = clip_n_mmproj_embd(ctx_v ? ctx_v : ctx_a);
if (n_embd_text != n_embd_clip) {
throw std::runtime_error(string_format( throw std::runtime_error(string_format(
"mismatch between text model (n_embd = %d) and mmproj (n_embd = %d)\n" "mismatch between text model (n_embd = %d) and mmproj (n_embd = %d)\n"
"hint: you may be using wrong mmproj\n", "hint: you may be using wrong mmproj\n",
llama_model_n_embd(text_model), clip_n_mmproj_embd(ctx_clip))); n_embd_text, n_embd_clip));
}
if (ctx_v) {
init_vision();
}
if (ctx_a) {
init_audio();
}
} }
has_vision = clip_has_vision_encoder(ctx_clip); void init_vision() {
has_audio = clip_has_audio_encoder(ctx_clip); GGML_ASSERT(ctx_v != nullptr);
use_mrope = clip_is_qwen2vl(ctx_clip); use_mrope = clip_is_qwen2vl(ctx_v);
projector_type proj = clip_get_projector_type(ctx_clip); projector_type proj = clip_get_projector_type(ctx_v);
int minicpmv_version = clip_is_minicpmv(ctx_clip); int minicpmv_version = clip_is_minicpmv(ctx_v);
if (minicpmv_version == 2) { if (minicpmv_version == 2) {
// minicpmv 2.5 format: // minicpmv 2.5 format:
// <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice> // <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
@ -203,24 +237,82 @@ struct mtmd_context {
ov_img_first = false; // overview image is last ov_img_first = false; // overview image is last
} }
if (clip_has_whisper_encoder(ctx_clip)) { // set boi/eoi
if (proj == PROJECTOR_TYPE_GEMMA3) {
// <start_of_image> ... (image embeddings) ... <end_of_image>
img_beg = "<start_of_image>";
img_end = "<end_of_image>";
} else if (proj == PROJECTOR_TYPE_IDEFICS3) {
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
img_beg = "<fake_token_around_image><global-img>";
img_end = "<fake_token_around_image>";
} else if (proj == PROJECTOR_TYPE_PIXTRAL) {
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
img_end = "[IMG_END]";
} else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL) {
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
img_beg = "<|vision_start|>";
img_end = "<|vision_end|>";
} else if (proj == PROJECTOR_TYPE_LLAMA4) {
// (more details in mtmd_context constructor)
img_beg = "<|image_start|>";
img_end = "<|image_end|>";
LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
" https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
} else if (proj == PROJECTOR_TYPE_INTERNVL) {
// <img> ... (image embeddings) ... </img>
img_beg = "<img>";
img_end = "</img>";
}
}
void init_audio() {
GGML_ASSERT(ctx_a != nullptr);
projector_type proj = clip_get_projector_type(ctx_a);
if (clip_has_whisper_encoder(ctx_a)) {
// TODO @ngxson : check if model n_mel is 128 or 80 // TODO @ngxson : check if model n_mel is 128 or 80
w_filters = whisper_precalc_filters::get_128_bins(); w_filters = whisper_precalc_filters::get_128_bins();
} }
// warning messages
if (proj == PROJECTOR_TYPE_LLAMA4) {
LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
" https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
}
if (has_audio) {
LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n" LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
" https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__); " https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__);
if (proj == PROJECTOR_TYPE_QWEN2A) {
// <|audio_bos|> ... (embeddings) ... <|audio_eos|>
aud_beg = "<|audio_bos|>";
aud_end = "<|audio_eos|>";
} }
} }
// get clip ctx based on chunk type
clip_ctx * get_clip_ctx(const mtmd_input_chunk * chunk) const {
if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
return ctx_v;
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
return ctx_a;
}
GGML_ABORT("unknown chunk type");
}
projector_type proj_type_v() const {
return ctx_v ? clip_get_projector_type(ctx_v) : PROJECTOR_TYPE_UNKNOWN;
}
projector_type proj_type_a() const {
return ctx_a ? clip_get_projector_type(ctx_a) : PROJECTOR_TYPE_UNKNOWN;
}
~mtmd_context() { ~mtmd_context() {
clip_free(ctx_clip); clip_free(ctx_a);
clip_free(ctx_v);
} }
private: private:
@ -267,8 +359,354 @@ void mtmd_free(mtmd_context * ctx) {
} }
} }
// copied from common_tokenize struct mtmd_tokenizer {
static std::vector<llama_token> mtmd_tokenize_text_internal( mtmd_context * ctx;
std::vector<const mtmd_bitmap *> bitmaps;
std::string input_text;
bool add_special;
bool parse_special;
const llama_vocab * vocab;
mtmd_input_chunks cur;
mtmd_tokenizer(mtmd_context * ctx,
const mtmd_input_text * text,
const mtmd_bitmap ** bitmaps,
size_t n_bitmaps) : ctx(ctx), bitmaps(bitmaps, bitmaps + n_bitmaps) {
add_special = text->add_special;
parse_special = text->parse_special;
input_text = text->text;
vocab = llama_model_get_vocab(ctx->text_model);
// for compatibility, we convert image marker to media marker
string_replace_all(input_text, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker);
}
int32_t tokenize(mtmd_input_chunks * output) {
cur.entries.clear();
std::vector<std::string> parts = split_text(input_text, ctx->media_marker);
size_t i_bm = 0; // index of the current bitmap
for (auto & part : parts) {
if (part == ctx->media_marker) {
// this is a marker, we should add the next bitmap
if (i_bm >= bitmaps.size()) {
LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
__func__, bitmaps.size(), parts.size() - 1);
return 1;
}
const mtmd_bitmap * bitmap = bitmaps[i_bm++];
int32_t res = add_media(bitmap);
if (res != 0) {
return res;
}
} else {
// this is a text part, we should add it as text
add_text(part, parse_special);
}
}
if (add_special && llama_vocab_get_add_bos(vocab)) {
// if first chunk is text, we add BOS token to first text chunk
// otherwise, create a new text chunk with BOS token
if (!cur.entries.empty() && cur.entries[0].type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
// add BOS token to the beginning of first text chunk
cur.entries[0].tokens_text.insert(cur.entries[0].tokens_text.begin(), llama_vocab_bos(vocab));
} else {
// create a new text chunk with BOS token at the beginning
mtmd_input_chunk bos_chunk{
MTMD_INPUT_CHUNK_TYPE_TEXT,
{llama_vocab_bos(vocab)},
nullptr, // image tokens
nullptr, // audio tokens
};
cur.entries.insert(cur.entries.begin(), std::move(bos_chunk));
}
}
if (add_special && llama_vocab_get_add_eos(vocab)) {
// if last chunk is text, we add EOS token to it
add_text({llama_vocab_eos(vocab)});
}
if (i_bm != bitmaps.size()) {
LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
__func__, bitmaps.size(), parts.size() - 1);
return 1;
}
*output = std::move(cur);
return 0;
}
void add_text(const std::string & txt, bool parse_special) {
LOG_DBG("%s: %s\n", __func__, txt.c_str());
auto tokens = mtmd_tokenize_text_internal(vocab, txt, /* add_special */ false, parse_special);
add_text(tokens);
}
void add_text(const std::vector<llama_token> & tokens) {
if (tokens.empty()) {
return;
}
// if last entry is also a text chunk, add tokens to it instead of creating new chunk
if (!cur.entries.empty() && cur.entries.back().type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
cur.entries.back().tokens_text.insert(
cur.entries.back().tokens_text.end(),
tokens.begin(),
tokens.end());
} else {
mtmd_input_chunk chunk{
MTMD_INPUT_CHUNK_TYPE_TEXT,
tokens,
nullptr, // image tokens
nullptr, // audio tokens
};
cur.entries.emplace_back(std::move(chunk));
}
}
int32_t add_media(const mtmd_bitmap * bitmap) {
if (!bitmap->is_audio) {
// handle image
if (!ctx->ctx_v) {
LOG_ERR("%s: error: model does not support vision input\n", __func__);
return 2;
}
if (!ctx->img_beg.empty()) {
add_text(ctx->img_beg, true); // add image begin token
}
// convert mtmd_bitmap to clip_image_u8
clip_image_u8_ptr img_u8(clip_image_u8_init());
img_u8->nx = bitmap->nx;
img_u8->ny = bitmap->ny;
img_u8->buf.resize(bitmap->data.size());
std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3);
// preprocess image
clip_image_f32_batch batch_f32;
bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &batch_f32);
if (!ok) {
LOG_ERR("Unable to preprocess image\n");
return 2;
}
// handle llava-uhd style preprocessing
if (
ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
) {
// split batch into chunks of single images
auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmap->id);
GGML_ASSERT(chunks.size() > 0);
auto ov_chunk = std::move(chunks.front());
chunks.erase(chunks.begin());
// add overview image (first)
if (ctx->ov_img_first) {
if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
add_text({ctx->tok_ov_img_start});
}
cur.entries.emplace_back(std::move(ov_chunk));
if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
add_text({ctx->tok_ov_img_end});
}
}
// add slices (or tiles)
if (!chunks.empty()) {
const int n_col = batch_f32.grid_x;
const int n_row = batch_f32.grid_y;
if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
add_text({ctx->tok_slices_start});
}
for (int y = 0; y < n_row; y++) {
for (int x = 0; x < n_col; x++) {
const bool is_last_in_row = (x == n_col - 1);
if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
add_text({ctx->tok_sli_img_start});
}
cur.entries.emplace_back(std::move(chunks[y * n_col + x]));
if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
add_text({ctx->tok_sli_img_end});
}
if (!is_last_in_row && ctx->tok_sli_img_mid != LLAMA_TOKEN_NULL) {
add_text({ctx->tok_sli_img_mid});
}
}
if ((y != n_row - 1 || ctx->tok_row_end_trail) && ctx->tok_row_end != LLAMA_TOKEN_NULL) {
add_text({ctx->tok_row_end});
}
}
if (ctx->tok_slices_end != LLAMA_TOKEN_NULL) {
add_text({ctx->tok_slices_end});
}
}
// add overview image (last)
if (!ctx->ov_img_first) {
if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
add_text({ctx->tok_ov_img_start});
}
cur.entries.emplace_back(std::move(ov_chunk));
if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
add_text({ctx->tok_ov_img_end});
}
}
} else {
size_t n_tokens = 0;
for (const auto & entry : batch_f32.entries) {
n_tokens += clip_n_output_tokens(ctx->ctx_v, entry.get());
}
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
if (ctx->use_mrope) {
// for Qwen2VL, we need this information for M-RoPE decoding positions
image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get());
image_tokens->use_mrope_pos = true;
} else {
// other models, we only need the total number of tokens
image_tokens->nx = n_tokens;
image_tokens->ny = 1;
}
image_tokens->batch_f32 = std::move(batch_f32);
image_tokens->id = bitmap->id; // optional
LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
mtmd_input_chunk chunk{
MTMD_INPUT_CHUNK_TYPE_IMAGE,
{}, // text tokens
std::move(image_tokens),
nullptr, // audio tokens
};
cur.entries.emplace_back(std::move(chunk));
}
if (!ctx->img_end.empty()) {
add_text(ctx->img_end, true); // add image end token
}
} else {
// handle audio
if (!ctx->ctx_a) {
LOG_ERR("%s: error: model does not support audio input\n", __func__);
return 2;
}
if (bitmap->data.size() == 0) {
LOG_ERR("%s: error: empty audio data\n", __func__);
return 2;
}
if (!ctx->aud_beg.empty()) {
add_text(ctx->aud_beg, true); // add audio begin token
}
// preprocess audio
GGML_ASSERT(ctx->w_filters.n_mel); // make sure we have filter preloaded
std::vector<whisper_preprocessor::whisper_mel> mel_spec_chunks;
const float * samples = (const float *)bitmap->data.data();
size_t n_samples = bitmap->data.size() / sizeof(float);
bool ok = whisper_preprocessor::preprocess_audio(samples, n_samples, ctx->w_filters, mel_spec_chunks);
if (!ok) {
LOG_ERR("Unable to preprocess audio\n");
return 2;
}
// consider each mel_spec as a separate audio chunk
// TODO: maybe support batching, but this may come with memory cost
for (auto & mel_spec : mel_spec_chunks) {
clip_image_f32_ptr mel_f32(clip_image_f32_init());
mel_f32->nx = mel_spec.n_len;
mel_f32->ny = mel_spec.n_mel;
mel_f32->buf = std::move(mel_spec.data);
size_t n_tokens = clip_n_output_tokens(ctx->ctx_a, mel_f32.get());
clip_image_f32_batch batch_f32;
batch_f32.is_audio = true;
batch_f32.entries.push_back(std::move(mel_f32));
mtmd_audio_tokens_ptr audio_tokens(new mtmd_audio_tokens);
audio_tokens->n_tokens = n_tokens;
audio_tokens->batch_f32 = std::move(batch_f32);
audio_tokens->id = bitmap->id; // optional
LOG_DBG("audio_tokens->n_tokens = %d\n", audio_tokens->n_tokens);
mtmd_input_chunk chunk{
MTMD_INPUT_CHUNK_TYPE_AUDIO,
{}, // text tokens
nullptr, // image tokens
std::move(audio_tokens),
};
cur.entries.emplace_back(std::move(chunk));
}
if (!ctx->aud_end.empty()) {
add_text(ctx->aud_end, true); // add audio end token
}
}
return 0;
}
std::vector<mtmd_input_chunk> split_batch_to_chunk(clip_image_f32_batch && batch_f32, const std::string & id) {
std::vector<mtmd_input_chunk> chunks;
for (auto & entry : batch_f32.entries) {
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
image_tokens->nx = clip_n_output_tokens(ctx->ctx_v, entry.get());
image_tokens->ny = 1;
image_tokens->batch_f32.entries.push_back(std::move(entry));
image_tokens->id = id;
mtmd_input_chunk chunk{
MTMD_INPUT_CHUNK_TYPE_IMAGE,
{}, // text tokens
std::move(image_tokens),
nullptr, // audio tokens
};
chunks.emplace_back(std::move(chunk));
}
return chunks;
}
// for example: "a <__media__> b <__media__> c" --> "a", "<__media__>", "b", "<__media__>", "c"
static std::vector<std::string> split_text(const std::string & input, const std::string & delimiter) {
std::vector<std::string> result;
if (input.empty()) {
return result;
}
size_t start = 0;
size_t pos = 0;
while ((pos = input.find(delimiter, start)) != std::string::npos) {
if (pos > start) {
result.push_back(input.substr(start, pos - start));
}
result.push_back(delimiter);
start = pos + delimiter.length();
}
if (start < input.length()) {
result.push_back(input.substr(start));
}
return result;
}
// copied from common_tokenize
static std::vector<llama_token> mtmd_tokenize_text_internal(
const struct llama_vocab * vocab, const struct llama_vocab * vocab,
const std::string & text, const std::string & text,
bool add_special, bool add_special,
@ -285,319 +723,16 @@ static std::vector<llama_token> mtmd_tokenize_text_internal(
result.resize(n_tokens); result.resize(n_tokens);
} }
return result; return result;
} }
};
int32_t mtmd_tokenize(mtmd_context * ctx, int32_t mtmd_tokenize(mtmd_context * ctx,
mtmd_input_chunks * output, mtmd_input_chunks * output,
const mtmd_input_text * text, const mtmd_input_text * text,
const mtmd_bitmap ** bitmaps, const mtmd_bitmap ** bitmaps,
size_t n_bitmaps) { size_t n_bitmaps) {
auto vocab = llama_model_get_vocab(ctx->text_model); mtmd_tokenizer tokenizer(ctx, text, bitmaps, n_bitmaps);
return tokenizer.tokenize(output);
std::string prompt_modified(text->text);
std::string marker_modified(ctx->media_marker);
projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
// for compatibility, we convert image marker to media marker
string_replace_all(prompt_modified, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker);
// a bit hacky here, but works for now
// for some models, we need to add prefix and suffix to the image embeddings
if (clip_is_gemma3(ctx->ctx_clip)) {
// gemma 3
// <start_of_image> ... (image embeddings) ... <end_of_image>
marker_modified = "<start_of_image>" + ctx->media_marker + "<end_of_image>";
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
} else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
marker_modified = "<fake_token_around_image><global-img>" + ctx->media_marker + "<fake_token_around_image>";
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
} else if (proj_type == PROJECTOR_TYPE_PIXTRAL) {
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
marker_modified = ctx->media_marker + "[IMG_END]";
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
} else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
marker_modified = "<|vision_start|>" + ctx->media_marker + "<|vision_end|>";
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
} else if (proj_type == PROJECTOR_TYPE_LLAMA4) {
// (more details in mtmd_context constructor)
marker_modified = "<|image_start|>" + ctx->media_marker + "<|image_end|>";
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
} else if (proj_type == PROJECTOR_TYPE_INTERNVL) {
// <img> ... (image embeddings) ... </img>
marker_modified = "<img>" + ctx->media_marker + "</img>";
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
} else if (proj_type == PROJECTOR_TYPE_QWEN2A) {
// <|audio_bos|> ... (embeddings) ... <|audio_eos|>
marker_modified = "<|audio_bos|>" + ctx->media_marker + "<|audio_eos|>";
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
}
// llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
// for glm-edge, BOI and EOI token's embeddings are not present in the text model
std::vector<std::string> parts = string_split_str(prompt_modified, ctx->media_marker);
output->entries.clear();
output->entries.reserve(parts.size());
size_t i_bm = 0;
// utility for adding raw tokens
auto add_text_chunk = [&output](std::vector<llama_token> && tokens) {
mtmd_input_chunk chunk{
MTMD_INPUT_CHUNK_TYPE_TEXT,
std::move(tokens),
nullptr, // image tokens
nullptr, // audio tokens
};
output->entries.emplace_back(std::move(chunk));
};
// utility for splitting batch of multiple images into chunks of batch having single images
auto split_batch_to_chunk = [&ctx](clip_image_f32_batch && batch_f32, const std::string & id) {
std::vector<mtmd_input_chunk> chunks;
for (auto & entry : batch_f32.entries) {
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
image_tokens->nx = clip_n_output_tokens(ctx->ctx_clip, entry.get());
image_tokens->ny = 1;
image_tokens->batch_f32.entries.push_back(std::move(entry));
image_tokens->id = id;
mtmd_input_chunk chunk{
MTMD_INPUT_CHUNK_TYPE_IMAGE,
{}, // text tokens
std::move(image_tokens),
nullptr, // audio tokens
};
chunks.emplace_back(std::move(chunk));
}
return chunks;
};
for (const auto & part : parts) {
// printf("tokenizing part: %s\n", part.c_str());
bool add_bos = &parts.front() == &part;
auto tokens = mtmd_tokenize_text_internal(vocab, part, text->add_special && add_bos, text->parse_special);
if (tokens.empty()) {
continue;
}
mtmd_input_chunk chunk{
MTMD_INPUT_CHUNK_TYPE_TEXT,
std::move(tokens),
nullptr, // image tokens
nullptr, // audio tokens
};
output->entries.emplace_back(std::move(chunk));
// only add image/audio tokens to middle of 2 parts
// therefore, we skip handling image/audio if this is the last part
if (&parts.back() == &part) {
continue;
}
if (!bitmaps[i_bm]->is_audio) {
// handle image
if (i_bm >= n_bitmaps) {
LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
return 1;
}
if (!ctx->has_vision) {
LOG_ERR("%s: error: model does not support vision input\n", __func__);
return 2;
}
// convert mtmd_bitmap to clip_image_u8
clip_image_u8_ptr img_u8(clip_image_u8_init());
img_u8->nx = bitmaps[i_bm]->nx;
img_u8->ny = bitmaps[i_bm]->ny;
img_u8->buf.resize(bitmaps[i_bm]->data.size());
std::memcpy(img_u8->buf.data(), bitmaps[i_bm]->data.data(), img_u8->nx * img_u8->ny * 3);
// preprocess image
clip_image_f32_batch batch_f32;
bool ok = clip_image_preprocess(ctx->ctx_clip, img_u8.get(), &batch_f32);
if (!ok) {
LOG_ERR("Unable to preprocess image\n");
return 2;
}
// handle llava-uhd style preprocessing
if (
ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
) {
// split batch into chunks of single images
auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_bm]->id);
GGML_ASSERT(chunks.size() > 0);
auto ov_chunk = std::move(chunks.front());
chunks.erase(chunks.begin());
// add overview image (first)
if (ctx->ov_img_first) {
if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_ov_img_start});
}
output->entries.emplace_back(std::move(ov_chunk));
if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_ov_img_end});
}
}
// add slices (or tiles)
if (!chunks.empty()) {
const int n_col = batch_f32.grid_x;
const int n_row = batch_f32.grid_y;
if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_slices_start});
}
for (int y = 0; y < n_row; y++) {
for (int x = 0; x < n_col; x++) {
const bool is_last_in_row = (x == n_col - 1);
if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_sli_img_start});
}
output->entries.emplace_back(std::move(chunks[y * n_col + x]));
if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_sli_img_end});
}
if (!is_last_in_row && ctx->tok_sli_img_mid != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_sli_img_mid});
}
}
if ((y != n_row - 1 || ctx->tok_row_end_trail) && ctx->tok_row_end != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_row_end});
}
}
if (ctx->tok_slices_end != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_slices_end});
}
}
// add overview image (last)
if (!ctx->ov_img_first) {
if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_ov_img_start});
}
output->entries.emplace_back(std::move(ov_chunk));
if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
add_text_chunk({ctx->tok_ov_img_end});
}
}
} else {
size_t n_tokens = 0;
for (const auto & entry : batch_f32.entries) {
n_tokens += clip_n_output_tokens(ctx->ctx_clip, entry.get());
}
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
if (ctx->use_mrope) {
// for Qwen2VL, we need this information for M-RoPE decoding positions
image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_clip, batch_f32.entries[0].get());
image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_clip, batch_f32.entries[0].get());
image_tokens->use_mrope_pos = true;
} else {
// other models, we only need the total number of tokens
image_tokens->nx = n_tokens;
image_tokens->ny = 1;
}
image_tokens->batch_f32 = std::move(batch_f32);
image_tokens->id = bitmaps[i_bm]->id; // optional
LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
mtmd_input_chunk chunk{
MTMD_INPUT_CHUNK_TYPE_IMAGE,
{}, // text tokens
std::move(image_tokens),
nullptr, // audio tokens
};
output->entries.emplace_back(std::move(chunk));
}
i_bm++; // move to next image
continue;
} else {
// handle audio
if (i_bm >= n_bitmaps) {
LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
return 1;
}
if (!ctx->has_audio) {
LOG_ERR("%s: error: model does not support audio input\n", __func__);
return 2;
}
if (bitmaps[i_bm]->data.size() == 0) {
LOG_ERR("%s: error: empty audio data\n", __func__);
return 2;
}
// preprocess audio
GGML_ASSERT(ctx->w_filters.n_mel); // make sure we have filter preloaded
std::vector<whisper_preprocessor::whisper_mel> mel_spec_chunks;
const float * samples = (const float *)bitmaps[i_bm]->data.data();
size_t n_samples = bitmaps[i_bm]->data.size() / sizeof(float);
bool ok = whisper_preprocessor::preprocess_audio(samples, n_samples, ctx->w_filters, mel_spec_chunks);
if (!ok) {
LOG_ERR("Unable to preprocess audio\n");
return 2;
}
// consider each mel_spec as a separate audio chunk
// TODO: maybe support batching, but this may come with memory cost
for (auto & mel_spec : mel_spec_chunks) {
clip_image_f32_ptr mel_f32(clip_image_f32_init());
mel_f32->nx = mel_spec.n_len;
mel_f32->ny = mel_spec.n_mel;
mel_f32->buf = std::move(mel_spec.data);
size_t n_tokens = clip_n_output_tokens(ctx->ctx_clip, mel_f32.get());
clip_image_f32_batch batch_f32;
batch_f32.is_audio = true;
batch_f32.entries.push_back(std::move(mel_f32));
mtmd_audio_tokens_ptr audio_tokens(new mtmd_audio_tokens);
audio_tokens->n_tokens = n_tokens;
audio_tokens->batch_f32 = std::move(batch_f32);
audio_tokens->id = bitmaps[i_bm]->id; // optional
LOG_DBG("audio_tokens->n_tokens = %d\n", audio_tokens->n_tokens);
mtmd_input_chunk chunk{
MTMD_INPUT_CHUNK_TYPE_AUDIO,
{}, // text tokens
nullptr, // image tokens
std::move(audio_tokens),
};
output->entries.emplace_back(std::move(chunk));
}
i_bm++;
continue;
}
}
return 0;
} }
int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) { int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
@ -605,41 +740,54 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n"); LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n");
return 0; return 0;
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) { } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
if (!ctx->ctx_v) {
LOG_ERR("%s: model does not support vision input\n", __func__);
return 1;
}
return mtmd_encode(ctx, chunk->tokens_image.get()); return mtmd_encode(ctx, chunk->tokens_image.get());
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) { } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip); if (!ctx->ctx_a) {
LOG_ERR("%s: model does not support audio input\n", __func__);
return 1;
}
int n_mmproj_embd = ctx->n_embd_text;
ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd); ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
bool ok = clip_image_batch_encode( bool ok = clip_image_batch_encode(
ctx->ctx_clip, ctx->ctx_a,
ctx->n_threads, ctx->n_threads,
&chunk->tokens_audio->batch_f32, &chunk->tokens_audio->batch_f32,
ctx->image_embd_v.data()); ctx->image_embd_v.data());
return ok ? 0 : 1; return ok ? 0 : 1;
} }
LOG_ERR("mtmd_encode_chunk: unknown chunk type %d\n", (int)chunk->type); LOG_ERR("%s: unknown chunk type %d\n", __func__, (int)chunk->type);
return 1; return 1;
} }
int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) { int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip); clip_ctx * ctx_clip = ctx->ctx_v;
if (!ctx_clip) {
LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__);
return 1;
}
int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip);
ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd); ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
bool ok = false; bool ok = false;
if (clip_is_llava(ctx->ctx_clip) || clip_is_minicpmv(ctx->ctx_clip) || clip_is_glm(ctx->ctx_clip)) { if (clip_is_llava(ctx_clip) || clip_is_minicpmv(ctx_clip) || clip_is_glm(ctx_clip)) {
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode() // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
const auto & entries = image_tokens->batch_f32.entries; const auto & entries = image_tokens->batch_f32.entries;
for (size_t i = 0; i < entries.size(); i++) { for (size_t i = 0; i < entries.size(); i++) {
int n_tokens_per_image = clip_n_output_tokens(ctx->ctx_clip, entries[i].get()); int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
ok = clip_image_encode( ok = clip_image_encode(
ctx->ctx_clip, ctx_clip,
ctx->n_threads, ctx->n_threads,
entries[i].get(), entries[i].get(),
ctx->image_embd_v.data() + i*n_mmproj_embd*n_tokens_per_image); ctx->image_embd_v.data() + i*n_mmproj_embd*n_tokens_per_image);
} }
} else { } else {
ok = clip_image_batch_encode( ok = clip_image_batch_encode(
ctx->ctx_clip, ctx_clip,
ctx->n_threads, ctx->n_threads,
&image_tokens->batch_f32, &image_tokens->batch_f32,
ctx->image_embd_v.data()); ctx->image_embd_v.data());
@ -653,8 +801,7 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
} }
bool mtmd_decode_use_non_causal(mtmd_context * ctx) { bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
projector_type proj_type = clip_get_projector_type(ctx->ctx_clip); if (ctx->ctx_v && clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3) {
if (proj_type == PROJECTOR_TYPE_GEMMA3) {
return true; return true;
} }
return false; return false;
@ -665,11 +812,11 @@ bool mtmd_decode_use_mrope(mtmd_context * ctx) {
} }
bool mtmd_support_vision(mtmd_context * ctx) { bool mtmd_support_vision(mtmd_context * ctx) {
return ctx->has_vision; return ctx->ctx_v != nullptr;
} }
bool mtmd_support_audio(mtmd_context * ctx) { bool mtmd_support_audio(mtmd_context * ctx) {
return ctx->has_audio; return ctx->ctx_a != nullptr;
} }
// these 2 helpers below use internal clip_image_u8_ptr, // these 2 helpers below use internal clip_image_u8_ptr,

BIN
tools/mtmd/test-2.mp3 Normal file

Binary file not shown.

View file

@ -25,80 +25,99 @@ RUN_HUGE_TESTS=false
if [ "${1:-}" = "huge" ]; then if [ "${1:-}" = "huge" ]; then
RUN_HUGE_TESTS=true RUN_HUGE_TESTS=true
RUN_BIG_TESTS=true RUN_BIG_TESTS=true
echo "Include BIG models..." echo "Include BIG and HUGE models..."
fi fi
############### ###############
arr_bin=() arr_prefix=()
arr_hf=() arr_hf=()
arr_tmpl=() # chat template arr_tmpl=() # chat template
arr_file=()
add_test() { add_test_vision() {
local bin=$1 local hf=$1
local hf=$2 local tmpl=${2:-""} # default to empty string if not provided
local tmpl=${3:-""} # default to empty string if not provided arr_prefix+=("[vision]")
arr_bin+=("$bin")
arr_hf+=("$hf") arr_hf+=("$hf")
arr_tmpl+=("$tmpl") arr_tmpl+=("$tmpl")
arr_file+=("test-1.jpeg")
} }
add_test "llama-mtmd-cli" "ggml-org/SmolVLM-500M-Instruct-GGUF:Q8_0" add_test_audio() {
add_test "llama-mtmd-cli" "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M" local hf=$1
add_test "llama-mtmd-cli" "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0" arr_prefix+=("[audio] ")
add_test "llama-mtmd-cli" "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M" arr_hf+=("$hf")
add_test "llama-mtmd-cli" "THUDM/glm-edge-v-5b-gguf:Q4_K_M" arr_tmpl+=("") # no need for chat tmpl
add_test "llama-mtmd-cli" "second-state/Llava-v1.5-7B-GGUF:Q2_K" "vicuna" arr_file+=("test-2.mp3")
add_test "llama-mtmd-cli" "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M" "vicuna" }
add_test "llama-mtmd-cli" "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
add_test "llama-mtmd-cli" "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted add_test_vision "ggml-org/SmolVLM-500M-Instruct-GGUF:Q8_0"
add_test "llama-mtmd-cli" "openbmb/MiniCPM-V-2_6-gguf:Q2_K" add_test_vision "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M"
add_test "llama-mtmd-cli" "openbmb/MiniCPM-o-2_6-gguf:Q4_0" add_test_vision "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0"
add_test "llama-mtmd-cli" "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M" add_test_vision "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M" add_test_vision "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
add_test "llama-mtmd-cli" "ggml-org/InternVL2_5-1B-GGUF:Q8_0" add_test_vision "second-state/Llava-v1.5-7B-GGUF:Q2_K" "vicuna"
add_test "llama-mtmd-cli" "ggml-org/InternVL3-1B-Instruct-GGUF:Q8_0" add_test_vision "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M" "vicuna"
add_test_vision "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
add_test_vision "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
add_test_vision "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
add_test_vision "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
add_test_vision "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
add_test_vision "ggml-org/InternVL2_5-1B-GGUF:Q8_0"
add_test_vision "ggml-org/InternVL3-1B-Instruct-GGUF:Q8_0"
add_test_vision "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"
add_test_audio "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0"
add_test_audio "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"
# to test the big models, run: ./tests.sh big # to test the big models, run: ./tests.sh big
if [ "$RUN_BIG_TESTS" = true ]; then if [ "$RUN_BIG_TESTS" = true ]; then
add_test "llama-mtmd-cli" "ggml-org/pixtral-12b-GGUF:Q4_K_M" add_test_vision "ggml-org/pixtral-12b-GGUF:Q4_K_M"
add_test "llama-mtmd-cli" "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" "mistral-v7" add_test_vision "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" "mistral-v7"
add_test "llama-mtmd-cli" "ggml-org/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M" add_test_vision "ggml-org/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
add_test "llama-mtmd-cli" "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M" add_test_vision "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M"
add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M" add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-7B-Instruct-GGUF:Q4_K_M" add_test_vision "ggml-org/Qwen2.5-VL-7B-Instruct-GGUF:Q4_K_M"
add_test "llama-mtmd-cli" "ggml-org/InternVL3-8B-Instruct-GGUF:Q4_K_M" add_test_vision "ggml-org/InternVL3-8B-Instruct-GGUF:Q4_K_M"
add_test "llama-mtmd-cli" "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M" add_test_vision "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M"
# add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra add_test_vision "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
# add_test_vision "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra
add_test_audio "ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF:Q4_K_M"
add_test_audio "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
fi fi
# to test the huge models, run: ./tests.sh huge # to test the huge models, run: ./tests.sh huge
# this will run both the big and huge models # this will run both the big and huge models
# huge models are > 32B parameters # huge models are > 32B parameters
if [ "$RUN_HUGE_TESTS" = true ]; then if [ "$RUN_HUGE_TESTS" = true ]; then
add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-72B-Instruct-GGUF:Q4_K_M" add_test_vision "ggml-org/Qwen2.5-VL-72B-Instruct-GGUF:Q4_K_M"
add_test "llama-mtmd-cli" "ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF:IQ1_S" add_test_vision "ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF:IQ1_S"
fi fi
# these models always give the wrong answer, not sure why # these models always give the wrong answer, not sure why
# add_test "llama-mtmd-cli" "ggml-org/SmolVLM-Instruct-GGUF:Q4_K_M" # add_test_vision "ggml-org/SmolVLM-Instruct-GGUF:Q4_K_M"
# add_test "llama-mtmd-cli" "ggml-org/SmolVLM-256M-Instruct-GGUF:Q8_0" # add_test_vision "ggml-org/SmolVLM-256M-Instruct-GGUF:Q8_0"
# add_test "llama-mtmd-cli" "ggml-org/SmolVLM2-256M-Video-Instruct-GGUF:Q8_0" # add_test_vision "ggml-org/SmolVLM2-256M-Video-Instruct-GGUF:Q8_0"
# this model has broken chat template, not usable # this model has broken chat template, not usable
# add_test "llama-mtmd-cli" "cmp-nct/Yi-VL-6B-GGUF:Q5_K" # add_test_vision "cmp-nct/Yi-VL-6B-GGUF:Q5_K"
# add_test "llama-mtmd-cli" "guinmoon/MobileVLM-3B-GGUF:Q4_K_M" "deepseek" # add_test_vision "guinmoon/MobileVLM-3B-GGUF:Q4_K_M" "deepseek"
############### ###############
cmake --build build -j --target "${arr_bin[@]}" cmake --build build -j --target llama-mtmd-cli
arr_res=() arr_res=()
for i in "${!arr_bin[@]}"; do for i in "${!arr_hf[@]}"; do
bin="${arr_bin[$i]}" bin="llama-mtmd-cli"
prefix="${arr_prefix[$i]}"
hf="${arr_hf[$i]}" hf="${arr_hf[$i]}"
tmpl="${arr_tmpl[$i]}" tmpl="${arr_tmpl[$i]}"
inp_file="${arr_file[$i]}"
echo "Running test with binary: $bin and HF model: $hf" echo "Running test with binary: $bin and HF model: $hf"
echo "" echo ""
@ -107,7 +126,7 @@ for i in "${!arr_bin[@]}"; do
output=$(\ output=$(\
"$PROJ_ROOT/build/bin/$bin" \ "$PROJ_ROOT/build/bin/$bin" \
-hf "$hf" \ -hf "$hf" \
--image $SCRIPT_DIR/test-1.jpeg \ --image $SCRIPT_DIR/$inp_file \
-p "what is the publisher name of the newspaper?" \ -p "what is the publisher name of the newspaper?" \
--temp 0 -n 128 \ --temp 0 -n 128 \
${tmpl:+--chat-template "$tmpl"} \ ${tmpl:+--chat-template "$tmpl"} \
@ -116,9 +135,9 @@ for i in "${!arr_bin[@]}"; do
echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
if echo "$output" | grep -iq "new york"; then if echo "$output" | grep -iq "new york"; then
result="\033[32mOK\033[0m: $bin $hf" result="$prefix \033[32mOK\033[0m: $bin $hf"
else else
result="\033[31mFAIL\033[0m: $bin $hf" result="$prefix \033[31mFAIL\033[0m: $bin $hf"
fi fi
echo -e "$result" echo -e "$result"
arr_res+=("$result") arr_res+=("$result")