mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-17 04:09:19 +00:00
mtmd: add MiMo v2.5 vision (#22883)
* mimo-v2.5: vision support * mimo-v2.5: use fused qkv for vision * mimi-v2.5: fix f16 vision overflow * mimo-v2.5: comment cleanups * mimo-v2.5: Flash doesn't have mmproj more cleanup remember to use filter_tensors * mimo-v2.5: fix trailing whitespace
This commit is contained in:
parent
78fbbc2c07
commit
4178259130
12 changed files with 460 additions and 23 deletions
|
|
@ -9760,6 +9760,73 @@ class MimoV2Model(TextModel):
|
|||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("MiMoV2ForCausalLM")
|
||||
class MiMoV2VisionModel(MmprojModel):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
assert self.hparams_vision is not None
|
||||
hp = self.hparams_vision
|
||||
|
||||
hp["image_size"] = hp.get("image_size", 560)
|
||||
hp["num_attention_heads"] = hp.get("num_heads", 32)
|
||||
hp["num_hidden_layers"] = hp.get("depth", 28)
|
||||
|
||||
self.n_q_heads = int(hp["num_heads"])
|
||||
self.num_kv_heads = int(hp.get("num_key_value_heads", 8))
|
||||
self.head_dim = int(hp.get("qk_channels", 64))
|
||||
self.spatial_merge_size = int(hp["spatial_merge_size"])
|
||||
# MiMoV2 vision RMSNorm: HF uses getattr(config, "rms_norm_eps", 1e-6) and the
|
||||
# field is absent from MiMo-V2.5's vision_config
|
||||
self.rms_norm_eps = float(hp.get("rms_norm_eps", 1e-6))
|
||||
|
||||
# fullatt_block_indexes are also reflected in vit_window_attn_types as -1
|
||||
self.fullatt_block_indexes = list(hp.get("fullatt_block_indexes") or [])
|
||||
self.vit_window_attn_types = list(hp.get("vit_window_attn_types") or [])
|
||||
self.visual_token_window_size = int(hp.get("visual_token_window_size", -1))
|
||||
self.use_sink = bool(hp.get("use_sink", False))
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MIMOVL)
|
||||
self.gguf_writer.add_vision_use_silu(True)
|
||||
self.gguf_writer.add_vision_head_count_kv(self.num_kv_heads)
|
||||
self.gguf_writer.add_vision_spatial_merge_size(self.spatial_merge_size)
|
||||
self.gguf_writer.add_uint32(gguf.Keys.ClipVision.WINDOW_SIZE, self.visual_token_window_size)
|
||||
self.gguf_writer.add_vision_wa_pattern_mode(self.vit_window_attn_types)
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(self.rms_norm_eps)
|
||||
self.gguf_writer.add_vision_min_pixels(int(self.preprocessor_config["min_pixels"]))
|
||||
self.gguf_writer.add_vision_max_pixels(int(self.preprocessor_config["max_pixels"]))
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
# Sinks must be F32: any sink-style softmax/mask add in ggml requires
|
||||
# F32, and we fold sinks into a host-built F32 mask at encode time.
|
||||
if new_name.endswith(".attn_sinks"):
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
|
||||
name, _ = item
|
||||
if not name.startswith("visual."):
|
||||
return None
|
||||
return super().filter_tensors(item)
|
||||
|
||||
def modify_tensors(self, data_torch, name, bid):
|
||||
# Conv3D patch embed: split along the temporal axis (kt=2) into two Conv2D
|
||||
# weights that the existing qwen2vl-style two-Conv2D path consumes.
|
||||
if name == "visual.patch_embed.proj.weight":
|
||||
_, _, kt, _, _ = data_torch.shape
|
||||
if kt != 2:
|
||||
raise ValueError(f"unexpected temporal_patch_size: {kt}")
|
||||
embd_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH]
|
||||
yield (embd_name + ".weight", data_torch[:, :, 0, ...])
|
||||
yield (embd_name + ".weight.1", data_torch[:, :, 1, ...])
|
||||
return
|
||||
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Step3p5ForCausalLM")
|
||||
class Step35Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.STEP35
|
||||
|
|
|
|||
|
|
@ -299,30 +299,32 @@ class Keys:
|
|||
HAS_LLAVA_PROJECTOR = "clip.has_llava_projector"
|
||||
|
||||
class ClipVision:
|
||||
PROJECTOR_TYPE = "clip.vision.projector_type" # for mixed modality models
|
||||
IMAGE_SIZE = "clip.vision.image_size"
|
||||
IMAGE_MIN_PIXELS = "clip.vision.image_min_pixels"
|
||||
IMAGE_MAX_PIXELS = "clip.vision.image_max_pixels"
|
||||
PREPROC_MIN_TILES = "clip.vision.preproc_min_tiles"
|
||||
PREPROC_MAX_TILES = "clip.vision.preproc_max_tiles"
|
||||
PREPROC_IMAGE_SIZE = "clip.vision.preproc_image_size"
|
||||
PATCH_SIZE = "clip.vision.patch_size"
|
||||
EMBEDDING_LENGTH = "clip.vision.embedding_length"
|
||||
FEED_FORWARD_LENGTH = "clip.vision.feed_forward_length"
|
||||
PROJECTION_DIM = "clip.vision.projection_dim"
|
||||
BLOCK_COUNT = "clip.vision.block_count"
|
||||
IMAGE_MEAN = "clip.vision.image_mean"
|
||||
IMAGE_STD = "clip.vision.image_std"
|
||||
SPATIAL_MERGE_SIZE = "clip.vision.spatial_merge_size"
|
||||
USE_GELU = "clip.use_gelu"
|
||||
USE_SILU = "clip.use_silu"
|
||||
N_WA_PATTERN = "clip.vision.n_wa_pattern" # used by qwen2.5vl
|
||||
WA_LAYER_INDEXES = "clip.vision.wa_layer_indexes" # used by youtuvl
|
||||
IS_DEEPSTACK_LAYERS = "clip.vision.is_deepstack_layers"
|
||||
WINDOW_SIZE = "clip.vision.window_size"
|
||||
PROJECTOR_TYPE = "clip.vision.projector_type" # for mixed modality models
|
||||
IMAGE_SIZE = "clip.vision.image_size"
|
||||
IMAGE_MIN_PIXELS = "clip.vision.image_min_pixels"
|
||||
IMAGE_MAX_PIXELS = "clip.vision.image_max_pixels"
|
||||
PREPROC_MIN_TILES = "clip.vision.preproc_min_tiles"
|
||||
PREPROC_MAX_TILES = "clip.vision.preproc_max_tiles"
|
||||
PREPROC_IMAGE_SIZE = "clip.vision.preproc_image_size"
|
||||
PATCH_SIZE = "clip.vision.patch_size"
|
||||
EMBEDDING_LENGTH = "clip.vision.embedding_length"
|
||||
FEED_FORWARD_LENGTH = "clip.vision.feed_forward_length"
|
||||
PROJECTION_DIM = "clip.vision.projection_dim"
|
||||
BLOCK_COUNT = "clip.vision.block_count"
|
||||
IMAGE_MEAN = "clip.vision.image_mean"
|
||||
IMAGE_STD = "clip.vision.image_std"
|
||||
SPATIAL_MERGE_SIZE = "clip.vision.spatial_merge_size"
|
||||
USE_GELU = "clip.use_gelu"
|
||||
USE_SILU = "clip.use_silu"
|
||||
N_WA_PATTERN = "clip.vision.n_wa_pattern" # used by qwen2.5vl
|
||||
WA_LAYER_INDEXES = "clip.vision.wa_layer_indexes" # used by youtuvl
|
||||
WA_PATTERN_MODE = "clip.vision.wa_pattern_mode" # used by mimovl, per-layer -1/0/1
|
||||
IS_DEEPSTACK_LAYERS = "clip.vision.is_deepstack_layers"
|
||||
WINDOW_SIZE = "clip.vision.window_size"
|
||||
|
||||
class Attention:
|
||||
HEAD_COUNT = "clip.vision.attention.head_count"
|
||||
HEAD_COUNT_KV = "clip.vision.attention.head_count_kv" # used by mimovl (GQA)
|
||||
LAYERNORM_EPS = "clip.vision.attention.layer_norm_epsilon"
|
||||
|
||||
class Projector:
|
||||
|
|
@ -733,6 +735,7 @@ class MODEL_TENSOR(IntEnum):
|
|||
V_ENC_ATTN_V = auto()
|
||||
V_ENC_ATTN_O = auto()
|
||||
V_ENC_ATTN_O_NORM = auto()
|
||||
V_ENC_ATTN_SINKS = auto() # mimovl
|
||||
V_ENC_POST_ATTN_NORM = auto()
|
||||
V_ENC_FFN_UP = auto()
|
||||
V_ENC_FFN_GATE = auto()
|
||||
|
|
@ -1246,6 +1249,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.V_ENC_INPUT_NORM: "v.blk.{bid}.ln1",
|
||||
MODEL_TENSOR.V_ENC_ATTN_O: "v.blk.{bid}.attn_out",
|
||||
MODEL_TENSOR.V_ENC_ATTN_O_NORM: "v.blk.{bid}.attn_out_norm",
|
||||
MODEL_TENSOR.V_ENC_ATTN_SINKS: "v.blk.{bid}.attn_sinks",
|
||||
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: "v.blk.{bid}.ln2",
|
||||
MODEL_TENSOR.V_ENC_FFN_UP: "v.blk.{bid}.ffn_up",
|
||||
MODEL_TENSOR.V_ENC_FFN_GATE: "v.blk.{bid}.ffn_gate",
|
||||
|
|
@ -1426,6 +1430,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.V_ENC_ATTN_V,
|
||||
MODEL_TENSOR.V_ENC_ATTN_O,
|
||||
MODEL_TENSOR.V_ENC_ATTN_O_NORM,
|
||||
MODEL_TENSOR.V_ENC_ATTN_SINKS,
|
||||
MODEL_TENSOR.V_ENC_POST_ATTN_NORM,
|
||||
MODEL_TENSOR.V_ENC_FFN_UP,
|
||||
MODEL_TENSOR.V_ENC_FFN_GATE,
|
||||
|
|
@ -4258,6 +4263,7 @@ class VisionProjectorType:
|
|||
HUNYUANVL = "hunyuanvl"
|
||||
MINICPMV4_6 = "minicpmv4_6"
|
||||
GRANITE_SPEECH = "granite_speech" # audio
|
||||
MIMOVL = "mimovl"
|
||||
|
||||
|
||||
# Items here are (block size, type size)
|
||||
|
|
|
|||
|
|
@ -1151,6 +1151,9 @@ class GGUFWriter:
|
|||
def add_vision_head_count(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.Attention.HEAD_COUNT, value)
|
||||
|
||||
def add_vision_head_count_kv(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.Attention.HEAD_COUNT_KV, value)
|
||||
|
||||
def add_vision_attention_layernorm_eps(self, value: float) -> None:
|
||||
self.add_float32(Keys.ClipVision.Attention.LAYERNORM_EPS, value)
|
||||
|
||||
|
|
@ -1222,6 +1225,9 @@ class GGUFWriter:
|
|||
def add_vision_is_deepstack_layers(self, layers: Sequence[bool]) -> None:
|
||||
self.add_array(Keys.ClipVision.IS_DEEPSTACK_LAYERS, layers)
|
||||
|
||||
def add_vision_wa_pattern_mode(self, modes: Sequence[int]) -> None:
|
||||
self.add_array(Keys.ClipVision.WA_PATTERN_MODE, modes)
|
||||
|
||||
def add_vision_window_size(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.WINDOW_SIZE, value)
|
||||
|
||||
|
|
|
|||
|
|
@ -1569,6 +1569,10 @@ class TensorNameMap:
|
|||
"vision_model.transformer.resblocks.{bid}.attn.out_proj", # Step3-VL
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_SINKS: (
|
||||
"visual.blocks.{bid}.attn.sinks", # mimovl
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm2",
|
||||
"model.vision_tower.encoder.layers.{bid}.layer_norm2", # minicpmv4_6
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@ add_library(mtmd
|
|||
models/pixtral.cpp
|
||||
models/qwen2vl.cpp
|
||||
models/qwen3vl.cpp
|
||||
models/mimovl.cpp
|
||||
models/qwen3a.cpp
|
||||
models/step3vl.cpp
|
||||
models/siglip.cpp
|
||||
|
|
|
|||
|
|
@ -98,7 +98,8 @@ struct clip_graph {
|
|||
ggml_tensor * v_cur,
|
||||
ggml_tensor * kq_mask,
|
||||
float kq_scale,
|
||||
int il) const;
|
||||
int il,
|
||||
ggml_tensor * sinks = nullptr) const;
|
||||
|
||||
// implementation of the 2D RoPE without adding a new op in ggml
|
||||
// this is not efficient (use double the memory), but works on all backends
|
||||
|
|
|
|||
|
|
@ -31,6 +31,7 @@
|
|||
#define KEY_N_BLOCK "clip.%s.block_count"
|
||||
#define KEY_PROJ_DIM "clip.%s.projection_dim"
|
||||
#define KEY_N_HEAD "clip.%s.attention.head_count"
|
||||
#define KEY_N_HEAD_KV "clip.%s.attention.head_count_kv"
|
||||
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
|
||||
|
||||
// vision-specific
|
||||
|
|
@ -53,6 +54,7 @@
|
|||
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
|
||||
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
|
||||
#define KEY_WIN_ATTN_LAYER_INDEXES "clip.vision.wa_layer_indexes"
|
||||
#define KEY_WA_PATTERN_MODE "clip.vision.wa_pattern_mode"
|
||||
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
|
||||
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
|
||||
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
|
||||
|
|
@ -86,6 +88,7 @@
|
|||
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
|
||||
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
|
||||
#define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s"
|
||||
#define TN_ATTN_SINKS "%s.blk.%d.attn_sinks"
|
||||
#define TN_ATTN_K_NORM "%s.blk.%d.attn_k_norm.%s"
|
||||
#define TN_ATTN_Q_NORM "%s.blk.%d.attn_q_norm.%s"
|
||||
#define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s"
|
||||
|
|
@ -344,6 +347,7 @@ enum projector_type {
|
|||
PROJECTOR_TYPE_HUNYUANVL,
|
||||
PROJECTOR_TYPE_MINICPMV4_6,
|
||||
PROJECTOR_TYPE_GRANITE_SPEECH,
|
||||
PROJECTOR_TYPE_MIMOVL,
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
|
|
@ -393,6 +397,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
|||
{ PROJECTOR_TYPE_HUNYUANVL, "hunyuanvl"},
|
||||
{ PROJECTOR_TYPE_MINICPMV4_6, "minicpmv4_6"},
|
||||
{ PROJECTOR_TYPE_GRANITE_SPEECH, "granite_speech"},
|
||||
{ PROJECTOR_TYPE_MIMOVL, "mimovl"},
|
||||
};
|
||||
|
||||
static projector_type clip_projector_type_from_string(const std::string & str) {
|
||||
|
|
|
|||
|
|
@ -42,6 +42,7 @@ struct clip_hparams {
|
|||
int32_t n_ff = 0;
|
||||
int32_t projection_dim = 0;
|
||||
int32_t n_head = 0;
|
||||
int32_t n_head_kv = 0;
|
||||
int32_t n_layer = 0;
|
||||
// idefics3
|
||||
int32_t n_merge = 0; // number of patch merges **per-side**
|
||||
|
|
@ -83,6 +84,7 @@ struct clip_hparams {
|
|||
int32_t attn_window_size = 0;
|
||||
int32_t n_wa_pattern = 0;
|
||||
std::unordered_set<int32_t> wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL)
|
||||
std::vector<int32_t> wa_pattern_mode; // mimovl: per-layer window-attention mode
|
||||
|
||||
// deepseek-ocr (sam)
|
||||
int32_t sam_n_layer = 0;
|
||||
|
|
@ -166,6 +168,8 @@ struct clip_layer {
|
|||
ggml_tensor * o_w = nullptr;
|
||||
ggml_tensor * o_b = nullptr;
|
||||
|
||||
ggml_tensor * attn_sinks = nullptr;
|
||||
|
||||
ggml_tensor * k_norm = nullptr;
|
||||
ggml_tensor * q_norm = nullptr;
|
||||
|
||||
|
|
|
|||
|
|
@ -642,7 +642,8 @@ ggml_tensor * clip_graph::build_attn(
|
|||
ggml_tensor * v_cur,
|
||||
ggml_tensor * kq_mask,
|
||||
float kq_scale,
|
||||
int il) const {
|
||||
int il,
|
||||
ggml_tensor * sinks) const {
|
||||
// these nodes are added to the graph together so that they are not reordered
|
||||
// by doing so, the number of splits in the graph is reduced
|
||||
ggml_build_forward_expand(gf, q_cur);
|
||||
|
|
@ -665,6 +666,9 @@ ggml_tensor * clip_graph::build_attn(
|
|||
|
||||
cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, 0.0f, 0.0f);
|
||||
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
||||
if (sinks != nullptr) {
|
||||
ggml_flash_attn_ext_add_sinks(cur, sinks);
|
||||
}
|
||||
|
||||
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*cur->ne[1], cur->ne[2]*cur->ne[3]);
|
||||
|
||||
|
|
@ -677,6 +681,9 @@ ggml_tensor * clip_graph::build_attn(
|
|||
// ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
||||
|
||||
kq = ggml_soft_max_ext(ctx0, kq, kq_mask, kq_scale, 0.0f);
|
||||
if (sinks != nullptr) {
|
||||
ggml_soft_max_add_sinks(kq, sinks);
|
||||
}
|
||||
|
||||
ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
|
||||
cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
|
||||
|
|
@ -866,6 +873,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
{
|
||||
builder = std::make_unique<clip_graph_qwen3vl>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_MIMOVL:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_mimovl>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_STEP3VL:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_step3vl>(ctx, img);
|
||||
|
|
@ -1389,6 +1400,22 @@ struct clip_model_loader {
|
|||
LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
|
||||
}
|
||||
} break;
|
||||
case PROJECTOR_TYPE_MIMOVL:
|
||||
{
|
||||
hparams.n_merge = 2; // spatial_merge_size
|
||||
hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
|
||||
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
|
||||
get_u32(string_format(KEY_N_HEAD_KV, "vision"), hparams.n_head_kv);
|
||||
// 1D banded sliding-window radius (visual_token_window_size); required
|
||||
get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size);
|
||||
std::vector<int> pat;
|
||||
get_arr_int(KEY_WA_PATTERN_MODE, pat, true);
|
||||
GGML_ASSERT((int) pat.size() == hparams.n_layer && "mimovl wa_pattern_mode length must equal n_layer");
|
||||
hparams.wa_pattern_mode.assign(pat.begin(), pat.end());
|
||||
get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels);
|
||||
get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels);
|
||||
hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup
|
||||
} break;
|
||||
case PROJECTOR_TYPE_STEP3VL:
|
||||
{
|
||||
hparams.n_merge = 4; // two stride-2 downsamplers after patching
|
||||
|
|
@ -1729,6 +1756,8 @@ struct clip_model_loader {
|
|||
layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "weight"));
|
||||
layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "bias"), false);
|
||||
|
||||
// mimovl per-head attention sink bias
|
||||
layer.attn_sinks = get_tensor(string_format(TN_ATTN_SINKS, prefix, il), false);
|
||||
|
||||
// qwen3vl deepstack layer
|
||||
layer.deepstack_norm_w = get_tensor(string_format(TN_DEEPSTACK_NORM, il, "weight"), false);
|
||||
|
|
@ -1913,6 +1942,13 @@ struct clip_model_loader {
|
|||
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
||||
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_MIMOVL:
|
||||
{
|
||||
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
|
||||
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"), false);
|
||||
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
||||
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"), false);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_STEP3VL:
|
||||
{
|
||||
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
|
||||
|
|
@ -3011,6 +3047,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
|
|||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_MIMOVL:
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
case PROJECTOR_TYPE_HUNYUANOCR:
|
||||
|
|
@ -3032,6 +3069,7 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 *
|
|||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_MIMOVL:
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
|
|
@ -3110,6 +3148,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
|||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_MIMOVL:
|
||||
case PROJECTOR_TYPE_GLM4V:
|
||||
case PROJECTOR_TYPE_YOUTUVL:
|
||||
{
|
||||
|
|
@ -3681,6 +3720,89 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||
|
||||
set_input_i32("positions", positions);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_MIMOVL:
|
||||
{
|
||||
const int merge = hparams.n_merge; // 2
|
||||
const int merge_unit = merge * merge; // 4
|
||||
const int patch = hparams.patch_size; // 16
|
||||
const int H = image_size_height / patch;
|
||||
const int W = image_size_width / patch;
|
||||
const int n_pos_full = H * W;
|
||||
const int llm_h = H / merge;
|
||||
const int llm_w = W / merge;
|
||||
const int n_units = llm_h * llm_w; // n_pos / merge_unit
|
||||
|
||||
// Row-major merge-tile-ordered (h, w) positions
|
||||
std::vector<int32_t> pos_h_row(n_pos_full);
|
||||
std::vector<int32_t> pos_w_row(n_pos_full);
|
||||
{
|
||||
int idx = 0;
|
||||
for (int ty = 0; ty < llm_h; ty++) {
|
||||
for (int tx = 0; tx < llm_w; tx++) {
|
||||
for (int dy = 0; dy < merge; dy++) {
|
||||
for (int dx = 0; dx < merge; dx++) {
|
||||
pos_h_row[idx] = ty * merge + dy;
|
||||
pos_w_row[idx] = tx * merge + dx;
|
||||
idx++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Col-major merge-unit permutation
|
||||
std::vector<float> idx_col(n_units);
|
||||
for (int r = 0; r < llm_h; r++) {
|
||||
for (int c = 0; c < llm_w; c++) {
|
||||
int u_row = r * llm_w + c;
|
||||
int u_col = c * llm_h + r;
|
||||
idx_col[u_col] = (float) u_row;
|
||||
}
|
||||
}
|
||||
|
||||
// Col-mode positions: permute pos_*_row by idx_col
|
||||
std::vector<int32_t> pos_h_col(n_pos_full);
|
||||
std::vector<int32_t> pos_w_col(n_pos_full);
|
||||
for (int u = 0; u < n_units; u++) {
|
||||
int src = (int) idx_col[u];
|
||||
for (int k = 0; k < merge_unit; k++) {
|
||||
pos_h_col[u * merge_unit + k] = pos_h_row[src * merge_unit + k];
|
||||
pos_w_col[u * merge_unit + k] = pos_w_row[src * merge_unit + k];
|
||||
}
|
||||
}
|
||||
|
||||
// Pack into ggml_rope_multi VISION-mode layout. The non-CPU kernels
|
||||
// only read slots 0 and 1, so pack h in slot 0, w in slot 1:
|
||||
// positions[0..n_pos) = h
|
||||
// positions[n_pos..2*n_pos) = w
|
||||
// positions[2*n_pos..3*n_pos) = 0
|
||||
// positions[3*n_pos..4*n_pos) = 0
|
||||
std::vector<int32_t> positions_row(static_cast<size_t>(n_pos_full) * 4, 0);
|
||||
std::vector<int32_t> positions_col(static_cast<size_t>(n_pos_full) * 4, 0);
|
||||
for (int i = 0; i < n_pos_full; i++) {
|
||||
positions_row[0 * n_pos_full + i] = pos_h_row[i];
|
||||
positions_row[1 * n_pos_full + i] = pos_w_row[i];
|
||||
positions_col[0 * n_pos_full + i] = pos_h_col[i];
|
||||
positions_col[1 * n_pos_full + i] = pos_w_col[i];
|
||||
}
|
||||
|
||||
// Banded 1D sliding-window mask
|
||||
const int window = hparams.attn_window_size;
|
||||
GGML_ASSERT(window > 0);
|
||||
std::vector<float> mask(static_cast<size_t>(n_pos_full) * n_pos_full, std::numeric_limits<float>::lowest());
|
||||
for (int q = 0; q < n_pos_full; q++) {
|
||||
int lo = std::max(0, q - window);
|
||||
int hi = std::min(n_pos_full - 1, q + window);
|
||||
for (int k = lo; k <= hi; k++) {
|
||||
mask[static_cast<size_t>(q) * n_pos_full + k] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
set_input_i32("mimovl_positions_row", positions_row);
|
||||
set_input_i32("mimovl_positions_col", positions_col);
|
||||
set_input_f32("mimovl_idx_col", idx_col);
|
||||
set_input_f32("mimovl_window_mask", mask);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PIXTRAL:
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
case PROJECTOR_TYPE_KIMIK25:
|
||||
|
|
@ -4081,6 +4203,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
// main path + deepstack paths
|
||||
return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers);
|
||||
case PROJECTOR_TYPE_MIMOVL:
|
||||
return ctx->model.mm_1_w->ne[1];
|
||||
case PROJECTOR_TYPE_STEP3VL:
|
||||
return ctx->model.mm_model_proj->ne[1];
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
|
|
|
|||
209
tools/mtmd/models/mimovl.cpp
Normal file
209
tools/mtmd/models/mimovl.cpp
Normal file
|
|
@ -0,0 +1,209 @@
|
|||
#include "models.h"
|
||||
|
||||
ggml_tensor * clip_graph_mimovl::build_mm(ggml_tensor * w, ggml_tensor * x) const {
|
||||
ggml_tensor * cur = ggml_mul_mat(ctx0, w, x);
|
||||
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
|
||||
return cur;
|
||||
}
|
||||
|
||||
// MiMoVL vision tower for MiMo-V2.5 (non-Pro). Qwen2.5-VL-shaped ViT, except:
|
||||
// 1. GQA in attention (32 Q / 8 KV heads, head_dim 64).
|
||||
// 2. Per-head attention sinks on every windowed layer. The sinks adjust
|
||||
// the softmax denominator (equivalently, a virtual extra K column with V=0),
|
||||
// so they decay attention weight without contributing to the output.
|
||||
// 3. Per-layer window-attention mode in hparams.wa_pattern_mode:
|
||||
// -1 -> full, 0 -> row-window+sinks, 1 -> col-window+sinks.
|
||||
// Col mode transposes the merge-unit grid on entry and restores
|
||||
// it on exit. Both patch and rotary orderings are pre-computed
|
||||
// host-side.
|
||||
// 4. 1D banded sliding window (|q-k| > window_size -> -inf) as a
|
||||
// single 2D mask broadcast across heads.
|
||||
// 5. Per-block MLP biases.
|
||||
ggml_cgraph * clip_graph_mimovl::build() {
|
||||
GGML_ASSERT(model.patch_embeddings_0 != nullptr);
|
||||
GGML_ASSERT(model.patch_embeddings_1 != nullptr);
|
||||
GGML_ASSERT(model.class_embedding == nullptr);
|
||||
GGML_ASSERT(hparams.n_head_kv > 0);
|
||||
GGML_ASSERT(n_head % hparams.n_head_kv == 0);
|
||||
GGML_ASSERT((int) hparams.wa_pattern_mode.size() == n_layer);
|
||||
|
||||
const int batch_size = 1;
|
||||
const int n_pos = n_patches;
|
||||
const int n_head_kv = hparams.n_head_kv;
|
||||
const int merge = hparams.n_merge > 0 ? hparams.n_merge : 2;
|
||||
const int merge_unit = merge * merge;
|
||||
const int n_units = n_pos / merge_unit;
|
||||
GGML_ASSERT(n_units * merge_unit == n_pos);
|
||||
|
||||
// MiMoVL has head_dim=64 with n_embd=1280, so n_embd is NOT n_head*head_dim
|
||||
// (the base class's d_head = n_embd/n_head = 40 is wrong here). Derive
|
||||
// head_dim from the fused QKV projection: rows = (n_head + 2*n_head_kv)*head_dim.
|
||||
GGML_ASSERT(model.layers[0].qkv_w != nullptr);
|
||||
const int qkv_rows = model.layers[0].qkv_w->ne[1];
|
||||
const int head_dim = qkv_rows / (n_head + 2 * n_head_kv);
|
||||
GGML_ASSERT(head_dim * (n_head + 2 * n_head_kv) == qkv_rows);
|
||||
const float attn_scale = 1.0f / std::sqrt((float) head_dim);
|
||||
const int rope_n_dims = head_dim / 2;
|
||||
int mrope_sections[4] = {rope_n_dims/2, rope_n_dims/2, 0, 0};
|
||||
|
||||
// Patch embed: Conv3D(kt=2) split into two Conv2D, then interleave-merge
|
||||
// along the height axis to match the merge-tile token order.
|
||||
ggml_tensor * inp_raw = build_inp_raw();
|
||||
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw,
|
||||
patch_size, patch_size, 0, 0, 1, 1);
|
||||
{
|
||||
ggml_tensor * inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw,
|
||||
patch_size, patch_size, 0, 0, 1, 1);
|
||||
inp = ggml_add(ctx0, inp, inp_1);
|
||||
|
||||
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
|
||||
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
|
||||
|
||||
inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w,h,c,b] -> [c,w,h,b]
|
||||
inp = ggml_cont_4d(ctx0, inp, n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
|
||||
inp = ggml_reshape_4d(ctx0, inp, n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
|
||||
inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
|
||||
inp = ggml_cont_3d(ctx0, inp, n_embd, n_patches_x * n_patches_y, batch_size);
|
||||
}
|
||||
cb(inp, "patch_embed", -1);
|
||||
|
||||
ggml_tensor * positions_row = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos * 4);
|
||||
ggml_set_name(positions_row, "mimovl_positions_row");
|
||||
ggml_set_input(positions_row);
|
||||
|
||||
ggml_tensor * positions_col = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos * 4);
|
||||
ggml_set_name(positions_col, "mimovl_positions_col");
|
||||
ggml_set_input(positions_col);
|
||||
|
||||
// idx_col is the col-major merge-unit permutation. Take it as F32 so we can
|
||||
// derive the inverse permutation in-graph via ggml_argsort;
|
||||
// ggml_get_rows requires its index tensor to be I32, so cast back as well.
|
||||
ggml_tensor * idx_col_f = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_units);
|
||||
ggml_set_name(idx_col_f, "mimovl_idx_col");
|
||||
ggml_set_input(idx_col_f);
|
||||
ggml_tensor * idx_col = ggml_cast(ctx0, idx_col_f, GGML_TYPE_I32);
|
||||
ggml_tensor * idx_col_inv = ggml_argsort(ctx0, idx_col_f, GGML_SORT_ORDER_ASC);
|
||||
|
||||
ggml_tensor * window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_pos, n_pos);
|
||||
ggml_set_name(window_mask, "mimovl_window_mask");
|
||||
ggml_set_input(window_mask);
|
||||
|
||||
ggml_tensor * window_mask_attn = (flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED)
|
||||
? ggml_cast(ctx0, window_mask, GGML_TYPE_F16)
|
||||
: window_mask;
|
||||
|
||||
// Reorder helper: permute patches at merge-unit granularity. The patch
|
||||
// sequence is laid out as n_units groups of merge_unit (=4) consecutive
|
||||
// patches; the row<->col transpose only permutes whole groups. We keep
|
||||
// the per-group (h,w) ordering intact by reshaping to
|
||||
// [n_embd*merge_unit, n_units] before ggml_get_rows.
|
||||
auto reorder = [&](ggml_tensor * x, ggml_tensor * idx) {
|
||||
ggml_tensor * y = ggml_reshape_2d(ctx0, x, n_embd * merge_unit, n_units);
|
||||
y = ggml_get_rows(ctx0, y, idx);
|
||||
return ggml_reshape_3d(ctx0, y, n_embd, n_pos, batch_size);
|
||||
};
|
||||
|
||||
ggml_tensor * inpL = inp;
|
||||
int prev_mode = -1;
|
||||
|
||||
for (int il = 0; il < n_layer; il++) {
|
||||
const auto & layer = model.layers[il];
|
||||
const int mode = hparams.wa_pattern_mode[il];
|
||||
const bool is_full = (mode == -1);
|
||||
const bool is_col = (mode == 1);
|
||||
|
||||
// Reorder transitions on entry/exit of a col-mode run.
|
||||
if (is_col && prev_mode != 1) {
|
||||
inpL = reorder(inpL, idx_col);
|
||||
cb(inpL, "reorder_to_col", il);
|
||||
} else if (!is_col && prev_mode == 1) {
|
||||
inpL = reorder(inpL, idx_col_inv);
|
||||
cb(inpL, "reorder_to_row", il);
|
||||
}
|
||||
|
||||
ggml_tensor * cur = inpL;
|
||||
|
||||
// Pre-attention RMSNorm.
|
||||
cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_RMS, eps, il);
|
||||
cb(cur, "ln1", il);
|
||||
|
||||
// Fused QKV with GQA.
|
||||
ggml_tensor * qkv = build_mm(layer.qkv_w, cur);
|
||||
qkv = ggml_add(ctx0, qkv, layer.qkv_b);
|
||||
|
||||
const size_t row = ggml_row_size(qkv->type, head_dim);
|
||||
const size_t off_k = ggml_row_size(qkv->type, n_head * head_dim);
|
||||
const size_t off_v = ggml_row_size(qkv->type, (n_head + n_head_kv) * head_dim);
|
||||
|
||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, head_dim, n_head, n_pos, row, qkv->nb[1], 0);
|
||||
ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, head_dim, n_head_kv, n_pos, row, qkv->nb[1], off_k);
|
||||
ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, head_dim, n_head_kv, n_pos, row, qkv->nb[1], off_v);
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
// 2D RoPE
|
||||
ggml_tensor * pos = is_col ? positions_col : positions_row;
|
||||
Qcur = ggml_rope_multi(ctx0, Qcur, pos, nullptr, rope_n_dims, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000.0f, 1.0f, 0.0f, 1.0f, 32.0f, 1.0f);
|
||||
Kcur = ggml_rope_multi(ctx0, Kcur, pos, nullptr, rope_n_dims, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000.0f, 1.0f, 0.0f, 1.0f, 32.0f, 1.0f);
|
||||
cb(Qcur, "Qcur_rope", il);
|
||||
cb(Kcur, "Kcur_rope", il);
|
||||
|
||||
// Full layers: plain attention. Windowed layers: banded mask and per-head sinks.
|
||||
ggml_tensor * mask = is_full ? nullptr : window_mask_attn;
|
||||
ggml_tensor * sinks = is_full ? nullptr : layer.attn_sinks;
|
||||
if (!is_full) {
|
||||
GGML_ASSERT(layer.attn_sinks != nullptr);
|
||||
}
|
||||
ggml_tensor * attn_out = build_attn(layer.o_w, layer.o_b, Qcur, Kcur, Vcur, mask, attn_scale, il, sinks);
|
||||
cb(attn_out, "attn_out", il);
|
||||
|
||||
// Residual 1.
|
||||
cur = ggml_add(ctx0, attn_out, inpL);
|
||||
inpL = cur;
|
||||
cb(cur, "ffn_inp", il);
|
||||
|
||||
// Pre-FFN RMSNorm.
|
||||
cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_RMS, eps, il);
|
||||
cb(cur, "ffn_inp_normed", il);
|
||||
|
||||
// SwiGLU MLP with biases
|
||||
cur = build_ffn(cur,
|
||||
layer.ff_up_w, layer.ff_up_b,
|
||||
layer.ff_gate_w, layer.ff_gate_b,
|
||||
layer.ff_down_w, layer.ff_down_b,
|
||||
hparams.ffn_op, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
// Residual 2.
|
||||
cur = ggml_add(ctx0, inpL, cur);
|
||||
cb(cur, "layer_out", il);
|
||||
|
||||
inpL = cur;
|
||||
prev_mode = mode;
|
||||
}
|
||||
|
||||
// If the last block was col-mode, undo the transpose so the merger sees patches in row order.
|
||||
if (prev_mode == 1) {
|
||||
inpL = reorder(inpL, idx_col_inv);
|
||||
cb(inpL, "reorder_to_row_final", -1);
|
||||
}
|
||||
|
||||
// Merger: post-LayerNorm
|
||||
inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, 1e-6f, n_layer);
|
||||
cb(inpL, "post_ln", -1);
|
||||
|
||||
// Spatial merge: pack each merge_unit (=4) of patches into a single
|
||||
// (n_embd*merge_unit)-wide row, then run the 2-layer MLP.
|
||||
ggml_tensor * embeddings = ggml_reshape_3d(ctx0, inpL, n_embd * merge_unit, n_units, batch_size);
|
||||
embeddings = build_ffn(embeddings,
|
||||
model.mm_0_w, nullptr,
|
||||
nullptr, nullptr,
|
||||
model.mm_1_w, nullptr,
|
||||
FFN_GELU, -1);
|
||||
cb(embeddings, "vit_out", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, embeddings);
|
||||
return gf;
|
||||
}
|
||||
|
|
@ -33,6 +33,15 @@ struct clip_graph_qwen3vl : clip_graph {
|
|||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_mimovl : clip_graph {
|
||||
clip_graph_mimovl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
// Force F32 mat-mul accumulation to avoid F16 overflow in the FFN down-proj
|
||||
// when the mmproj is stored in F16 (the source weights are BF16; downcasting
|
||||
// to F16 reduces dynamic range below the SwiGLU output magnitude on the last few layers).
|
||||
ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const override;
|
||||
};
|
||||
|
||||
struct clip_graph_step3vl : clip_graph {
|
||||
clip_graph_step3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
|
|
|
|||
|
|
@ -325,6 +325,7 @@ struct mtmd_context {
|
|||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
case PROJECTOR_TYPE_QWEN3VL:
|
||||
case PROJECTOR_TYPE_MIMOVL:
|
||||
{
|
||||
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
|
||||
img_beg = "<|vision_start|>";
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue