from __future__ import annotations from typing import Callable, Iterable, TYPE_CHECKING if TYPE_CHECKING: from torch import Tensor from .base import MmprojModel, ModelBase, gguf @ModelBase.register("InternVisionModel") class InternVisionModel(MmprojModel): min_dynamic_tiles: int = 0 max_dynamic_tiles: int = 0 def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) assert self.hparams_vision is not None self.min_dynamic_tiles = self.global_config.get("min_dynamic_patch", 0) self.max_dynamic_tiles = self.global_config.get("max_dynamic_patch", 0) def set_gguf_parameters(self): assert self.hparams_vision is not None if isinstance(self.hparams_vision['image_size'], list): self.hparams_vision['image_size'] = self.hparams_vision['image_size'][0] if isinstance(self.hparams_vision['patch_size'], list): self.hparams_vision['patch_size'] = self.hparams_vision['patch_size'][0] super().set_gguf_parameters() hparams = self.hparams self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.INTERNVL) self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"]) # hidden_act if hparams["hidden_act"] == "silu": self.gguf_writer.add_vision_use_silu(True) elif hparams["hidden_act"] == "gelu": self.gguf_writer.add_vision_use_gelu(True) else: raise ValueError(f"Unsupported hidden_act: {hparams['hidden_act']}") # downsample_ratio downsample_ratio = self.global_config.get("downsample_ratio") assert downsample_ratio is not None self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio)) # older models may not have min/max_dynamic_patch in config if self.min_dynamic_tiles > 0: self.gguf_writer.add_vision_preproc_min_tiles(self.min_dynamic_tiles) if self.max_dynamic_tiles > 0: self.gguf_writer.add_vision_preproc_max_tiles(self.max_dynamic_tiles) def tensor_force_quant(self, name, new_name, bid, n_dims): if ".position_embd." in new_name: return gguf.GGMLQuantizationType.F32 return super().tensor_force_quant(name, new_name, bid, n_dims) @classmethod def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: name, gen = item vision_prefix = ['vision_model', 'mlp', 'model.vision_tower', 'model.multi_modal_projector'] if not any([name.startswith(prefix) for prefix in vision_prefix]): return None # deal with intern-s1 special case names_map = { "model.multi_modal_projector.layer_norm.bias": "mlp1.0.bias", "model.multi_modal_projector.layer_norm.weight": "mlp1.0.weight", "model.multi_modal_projector.linear_1.bias": "mlp1.1.bias", "model.multi_modal_projector.linear_1.weight": "mlp1.1.weight", "model.multi_modal_projector.linear_2.bias": "mlp1.3.bias", "model.multi_modal_projector.linear_2.weight": "mlp1.3.weight", } if name in names_map: name = names_map[name] # correct name if name.startswith("vision_model"): name = "vision_tower." + name if (".ls" in name or ".lambda_" in name or "position_embedding" in name) and not name.endswith(".weight"): name += ".weight" return super().filter_tensors((name, gen)) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # split QKV tensors if needed if ".qkv." in name: if data_torch.ndim == 2: # weight c3, _ = data_torch.shape else: # bias c3 = data_torch.shape[0] assert c3 % 3 == 0 c = c3 // 3 wq = data_torch[:c] wk = data_torch[c: c * 2] wv = data_torch[c * 2:] yield from super().modify_tensors(wq, name.replace("attn.qkv", "self_attn.q_proj"), bid) yield from super().modify_tensors(wk, name.replace("attn.qkv", "self_attn.k_proj"), bid) yield from super().modify_tensors(wv, name.replace("attn.qkv", "self_attn.v_proj"), bid) else: yield from super().modify_tensors(data_torch, name, bid)