diff --git a/common/arg.cpp b/common/arg.cpp index cf4fe4e46..815bf99cb 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -977,14 +977,13 @@ static void common_params_print_completion(common_params_context & ctx_arg) { "llama-gritlm", "llama-imatrix", "llama-infill", - "llama-llava-cli", + "llama-mtmd-cli", "llama-llava-clip-quantize-cli", "llama-lookahead", "llama-lookup", "llama-lookup-create", "llama-lookup-merge", "llama-lookup-stats", - "llama-minicpmv-cli", "llama-parallel", "llama-passkey", "llama-perplexity", @@ -2727,7 +2726,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.chat_template = value; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE")); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LLAVA}).set_env("LLAMA_ARG_CHAT_TEMPLATE")); add_opt(common_arg( {"--chat-template-file"}, "JINJA_TEMPLATE_FILE", string_format( diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 89522dee8..645bdad9b 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -42,11 +42,19 @@ class SentencePieceTokenTypes(IntEnum): BYTE = 6 -AnyModel = TypeVar("AnyModel", bound="type[Model]") +class ModelType(IntEnum): + TEXT = 1 + VISION = 2 -class Model: - _model_classes: dict[str, type[Model]] = {} +AnyModel = TypeVar("AnyModel", bound="type[ModelBase]") + + +class ModelBase: + _model_classes: dict[ModelType, dict[str, type[ModelBase]]] = { + ModelType.TEXT: {}, + ModelType.VISION: {}, + } dir_model: Path ftype: gguf.LlamaFileType @@ -75,7 +83,9 @@ class Model: metadata_override: Path | None = None, model_name: str | None = None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None): - if type(self) is Model: + if type(self) is ModelBase or \ + type(self) is TextModel or \ + type(self) is VisionModel: raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") self.dir_model = dir_model @@ -98,11 +108,11 @@ class Model: self.get_tensors = get_remote_tensors else: - self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors") + self.part_names = ModelBase.get_model_part_names(self.dir_model, "model", ".safetensors") self.is_safetensors = len(self.part_names) > 0 if not self.is_safetensors: - self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin") - self.hparams = Model.load_hparams(self.dir_model) if hparams is None else hparams + self.part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin") + self.hparams = ModelBase.load_hparams(self.dir_model) if hparams is None else hparams self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"]) self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) self.tensor_names = None @@ -126,11 +136,10 @@ class Model: split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard) @classmethod - def __init_subclass__(cls): - # can't use an abstract property, because overriding it without type errors - # would require using decorated functions instead of simply defining the property - if "model_arch" not in cls.__dict__: - raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}") + def add_prefix_to_filename(cls, path: Path, prefix: str) -> Path: + stem, suffix = path.stem, path.suffix + new_name = f"{prefix}{stem}{suffix}" + return path.with_name(new_name) def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: key = next((k for k in keys if k in self.hparams), None) @@ -140,9 +149,6 @@ class Model: return None raise KeyError(f"could not find any of: {keys}") - def set_vocab(self): - self._set_vocab_gpt2() - def get_tensors(self) -> Iterator[tuple[str, Tensor]]: tensor_names_from_parts: set[str] = set() @@ -230,50 +236,7 @@ class Model: return new_name def set_gguf_parameters(self): - self.gguf_writer.add_block_count(self.block_count) - - if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None: - self.gguf_writer.add_context_length(n_ctx) - logger.info(f"gguf: context length = {n_ctx}") - - if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None: - self.gguf_writer.add_embedding_length(n_embd) - logger.info(f"gguf: embedding length = {n_embd}") - - if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None: - self.gguf_writer.add_feed_forward_length(n_ff) - logger.info(f"gguf: feed forward length = {n_ff}") - - if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None: - self.gguf_writer.add_head_count(n_head) - logger.info(f"gguf: head count = {n_head}") - - if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: - self.gguf_writer.add_head_count_kv(n_head_kv) - logger.info(f"gguf: key-value head count = {n_head_kv}") - - if (rope_theta := self.hparams.get("rope_theta")) is not None: - self.gguf_writer.add_rope_freq_base(rope_theta) - logger.info(f"gguf: rope theta = {rope_theta}") - if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None: - self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) - logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") - if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: - self.gguf_writer.add_layer_norm_eps(f_norm_eps) - logger.info(f"gguf: layer norm epsilon = {f_norm_eps}") - if (n_experts := self.hparams.get("num_local_experts")) is not None: - self.gguf_writer.add_expert_count(n_experts) - logger.info(f"gguf: expert count = {n_experts}") - if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: - self.gguf_writer.add_expert_used_count(n_experts_used) - logger.info(f"gguf: experts used count = {n_experts_used}") - - if (head_dim := self.hparams.get("head_dim")) is not None: - self.gguf_writer.add_key_length(head_dim) - self.gguf_writer.add_value_length(head_dim) - - self.gguf_writer.add_file_type(self.ftype) - logger.info(f"gguf: file type = {self.ftype}") + raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses") def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused @@ -419,6 +382,92 @@ class Model: if self.metadata.size_label is None and total_params > 0: self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count) + self.set_type() + + logger.info("Set meta model") + self.metadata.set_gguf_meta_model(self.gguf_writer) + + logger.info("Set model parameters") + self.set_gguf_parameters() + + logger.info("Set model quantization version") + self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) + + def write_vocab(self): + raise NotImplementedError("write_vocab() must be implemented in subclasses") + + def write(self): + self.prepare_tensors() + self.prepare_metadata(vocab_only=False) + self.gguf_writer.write_header_to_file(path=self.fname_out) + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.write_tensors_to_file(progress=True) + self.gguf_writer.close() + + @staticmethod + def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]: + part_names: list[str] = [] + for filename in os.listdir(dir_model): + if filename.startswith(prefix) and filename.endswith(suffix): + part_names.append(filename) + + part_names.sort() + + return part_names + + @staticmethod + def load_hparams(dir_model: Path): + with open(dir_model / "config.json", "r", encoding="utf-8") as f: + hparams = json.load(f) + architectures = hparams.get("architectures") + if "text_config" in hparams: + hparams = {**hparams, **hparams["text_config"]} + if architectures is not None: + # preserve "architectures" from root level config + hparams["architectures"] = architectures + return hparams + + @classmethod + def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]: + assert names + + def func(modelcls: AnyModel) -> AnyModel: + model_type = ModelType.VISION if modelcls.model_arch == gguf.MODEL_ARCH.CLIP_VISION else ModelType.TEXT + for name in names: + cls._model_classes[model_type][name] = modelcls + return modelcls + return func + + @classmethod + def print_registered_models(cls): + for model_type, model_classes in cls._model_classes.items(): + logger.error(f"{model_type.name} models:") + for name in sorted(model_classes.keys()): + logger.error(f" - {name}") + + @classmethod + def from_model_architecture(cls, arch: str, model_type = ModelType.TEXT) -> type[ModelBase]: + try: + return cls._model_classes[model_type][arch] + except KeyError: + raise NotImplementedError(f'Architecture {arch!r} not supported!') from None + + +class TextModel(ModelBase): + @classmethod + def __init_subclass__(cls): + # can't use an abstract property, because overriding it without type errors + # would require using decorated functions instead of simply defining the property + if "model_arch" not in cls.__dict__: + raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}") + + def set_vocab(self): + self._set_vocab_gpt2() + + def prepare_metadata(self, vocab_only: bool): + super().prepare_metadata(vocab_only=vocab_only) + + total_params = self.gguf_writer.get_total_parameter_count()[0] # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0' output_type: str = self.ftype.name.partition("_")[2] @@ -440,27 +489,54 @@ class Model: # Process templated file name with the output ftype, useful with the "auto" ftype self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type) - self.set_type() - - logger.info("Set meta model") - self.metadata.set_gguf_meta_model(self.gguf_writer) - - logger.info("Set model parameters") - self.set_gguf_parameters() - logger.info("Set model tokenizer") self.set_vocab() - logger.info("Set model quantization version") - self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) + def set_gguf_parameters(self): + self.gguf_writer.add_block_count(self.block_count) - def write(self): - self.prepare_tensors() - self.prepare_metadata(vocab_only=False) - self.gguf_writer.write_header_to_file(path=self.fname_out) - self.gguf_writer.write_kv_data_to_file() - self.gguf_writer.write_tensors_to_file(progress=True) - self.gguf_writer.close() + if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None: + self.gguf_writer.add_context_length(n_ctx) + logger.info(f"gguf: context length = {n_ctx}") + + if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None: + self.gguf_writer.add_embedding_length(n_embd) + logger.info(f"gguf: embedding length = {n_embd}") + + if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None: + self.gguf_writer.add_feed_forward_length(n_ff) + logger.info(f"gguf: feed forward length = {n_ff}") + + if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None: + self.gguf_writer.add_head_count(n_head) + logger.info(f"gguf: head count = {n_head}") + + if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: + self.gguf_writer.add_head_count_kv(n_head_kv) + logger.info(f"gguf: key-value head count = {n_head_kv}") + + if (rope_theta := self.hparams.get("rope_theta")) is not None: + self.gguf_writer.add_rope_freq_base(rope_theta) + logger.info(f"gguf: rope theta = {rope_theta}") + if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None: + self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) + logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") + if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: + self.gguf_writer.add_layer_norm_eps(f_norm_eps) + logger.info(f"gguf: layer norm epsilon = {f_norm_eps}") + if (n_experts := self.hparams.get("num_local_experts")) is not None: + self.gguf_writer.add_expert_count(n_experts) + logger.info(f"gguf: expert count = {n_experts}") + if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: + self.gguf_writer.add_expert_used_count(n_experts_used) + logger.info(f"gguf: experts used count = {n_experts_used}") + + if (head_dim := self.hparams.get("head_dim")) is not None: + self.gguf_writer.add_key_length(head_dim) + self.gguf_writer.add_value_length(head_dim) + + self.gguf_writer.add_file_type(self.ftype) + logger.info(f"gguf: file type = {self.ftype}") def write_vocab(self): if len(self.gguf_writer.tensors) != 1: @@ -471,44 +547,6 @@ class Model: self.gguf_writer.write_kv_data_to_file() self.gguf_writer.close() - @staticmethod - def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]: - part_names: list[str] = [] - for filename in os.listdir(dir_model): - if filename.startswith(prefix) and filename.endswith(suffix): - part_names.append(filename) - - part_names.sort() - - return part_names - - @staticmethod - def load_hparams(dir_model: Path): - with open(dir_model / "config.json", "r", encoding="utf-8") as f: - return json.load(f) - - @classmethod - def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]: - assert names - - def func(modelcls: AnyModel) -> AnyModel: - for name in names: - cls._model_classes[name] = modelcls - return modelcls - return func - - @classmethod - def print_registered_models(cls): - for name in sorted(cls._model_classes.keys()): - logger.error(f"- {name}") - - @classmethod - def from_model_architecture(cls, arch: str) -> type[Model]: - try: - return cls._model_classes[arch] - except KeyError: - raise NotImplementedError(f'Architecture {arch!r} not supported!') from None - def does_token_look_special(self, token: str | bytes) -> bool: if isinstance(token, (bytes, bytearray)): token_text = token.decode(encoding="utf-8") @@ -1024,8 +1062,59 @@ class Model: self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0]) -@Model.register("GPTNeoXForCausalLM") -class GPTNeoXModel(Model): +class VisionModel(ModelBase): + model_arch = gguf.MODEL_ARCH.CLIP_VISION + n_text_embd = 0 + preprocessor_config: dict[str, Any] + global_config: dict[str, Any] + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + if self.model_arch != gguf.MODEL_ARCH.CLIP_VISION: + raise TypeError("VisionModel must be subclassed with model_arch = gguf.MODEL_ARCH.CLIP_VISION") + + # small hack to correct the number of layers + self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.CLIP_VISION, 128) + self.n_embd_text = self.find_hparam(["hidden_size", "n_embd"]) + assert self.n_embd_text > 0, "n_embd not found in hparams" + + if "vision_config" not in self.hparams: + raise ValueError("vision_config not found in hparams") + # move vision config to the top level, while preserving the original hparams in global_config + self.global_config = self.hparams + self.hparams = self.hparams["vision_config"] + + # load preprocessor config + with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f: + self.preprocessor_config = json.load(f) + + def set_type(self): + self.gguf_writer.add_type(gguf.GGUFType.CLIP_VISION) + + def set_gguf_parameters(self): + self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_vision_projection_dim(self.n_embd_text) + self.gguf_writer.add_vision_has_vision_encoder(True) + + # vision config + self.gguf_writer.add_vision_image_size(self.find_hparam(["image_size"])) + self.gguf_writer.add_vision_patch_size(self.find_hparam(["patch_size"])) + self.gguf_writer.add_vision_embedding_length(self.find_hparam(["hidden_size"])) + self.gguf_writer.add_vision_feed_forward_length(self.find_hparam(["intermediate_size"])) + self.gguf_writer.add_vision_block_count(self.find_hparam(["num_hidden_layers"])) + self.gguf_writer.add_vision_head_count(self.find_hparam(["num_attention_heads"])) + + # preprocessor config + self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"]) + self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_mean"]) + + def write_vocab(self): + raise ValueError("VisionModel does not support vocab writing") + + +@ModelBase.register("GPTNeoXForCausalLM") +class GPTNeoXModel(TextModel): model_arch = gguf.MODEL_ARCH.GPTNEOX def set_gguf_parameters(self): @@ -1081,8 +1170,8 @@ class GPTNeoXModel(Model): return tensors -@Model.register("BloomForCausalLM", "BloomModel") -class BloomModel(Model): +@ModelBase.register("BloomForCausalLM", "BloomModel") +class BloomModel(TextModel): model_arch = gguf.MODEL_ARCH.BLOOM def set_gguf_parameters(self): @@ -1138,8 +1227,8 @@ class BloomModel(Model): return tensors -@Model.register("MPTForCausalLM") -class MPTModel(Model): +@ModelBase.register("MPTForCausalLM") +class MPTModel(TextModel): model_arch = gguf.MODEL_ARCH.MPT def set_vocab(self): @@ -1182,8 +1271,8 @@ class MPTModel(Model): return [(new_name, data_torch)] -@Model.register("OrionForCausalLM") -class OrionModel(Model): +@ModelBase.register("OrionForCausalLM") +class OrionModel(TextModel): model_arch = gguf.MODEL_ARCH.ORION def set_vocab(self): @@ -1217,8 +1306,8 @@ class OrionModel(Model): self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"]) -@Model.register("BaichuanForCausalLM", "BaiChuanForCausalLM") -class BaichuanModel(Model): +@ModelBase.register("BaichuanForCausalLM", "BaiChuanForCausalLM") +class BaichuanModel(TextModel): model_arch = gguf.MODEL_ARCH.BAICHUAN def set_vocab(self): @@ -1297,8 +1386,8 @@ class BaichuanModel(Model): return weights[r * n_part:r * n_part + r, ...] -@Model.register("XverseForCausalLM") -class XverseModel(Model): +@ModelBase.register("XverseForCausalLM") +class XverseModel(TextModel): model_arch = gguf.MODEL_ARCH.XVERSE def set_vocab(self): @@ -1404,8 +1493,8 @@ class XverseModel(Model): ) -@Model.register("FalconForCausalLM", "RWForCausalLM") -class FalconModel(Model): +@ModelBase.register("FalconForCausalLM", "RWForCausalLM") +class FalconModel(TextModel): model_arch = gguf.MODEL_ARCH.FALCON def set_gguf_parameters(self): @@ -1458,8 +1547,8 @@ class FalconModel(Model): return [(self.map_tensor_name(name), data_torch)] -@Model.register("GPTBigCodeForCausalLM") -class StarCoderModel(Model): +@ModelBase.register("GPTBigCodeForCausalLM") +class StarCoderModel(TextModel): model_arch = gguf.MODEL_ARCH.STARCODER def set_gguf_parameters(self): @@ -1475,8 +1564,8 @@ class StarCoderModel(Model): self.gguf_writer.add_file_type(self.ftype) -@Model.register("GPTRefactForCausalLM") -class RefactModel(Model): +@ModelBase.register("GPTRefactForCausalLM") +class RefactModel(TextModel): model_arch = gguf.MODEL_ARCH.REFACT def set_vocab(self): @@ -1539,8 +1628,8 @@ class RefactModel(Model): return tensors -@Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM") -class StableLMModel(Model): +@ModelBase.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM") +class StableLMModel(TextModel): model_arch = gguf.MODEL_ARCH.STABLELM def set_vocab(self): @@ -1629,11 +1718,23 @@ class StableLMModel(Model): raise ValueError(f"Unprocessed norms: {norms}") -@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") -class LlamaModel(Model): +@ModelBase.register( + "LLaMAForCausalLM", + "LlamaForCausalLM", + "MistralForCausalLM", + "MixtralForCausalLM", + "Idefics3ForConditionalGeneration", + "SmolVLMForConditionalGeneration") +class LlamaModel(TextModel): model_arch = gguf.MODEL_ARCH.LLAMA undo_permute = True + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # fix for SmolVLM2, missing `num_attention_heads` in config.json + if self.hparams["architectures"][0] == "SmolVLMForConditionalGeneration": + self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32) + def set_vocab(self): try: self._set_vocab_sentencepiece() @@ -1696,6 +1797,12 @@ class LlamaModel(Model): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") + is_vision_tensor = "vision_tower" in name or "vision_model" in name or "model.connector" in name + + if is_vision_tensor: + return [] # skip vision tensors + elif name.startswith("model.text_model"): + name = name.replace("text_model.", "") # for SmolVLM if self.undo_permute: if name.endswith(("q_proj.weight", "q_proj.bias")): @@ -1778,23 +1885,48 @@ class LlamaModel(Model): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("Llama4ForConditionalGeneration") +@ModelBase.register("Idefics3ForConditionalGeneration", "SmolVLMForConditionalGeneration") +class SmolVLMModel(VisionModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # fix for SmolVLM2, missing some keys in config.json + # default values are taken from transformers code + if self.hparams["model_type"] == "smolvlm_vision": + self.hparams["hidden_size"] = self.hparams.get("hidden_size", 1152) + self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 16) + self.hparams["intermediate_size"] = self.hparams.get("intermediate_size", 3072) + self.hparams["num_hidden_layers"] = self.hparams.get("num_hidden_layers", 12) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.IDEFICS3) + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5)) + self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2)) + self.gguf_writer.add_vision_use_gelu(True) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + del bid, new_name, n_dims # unused + if ".embeddings." in name: + return gguf.GGMLQuantizationType.F32 + return False + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + is_vision_tensor = "vision_tower" in name or "vision_model" in name or "model.connector" in name + + if is_vision_tensor: + return [(self.map_tensor_name(name), data_torch)] + + return [] # skip other tensors + + +@ModelBase.register("Llama4ForConditionalGeneration") class Llama4Model(LlamaModel): model_arch = gguf.MODEL_ARCH.LLAMA4 - has_vision: bool = False undo_permute = False - # TODO @ngxson : avoid duplicate this code everywhere by at least support "text_config" - # same with llama, but we need to merge the text_config into the root level of hparams def __init__(self, *args, **kwargs): - hparams = kwargs["hparams"] if "hparams" in kwargs else Model.load_hparams(args[0]) - if "text_config" in hparams: - hparams = {**hparams, **hparams["text_config"]} - kwargs["hparams"] = hparams super().__init__(*args, **kwargs) - if "vision_config" in hparams: - logger.info("Has vision encoder, but it will be ignored") - self.has_vision = True # IMPORTANT: the normal "intermediate_size" is renamed to "intermediate_size_mlp", we need to undo this self.hparams["intermediate_size_moe"] = self.hparams["intermediate_size"] self.hparams["intermediate_size"] = self.hparams["intermediate_size_mlp"] @@ -1829,18 +1961,10 @@ class Llama4Model(LlamaModel): return super().modify_tensors(data_torch, name, bid) -@Model.register("Mistral3ForConditionalGeneration") +@ModelBase.register("Mistral3ForConditionalGeneration") class Mistral3Model(LlamaModel): model_arch = gguf.MODEL_ARCH.LLAMA - # we need to merge the text_config into the root level of hparams - def __init__(self, *args, **kwargs): - hparams = kwargs["hparams"] if "hparams" in kwargs else Model.load_hparams(args[0]) - if "text_config" in hparams: - hparams = {**hparams, **hparams["text_config"]} - kwargs["hparams"] = hparams - super().__init__(*args, **kwargs) - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): name = name.replace("language_model.", "") if "multi_modal_projector" in name or "vision_tower" in name: @@ -1848,8 +1972,8 @@ class Mistral3Model(LlamaModel): return super().modify_tensors(data_torch, name, bid) -@Model.register("DeciLMForCausalLM") -class DeciModel(Model): +@ModelBase.register("DeciLMForCausalLM") +class DeciModel(TextModel): model_arch = gguf.MODEL_ARCH.DECI @staticmethod @@ -2020,8 +2144,8 @@ class DeciModel(Model): super().prepare_tensors() -@Model.register("BitnetForCausalLM") -class BitnetModel(Model): +@ModelBase.register("BitnetForCausalLM") +class BitnetModel(TextModel): model_arch = gguf.MODEL_ARCH.BITNET def set_vocab(self): @@ -2061,8 +2185,8 @@ class BitnetModel(Model): yield (new_name, data_torch) -@Model.register("GrokForCausalLM") -class GrokModel(Model): +@ModelBase.register("GrokForCausalLM") +class GrokModel(TextModel): model_arch = gguf.MODEL_ARCH.GROK def set_vocab(self): @@ -2114,8 +2238,8 @@ class GrokModel(Model): return [(self.map_tensor_name(name), data_torch)] -@Model.register("DbrxForCausalLM") -class DbrxModel(Model): +@ModelBase.register("DbrxForCausalLM") +class DbrxModel(TextModel): model_arch = gguf.MODEL_ARCH.DBRX def set_gguf_parameters(self): @@ -2183,8 +2307,8 @@ class DbrxModel(Model): return n_dims > 1 -@Model.register("MiniCPMForCausalLM") -class MiniCPMModel(Model): +@ModelBase.register("MiniCPMForCausalLM") +class MiniCPMModel(TextModel): model_arch = gguf.MODEL_ARCH.MINICPM def set_gguf_parameters(self): @@ -2238,8 +2362,8 @@ class MiniCPMModel(Model): return [(self.map_tensor_name(name), data_torch)] -@Model.register("MiniCPM3ForCausalLM") -class MiniCPM3Model(Model): +@ModelBase.register("MiniCPM3ForCausalLM") +class MiniCPM3Model(TextModel): model_arch = gguf.MODEL_ARCH.MINICPM3 def set_gguf_parameters(self): @@ -2291,8 +2415,8 @@ class MiniCPM3Model(Model): ) -@Model.register("QWenLMHeadModel") -class QwenModel(Model): +@ModelBase.register("QWenLMHeadModel") +class QwenModel(TextModel): model_arch = gguf.MODEL_ARCH.QWEN @staticmethod @@ -2333,8 +2457,8 @@ class QwenModel(Model): self.gguf_writer.add_file_type(self.ftype) -@Model.register("Qwen2ForCausalLM") -class Qwen2Model(Model): +@ModelBase.register("Qwen2ForCausalLM") +class Qwen2Model(TextModel): model_arch = gguf.MODEL_ARCH.QWEN2 def set_vocab(self): @@ -2352,8 +2476,8 @@ class Qwen2Model(Model): self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) -@Model.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration") -class Qwen2VLModel(Model): +@ModelBase.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration") +class Qwen2VLModel(TextModel): model_arch = gguf.MODEL_ARCH.QWEN2VL def set_gguf_parameters(self): @@ -2375,8 +2499,8 @@ class Qwen2VLModel(Model): yield name, data -@Model.register("WavTokenizerDec") -class WavTokenizerDecModel(Model): +@ModelBase.register("WavTokenizerDec") +class WavTokenizerDecModel(TextModel): model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: @@ -2413,8 +2537,8 @@ class WavTokenizerDecModel(Model): self.gguf_writer.add_causal_attention(False) -@Model.register("Qwen2MoeForCausalLM") -class Qwen2MoeModel(Model): +@ModelBase.register("Qwen2MoeForCausalLM") +class Qwen2MoeModel(TextModel): model_arch = gguf.MODEL_ARCH.QWEN2MOE def set_gguf_parameters(self): @@ -2476,18 +2600,18 @@ class Qwen2MoeModel(Model): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("Qwen3ForCausalLM") +@ModelBase.register("Qwen3ForCausalLM") class Qwen3Model(Qwen2Model): model_arch = gguf.MODEL_ARCH.QWEN3 -@Model.register("Qwen3MoeForCausalLM") +@ModelBase.register("Qwen3MoeForCausalLM") class Qwen3MoeModel(Qwen2MoeModel): model_arch = gguf.MODEL_ARCH.QWEN3MOE -@Model.register("GPT2LMHeadModel") -class GPT2Model(Model): +@ModelBase.register("GPT2LMHeadModel") +class GPT2Model(TextModel): model_arch = gguf.MODEL_ARCH.GPT2 def set_gguf_parameters(self): @@ -2518,8 +2642,8 @@ class GPT2Model(Model): return tensors -@Model.register("PhiForCausalLM") -class Phi2Model(Model): +@ModelBase.register("PhiForCausalLM") +class Phi2Model(TextModel): model_arch = gguf.MODEL_ARCH.PHI2 def set_gguf_parameters(self): @@ -2542,8 +2666,8 @@ class Phi2Model(Model): self.gguf_writer.add_add_bos_token(False) -@Model.register("Phi3ForCausalLM") -class Phi3MiniModel(Model): +@ModelBase.register("Phi3ForCausalLM") +class Phi3MiniModel(TextModel): model_arch = gguf.MODEL_ARCH.PHI3 def set_vocab(self): @@ -2720,7 +2844,7 @@ class Phi3MiniModel(Model): yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32)) -@Model.register("PhiMoEForCausalLM") +@ModelBase.register("PhiMoEForCausalLM") class PhiMoeModel(Phi3MiniModel): model_arch = gguf.MODEL_ARCH.PHIMOE @@ -2777,8 +2901,8 @@ class PhiMoeModel(Phi3MiniModel): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("PlamoForCausalLM") -class PlamoModel(Model): +@ModelBase.register("PlamoForCausalLM") +class PlamoModel(TextModel): model_arch = gguf.MODEL_ARCH.PLAMO def set_vocab(self): @@ -2825,8 +2949,8 @@ class PlamoModel(Model): return [(new_name, data_torch)] -@Model.register("CodeShellForCausalLM") -class CodeShellModel(Model): +@ModelBase.register("CodeShellForCausalLM") +class CodeShellModel(TextModel): model_arch = gguf.MODEL_ARCH.CODESHELL def set_gguf_parameters(self): @@ -2866,8 +2990,8 @@ class CodeShellModel(Model): return [(new_name, data_torch)] -@Model.register("InternLM2ForCausalLM") -class InternLM2Model(Model): +@ModelBase.register("InternLM2ForCausalLM") +class InternLM2Model(TextModel): model_arch = gguf.MODEL_ARCH.INTERNLM2 def set_vocab(self): @@ -3039,8 +3163,8 @@ class InternLM2Model(Model): return [(self.map_tensor_name(name), data_torch)] -@Model.register("InternLM3ForCausalLM") -class InternLM3Model(Model): +@ModelBase.register("InternLM3ForCausalLM") +class InternLM3Model(TextModel): model_arch = gguf.MODEL_ARCH.LLAMA def set_vocab(self): @@ -3099,8 +3223,8 @@ class InternLM3Model(Model): return [(self.map_tensor_name(name), data_torch)] -@Model.register("BertModel", "BertForMaskedLM", "CamembertModel") -class BertModel(Model): +@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel") +class BertModel(TextModel): model_arch = gguf.MODEL_ARCH.BERT def __init__(self, *args, **kwargs): @@ -3187,7 +3311,7 @@ class BertModel(Model): return [(self.map_tensor_name(name), data_torch)] -@Model.register("RobertaModel") +@ModelBase.register("RobertaModel") class RobertaModel(BertModel): model_arch = gguf.MODEL_ARCH.BERT @@ -3232,7 +3356,7 @@ class RobertaModel(BertModel): return super().modify_tensors(data_torch, name, bid) -@Model.register("NomicBertModel") +@ModelBase.register("NomicBertModel") class NomicBertModel(BertModel): model_arch = gguf.MODEL_ARCH.NOMIC_BERT @@ -3262,7 +3386,7 @@ class NomicBertModel(BertModel): self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) -@Model.register("XLMRobertaModel", "XLMRobertaForSequenceClassification") +@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification") class XLMRobertaModel(BertModel): model_arch = gguf.MODEL_ARCH.BERT @@ -3373,8 +3497,8 @@ class XLMRobertaModel(BertModel): return super().modify_tensors(data_torch, name, bid) -@Model.register("GemmaForCausalLM") -class GemmaModel(Model): +@ModelBase.register("GemmaForCausalLM") +class GemmaModel(TextModel): model_arch = gguf.MODEL_ARCH.GEMMA def set_vocab(self): @@ -3424,8 +3548,8 @@ class GemmaModel(Model): return [(self.map_tensor_name(name), data_torch)] -@Model.register("Gemma2ForCausalLM") -class Gemma2Model(Model): +@ModelBase.register("Gemma2ForCausalLM") +class Gemma2Model(TextModel): model_arch = gguf.MODEL_ARCH.GEMMA2 def set_vocab(self): @@ -3471,27 +3595,9 @@ class Gemma2Model(Model): return [(self.map_tensor_name(name), data_torch)] -@Model.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration") -class Gemma3Model(Model): +@ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration") +class Gemma3Model(TextModel): model_arch = gguf.MODEL_ARCH.GEMMA3 - has_vision: bool = False - - # we need to merge the text_config into the root level of hparams - def __init__(self, *args, **kwargs): - hparams = kwargs["hparams"] if "hparams" in kwargs else Model.load_hparams(args[0]) - if "text_config" in hparams: - hparams = {**hparams, **hparams["text_config"]} - kwargs["hparams"] = hparams - super().__init__(*args, **kwargs) - if "vision_config" in hparams: - logger.info("Has vision encoder, but it will be ignored") - self.has_vision = True - - def write(self): - super().write() - if self.has_vision: - logger.info("NOTE: this script only convert the language model to GGUF") - logger.info(" for the vision model, please use gemma3_convert_encoder_to_gguf.py") def set_vocab(self): self._set_vocab_sentencepiece() @@ -3529,10 +3635,10 @@ class Gemma3Model(Model): if name.startswith("language_model."): name = name.replace("language_model.", "") + elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \ - or name.startswith("multimodal_projector.") or name.startswith("vision_model."): # this is for old HF model, should be removed later - # ignore vision tensors - return [] + or name.startswith("multimodal_projector.") or name.startswith("vision_model."): + return [] # skip vision tensors # remove OOV (out-of-vocabulary) rows in token_embd if "embed_tokens.weight" in name: @@ -3548,13 +3654,52 @@ class Gemma3Model(Model): return [(self.map_tensor_name(name), data_torch)] -@Model.register("Starcoder2ForCausalLM") -class StarCoder2Model(Model): +@ModelBase.register("Gemma3ForConditionalGeneration") +class Gemma3VisionModel(VisionModel): + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.GEMMA3) + # default values below are taken from HF tranformers code + self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6)) + self.gguf_writer.add_vision_use_gelu(True) + + def tensor_force_quant(self, name, new_name, bid, n_dims): + del bid, new_name, n_dims # unused + # related to https://github.com/ggml-org/llama.cpp/issues/13025 + if "input_projection" in name: + return gguf.GGMLQuantizationType.F16 + if ".embeddings." in name: + return gguf.GGMLQuantizationType.F32 + return False + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + if name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \ + or name.startswith("multimodal_projector.") or name.startswith("vision_model."): + # process vision tensors + name = name.replace("_weight", ".weight") + + # correct norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector + # the other norm values are part of SigLIP model, and they are already correct + # ref code: Gemma3RMSNorm + if "soft_emb_norm.weight" in name: + logger.info(f"Correcting norm value for '{name}'") + data_torch = data_torch + 1 + + return [(self.map_tensor_name(name), data_torch)] + + return [] # skip other tensors + + +@ModelBase.register("Starcoder2ForCausalLM") +class StarCoder2Model(TextModel): model_arch = gguf.MODEL_ARCH.STARCODER2 -@Model.register("Rwkv6ForCausalLM") -class Rwkv6Model(Model): +@ModelBase.register("Rwkv6ForCausalLM") +class Rwkv6Model(TextModel): model_arch = gguf.MODEL_ARCH.RWKV6 def set_vocab(self): @@ -3626,7 +3771,7 @@ class Rwkv6Model(Model): yield (new_name, data_torch) -@Model.register("RWKV6Qwen2ForCausalLM") +@ModelBase.register("RWKV6Qwen2ForCausalLM") class RWKV6Qwen2Model(Rwkv6Model): model_arch = gguf.MODEL_ARCH.RWKV6QWEN2 @@ -3680,8 +3825,8 @@ class RWKV6Qwen2Model(Rwkv6Model): yield (new_name, data) -@Model.register("Rwkv7ForCausalLM", "RWKV7ForCausalLM") -class Rwkv7Model(Model): +@ModelBase.register("Rwkv7ForCausalLM", "RWKV7ForCausalLM") +class Rwkv7Model(TextModel): model_arch = gguf.MODEL_ARCH.RWKV7 def set_vocab(self): @@ -3799,7 +3944,7 @@ class Rwkv7Model(Model): yield (new_name, data_torch) -@Model.register("RwkvHybridForCausalLM") +@ModelBase.register("RwkvHybridForCausalLM") class ARwkv7Model(Rwkv7Model): model_arch = gguf.MODEL_ARCH.ARWKV7 @@ -3842,8 +3987,8 @@ class ARwkv7Model(Rwkv7Model): self.gguf_writer.add_head_count(0) -@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") -class MambaModel(Model): +@ModelBase.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") +class MambaModel(TextModel): model_arch = gguf.MODEL_ARCH.MAMBA def set_vocab(self): @@ -3920,8 +4065,8 @@ class MambaModel(Model): return [(new_name, data_torch)] -@Model.register("CohereForCausalLM") -class CommandR2Model(Model): +@ModelBase.register("CohereForCausalLM") +class CommandR2Model(TextModel): model_arch = gguf.MODEL_ARCH.COMMAND_R def __init__(self, *args, **kwargs): @@ -3938,8 +4083,8 @@ class CommandR2Model(Model): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) -@Model.register("Cohere2ForCausalLM") -class Cohere2Model(Model): +@ModelBase.register("Cohere2ForCausalLM") +class Cohere2Model(TextModel): model_arch = gguf.MODEL_ARCH.COHERE2 def set_gguf_parameters(self): @@ -3956,9 +4101,9 @@ class Cohere2Model(Model): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) -@Model.register("OlmoForCausalLM") -@Model.register("OLMoForCausalLM") -class OlmoModel(Model): +@ModelBase.register("OlmoForCausalLM") +@ModelBase.register("OLMoForCausalLM") +class OlmoModel(TextModel): model_arch = gguf.MODEL_ARCH.OLMO def set_gguf_parameters(self): @@ -3984,13 +4129,13 @@ class OlmoModel(Model): return [(self.map_tensor_name(name), data_torch)] -@Model.register("Olmo2ForCausalLM") -class Olmo2Model(Model): +@ModelBase.register("Olmo2ForCausalLM") +class Olmo2Model(TextModel): model_arch = gguf.MODEL_ARCH.OLMO2 -@Model.register("OlmoeForCausalLM") -class OlmoeModel(Model): +@ModelBase.register("OlmoeForCausalLM") +class OlmoeModel(TextModel): model_arch = gguf.MODEL_ARCH.OLMOE def set_gguf_parameters(self): @@ -4049,7 +4194,7 @@ class OlmoeModel(Model): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("JinaBertModel", "JinaBertForMaskedLM") +@ModelBase.register("JinaBertModel", "JinaBertForMaskedLM") class JinaBertV2Model(BertModel): model_arch = gguf.MODEL_ARCH.JINA_BERT_V2 @@ -4096,8 +4241,8 @@ class JinaBertV2Model(BertModel): return super().modify_tensors(data_torch, name, bid) -@Model.register("OpenELMForCausalLM") -class OpenELMModel(Model): +@ModelBase.register("OpenELMForCausalLM") +class OpenELMModel(TextModel): model_arch = gguf.MODEL_ARCH.OPENELM @staticmethod @@ -4171,8 +4316,8 @@ class OpenELMModel(Model): yield (self.map_tensor_name(name), data_torch) -@Model.register("ArcticForCausalLM") -class ArcticModel(Model): +@ModelBase.register("ArcticForCausalLM") +class ArcticModel(TextModel): model_arch = gguf.MODEL_ARCH.ARCTIC def set_vocab(self): @@ -4322,8 +4467,8 @@ class ArcticModel(Model): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("DeepseekForCausalLM") -class DeepseekModel(Model): +@ModelBase.register("DeepseekForCausalLM") +class DeepseekModel(TextModel): model_arch = gguf.MODEL_ARCH.DEEPSEEK def set_vocab(self): @@ -4413,9 +4558,9 @@ class DeepseekModel(Model): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("DeepseekV2ForCausalLM") -@Model.register("DeepseekV3ForCausalLM") -class DeepseekV2Model(Model): +@ModelBase.register("DeepseekV2ForCausalLM") +@ModelBase.register("DeepseekV3ForCausalLM") +class DeepseekV2Model(TextModel): model_arch = gguf.MODEL_ARCH.DEEPSEEK2 def set_vocab(self): @@ -4541,8 +4686,8 @@ class DeepseekV2Model(Model): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("PLMForCausalLM") -class PLMModel(Model): +@ModelBase.register("PLMForCausalLM") +class PLMModel(TextModel): model_arch = gguf.MODEL_ARCH.PLM def set_vocab(self): @@ -4564,11 +4709,11 @@ class PLMModel(Model): super().prepare_tensors() -@Model.register("T5WithLMHeadModel") -@Model.register("T5ForConditionalGeneration") -@Model.register("MT5ForConditionalGeneration") -@Model.register("UMT5ForConditionalGeneration") -class T5Model(Model): +@ModelBase.register("T5WithLMHeadModel") +@ModelBase.register("T5ForConditionalGeneration") +@ModelBase.register("MT5ForConditionalGeneration") +@ModelBase.register("UMT5ForConditionalGeneration") +class T5Model(TextModel): model_arch = gguf.MODEL_ARCH.T5 def __init__(self, *args, **kwargs): @@ -4707,8 +4852,8 @@ class T5Model(Model): return [(self.map_tensor_name(name), data_torch)] -@Model.register("T5EncoderModel") -class T5EncoderModel(Model): +@ModelBase.register("T5EncoderModel") +class T5EncoderModel(TextModel): model_arch = gguf.MODEL_ARCH.T5ENCODER def __init__(self, *args, **kwargs): @@ -4846,8 +4991,8 @@ class T5EncoderModel(Model): return [(self.map_tensor_name(name), data_torch)] -@Model.register("JAISLMHeadModel") -class JaisModel(Model): +@ModelBase.register("JAISLMHeadModel") +class JaisModel(TextModel): model_arch = gguf.MODEL_ARCH.JAIS def __init__(self, *args, **kwargs): @@ -4929,8 +5074,8 @@ class JaisModel(Model): self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias) -@Model.register("Glm4ForCausalLM") -class Glm4Model(Model): +@ModelBase.register("Glm4ForCausalLM") +class Glm4Model(TextModel): model_arch = gguf.MODEL_ARCH.GLM4 def set_vocab(self): @@ -4945,8 +5090,8 @@ class Glm4Model(Model): self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) -@Model.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration") -class ChatGLMModel(Model): +@ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration") +class ChatGLMModel(TextModel): model_arch = gguf.MODEL_ARCH.CHATGLM def set_vocab_chatglm3(self): @@ -5100,8 +5245,8 @@ class ChatGLMModel(Model): return [(self.map_tensor_name(name), data_torch)] -@Model.register("NemotronForCausalLM") -class NemotronModel(Model): +@ModelBase.register("NemotronForCausalLM") +class NemotronModel(TextModel): model_arch = gguf.MODEL_ARCH.NEMOTRON def set_vocab(self): @@ -5141,8 +5286,8 @@ class NemotronModel(Model): return [(self.map_tensor_name(name), data_torch)] -@Model.register("ExaoneForCausalLM") -class ExaoneModel(Model): +@ModelBase.register("ExaoneForCausalLM") +class ExaoneModel(TextModel): model_arch = gguf.MODEL_ARCH.EXAONE def set_gguf_parameters(self): @@ -5210,7 +5355,7 @@ class ExaoneModel(Model): yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) -@Model.register("GraniteForCausalLM") +@ModelBase.register("GraniteForCausalLM") class GraniteModel(LlamaModel): """Conversion for IBM's GraniteForCausalLM""" model_arch = gguf.MODEL_ARCH.GRANITE @@ -5244,7 +5389,7 @@ class GraniteModel(LlamaModel): logger.info("gguf: (granite) logits_scale = %s", logits_scale) -@Model.register("GraniteMoeForCausalLM") +@ModelBase.register("GraniteMoeForCausalLM") class GraniteMoeModel(GraniteModel): """Conversion for IBM's GraniteMoeForCausalLM""" model_arch = gguf.MODEL_ARCH.GRANITE_MOE @@ -5268,8 +5413,8 @@ class GraniteMoeModel(GraniteModel): return super().modify_tensors(data_torch, name, bid) -@Model.register("BailingMoeForCausalLM") -class BailingMoeModel(Model): +@ModelBase.register("BailingMoeForCausalLM") +class BailingMoeModel(TextModel): model_arch = gguf.MODEL_ARCH.BAILINGMOE def set_vocab(self): @@ -5367,9 +5512,9 @@ class BailingMoeModel(Model): raise ValueError(f"Unprocessed experts: {experts}") -@Model.register("ChameleonForConditionalGeneration") -@Model.register("ChameleonForCausalLM") # obsolete -class ChameleonModel(Model): +@ModelBase.register("ChameleonForConditionalGeneration") +@ModelBase.register("ChameleonForCausalLM") # obsolete +class ChameleonModel(TextModel): model_arch = gguf.MODEL_ARCH.CHAMELEON def set_gguf_parameters(self): @@ -5554,6 +5699,10 @@ def parse_args() -> argparse.Namespace: "--remote", action="store_true", help="(Experimental) Read safetensors file remotely without downloading to disk. Config and tokenizer files will still be downloaded. To use this feature, you need to specify Hugging Face model repo name instead of a local directory. For example: 'HuggingFaceTB/SmolLM2-1.7B-Instruct'. Note: To access gated repo, set HF_TOKEN environment variable to your Hugging Face token.", ) + parser.add_argument( + "--mmproj", action="store_true", + help="(Experimental) Export multimodal projector (mmproj) for vision models. This will only work on some vision models. A prefix 'mmproj-' will be added to the output file name.", + ) args = parser.parse_args() if not args.print_supported_models and args.model is None: @@ -5584,7 +5733,7 @@ def main() -> None: if args.print_supported_models: logger.error("Supported models:") - Model.print_registered_models() + ModelBase.print_registered_models() sys.exit(0) if args.verbose: @@ -5631,13 +5780,18 @@ def main() -> None: logger.info(f"Loading model: {dir_model.name}") - hparams = Model.load_hparams(dir_model) + hparams = ModelBase.load_hparams(dir_model) + + if args.mmproj: + if "mmproj" not in fname_out.name: + fname_out = ModelBase.add_prefix_to_filename(fname_out, "mmproj-") with torch.inference_mode(): output_type = ftype_map[args.outtype] model_architecture = hparams["architectures"][0] + model_type = ModelType.VISION if args.mmproj else ModelType.TEXT try: - model_class = Model.from_model_architecture(model_architecture) + model_class = ModelBase.from_model_architecture(model_architecture, model_type=model_type) except NotImplementedError: logger.error(f"Model {model_architecture} is not supported") sys.exit(1) diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index bdc991533..00a6733cb 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -24,7 +24,7 @@ if 'NO_LOCAL_GGUF' not in os.environ: import gguf # reuse model definitions from convert_hf_to_gguf.py -from convert_hf_to_gguf import LazyTorchTensor, Model +from convert_hf_to_gguf import LazyTorchTensor, ModelBase logger = logging.getLogger("lora-to-gguf") @@ -340,11 +340,11 @@ if __name__ == '__main__': sys.exit(1) else: logger.info(f"Loading base model: {dir_base_model.name}") - hparams = Model.load_hparams(dir_base_model) + hparams = ModelBase.load_hparams(dir_base_model) with torch.inference_mode(): try: - model_class = Model.from_model_architecture(hparams["architectures"][0]) + model_class = ModelBase.from_model_architecture(hparams["architectures"][0]) except NotImplementedError: logger.error(f"Model {hparams['architectures'][0]} is not supported") sys.exit(1) diff --git a/examples/llava/README-gemma3.md b/docs/multimodal/gemma3.md similarity index 82% rename from examples/llava/README-gemma3.md rename to docs/multimodal/gemma3.md index 3c25ee258..8fa077de7 100644 --- a/examples/llava/README-gemma3.md +++ b/docs/multimodal/gemma3.md @@ -26,11 +26,12 @@ llama-gemma3-cli -hf ggml-org/gemma-3-27b-it-GGUF ## How to get mmproj.gguf? +Simply to add `--mmproj` in when converting model via `convert_hf_to_gguf.py`: + ```bash cd gemma-3-4b-it -python ../llama.cpp/examples/llava/gemma3_convert_encoder_to_gguf.py . - -# output file is mmproj.gguf +python ../llama.cpp/convert_hf_to_gguf.py --outfile model.gguf --outtype f16 --mmproj . +# output file: mmproj-model.gguf ``` ## How to run it? diff --git a/examples/llava/README-glmedge.md b/docs/multimodal/glmedge.md similarity index 80% rename from examples/llava/README-glmedge.md rename to docs/multimodal/glmedge.md index 603d01474..af6b696a8 100644 --- a/examples/llava/README-glmedge.md +++ b/docs/multimodal/glmedge.md @@ -3,12 +3,12 @@ Currently this implementation supports [glm-edge-v-2b](https://huggingface.co/THUDM/glm-edge-v-2b) and [glm-edge-v-5b](https://huggingface.co/THUDM/glm-edge-v-5b). ## Usage -Build with cmake or run `make llama-llava-cli` to build it. +Build the `llama-mtmd-cli` binary. -After building, run: `./llama-llava-cli` to see the usage. For example: +After building, run: `./llama-mtmd-cli` to see the usage. For example: ```sh -./llama-llava-cli -m model_path/ggml-model-f16.gguf --mmproj model_path/mmproj-model-f16.gguf --image img_path/image.jpg -p "<|system|>\n system prompt <|user|>\n prompt <|assistant|>\n" +./llama-mtmd-cli -m model_path/ggml-model-f16.gguf --mmproj model_path/mmproj-model-f16.gguf ``` **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so. diff --git a/examples/llava/README-granitevision.md b/docs/multimodal/granitevision.md similarity index 92% rename from examples/llava/README-granitevision.md rename to docs/multimodal/granitevision.md index f08a21cc1..3118fe0cd 100644 --- a/examples/llava/README-granitevision.md +++ b/docs/multimodal/granitevision.md @@ -176,15 +176,11 @@ Note that currently you cannot quantize the visual encoder because granite visio ### 5. Running the Model in Llama cpp -Build llama cpp normally; you should have a target binary named `llama-llava-cli`, which you can pass two binaries to. As an example, we pass the the llama.cpp banner. +Build llama cpp normally; you should have a target binary named `llama-mtmd-cli`, which you can pass two binaries to. As an example, we pass the the llama.cpp banner. ```bash -$ ./build/bin/llama-llava-cli -m $LLM_GGUF_PATH \ +$ ./build/bin/llama-mtmd-cli -m $LLM_GGUF_PATH \ --mmproj $VISUAL_GGUF_PATH \ - --image ./media/llama0-banner.png \ -c 16384 \ - -p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n<|user|>\n\\nWhat does the text in this image say?\n<|assistant|>\n" \ --temp 0 ``` - -Sample output: `The text in the image reads "LLAMA C++ Can it run DOOM Llama?"` diff --git a/docs/multimodal/llava.md b/docs/multimodal/llava.md new file mode 100644 index 000000000..c5bdc8215 --- /dev/null +++ b/docs/multimodal/llava.md @@ -0,0 +1,143 @@ +# LLaVA + +Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants, +as well as llava-1.6 [llava-v1.6](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2) variants. + +The pre-converted [7b](https://huggingface.co/mys/ggml_llava-v1.5-7b) +and [13b](https://huggingface.co/mys/ggml_llava-v1.5-13b) +models are available. +For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](https://huggingface.co/cmp-nct/llava-1.6-gguf) + +After API is confirmed, more models will be supported / uploaded. + +## Usage +Build the `llama-mtmd-cli` binary. + +After building, run: `./llama-mtmd-cli` to see the usage. For example: + +```sh +./llama-mtmd-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf \ + --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf \ + --chat-template vicuna +``` + +**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so. +**note**: For GPU offloading ensure to use the `-ngl` flag just like usual + +## LLaVA 1.5 + +1. Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example: + +```sh +git clone https://huggingface.co/liuhaotian/llava-v1.5-7b + +git clone https://huggingface.co/openai/clip-vit-large-patch14-336 +``` + +2. Install the required Python packages: + +```sh +pip install -r examples/llava/requirements.txt +``` + +3. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents: + +```sh +python ./examples/llava/llava_surgery.py -m ../llava-v1.5-7b +``` + +4. Use `convert_image_encoder_to_gguf.py` to convert the LLaVA image encoder to GGUF: + +```sh +python ./examples/llava/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b +``` + +5. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF: + +```sh +python ./examples/convert_legacy_llama.py ../llava-v1.5-7b --skip-unknown +``` + +Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory. + +## LLaVA 1.6 gguf conversion +1) First clone a LLaVA 1.6 model: +```console +git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b +``` + +2) Install the required Python packages: + +```sh +pip install -r examples/llava/requirements.txt +``` + +3) Use `llava_surgery_v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models: +```console +python examples/llava/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/ +``` +- you will find a llava.projector and a llava.clip file in your model directory + +4) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory: +```console +mkdir vit +cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin +cp ../llava-v1.6-vicuna-7b/llava.projector vit/ +curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json +``` + +5) Create the visual gguf model: +```console +python ./examples/llava/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision +``` +- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP + +6) Then convert the model to gguf format: +```console +python ./examples/convert_legacy_llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown +``` + +7) And finally we can run the llava cli using the 1.6 model version: +```console +./llama-mtmd-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf +``` + +**note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096) + +**note** llava-1.6 greatly benefits from batched prompt processing (defaults work) + +**note** if the language model in step `6)` is incompatible with the legacy conversion script, the easiest way handle the LLM model conversion is to load the model in transformers, and export only the LLM from the llava next model. + +```python +import os +import transformers + +model_path = ... +llm_export_path = ... + +tokenizer = transformers.AutoTokenizer.from_pretrained(model_path) +model = transformers.AutoModelForImageTextToText.from_pretrained(model_path) + +tokenizer.save_pretrained(llm_export_path) +model.language_model.save_pretrained(llm_export_path) +``` + +Then, you can convert the LLM using the `convert_hf_to_gguf.py` script, which handles more LLM architectures. + +## Chat template + +For llava-1.5 and llava-1.6, you need to use `vicuna` chat template. Simply add `--chat-template vicuna` to activate this template. + + +## How to know if you are running in llava-1.5 or llava-1.6 mode + +When running llava-cli you will see a visual information right before the prompt is being processed: + +**Llava-1.5:** +`encode_image_with_clip: image embedding created: 576 tokens` + +**Llava-1.6 (anything above 576):** +`encode_image_with_clip: image embedding created: 2880 tokens` + + +Alternatively just pay notice to how many "tokens" have been used for your prompt, it will also show 1000+ tokens for llava-1.6 diff --git a/examples/llava/README-minicpmo2.6.md b/docs/multimodal/minicpmo2.6.md similarity index 73% rename from examples/llava/README-minicpmo2.6.md rename to docs/multimodal/minicpmo2.6.md index 48c423238..de470d8a8 100644 --- a/examples/llava/README-minicpmo2.6.md +++ b/docs/multimodal/minicpmo2.6.md @@ -40,9 +40,9 @@ python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model Inference on Linux or Mac ```bash -# run f16 version -./build/bin/llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" +# run in single-turn mode +./build/bin/llama-mtmd-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" -# run quantized int4 version -./build/bin/llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" +# run in conversation mode +./build/bin/llama-mtmd-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf ``` diff --git a/examples/llava/README-minicpmv2.5.md b/docs/multimodal/minicpmv2.5.md similarity index 72% rename from examples/llava/README-minicpmv2.5.md rename to docs/multimodal/minicpmv2.5.md index 6bfe7abd1..7a6879d39 100644 --- a/examples/llava/README-minicpmv2.5.md +++ b/docs/multimodal/minicpmv2.5.md @@ -39,9 +39,9 @@ python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model Inference on Linux or Mac ```bash -# run f16 version -./build/bin/llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" +# run in single-turn mode +./build/bin/llama-mtmd-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" -# run quantized int4 version -./build/bin/llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" +# run in conversation mode +./build/bin/llama-mtmd-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf ``` diff --git a/examples/llava/README-minicpmv2.6.md b/docs/multimodal/minicpmv2.6.md similarity index 71% rename from examples/llava/README-minicpmv2.6.md rename to docs/multimodal/minicpmv2.6.md index 2df39cdba..410a5dd17 100644 --- a/examples/llava/README-minicpmv2.6.md +++ b/docs/multimodal/minicpmv2.6.md @@ -39,9 +39,9 @@ python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model Inference on Linux or Mac ```bash -# run f16 version -./build/bin/llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" +# run in single-turn mode +./build/bin/llama-mtmd-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" -# run quantized int4 version -./build/bin/llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" +# run in conversation mode +./build/bin/llama-mtmd-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf ``` diff --git a/examples/llava/clip-impl.h b/examples/llava/clip-impl.h index 55b18dc89..3fad9a932 100644 --- a/examples/llava/clip-impl.h +++ b/examples/llava/clip-impl.h @@ -35,13 +35,13 @@ #define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" #define KEY_PROJ_DIM "clip.%s.projection_dim" #define KEY_TOKENS "tokenizer.ggml.tokens" -#define KEY_N_POSITIONS "clip.text.context_length" #define KEY_IMAGE_SIZE "clip.vision.image_size" #define KEY_PATCH_SIZE "clip.vision.patch_size" #define KEY_IMAGE_MEAN "clip.vision.image_mean" #define KEY_IMAGE_STD "clip.vision.image_std" -#define KEY_PROJ_TYPE "clip.projector_type" #define KEY_FEATURE_LAYER "clip.vision.feature_layer" +#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor" +#define KEY_PROJ_TYPE "clip.projector_type" #define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" #define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" @@ -54,7 +54,6 @@ // tensor name constants // -#define TN_TOKEN_EMBD "%s.token_embd.weight" #define TN_POS_EMBD "%s.position_embd.weight" #define TN_CLASS_EMBD "v.class_embd" #define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat @@ -71,8 +70,6 @@ #define TN_LN_2 "%s.blk.%d.ln2.%s" #define TN_LN_PRE "%s.pre_ln.%s" #define TN_LN_POST "%s.post_ln.%s" -#define TN_TEXT_PROJ "text_projection.weight" -#define TN_VIS_PROJ "visual_projection.weight" #define TN_LLAVA_PROJ "mm.%d.%s" #define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s" #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" @@ -80,6 +77,7 @@ #define TN_IMAGE_NEWLINE "model.image_newline" #define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3 #define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3 +#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3 // mimicpmv #define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k" @@ -107,6 +105,7 @@ enum projector_type { PROJECTOR_TYPE_GLM_EDGE, PROJECTOR_TYPE_MERGER, PROJECTOR_TYPE_GEMMA3, + PROJECTOR_TYPE_IDEFICS3, PROJECTOR_TYPE_UNKNOWN, }; @@ -118,6 +117,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_GLM_EDGE, "adapter"}, { PROJECTOR_TYPE_MERGER, "qwen2vl_merger"}, { PROJECTOR_TYPE_GEMMA3, "gemma3"}, + { PROJECTOR_TYPE_IDEFICS3, "idefics3"}, }; static projector_type clip_projector_type_from_string(const std::string & str) { diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 557a73fe8..a4d44902e 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -173,6 +173,7 @@ struct clip_hparams { int32_t projection_dim; int32_t n_head; int32_t n_layer; + int32_t proj_scale_factor = 0; // idefics3 patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT; @@ -539,6 +540,35 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im embeddings = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)), embeddings); + + } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) { + // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578 + + ggml_tensor * cur = embeddings; + const int scale_factor = model.hparams.proj_scale_factor; + const int n_embd = cur->ne[0]; + const int seq = cur->ne[1]; + const int bsz = 1; // batch size, always 1 for now since we don't support batching + const int height = std::sqrt(seq); + const int width = std::sqrt(seq); + GGML_ASSERT(scale_factor != 0); + cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur), + n_embd * scale_factor * scale_factor, + height / scale_factor, + width / scale_factor, + bsz); + cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); + cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, cur), + n_embd * scale_factor * scale_factor, + seq / (scale_factor * scale_factor), + bsz); + + cur = ggml_mul_mat(ctx0, model.projection, cur); + embeddings = cur; + } else { + GGML_ABORT("SigLIP: Unsupported projector type"); } // build the graph @@ -1197,12 +1227,20 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im } static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) { - if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { - return clip_image_build_graph_siglip(ctx, imgs); - } else { - // TODO: we should have one build_* function per model - return clip_image_build_graph_legacy(ctx, imgs, load_image_size, is_inf); + ggml_cgraph * res; + switch (ctx->proj_type) { + case PROJECTOR_TYPE_GEMMA3: + case PROJECTOR_TYPE_IDEFICS3: + { + res = clip_image_build_graph_siglip(ctx, imgs); + } break; + default: + { + // TODO: we should have one build_* function per model + res = clip_image_build_graph_legacy(ctx, imgs, load_image_size, is_inf); + } break; } + return res; } struct clip_model_loader { @@ -1266,6 +1304,8 @@ struct clip_model_loader { } void load_hparams() { + auto & hparams = ctx_clip.vision_model.hparams; + // projector type { std::string proj_type; @@ -1298,7 +1338,6 @@ struct clip_model_loader { get_bool(KEY_USE_GLU_MLP, ctx_clip.use_glu_mlp, false); get_bool(KEY_USE_RMS_NORM, ctx_clip.use_rms_norm, false); - auto & hparams = ctx_clip.vision_model.hparams; get_u32(string_format(KEY_N_EMBD, "vision"), hparams.hidden_size); get_u32(string_format(KEY_N_HEAD, "vision"), hparams.n_head); get_u32(string_format(KEY_N_FF, "vision"), hparams.n_intermediate); @@ -1356,6 +1395,16 @@ struct clip_model_loader { LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0); LOG_INF("%s: metadata size: %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0); } + + // model-specific params + switch (ctx_clip.proj_type) { + case PROJECTOR_TYPE_IDEFICS3: + { + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false); + } break; + default: + break; + } } void load_tensors() { @@ -1547,6 +1596,10 @@ struct clip_model_loader { vision_model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ); vision_model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N); } break; + case PROJECTOR_TYPE_IDEFICS3: + { + vision_model.projection = get_tensor(TN_MM_PROJECTOR); + } break; default: GGML_ASSERT(false && "unknown projector type"); } @@ -2407,10 +2460,12 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str return true; } - if (ctx->has_glm_projector || ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { + if (ctx->has_glm_projector + || ctx->proj_type == PROJECTOR_TYPE_GEMMA3 + || ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) { clip_image_u8 resized_image; int sz = params.image_size; - image_manipulation::bicubic_resize(*img, resized_image, sz, sz); + image_manipulation::resize_and_pad_image(*img, resized_image, {sz, sz}); clip_image_f32_ptr img_f32(clip_image_f32_init()); //clip_image_save_to_bmp(resized_image, "resized.bmp"); normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std); @@ -2546,6 +2601,8 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i n_patches = x_patch * y_patch; } else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { n_patches = 256; + } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) { + n_patches /= ctx->vision_model.hparams.proj_scale_factor; } return n_patches; @@ -2877,6 +2934,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { // do nothing } + else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) { + // do nothing + } else { struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); @@ -3138,37 +3198,34 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i } int clip_n_mmproj_embd(const struct clip_ctx * ctx) { - if (ctx->proj_type == PROJECTOR_TYPE_LDP) { - return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0]; - } - if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) { - return ctx->vision_model.mm_model_peg_0_b->ne[0]; - } - if (ctx->proj_type == PROJECTOR_TYPE_MLP) { - return ctx->vision_model.mm_2_b->ne[0]; - } - if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { - return ctx->vision_model.mm_3_b->ne[0]; - } - if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) { - if (ctx->minicpmv_version == 2) { - return 4096; - } - else if (ctx->minicpmv_version == 3) { - return 3584; - } - else if (ctx->minicpmv_version == 4) { - return 3584; - } - } - if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE){ - return ctx->vision_model.mm_model_mlp_3_w->ne[1]; - } - if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { - return ctx->vision_model.mm_1_b->ne[0]; - } - if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) { - return ctx->vision_model.mm_input_proj_w->ne[0]; + switch (ctx->proj_type) { + case PROJECTOR_TYPE_LDP: + return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0]; + case PROJECTOR_TYPE_LDPV2: + return ctx->vision_model.mm_model_peg_0_b->ne[0]; + case PROJECTOR_TYPE_MLP: + return ctx->vision_model.mm_2_b->ne[0]; + case PROJECTOR_TYPE_MLP_NORM: + return ctx->vision_model.mm_3_b->ne[0]; + case PROJECTOR_TYPE_RESAMPLER: + if (ctx->minicpmv_version == 2) { + return 4096; + } else if (ctx->minicpmv_version == 3) { + return 3584; + } else if (ctx->minicpmv_version == 4) { + return 3584; + } + break; // Should not happen if version is valid + case PROJECTOR_TYPE_GLM_EDGE: + return ctx->vision_model.mm_model_mlp_3_w->ne[1]; + case PROJECTOR_TYPE_MERGER: + return ctx->vision_model.mm_1_b->ne[0]; + case PROJECTOR_TYPE_GEMMA3: + return ctx->vision_model.mm_input_proj_w->ne[0]; + case PROJECTOR_TYPE_IDEFICS3: + return ctx->vision_model.projection->ne[1]; + default: + break; // Fall through to throw } std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; diff --git a/examples/llava/deprecation-warning.cpp b/examples/llava/deprecation-warning.cpp new file mode 100644 index 000000000..dded0a56a --- /dev/null +++ b/examples/llava/deprecation-warning.cpp @@ -0,0 +1,22 @@ +#include +#include + +int main(int argc, char** argv) { + std::string filename = "main"; + if (argc >= 1) { + filename = argv[0]; + } + + // Get only the program name from the full path + size_t pos = filename.find_last_of("/\\"); + if (pos != std::string::npos) { + filename = filename.substr(pos+1); + } + + fprintf(stdout, "\n"); + fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str()); + fprintf(stdout, "Please use 'llama-mtmd-cli' instead.\n"); + fprintf(stdout, "\n"); + + return EXIT_FAILURE; +} diff --git a/examples/llava/gemma3_convert_encoder_to_gguf.py b/examples/llava/gemma3_convert_encoder_to_gguf.py deleted file mode 100644 index 241b526b9..000000000 --- a/examples/llava/gemma3_convert_encoder_to_gguf.py +++ /dev/null @@ -1,307 +0,0 @@ -import gguf -import argparse -import logging -import sys -import torch -import json -import os -import numpy as np -from typing import cast, ContextManager, Any, Iterator -from pathlib import Path -from torch import Tensor - -logger = logging.getLogger("gemma3-mmproj") - - -# (copied from convert_hf_to_gguf.py) -# tree of lazy tensors -class LazyTorchTensor(gguf.LazyBase): - _tensor_type = torch.Tensor - # to keep the type-checker happy - dtype: torch.dtype - shape: torch.Size - - # only used when converting a torch.Tensor to a np.ndarray - _dtype_map: dict[torch.dtype, type] = { - torch.float16: np.float16, - torch.float32: np.float32, - } - - # used for safetensors slices - # ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046 - # TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734 - _dtype_str_map: dict[str, torch.dtype] = { - "F64": torch.float64, - "F32": torch.float32, - "BF16": torch.bfloat16, - "F16": torch.float16, - # "U64": torch.uint64, - "I64": torch.int64, - # "U32": torch.uint32, - "I32": torch.int32, - # "U16": torch.uint16, - "I16": torch.int16, - "U8": torch.uint8, - "I8": torch.int8, - "BOOL": torch.bool, - "F8_E4M3": torch.float8_e4m3fn, - "F8_E5M2": torch.float8_e5m2, - } - - def numpy(self) -> gguf.LazyNumpyTensor: - dtype = self._dtype_map[self.dtype] - return gguf.LazyNumpyTensor( - meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape), - args=(self,), - func=(lambda s: s.numpy()) - ) - - @classmethod - def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor: - return torch.empty(size=shape, dtype=dtype, device="meta") - - @classmethod - def from_safetensors_slice(cls, st_slice: Any) -> Tensor: - dtype = cls._dtype_str_map[st_slice.get_dtype()] - shape: tuple[int, ...] = tuple(st_slice.get_shape()) - lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:]) - return cast(torch.Tensor, lazy) - - @classmethod - def __torch_function__(cls, func, types, args=(), kwargs=None): - del types # unused - - if kwargs is None: - kwargs = {} - - if func is torch.Tensor.numpy: - return args[0].numpy() - - return cls._wrap_fn(func)(*args, **kwargs) - - -class Gemma3VisionTower: - hparams: dict - gguf_writer: gguf.GGUFWriter - fname_out: Path - ftype: gguf.LlamaFileType - - @staticmethod - def load_hparams(dir_model: Path): - with open(dir_model / "config.json", "r", encoding="utf-8") as f: - return json.load(f) - - @staticmethod - def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]: - part_names: list[str] = [] - for filename in os.listdir(dir_model): - if filename.startswith(prefix) and filename.endswith(suffix): - part_names.append(filename) - part_names.sort() - return part_names - - def __init__(self, - dir_model: Path, - fname_out: Path, - ftype: gguf.LlamaFileType, - is_big_endian: bool,): - hparams = Gemma3VisionTower.load_hparams(dir_model) - self.hparams = hparams - self.fname_out = fname_out - self.ftype = ftype - endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE - self.gguf_writer = gguf.GGUFWriter(path=None, arch="clip", endianess=endianess) - - text_config = hparams["text_config"] - vision_config = hparams["vision_config"] - - assert hparams["architectures"][0] == "Gemma3ForConditionalGeneration" - assert text_config is not None - assert vision_config is not None - - self.gguf_writer.add_string ("clip.projector_type", "gemma3") - self.gguf_writer.add_bool ("clip.has_text_encoder", False) - self.gguf_writer.add_bool ("clip.has_vision_encoder", True) - self.gguf_writer.add_bool ("clip.has_llava_projector", False) # legacy - self.gguf_writer.add_uint32 ("clip.vision.image_size", vision_config["image_size"]) - self.gguf_writer.add_uint32 ("clip.vision.patch_size", vision_config["patch_size"]) - self.gguf_writer.add_uint32 ("clip.vision.embedding_length", vision_config["hidden_size"]) - self.gguf_writer.add_uint32 ("clip.vision.feed_forward_length", vision_config["intermediate_size"]) - self.gguf_writer.add_uint32 ("clip.vision.projection_dim", text_config["hidden_size"]) - self.gguf_writer.add_uint32 ("clip.vision.block_count", vision_config["num_hidden_layers"]) - self.gguf_writer.add_uint32 ("clip.vision.attention.head_count", vision_config["num_attention_heads"]) - self.gguf_writer.add_float32("clip.vision.attention.layer_norm_epsilon", vision_config.get("layer_norm_eps", 1e-6)) - # default values taken from HF tranformers code - self.gguf_writer.add_array ("clip.vision.image_mean", [0.5, 0.5, 0.5]) - self.gguf_writer.add_array ("clip.vision.image_std", [0.5, 0.5, 0.5]) - self.gguf_writer.add_bool ("clip.use_gelu", True) - - # load tensors - for name, data_torch in self.get_tensors(dir_model): - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - self.add_tensor(name, data_torch) - - def get_tensors(self, dir_model: Path) -> Iterator[tuple[str, Tensor]]: - part_names = Gemma3VisionTower.get_model_part_names(dir_model, "model", ".safetensors") - tensor_names_from_parts: set[str] = set() - for part_name in part_names: - logger.info(f"gguf: loading model part '{part_name}'") - from safetensors import safe_open - ctx = cast(ContextManager[Any], safe_open(dir_model / part_name, framework="pt", device="cpu")) - with ctx as model_part: - tensor_names_from_parts.update(model_part.keys()) - - for name in model_part.keys(): - data = model_part.get_slice(name) - data = LazyTorchTensor.from_safetensors_slice(data) - yield name, data - - def add_tensor(self, name: str, data_torch: Tensor): - is_1d = len(data_torch.shape) == 1 - is_embd = ".embeddings." in name - old_dtype = data_torch.dtype - can_quantize = not is_1d and not is_embd - data_qtype = gguf.GGMLQuantizationType.F32 - - # this is to support old checkpoint - # TODO: remove this when we have the final model - name = name.replace("vision_model.vision_model.", "vision_tower.vision_model.") - name = name.replace("multimodal_projector.", "multi_modal_projector.") - - # filter only vision tensors - if not name.startswith("vision_tower.vision_model.") and not name.startswith("multi_modal_projector."): - return - # prefix - name = name.replace("vision_tower.vision_model.encoder.layers.", "v.blk.") - name = name.replace("vision_tower.vision_model.", "v.") - # projector and input embd - name = name.replace(".embeddings.patch_embedding.", ".patch_embd.") - name = name.replace(".embeddings.position_embedding.", ".position_embd.") - name = name.replace( - "multi_modal_projector.mm_input_projection_weight", - "mm.input_projection.weight" - ) - name = name.replace( - "multi_modal_projector.mm_soft_emb_norm.weight", - "mm.soft_emb_norm.weight" - ) - name = name.replace("post_layernorm.", "post_ln.") - # each block - name = name.replace(".self_attn.k_proj.", ".attn_k.") - name = name.replace(".self_attn.v_proj.", ".attn_v.") - name = name.replace(".self_attn.q_proj.", ".attn_q.") - name = name.replace(".self_attn.out_proj.", ".attn_out.") - name = name.replace(".layer_norm1.", ".ln1.") - name = name.replace(".layer_norm2.", ".ln2.") - name = name.replace(".mlp.fc1.", ".ffn_down.") - name = name.replace(".mlp.fc2.", ".ffn_up.") - - if can_quantize: - if self.ftype == gguf.LlamaFileType.ALL_F32: - data_qtype = gguf.GGMLQuantizationType.F32 - elif self.ftype == gguf.LlamaFileType.MOSTLY_F16: - data_qtype = gguf.GGMLQuantizationType.F16 - elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: - data_qtype = gguf.GGMLQuantizationType.BF16 - elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: - data_qtype = gguf.GGMLQuantizationType.Q8_0 - else: - raise ValueError(f"Unsupported file type: {self.ftype}") - - # corrent norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector - # the other norm values are part of SigLIP model, and they are already correct - # ref code: Gemma3RMSNorm - if "soft_emb_norm.weight" in name: - logger.info(f"Correcting norm value for '{name}'") - data_torch = data_torch + 1 - - data = data_torch.numpy() - - try: - data = gguf.quants.quantize(data, data_qtype) - except Exception as e: - logger.error(f"Error quantizing tensor '{name}': {e}, fallback to F16") - data_qtype = gguf.GGMLQuantizationType.F16 - data = gguf.quants.quantize(data, data_qtype) - - # reverse shape to make it similar to the internal ggml dimension order - shape_str = f"{{{', '.join(str(n) for n in reversed(data_torch.shape))}}}" - logger.info(f"{f'%-32s' % f'{name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") - - self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype) - - def write(self): - self.gguf_writer.write_header_to_file(path=self.fname_out) - self.gguf_writer.write_kv_data_to_file() - self.gguf_writer.write_tensors_to_file(progress=True) - self.gguf_writer.close() - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser( - description="Convert Gemma 3 vision tower safetensors to GGUF format",) - parser.add_argument( - "--outfile", type=Path, default="mmproj.gguf", - help="path to write to", - ) - parser.add_argument( - "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0"], default="f16", - help="output format", - ) - parser.add_argument( - "--bigendian", action="store_true", - help="model is executed on big endian machine", - ) - parser.add_argument( - "model", type=Path, - help="directory containing model file", - nargs="?", - ) - parser.add_argument( - "--verbose", action="store_true", - help="increase output verbosity", - ) - - args = parser.parse_args() - if args.model is None: - parser.error("the following arguments are required: model") - return args - - -def main() -> None: - args = parse_args() - - if args.verbose: - logging.basicConfig(level=logging.DEBUG) - else: - logging.basicConfig(level=logging.INFO) - - dir_model = args.model - - if not dir_model.is_dir(): - logger.error(f'Error: {args.model} is not a directory') - sys.exit(1) - - ftype_map: dict[str, gguf.LlamaFileType] = { - "f32": gguf.LlamaFileType.ALL_F32, - "f16": gguf.LlamaFileType.MOSTLY_F16, - "bf16": gguf.LlamaFileType.MOSTLY_BF16, - "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, - } - - logger.info(f"Loading model: {dir_model.name}") - - with torch.inference_mode(): - gemma3_vision_tower = Gemma3VisionTower( - dir_model=dir_model, - fname_out=args.outfile, - ftype=ftype_map[args.outtype], - is_big_endian=args.bigendian, - ) - gemma3_vision_tower.write() - - -if __name__ == '__main__': - main() - diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp deleted file mode 100644 index 0fe0e333a..000000000 --- a/examples/llava/llava-cli.cpp +++ /dev/null @@ -1,332 +0,0 @@ -#include "arg.h" -#include "base64.hpp" -#include "log.h" -#include "common.h" -#include "sampling.h" -#include "clip.h" -#include "llava.h" -#include "llama.h" -#include "ggml.h" - -#include -#include -#include -#include - -static bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens, int n_batch, int * n_past) { - int N = (int) tokens.size(); - for (int i = 0; i < N; i += n_batch) { - int n_eval = (int) tokens.size() - i; - if (n_eval > n_batch) { - n_eval = n_batch; - } - if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) { - LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); - return false; - } - *n_past += n_eval; - } - return true; -} - -static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) { - std::vector tokens; - tokens.push_back(id); - return eval_tokens(ctx_llama, tokens, 1, n_past); -} - -static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){ - std::string str2 = str; - std::vector embd_inp = common_tokenize(ctx_llama, str2, add_bos, true); - eval_tokens(ctx_llama, embd_inp, n_batch, n_past); - return true; -} - -static const char * sample(struct common_sampler * smpl, - struct llama_context * ctx_llama, - int * n_past) { - const llama_token id = common_sampler_sample(smpl, ctx_llama, -1); - common_sampler_accept(smpl, id, true); - - const llama_model * model = llama_get_model(ctx_llama); - const llama_vocab * vocab = llama_model_get_vocab(model); - - static std::string ret; - if (llama_vocab_is_eog(vocab, id)) { - ret = ""; - } else { - ret = common_token_to_piece(ctx_llama, id); - } - eval_id(ctx_llama, id, n_past); - return ret.c_str(); -} - -static const char* IMG_BASE64_TAG_BEGIN = ""; - -static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) { - begin_out = prompt.find(IMG_BASE64_TAG_BEGIN); - end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out); -} - -static bool prompt_contains_image(const std::string& prompt) { - size_t begin, end; - find_image_tag_in_prompt(prompt, begin, end); - return (begin != std::string::npos); -} - -// replaces the base64 image tag in the prompt with `replacement` -static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) { - size_t img_base64_str_start, img_base64_str_end; - find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end); - if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) { - LOG_ERR("%s: invalid base64 image tag. must be %s%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END); - return NULL; - } - - auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN); - auto base64_bytes_count = img_base64_str_end - base64_bytes_start; - auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count ); - - auto required_bytes = base64::required_encode_size(base64_str.size()); - auto img_bytes = std::vector(required_bytes); - base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin()); - - auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size()); - if (!embed) { - LOG_ERR("%s: could not load image from base64 string.\n", __func__); - return NULL; - } - - return embed; -} - -static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") { - size_t begin, end; - find_image_tag_in_prompt(prompt, begin, end); - if (begin == std::string::npos || end == std::string::npos) { - return prompt; - } - auto pre = prompt.substr(0, begin); - auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END)); - return pre + replacement + post; -} - -struct llava_context { - struct clip_ctx * ctx_clip = NULL; - struct llama_context * ctx_llama = NULL; - struct llama_model * model = NULL; -}; - -static void print_usage(int, char ** argv) { - LOG("\n example usage:\n"); - LOG("\n %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); - LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n"); -} - -static struct llava_image_embed * load_image(llava_context * ctx_llava, common_params * params, const std::string & fname) { - - // load and preprocess the image - llava_image_embed * embed = NULL; - auto prompt = params->prompt; - if (prompt_contains_image(prompt)) { - if (!params->image.empty()) { - LOG_INF("using base64 encoded image instead of command line image path\n"); - } - embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt); - if (!embed) { - LOG_ERR("%s: can't load image from prompt\n", __func__); - return NULL; - } - params->prompt = remove_image_from_prompt(prompt); - } else { - embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str()); - if (!embed) { - fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str()); - return NULL; - } - } - - return embed; -} - -static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, common_params * params, const std::string & prompt) { - int n_past = 0; - - const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict; - - std::string system_prompt, user_prompt; - size_t image_pos = prompt.find(""); - if (image_pos != std::string::npos) { - // new templating mode: Provide the full prompt including system message and use as a placeholder for the image - system_prompt = prompt.substr(0, image_pos); - user_prompt = prompt.substr(image_pos + std::string("").length()); - LOG_INF("system_prompt: %s\n", system_prompt.c_str()); - if (params->verbose_prompt) { - auto tmp = common_tokenize(ctx_llava->ctx_llama, system_prompt, true, true); - for (int i = 0; i < (int) tmp.size(); i++) { - LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); - } - } - LOG_INF("user_prompt: %s\n", user_prompt.c_str()); - if (params->verbose_prompt) { - auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); - for (int i = 0; i < (int) tmp.size(); i++) { - LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); - } - } - } else { - // llava-1.5 native mode - system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:"; - user_prompt = prompt + "\nASSISTANT:"; - if (params->verbose_prompt) { - auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); - for (int i = 0; i < (int) tmp.size(); i++) { - LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); - } - } - } - - eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true); - llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past); - eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false); - - // generate the response - - LOG("\n"); - - struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling); - if (!smpl) { - LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__); - exit(1); - } - - std::string response = ""; - for (int i = 0; i < max_tgt_len; i++) { - const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past); - response += tmp; - if (strcmp(tmp, "") == 0) break; - if (strstr(tmp, "###")) break; // Yi-VL behavior - LOG("%s", tmp); - if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works) - if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6 - if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6 - - fflush(stdout); - } - - common_sampler_free(smpl); - LOG("\n"); -} - -static struct llama_model * llava_init(common_params * params) { - llama_backend_init(); - llama_numa_init(params->numa); - - llama_model_params model_params = common_model_params_to_llama(*params); - - llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params); - if (model == NULL) { - LOG_ERR("%s: unable to load model\n" , __func__); - return NULL; - } - return model; -} - -static struct llava_context * llava_init_context(common_params * params, llama_model * model) { - const char * clip_path = params->mmproj.path.c_str(); - - auto prompt = params->prompt; - if (prompt.empty()) { - prompt = "describe the image in detail."; - } - - auto ctx_clip = clip_model_load(clip_path, GGML_LOG_LEVEL_INFO); - - llama_context_params ctx_params = common_context_params_to_llama(*params); - ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings - - llama_context * ctx_llama = llama_init_from_model(model, ctx_params); - - if (ctx_llama == NULL) { - LOG_ERR("%s: failed to create the llama_context\n" , __func__); - return NULL; - } - - auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); - - ctx_llava->ctx_llama = ctx_llama; - ctx_llava->ctx_clip = ctx_clip; - ctx_llava->model = model; - return ctx_llava; -} - -static void llava_free(struct llava_context * ctx_llava) { - if (ctx_llava->ctx_clip) { - clip_free(ctx_llava->ctx_clip); - ctx_llava->ctx_clip = NULL; - } - - llama_free(ctx_llava->ctx_llama); - llama_model_free(ctx_llava->model); - llama_backend_free(); -} - -int main(int argc, char ** argv) { - ggml_time_init(); - - common_params params; - - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) { - return 1; - } - - common_init(); - - if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { - print_usage(argc, argv); - return 1; - } - - auto * model = llava_init(¶ms); - if (model == NULL) { - fprintf(stderr, "%s: error: failed to init llava model\n", __func__); - return 1; - } - - if (prompt_contains_image(params.prompt)) { - auto * ctx_llava = llava_init_context(¶ms, model); - - auto * image_embed = load_image(ctx_llava, ¶ms, ""); - - // process the prompt - process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); - - llama_perf_context_print(ctx_llava->ctx_llama); - llava_image_embed_free(image_embed); - ctx_llava->model = NULL; - llava_free(ctx_llava); - } else { - for (auto & image : params.image) { - auto * ctx_llava = llava_init_context(¶ms, model); - - auto * image_embed = load_image(ctx_llava, ¶ms, image); - if (!image_embed) { - LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str()); - return 1; - } - - // process the prompt - process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); - - llama_perf_context_print(ctx_llava->ctx_llama); - llava_image_embed_free(image_embed); - ctx_llava->model = NULL; - llava_free(ctx_llava); - } - } - - llama_model_free(model); - - return 0; -} diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp deleted file mode 100644 index 5ad970c22..000000000 --- a/examples/llava/minicpmv-cli.cpp +++ /dev/null @@ -1,354 +0,0 @@ -#include "arg.h" -#include "log.h" -#include "common.h" -#include "sampling.h" -#include "clip.h" -#include "llava.h" -#include "llama.h" -#include "ggml.h" - -#include -#include -#include -#include -#include -#include // TODO: remove me - -struct llava_context { - struct clip_ctx * ctx_clip = NULL; - struct llama_context * ctx_llama = NULL; - struct llama_model * model = NULL; -}; - -static void show_additional_info(int /*argc*/, char ** argv) { - LOG("\nexample usage:\n\n%s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); - LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n"); -} - -static struct llama_model * llava_init(common_params * params) { - llama_backend_init(); - llama_numa_init(params->numa); - - llama_model_params model_params = common_model_params_to_llama(*params); - - llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params); - if (model == NULL) { - LOG_ERR("%s: unable to load model\n" , __func__); - return NULL; - } - return model; -} - -static struct llava_context * llava_init_context(common_params * params, llama_model * model) { - auto prompt = params->prompt; - if (prompt.empty()) { - prompt = "describe the image in detail."; - } - - llama_context_params ctx_params = common_context_params_to_llama(*params); - if (params->n_ctx < 2048) { - // warn user here, "Image processing requires at least 2048 context, setting context to 2048" - LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__); - ctx_params.n_ctx = 2048; - } else { - ctx_params.n_ctx = params->n_ctx; - } - - llama_context * ctx_llama = llama_init_from_model(model, ctx_params); - - if (ctx_llama == NULL) { - LOG_ERR("%s: failed to create the llama_context\n" , __func__); - return NULL; - } - - auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); - - ctx_llava->ctx_llama = ctx_llama; - ctx_llava->model = model; - return ctx_llava; -} - -static void llava_free(struct llava_context * ctx_llava) { - if (ctx_llava->ctx_clip) { - clip_free(ctx_llava->ctx_clip); - ctx_llava->ctx_clip = NULL; - } - - llama_free(ctx_llava->ctx_llama); - llama_model_free(ctx_llava->model); - llama_backend_free(); -} - -static struct clip_ctx * clip_init_context(common_params * params) { - const char * clip_path = params->mmproj.path.c_str(); - - auto prompt = params->prompt; - if (prompt.empty()) { - prompt = "describe the image in detail."; - } - struct clip_context_params clip_params = { - /* use_gpu */ params->n_gpu_layers != 0, - /* verbosity */ GGML_LOG_LEVEL_INFO, // TODO: make this configurable - }; - auto * ctx_clip = clip_init(clip_path, clip_params); - return ctx_clip; -} - -static bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens, int n_batch, int * n_past) { - int N = (int) tokens.size(); - for (int i = 0; i < N; i += n_batch) { - int n_eval = (int) tokens.size() - i; - if (n_eval > n_batch) { - n_eval = n_batch; - } - if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) { - LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); - return false; - } - *n_past += n_eval; - } - return true; -} - -static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) { - std::vector tokens; - tokens.push_back(id); - return eval_tokens(ctx_llama, tokens, 1, n_past); -} - -static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){ - std::string str2 = str; - std::vector embd_inp = common_tokenize(ctx_llama, str2, add_bos, true); - return eval_tokens(ctx_llama, embd_inp, n_batch, n_past); -} - -static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) { - float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip)); - std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip)); - - auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed)); - slice_embed->embed = image_embed; - slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip); - llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past); - llava_image_embed_free(slice_embed); -} - -static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, common_params * params, int &n_past) { - std::string system_prompt; - int idx = 0; - int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip); - int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip); - if (has_minicpmv_projector == 2) { - system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"; - } - else if (has_minicpmv_projector == 3) { - system_prompt = "<|im_start|>user\n"; - } - else if (has_minicpmv_projector == 4) { - system_prompt = "<|im_start|>user\n"; - } - LOG_INF("%s: image token past: %d\n", __func__, n_past); - eval_string(ctx_llava->ctx_llama, (system_prompt+"").c_str(), params->n_batch, &n_past, false); - process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - if (num_image_embeds > 1) { - if (has_minicpmv_projector == 2) { - size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip); - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) { - for (size_t j = 0; j < num_image_embeds_col; ++j) { - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - if (j == num_image_embeds_col - 1) { - eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false); - } - } - } - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - } - else if (has_minicpmv_projector == 3 || has_minicpmv_projector == 4) { - size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip); - for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) { - for (size_t j = 0; j < num_image_embeds_col; ++j) { - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - if (j == num_image_embeds_col - 1) { - eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false); - } - } - } - } - } - LOG_INF("%s: image token past: %d\n", __func__, n_past); -} - -static const char * sample(struct common_sampler * smpl, - struct llama_context * ctx_llama, - int * n_past) { - const llama_token id = common_sampler_sample(smpl, ctx_llama, -1); - common_sampler_accept(smpl, id, true); - - const llama_model * model = llama_get_model(ctx_llama); - const llama_vocab * vocab = llama_model_get_vocab(model); - - static std::string ret; - if (llama_vocab_is_eog(vocab, id)) { - ret = ""; - } else { - ret = common_token_to_piece(ctx_llama, id); - } - eval_id(ctx_llama, id, n_past); - return ret.c_str(); -} - -static struct llava_context * minicpmv_init(common_params * params, const std::string & fname, int &n_past){ - auto * ctx_clip = clip_init_context(params); - auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str()); - if (!embeds) { - LOG_ERR("failed to load image %s. Terminating\n\n", fname.c_str()); - return NULL; - } - - // process the prompt - if (params->prompt.empty() && params->interactive == false) { - LOG_ERR("prompt should be given or interactive mode should be on"); - return NULL; - } - - auto * model = llava_init(params); - if (model == NULL) { - fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__); - return NULL; - } - const int64_t t_llava_init_start_us = ggml_time_us(); - auto * ctx_llava = llava_init_context(params, model); - ctx_llava->ctx_clip = ctx_clip; - const int64_t t_llava_init_end_us = ggml_time_us(); - float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0; - LOG_INF("%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms); - - const int64_t t_process_image_start_us = ggml_time_us(); - process_image(ctx_llava, embeds, params, n_past); - const int64_t t_process_image_end_us = ggml_time_us(); - float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0; - LOG_INF("%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms); - - llava_image_embed_free(embeds); - return ctx_llava; -} - -static struct common_sampler * llama_init(struct llava_context * ctx_llava, common_params * params, const std::string & prompt, int & n_past, bool is_first = false){ - std::string user_prompt = prompt; - int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip); - if (!is_first) { - if (has_minicpmv_projector == 2) { - user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt; - } - else if (has_minicpmv_projector == 3) { - user_prompt = "<|im_start|>user\n" + prompt; - } - else if (has_minicpmv_projector == 4) { - user_prompt = "<|im_start|>user\n" + prompt; - } - } - - eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false); - if (has_minicpmv_projector == 2) { - eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false); - } - else if (has_minicpmv_projector == 3) { - eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false); - } - else if (has_minicpmv_projector == 4) { - eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false); - } - - // generate the response - - LOG_INF("\n"); - - struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling); - return smpl; -} - -static const char * llama_loop(struct llava_context * ctx_llava,struct common_sampler * smpl, int &n_past){ - - const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past); - return tmp; -} - -int main(int argc, char ** argv) { - ggml_time_init(); - - common_params params; - - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) { - return 1; - } - - common_init(); - - if (params.mmproj.path.empty() || (params.image.empty())) { - show_additional_info(argc, argv); - return 1; - } - - for (auto & image : params.image) { - int n_past = 0; - auto * ctx_llava = minicpmv_init(¶ms, image, n_past); - - if (!params.prompt.empty()) { - LOG("%s\n", params.prompt.c_str()); - LOG(""); - auto * smpl = llama_init(ctx_llava, ¶ms, params.prompt, n_past, true); - const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict; - std::string response; - bool have_tmp = false; - for (int i = 0; i < max_tgt_len; i++) { - const auto * tmp = llama_loop(ctx_llava, smpl, n_past); - response += tmp; - if (strcmp(tmp, "") == 0){ - if (!have_tmp) { - continue; - } - break; - } - if (strstr(tmp, "###")) break; // Yi-VL behavior - have_tmp = true; - printf("%s", tmp); - if (strstr(response.c_str(), "")) break; // minicpm-v - - fflush(stdout); - } - common_sampler_free(smpl); - }else { - while (true) { - LOG(""); - std::string prompt; - std::getline(std::cin, prompt); - LOG(""); - auto * smpl = llama_init(ctx_llava, ¶ms, prompt, n_past, true); - const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict; - std::string response; - for (int i = 0; i < max_tgt_len; i++) { - const auto * tmp = llama_loop(ctx_llava, smpl, n_past); - response += tmp; - if (strcmp(tmp, "") == 0) break; - printf("%s", tmp);// mistral llava-1.6 - if (strstr(response.c_str(), "")) break; // minicpm-v - fflush(stdout); - } - common_sampler_free(smpl); - } - } - printf("\n"); - llama_perf_context_print(ctx_llava->ctx_llama); - - ctx_llava->model = NULL; - llava_free(ctx_llava); - } - - return 0; -} diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/mtmd-cli.cpp similarity index 82% rename from examples/llava/gemma3-cli.cpp rename to examples/llava/mtmd-cli.cpp index 3d5664750..e80845a2c 100644 --- a/examples/llava/gemma3-cli.cpp +++ b/examples/llava/mtmd-cli.cpp @@ -28,15 +28,16 @@ static bool g_is_generating = false; /** * Please note that this is NOT a production-ready stuff. - * It is a playground for trying Gemma 3 vision capabilities. + * It is a playground for trying multimodal support in llama.cpp. * For contributors: please keep this code simple and easy to understand. */ static void show_additional_info(int /*argc*/, char ** argv) { LOG( - "Experimental CLI for using Gemma 3 vision model\n\n" + "Experimental CLI for multimodal\n\n" "Usage: %s [options] -m --mmproj --image -p \n\n" " -m and --mmproj are required\n" + " -hf user/repo can replace both -m and --mmproj in most cases\n" " --image and -p are optional, if NOT provided, the CLI will run in chat mode\n", argv[0] ); @@ -56,7 +57,7 @@ static void sigint_handler(int signo) { } #endif -struct gemma3_context { +struct mtmd_cli_context { mtmd_context_ptr ctx_vision; common_init_result llama_init; @@ -70,18 +71,38 @@ struct gemma3_context { // so here we don't need to keep track of chat history common_chat_templates_ptr tmpls; + // support for legacy templates (models not having EOT token) + llama_tokens antiprompt_tokens; + int n_threads = 1; llama_pos n_past = 0; - gemma3_context(common_params & params) : llama_init(common_init_from_params(params)) { + mtmd_cli_context(common_params & params) : llama_init(common_init_from_params(params)) { model = llama_init.model.get(); lctx = llama_init.context.get(); vocab = llama_model_get_vocab(model); n_threads = params.cpuparams.n_threads; batch = llama_batch_init(params.n_batch, 0, 1); n_batch = params.n_batch; + + if (!llama_model_chat_template(model, nullptr) && params.chat_template.empty()) { + LOG_ERR("Model does not have chat template.\n"); + LOG_ERR(" For old llava models, you may need to use '--chat-template vicuna'\n"); + LOG_ERR(" For MobileVLM models, use '--chat-template deepseek'\n"); + exit(1); + } + tmpls = common_chat_templates_init(model, params.chat_template); + LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(tmpls.get(), params.use_jinja).c_str()); + init_vision_context(params); + + // load antiprompt tokens for legacy templates + if (params.chat_template == "vicuna") { + antiprompt_tokens = common_tokenize(lctx, "ASSISTANT:", false, true); + } else if (params.chat_template == "deepseek") { + antiprompt_tokens = common_tokenize(lctx, "###", false, true); + } } void init_vision_context(common_params & params) { @@ -97,6 +118,17 @@ struct gemma3_context { exit(1); } } + + bool check_antiprompt(const llama_tokens & generated_tokens) { + if (antiprompt_tokens.empty() || generated_tokens.size() < antiprompt_tokens.size()) { + return false; + } + return std::equal( + generated_tokens.end() - antiprompt_tokens.size(), + generated_tokens.end(), + antiprompt_tokens.begin() + ); + } }; struct decode_embd_batch { @@ -132,7 +164,8 @@ struct decode_embd_batch { } }; -static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_predict) { +static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int n_predict) { + llama_tokens generated_tokens; for (int i = 0; i < n_predict; i++) { if (i > n_predict || !g_is_generating) { printf("\n"); @@ -140,9 +173,10 @@ static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_ } llama_token token_id = common_sampler_sample(smpl, ctx.lctx, -1); + generated_tokens.push_back(token_id); common_sampler_accept(smpl, token_id, true); - if (llama_vocab_is_eog(ctx.vocab, token_id)) { + if (llama_vocab_is_eog(ctx.vocab, token_id) || ctx.check_antiprompt(generated_tokens)) { printf("\n"); break; // end of generation } @@ -161,7 +195,7 @@ static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_ return 0; } -static int eval_message(gemma3_context & ctx, common_chat_msg & msg, std::vector & images_fname, bool add_bos = false) { +static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, std::vector & images_fname, bool add_bos = false) { std::vector bitmaps; common_chat_templates_inputs tmpl_inputs; @@ -218,7 +252,7 @@ int main(int argc, char ** argv) { return 1; } - gemma3_context ctx(params); + mtmd_cli_context ctx(params); printf("%s: %s\n", __func__, params.model.path.c_str()); bool is_single_turn = !params.prompt.empty() && !params.image.empty(); diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp index 3fd5bebc6..c3fb2f18a 100644 --- a/examples/llava/mtmd.cpp +++ b/examples/llava/mtmd.cpp @@ -12,6 +12,15 @@ #include #include +// slice template, used by some llava-uhd models to correctly place the special tokens around image embeddings +// models not having it (llava-1.6) will process embeddings without any special tokens in-between +enum mtmd_slice_tmpl { + MTMD_SLICE_TMPL_NONE, + MTMD_SLICE_TMPL_MINICPMV_2_5, + MTMD_SLICE_TMPL_MINICPMV_2_6, + // TODO @ngxson : add support for idefics (SmolVLM) +}; + struct mtmd_context { struct clip_ctx * ctx_clip; const struct llama_model * text_model; @@ -21,6 +30,16 @@ struct mtmd_context { int n_threads; std::string image_marker; + // for minicpmv, we need special tokens in-between slices + mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE; + llama_token tok_ov_img_start = LLAMA_TOKEN_NULL; // overview image + llama_token tok_ov_img_end = LLAMA_TOKEN_NULL; // overview image + llama_token tok_slices_start = LLAMA_TOKEN_NULL; // start of all slices + llama_token tok_slices_end = LLAMA_TOKEN_NULL; // end of all slices + llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice + llama_token tok_sli_img_end = LLAMA_TOKEN_NULL; // single slice + llama_token tok_row_end = LLAMA_TOKEN_NULL; // end of row + // TODO @ngxson : add timings mtmd_context(const char * mmproj_fname, @@ -38,11 +57,66 @@ struct mtmd_context { throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname)); } this->text_model = text_model; + + GGML_ASSERT(!clip_is_qwen2vl(ctx_clip) && "Qwen2VL model is not supported yet, use llama-qwen2vl-cli instead"); + + int minicpmv_version = clip_is_minicpmv(ctx_clip); + if (minicpmv_version == 2) { + // minicpmv 2.5 format: + // (overview) (slice) (slice) \n ... + slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_5; + tok_ov_img_start = lookup_token(""); + tok_ov_img_end = lookup_token(""); + tok_slices_start = lookup_token(""); + tok_slices_end = lookup_token(""); + tok_sli_img_start = tok_ov_img_start; + tok_sli_img_end = tok_ov_img_end; + tok_row_end = lookup_token("\n"); + + } else if (minicpmv_version == 3 || minicpmv_version == 4) { + // minicpmv 2.6 format: + // (overview) (slice) (slice) \n ... + slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6; + tok_ov_img_start = lookup_token(""); + tok_ov_img_end = lookup_token(""); + tok_sli_img_start = lookup_token(""); + tok_sli_img_end = lookup_token(""); + tok_row_end = lookup_token("\n"); + + } else if (minicpmv_version != 0) { + GGML_ASSERT(false && "unsupported minicpmv version"); + } } ~mtmd_context() { clip_free(ctx_clip); } + +private: + llama_token lookup_token(const std::string & token_text) { + const llama_vocab * vocab = llama_model_get_vocab(text_model); + const int n_vocab = llama_vocab_n_tokens(vocab); + for (int i = 0; i < n_vocab; i++) { + if (token_to_piece(vocab, i, true) == token_text) { + return i; + } + } + return LLAMA_TOKEN_NULL; + } + + std::string token_to_piece(const llama_vocab * vocab, llama_token token, bool special) { + std::string piece; + piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n' + const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special); + if (n_chars < 0) { + piece.resize(-n_chars); + int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special); + GGML_ASSERT(check == -n_chars); + } else { + piece.resize(n_chars); + } + return piece; + } }; struct mtmd_image_tokens_data { @@ -103,20 +177,64 @@ int32_t mtmd_tokenize(mtmd_context * ctx, std::string prompt_modified(text.text); std::string marker_modified(ctx->image_marker); projector_type proj_type = clip_get_projector_type(ctx->ctx_clip); + // a bit hacky here, but works for now // for some models, we need to add prefix and suffix to the image embeddings - if (proj_type == PROJECTOR_TYPE_GEMMA3) { + if (clip_is_gemma3(ctx->ctx_clip)) { + // gemma 3 // ... (image embeddings) ... marker_modified = "" + ctx->image_marker + ""; string_replace_all(prompt_modified, ctx->image_marker, marker_modified); + + } else if (proj_type == PROJECTOR_TYPE_IDEFICS3) { + // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215 + marker_modified = "" + ctx->image_marker + ""; + string_replace_all(prompt_modified, ctx->image_marker, marker_modified); } + // llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix + // for glm-edge, we don't need to add because the tokens are already in the returned embeddings + + // TODO @ngxson : glm-edge : remove BOI / EOI tokens embeddings, decode them as normal tokens + std::vector parts = string_split_str(prompt_modified, ctx->image_marker); output.clear(); output.reserve(parts.size()); size_t i_img = 0; + // utility for adding raw tokens + auto add_text_chunk = [&output](std::vector && tokens) { + mtmd_input_chunk chunk{ + MTMD_INPUT_CHUNK_TYPE_TEXT, + std::move(tokens), + {}, + }; + output.emplace_back(std::move(chunk)); + }; + + // utility for splitting batch of multiple images into chunks of batch having single images + auto split_batch_to_chunk = [&ctx](clip_image_f32_batch && batch_f32, const std::string & id) { + std::vector chunks; + + for (auto & entry : batch_f32.entries) { + mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens); + image_tokens->nx = clip_n_patches(ctx->ctx_clip); + image_tokens->ny = 1; + image_tokens->batch_f32.entries.push_back(std::move(entry)); + image_tokens->id = id; + + mtmd_input_chunk chunk{ + MTMD_INPUT_CHUNK_TYPE_IMAGE, + {}, + std::move(image_tokens), + }; + chunks.emplace_back(std::move(chunk)); + } + + return chunks; + }; + for (const auto & part : parts) { //printf("tokenizing part: %s\n", part.c_str()); bool add_bos = &parts.front() == ∂ @@ -139,12 +257,13 @@ int32_t mtmd_tokenize(mtmd_context * ctx, return 1; } - // shim layer + // convert mtmd_bitmap to clip_image_u8 clip_image_u8_ptr img_u8(clip_image_u8_init()); img_u8->nx = bitmaps[i_img].nx; img_u8->ny = bitmaps[i_img].ny; img_u8->buf.resize(bitmaps[i_img].data.size()); std::memcpy(img_u8->buf.data(), bitmaps[i_img].data.data(), img_u8->nx * img_u8->ny * 3); + clip_image_size img_u8_size{img_u8->nx, img_u8->ny}; // preprocess image clip_image_f32_batch batch_f32; @@ -154,19 +273,70 @@ int32_t mtmd_tokenize(mtmd_context * ctx, return 2; } - mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens); - image_tokens->nx = clip_n_patches(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image - image_tokens->ny = 1; // TODO - image_tokens->batch_f32 = std::move(batch_f32); - image_tokens->id = bitmaps[i_img].id; // optional + if (ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5 || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6) { + // split batch into chunks of single images + auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_img].id); + GGML_ASSERT(chunks.size() > 0); - mtmd_input_chunk chunk{ - MTMD_INPUT_CHUNK_TYPE_IMAGE, - {}, - std::move(image_tokens), - }; - output.emplace_back(std::move(chunk)); - i_img++; + // add overview image + add_text_chunk({ctx->tok_ov_img_start}); + output.emplace_back(std::move(chunks.front())); + chunks.erase(chunks.begin()); + add_text_chunk({ctx->tok_ov_img_end}); + + // add slices + if (!chunks.empty()) { + clip_add_load_image_size(ctx->ctx_clip, &img_u8_size); + int n_col = clip_uhd_num_image_embeds_col(ctx->ctx_clip); + int n_row = (int)chunks.size() / n_col; + GGML_ASSERT(n_row * n_col == (int)chunks.size()); + if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) { + add_text_chunk({ctx->tok_slices_start}); + } + for (int y = 0; y < n_row; y++) { + for (int x = 0; x < n_col; x++) { + if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) { + add_text_chunk({ctx->tok_sli_img_start}); + } + output.emplace_back(std::move(chunks[y * n_col + x])); + if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) { + add_text_chunk({ctx->tok_sli_img_end}); + } + } + if (ctx->tok_row_end != LLAMA_TOKEN_NULL && y != n_row - 1) { + add_text_chunk({ctx->tok_row_end}); + } + } + if (ctx->tok_slices_end != LLAMA_TOKEN_NULL) { + add_text_chunk({ctx->tok_slices_end}); + } + } + + } else { + mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens); + image_tokens->nx = clip_n_patches(ctx->ctx_clip) * batch_f32.entries.size(); // TODO @ngxson : use clip_n_patches_by_image + image_tokens->ny = 1; // TODO + image_tokens->batch_f32 = std::move(batch_f32); + image_tokens->id = bitmaps[i_img].id; // optional + + LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx); + LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny); + LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size()); + + if (clip_is_glm(ctx->ctx_clip)) { + // glm-edge + image_tokens->nx += 2; // add 2 for the begin_of_image and end_of_image token embeddings + } + + mtmd_input_chunk chunk{ + MTMD_INPUT_CHUNK_TYPE_IMAGE, + {}, + std::move(image_tokens), + }; + output.emplace_back(std::move(chunk)); + } + + i_img++; // move to next image } } @@ -198,11 +368,35 @@ std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) { int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) { int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip); ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd); - bool ok = clip_image_batch_encode( - ctx->ctx_clip, - ctx->n_threads, - &image_tokens->batch_f32, - ctx->image_embd_v.data()); + bool ok = false; + + // only effective for minicpmv and qwen2vl, other models will ignore load_image_size + { + clip_image_size slice_size{ + image_tokens->batch_f32.entries[0]->nx, + image_tokens->batch_f32.entries[0]->ny}; + clip_add_load_image_size(ctx->ctx_clip, &slice_size); + } + + if (clip_is_llava(ctx->ctx_clip) || clip_is_minicpmv(ctx->ctx_clip) || clip_is_glm(ctx->ctx_clip)) { + // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode() + const auto & entries = image_tokens->batch_f32.entries; + for (size_t i = 0; i < entries.size(); i++) { + int n_tokens_per_image = clip_n_patches(ctx->ctx_clip); + ok = clip_image_encode( + ctx->ctx_clip, + ctx->n_threads, + entries[i].get(), + ctx->image_embd_v.data() + i*n_mmproj_embd*n_tokens_per_image); + } + } else { + ok = clip_image_batch_encode( + ctx->ctx_clip, + ctx->n_threads, + &image_tokens->batch_f32, + ctx->image_embd_v.data()); + } + return ok ? 0 : 1; } @@ -268,28 +462,31 @@ int32_t mtmd_helper_eval(mtmd_context * ctx, int32_t ret; llama_pos n_past = pos0; llama_batch text_batch = llama_batch_init(n_batch, 0, 1); + int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip); for (auto & chunk : chunks) { bool is_last = &chunk == &chunks.back(); if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) { - // TODO @ngxson : may need to split into smaller batches text_batch.n_tokens = chunk.tokens_text.size(); - for (size_t i = 0; i < chunk.tokens_text.size(); i++) { - text_batch.token [i] = chunk.tokens_text[i]; - text_batch.pos [i] = n_past++; - text_batch.n_seq_id[i] = 1; - text_batch.seq_id [i][0] = seq_id; - text_batch.logits [i] = false; - } - if (is_last) { - // always get logits for last input chunk - text_batch.logits[text_batch.n_tokens - 1] = true; - } - ret = llama_decode(lctx, text_batch); - if (ret != 0) { - LOG_ERR("failed to decode text\n"); - llama_batch_free(text_batch); - return ret; + size_t i = 0; + while (i < chunk.tokens_text.size()) { // split into batches + for (; i < chunk.tokens_text.size() && text_batch.n_tokens < n_batch; i++) { + text_batch.token [i] = chunk.tokens_text[i]; + text_batch.pos [i] = n_past++; + text_batch.n_seq_id[i] = 1; + text_batch.seq_id [i][0] = seq_id; + text_batch.logits [i] = false; + } + if (is_last) { + // always get logits for last input chunk + text_batch.logits[text_batch.n_tokens - 1] = true; + } + ret = llama_decode(lctx, text_batch); + if (ret != 0) { + LOG_ERR("failed to decode text\n"); + llama_batch_free(text_batch); + return ret; + } } } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) { @@ -297,7 +494,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx, GGML_ASSERT(chunk.tokens_image != nullptr); int64_t t0 = ggml_time_ms(); if (ctx->print_timings) { - LOG_INF("encoding image...\n"); + LOG_INF("encoding image or slice...\n"); } ret = mtmd_encode(ctx, chunk.tokens_image.get()); if (ret != 0) { @@ -306,24 +503,47 @@ int32_t mtmd_helper_eval(mtmd_context * ctx, return ret; } if (ctx->print_timings) { - LOG_INF("image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0); + LOG_INF("image/slice encoded in %" PRId64 " ms\n", ggml_time_ms() - t0); } int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get()); + int32_t i_batch = 0; + int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch; float * embd = mtmd_get_output_embd(ctx); - decode_embd_batch batch_img(embd, n_tokens, n_past, 0); - int64_t t1 = ggml_time_ms(); - ret = llama_decode(lctx, batch_img.batch); - if (ret != 0) { - LOG_ERR("failed to decode image\n"); - llama_batch_free(text_batch); - return ret; - } - if (ctx->print_timings) { - LOG_INF("image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1); + + if (mtmd_decode_use_non_causal(ctx)) { + llama_set_causal_attn(lctx, false); + // TODO @ngxson : need to make sure only one image is processed at a time, and n_ubatch must be enough to hold the image } - n_past += n_tokens; + while (i_batch < n_img_batches) { // split into batches + int32_t pos_offset = i_batch*n_batch; + int32_t n_tokens_batch = std::min(n_batch, n_tokens - pos_offset); + float * embd_batch = embd + pos_offset*n_mmproj_embd; + decode_embd_batch batch_img(embd_batch, n_tokens_batch, n_past, 0); + + printf("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch); + + int64_t t1 = ggml_time_ms(); + ret = llama_decode(lctx, batch_img.batch); + if (ret != 0) { + LOG_ERR("failed to decode image\n"); + llama_set_causal_attn(lctx, true); // restore causal attn + llama_batch_free(text_batch); + return ret; + } + + if (ctx->print_timings) { + LOG_INF("image decoded (batch %d/%d) in %" PRId64 " ms\n", i_batch+1, n_img_batches, ggml_time_ms() - t1); + } + + i_batch++; + n_past += n_tokens_batch; + } + + if (mtmd_decode_use_non_causal(ctx)) { + llama_set_causal_attn(lctx, true); + } } else { GGML_ASSERT(false && "chunk type not supported"); diff --git a/examples/llava/tests.sh b/examples/llava/tests.sh index cc9bda876..8752fc267 100755 --- a/examples/llava/tests.sh +++ b/examples/llava/tests.sh @@ -17,26 +17,39 @@ cd $PROJ_ROOT arr_bin=() arr_hf=() +arr_tmpl=() # chat template add_test() { local bin=$1 local hf=$2 + local tmpl=${3:-""} # default to empty string if not provided arr_bin+=("$bin") arr_hf+=("$hf") + arr_tmpl+=("$tmpl") } -add_test "llama-gemma3-cli" "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M" -add_test "llama-llava-cli" "cmp-nct/Yi-VL-6B-GGUF:Q5_K" -add_test "llama-llava-cli" "guinmoon/MobileVLM-3B-GGUF:Q4_K_M" -add_test "llama-llava-cli" "THUDM/glm-edge-v-5b-gguf:Q4_K_M" -add_test "llama-llava-cli" "second-state/Llava-v1.5-7B-GGUF:Q2_K" -add_test "llama-llava-cli" "cjpais/llava-1.6-mistral-7b-gguf:Q3_K" -add_test "llama-llava-cli" "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M" -add_test "llama-minicpmv-cli" "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted -add_test "llama-minicpmv-cli" "openbmb/MiniCPM-V-2_6-gguf:Q2_K" -add_test "llama-minicpmv-cli" "openbmb/MiniCPM-o-2_6-gguf:Q4_0" +add_test "llama-mtmd-cli" "ggml-org/SmolVLM-500M-Instruct-GGUF:Q8_0" +add_test "llama-mtmd-cli" "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M" +add_test "llama-mtmd-cli" "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0" +add_test "llama-mtmd-cli" "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M" +add_test "llama-mtmd-cli" "guinmoon/MobileVLM-3B-GGUF:Q4_K_M" "deepseek" +add_test "llama-mtmd-cli" "THUDM/glm-edge-v-5b-gguf:Q4_K_M" +add_test "llama-mtmd-cli" "second-state/Llava-v1.5-7B-GGUF:Q2_K" "vicuna" +add_test "llama-mtmd-cli" "cjpais/llava-1.6-mistral-7b-gguf:Q3_K" "vicuna" +add_test "llama-mtmd-cli" "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M" +add_test "llama-mtmd-cli" "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted +add_test "llama-mtmd-cli" "openbmb/MiniCPM-V-2_6-gguf:Q2_K" +add_test "llama-mtmd-cli" "openbmb/MiniCPM-o-2_6-gguf:Q4_0" add_test "llama-qwen2vl-cli" "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M" +# these models always give the wrong answer, not sure why +# add_test "llama-mtmd-cli" "ggml-org/SmolVLM-Instruct-GGUF:Q4_K_M" +# add_test "llama-mtmd-cli" "ggml-org/SmolVLM-256M-Instruct-GGUF:Q8_0" +# add_test "llama-mtmd-cli" "ggml-org/SmolVLM2-256M-Video-Instruct-GGUF:Q8_0" + +# this model has broken chat template, not usable +# add_test "llama-mtmd-cli" "cmp-nct/Yi-VL-6B-GGUF:Q5_K" + ############### cmake --build build -j --target "${arr_bin[@]}" @@ -46,12 +59,20 @@ arr_res=() for i in "${!arr_bin[@]}"; do bin="${arr_bin[$i]}" hf="${arr_hf[$i]}" + tmpl="${arr_tmpl[$i]}" echo "Running test with binary: $bin and HF model: $hf" echo "" echo "" - output=$("$PROJ_ROOT/build/bin/$bin" -hf "$hf" --image $SCRIPT_DIR/test-1.jpeg -p "what is the publisher name of the newspaper?" --temp 0 2>&1 | tee /dev/tty) + output=$(\ + "$PROJ_ROOT/build/bin/$bin" \ + -hf "$hf" \ + --image $SCRIPT_DIR/test-1.jpeg \ + -p "what is the publisher name of the newspaper?" \ + --temp 0 -n 128 \ + ${tmpl:+--chat-template "$tmpl"} \ + 2>&1 | tee /dev/tty) echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log diff --git a/ggml/src/ggml-cpu/cpu-feats-x86.cpp b/ggml/src/ggml-cpu/cpu-feats-x86.cpp index 902ee4346..d775a0363 100644 --- a/ggml/src/ggml-cpu/cpu-feats-x86.cpp +++ b/ggml/src/ggml-cpu/cpu-feats-x86.cpp @@ -263,7 +263,7 @@ void test_x86_is() { static int ggml_backend_cpu_x86_score() { // FIXME: this does not check for OS support - int score = 0; + int score = 1; cpuid_x86 is; #ifdef GGML_FMA diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index a968f5945..981deb5b0 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -44,8 +44,8 @@ static struct ggml_backend_device g_ggml_backend_metal_device; // note: assumes single GPU device - the default one // TODO: support multiple GPU devices static struct ggml_backend_metal_device_context { - id mtl_device; - int mtl_device_ref_count; + id mtl_device; + int mtl_device_ref_count; id mtl_library; bool has_simdgroup_reduction; @@ -490,7 +490,259 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_COUNT }; +// +// ggml_metal_heap +// + +struct ggml_metal_heap { + // number of times the heap was unused + int n_unused; + + // total number of buffer allocations in this heap across all computes + int64_t n_alloc; + + // current offset in the heap - we reset this after each node in order to reuse the memory + size_t offs; + + // the currently allocated MTLBuffer objects in this heap + id obj; + + NSMutableArray * bufs; +}; + +static struct ggml_metal_heap * ggml_metal_heap_init(id device, size_t size) { + struct ggml_metal_heap * heap = calloc(1, sizeof(struct ggml_metal_heap)); + + MTLHeapDescriptor * desc = [[MTLHeapDescriptor alloc] init]; + desc.storageMode = MTLStorageModePrivate; + desc.cpuCacheMode = MTLCPUCacheModeDefaultCache; + desc.type = MTLHeapTypePlacement; + desc.size = size; + + heap->n_unused = 0; + heap->n_alloc = 0; + + heap->obj = [device newHeapWithDescriptor:desc]; + if (!heap->obj) { + GGML_LOG_ERROR("%s: error: failed to create MTLHeap with size %zu\n", __func__, size); + + free(heap); + + return false; + } + + [desc release]; + + heap->bufs = [[NSMutableArray alloc] init]; + + return heap; +} + +static void ggml_metal_heap_reset(struct ggml_metal_heap * heap) { + heap->offs = 0; + + // count how many graph computes the heap ended up being unused + if ([heap->bufs count] > 0) { + heap->n_unused = 0; + } else { + heap->n_unused++; + } + + for (id buf in heap->bufs) { + [buf release]; + } + [heap->bufs removeAllObjects]; + + // tell the OS that it can reuse this memory if needed + // ref: https://developer.apple.com/documentation/metal/mtlpurgeablestate?language=objc + [heap->obj setPurgeableState:MTLPurgeableStateVolatile]; +} + +static void ggml_metal_heap_free(struct ggml_metal_heap * heap) { + if (heap == nil) { + return; + } + + ggml_metal_heap_reset(heap); + + [heap->obj release]; + [heap->bufs release]; + + free(heap); +} + +@interface ggml_metal_heap_ptr : NSObject + +@property (nonatomic, assign) struct ggml_metal_heap * data; + +@end + +@implementation ggml_metal_heap_ptr +@end + +// +// ggml_metal_mem_pool +// + +struct ggml_metal_mem_pool { + id device; + + int n_heaps; // total number of heaps ever created (including those that were removed) + + NSMutableArray * heaps; + NSMutableArray * heaps_to_remove; +}; + +static struct ggml_metal_mem_pool * ggml_metal_mem_pool_init(void) { + struct ggml_metal_mem_pool * mem_pool = calloc(1, sizeof(struct ggml_metal_mem_pool)); + + mem_pool->n_heaps = 0; + + mem_pool->heaps = [[NSMutableArray alloc] init]; + mem_pool->heaps_to_remove = [[NSMutableArray alloc] init]; + + return mem_pool; +} + +static void ggml_metal_mem_pool_free(struct ggml_metal_mem_pool * mem_pool) { + GGML_LOG_DEBUG("%s: freeing memory pool, num heaps = %zu (total = %d)\n", __func__, [mem_pool->heaps count], mem_pool->n_heaps); + + size_t size_all = 0; + size_t size_cur = 0; + + for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) { + GGML_LOG_DEBUG("%s: heap: %p\n", __func__, (void *) ptr.data); + GGML_LOG_DEBUG("%s: n_alloc: %" PRId64 "\n", __func__, ptr.data->n_alloc); + GGML_LOG_DEBUG("%s: n_unused: %d\n", __func__, ptr.data->n_unused); + GGML_LOG_DEBUG("%s: size: %.2f MiB\n", __func__, [ptr.data->obj size] / 1024.0 / 1024.0); + GGML_LOG_DEBUG("%s: bufs: %zu\n", __func__, [ptr.data->bufs count]); + + if ([ptr.data->bufs count] > 0) { + size_cur += [ptr.data->obj size]; + } + size_all += [ptr.data->obj size]; + + ggml_metal_heap_free(ptr.data); + [ptr release]; + } + [mem_pool->heaps release]; + [mem_pool->heaps_to_remove release]; + + if (size_all > 0) { + GGML_LOG_DEBUG("%s: size_all: %.2f MiB\n", __func__, size_all / 1024.0 / 1024.0); + GGML_LOG_DEBUG("%s: size_cur: %.2f MiB\n", __func__, size_cur / 1024.0 / 1024.0); + } + + free(mem_pool); +} + +static void ggml_metal_mem_pool_reset(struct ggml_metal_mem_pool * mem_pool) { + for (NSUInteger i = 0; i < [mem_pool->heaps count]; i++) { + ggml_metal_heap_ptr * ptr = [mem_pool->heaps objectAtIndex:i]; + + struct ggml_metal_heap * heap = ptr.data; + ggml_metal_heap_reset(heap); + + // if the heap hasn't been used for a while, remove it + if (heap->n_unused >= 128) { + [mem_pool->heaps_to_remove addObject:@(i)]; + } + } + + if (mem_pool->heaps_to_remove.count > 0) { + for (NSUInteger i = 0; i < [mem_pool->heaps_to_remove count]; i++) { + NSUInteger index = [[mem_pool->heaps_to_remove objectAtIndex:i] intValue]; + ggml_metal_heap_ptr * ptr = [mem_pool->heaps objectAtIndex:index]; + + struct ggml_metal_heap * heap = ptr.data; + ggml_metal_heap_free(heap); + + [mem_pool->heaps removeObjectAtIndex:index]; + [ptr release]; + } + + [mem_pool->heaps_to_remove removeAllObjects]; + } +} + +static void ggml_metal_mem_pool_clear(struct ggml_metal_mem_pool * mem_pool) { + for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) { + ptr.data->offs = 0; + } +} + +static id ggml_metal_mem_pool_alloc(struct ggml_metal_mem_pool * mem_pool, size_t size) { + const size_t alignment = 32; + + const size_t size_aligned = GGML_PAD(size, alignment); + + // try one of the existing heaps + for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) { + struct ggml_metal_heap * heap = ptr.data; + if (heap->offs + size_aligned <= [heap->obj size]) { + // if this is the first buffer in the heap for the current command buffer, tell the OS that + // it cannot free the memory used by the heap + // ref: https://developer.apple.com/documentation/metal/mtlpurgeablestate?language=objc + if ([heap->bufs count] == 0) { + [heap->obj setPurgeableState:MTLPurgeableStateNonVolatile]; + } + + id buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate offset:heap->offs]; + if (buf == nil) { + GGML_LOG_ERROR("%s: error: failed to create MTLBuffer with size %zu\n", __func__, size_aligned); + return nil; + } + + heap->n_alloc++; + heap->offs += size_aligned; + + [heap->bufs addObject:buf]; + + return buf; + } + } + + // create a new heap that can fit this buffer + ggml_metal_heap_ptr * heap_ptr = [ggml_metal_heap_ptr new]; + + struct ggml_metal_heap * heap = ggml_metal_heap_init(mem_pool->device, size_aligned); + if (heap == NULL) { + GGML_LOG_ERROR("%s: error: failed to create heap of size %zu\n", __func__, size_aligned); + return NULL; + } + + //GGML_LOG_DEBUG("%s: creating new heap of size %zu, got %zu\n", __func__, size_aligned, [heap->obj size]); + + heap_ptr.data = heap; + ggml_metal_heap_reset(heap); + + [heap->obj setPurgeableState:MTLPurgeableStateNonVolatile]; + id buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate offset:heap->offs]; + if (buf == nil) { + GGML_LOG_ERROR("%s: error: failed to create MTLBuffer with size %zu\n", __func__, size_aligned); + return NULL; + } + + heap->n_alloc++; + heap->offs += size_aligned; + + [heap->bufs addObject:buf]; + + [mem_pool->heaps addObject:heap_ptr]; + mem_pool->n_heaps++; + + return buf; +} + +struct ggml_metal_command_buffer { + id obj; + + // each command buffer has a memory pool from which it can allocate temporary buffers during the compute + struct ggml_metal_mem_pool * mem_pool; +}; + struct ggml_backend_metal_context { + id device; id queue; dispatch_queue_t d_queue; @@ -515,7 +767,7 @@ struct ggml_backend_metal_context { void (^encode_async)(size_t ith); // n_cb command buffers + 1 used by the main thread - id command_buffers[GGML_METAL_MAX_COMMAND_BUFFERS + 1]; + struct ggml_metal_command_buffer cmd_bufs[GGML_METAL_MAX_COMMAND_BUFFERS + 1]; // abort ggml_metal_graph_compute if callback returns true ggml_abort_callback abort_callback; @@ -705,9 +957,11 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de struct ggml_backend_metal_device_context * ctx_dev = dev->context; id device = ggml_backend_metal_device_acq(ctx_dev); + GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]); - ctx->queue = [device newCommandQueue]; + ctx->device = device; + ctx->queue = [device newCommandQueue]; if (ctx->queue == nil) { GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__); return NULL; @@ -768,7 +1022,10 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de ctx->gf = nil; ctx->encode_async = nil; for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) { - ctx->command_buffers[i] = nil; + ctx->cmd_bufs[i].obj = nil; + + ctx->cmd_bufs[i].mem_pool = ggml_metal_mem_pool_init(); + ctx->cmd_bufs[i].mem_pool->device = device; } #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) @@ -1181,6 +1438,12 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { [ctx->queue release]; + for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) { + // ctx->cmd_bufs[i].obj is auto released + + ggml_metal_mem_pool_free(ctx->cmd_bufs[i].mem_pool); + } + dispatch_release(ctx->d_queue); free(ctx); @@ -1486,10 +1749,11 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex } } -static void ggml_metal_encode_node( +static bool ggml_metal_encode_node( ggml_backend_t backend, int idx, - id encoder) { + id encoder, + struct ggml_metal_mem_pool * mem_pool) { struct ggml_backend_metal_context * ctx = backend->context; struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; @@ -1505,7 +1769,7 @@ static void ggml_metal_encode_node( struct ggml_tensor * dst = node; if (ggml_is_empty(dst)) { - return; + return true; } switch (dst->op) { @@ -1516,7 +1780,7 @@ static void ggml_metal_encode_node( case GGML_OP_PERMUTE: { // noop -> next node - } return; + } return true; default: { } break; @@ -1527,6 +1791,8 @@ static void ggml_metal_encode_node( GGML_ABORT("unsupported op"); } + ggml_metal_mem_pool_clear(mem_pool); + const int64_t ne00 = src0 ? src0->ne[0] : 0; const int64_t ne01 = src0 ? src0->ne[1] : 0; const int64_t ne02 = src0 ? src0->ne[2] : 0; @@ -2173,26 +2439,76 @@ static void ggml_metal_encode_node( const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - ggml_metal_kargs_soft_max args = { +// use this branch to test the ggml_metal_mem_pool functionality +#if 0 + // cpy to tmp buffer in MTLHeap + + id h_src0 = h_src0 = ggml_metal_mem_pool_alloc(mem_pool, ggml_nbytes(src0)); + if (!h_src0) { + GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, ggml_nbytes(src0)); + return false; + } + + offs_src0 = 0; + + ggml_metal_kargs_cpy args_cpy = { /*.ne00 =*/ ne00, /*.ne01 =*/ ne01, /*.ne02 =*/ ne02, - /*.scale =*/ scale, - /*.max_bias =*/ max_bias, - /*.m0 =*/ m0, - /*.m1 =*/ m1, + /*.ne03 =*/ ne03, + /*.nb00 =*/ nb00, + /*.nb01 =*/ nb01, + /*.nb02 =*/ nb02, + /*.nb03 =*/ nb03, + /*.ne0 =*/ ne00, + /*.ne1 =*/ ne01, + /*.ne2 =*/ ne02, + /*.ne3 =*/ ne03, + /*.nb0 =*/ nb00, + /*.nb1 =*/ nb01, + /*.nb2 =*/ nb02, + /*.nb3 =*/ nb03, + }; + + if (src0->type == GGML_TYPE_F16) { + [encoder setComputePipelineState:ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline]; + } else { + [encoder setComputePipelineState:ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline]; + } + [encoder setBytes:&args_cpy length:sizeof(args_cpy) atIndex:0]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; + [encoder setBuffer:h_src0 offset:0 atIndex:2]; + + GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0); + int nth_cpy = MIN(1024, ne00 / ggml_blck_size(src0->type)); + + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth_cpy, 1, 1)]; + +#else + id h_src0 = id_src0; +#endif + // softmax + + ggml_metal_kargs_soft_max args = { + /*.ne00 =*/ ne00, + /*.ne01 =*/ ne01, + /*.ne02 =*/ ne02, + /*.scale =*/ scale, + /*.max_bias =*/ max_bias, + /*.m0 =*/ m0, + /*.m1 =*/ m1, /*.n_head_log2 =*/ n_head_log2, }; [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:h_src0 offset:offs_src0 atIndex:0]; if (id_src1) { - [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; } else { - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; + [encoder setBuffer:h_src0 offset:offs_src0 atIndex:1]; } - [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&args length:sizeof(args) atIndex:3]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&args length:sizeof(args) atIndex:3]; [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; @@ -4601,6 +4917,8 @@ static void ggml_metal_encode_node( GGML_ABORT("fatal error"); } } + + return true; } static enum ggml_status ggml_metal_graph_compute( @@ -4654,25 +4972,25 @@ static enum ggml_status ggml_metal_graph_compute( } // the main thread commits the first few commands immediately - // command_buffer[n_cb] + // cmd_buf[n_cb] { - id command_buffer = [ctx->queue commandBufferWithUnretainedReferences]; - ctx->command_buffers[n_cb] = command_buffer; + id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; + ctx->cmd_bufs[n_cb].obj = cmd_buf; - [command_buffer enqueue]; + [cmd_buf enqueue]; ctx->encode_async(n_cb); } // prepare the rest of the command buffers asynchronously - // command_buffer[0.. n_cb) + // cmd_buf[0.. n_cb) for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) { - id command_buffer = [ctx->queue commandBufferWithUnretainedReferences]; - ctx->command_buffers[cb_idx] = command_buffer; + id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; + ctx->cmd_bufs[cb_idx].obj = cmd_buf; // always enqueue the first two command buffers // enqueue all of the command buffers if we don't need to abort if (cb_idx < 2 || ctx->abort_callback == NULL) { - [command_buffer enqueue]; + [cmd_buf enqueue]; } } @@ -4681,14 +4999,14 @@ static enum ggml_status ggml_metal_graph_compute( // wait for completion and check status of each command buffer // needed to detect if the device ran out-of-memory for example (#1881) { - id command_buffer = ctx->command_buffers[n_cb]; - [command_buffer waitUntilCompleted]; + id cmd_buf = ctx->cmd_bufs[n_cb].obj; + [cmd_buf waitUntilCompleted]; - MTLCommandBufferStatus status = [command_buffer status]; + MTLCommandBufferStatus status = [cmd_buf status]; if (status != MTLCommandBufferStatusCompleted) { GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status); if (status == MTLCommandBufferStatusError) { - GGML_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]); + GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); } return GGML_STATUS_FAILED; @@ -4696,20 +5014,20 @@ static enum ggml_status ggml_metal_graph_compute( } for (int i = 0; i < n_cb; ++i) { - id command_buffer = ctx->command_buffers[i]; - [command_buffer waitUntilCompleted]; + id cmd_buf = ctx->cmd_bufs[i].obj; + [cmd_buf waitUntilCompleted]; - MTLCommandBufferStatus status = [command_buffer status]; + MTLCommandBufferStatus status = [cmd_buf status]; if (status != MTLCommandBufferStatusCompleted) { GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); if (status == MTLCommandBufferStatusError) { - GGML_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]); + GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]); } return GGML_STATUS_FAILED; } - id next_buffer = (i + 1 < n_cb ? ctx->command_buffers[i + 1] : nil); + id next_buffer = (i + 1 < n_cb ? ctx->cmd_bufs[i + 1].obj : nil); if (!next_buffer) { continue; } @@ -5092,8 +5410,9 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { const int n_nodes_per_cb = ctx->n_nodes_per_cb; - id command_buffer = ctx->command_buffers[cb_idx]; - id encoder = [command_buffer computeCommandEncoder]; + id cmd_buf = ctx->cmd_bufs[cb_idx].obj; + + id encoder = [cmd_buf computeCommandEncoder]; int node_start = 0; int node_end = n_nodes_0; @@ -5105,22 +5424,29 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { const bool should_capture = ctx->capture_next_compute; + struct ggml_metal_mem_pool * mem_pool = ctx->cmd_bufs[cb_idx].mem_pool; + ggml_metal_mem_pool_reset(mem_pool); + for (int idx = node_start; idx < node_end; ++idx) { if (should_capture) { [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]]; } - ggml_metal_encode_node(backend, idx, encoder); + const bool res = ggml_metal_encode_node(backend, idx, encoder, mem_pool); if (should_capture) { [encoder popDebugGroup]; } + + if (!res) { + break; + } } [encoder endEncoding]; if (cb_idx < 2 || ctx->abort_callback == NULL) { - [command_buffer commit]; + [cmd_buf commit]; } }); } diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 8fcde2626..59510bd0c 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -218,17 +218,41 @@ class Keys: TYPE = "adapter.type" LORA_ALPHA = "adapter.lora.alpha" + class ClipVision: + PROJECTOR_TYPE = "clip.projector_type" + HAS_VISION_ENCODER = "clip.has_vision_encoder" + HAS_LLAVA_PROJECTOR = "clip.has_llava_projector" + IMAGE_SIZE = "clip.vision.image_size" + PATCH_SIZE = "clip.vision.patch_size" + EMBEDDING_LENGTH = "clip.vision.embedding_length" + FEED_FORWARD_LENGTH = "clip.vision.feed_forward_length" + PROJECTION_DIM = "clip.vision.projection_dim" + BLOCK_COUNT = "clip.vision.block_count" + IMAGE_MEAN = "clip.vision.image_mean" + IMAGE_STD = "clip.vision.image_std" + USE_GELU = "clip.use_gelu" + USE_SILU = "clip.use_silu" + + class Attention: + HEAD_COUNT = "clip.vision.attention.head_count" + LAYERNORM_EPS = "clip.vision.attention.layer_norm_epsilon" + + class Projector: + SCALE_FACTOR = "clip.vision.projector.scale_factor" + # # recommended mapping of model tensor names for storage in gguf # class GGUFType: - MODEL = "model" - ADAPTER = "adapter" + MODEL = "model" + ADAPTER = "adapter" + CLIP_VISION = "clip-vision" class MODEL_ARCH(IntEnum): + CLIP_VISION = auto() # dummy arch for clip.cpp LLAMA = auto() LLAMA4 = auto() DECI = auto() @@ -297,6 +321,16 @@ class MODEL_ARCH(IntEnum): BAILINGMOE = auto() +class VISION_PROJECTOR_TYPE(IntEnum): + MLP = auto() + LDP = auto() + LDPV2 = auto() + RESAMPLER = auto() + GLM_EDGE = auto() + MERGER = auto() + GEMMA3 = auto() + + class MODEL_TENSOR(IntEnum): TOKEN_EMBD = auto() TOKEN_EMBD_NORM = auto() @@ -436,9 +470,41 @@ class MODEL_TENSOR(IntEnum): POSNET_ATTN_K = auto() POSNET_ATTN_V = auto() POSNET_ATTN_OUT = auto() + # vision + V_MMPROJ = auto() + V_MMPROJ_FC = auto() + V_MMPROJ_MLP = auto() + V_MMPROJ_PEG = auto() + V_ENC_EMBD_CLS = auto() + V_ENC_EMBD_PATCH = auto() + V_ENC_EMBD_POS = auto() + V_ENC_ATTN_Q = auto() + V_ENC_ATTN_K = auto() + V_ENC_ATTN_V = auto() + V_ENC_INPUT_NORM = auto() + V_ENC_OUTPUT = auto() + V_ENC_OUTPUT_NORM = auto() + V_ENC_FFN_UP = auto() + V_ENC_FFN_DOWN = auto() + V_PRE_NORM = auto() + V_POST_NORM = auto() + V_MM_INP_PROJ = auto() # gemma3 + V_MM_SOFT_EMB_NORM = auto() # gemma3 + V_RESMPL_POS_EMBD_K = auto() # minicpmv + V_RESMPL_ATTN_Q = auto() # minicpmv + V_RESMPL_ATTN_K = auto() # minicpmv + V_RESMPL_ATTN_V = auto() # minicpmv + V_RESMPL_ATTN_OUT = auto() # minicpmv + V_RESMPL_KV = auto() # minicpmv + V_RESMPL_KV_NORM = auto() # minicpmv + V_RESMPL_POST_NORM = auto() # minicpmv + V_RESMPL_Q_NORM = auto() # minicpmv + V_RESMPL_PROJ = auto() # minicpmv + V_RESMPL_QUERY = auto() # minicpmv MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { + MODEL_ARCH.CLIP_VISION: "clip", # dummy arch for clip.cpp MODEL_ARCH.LLAMA: "llama", MODEL_ARCH.LLAMA4: "llama4", MODEL_ARCH.DECI: "deci", @@ -507,6 +573,16 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.BAILINGMOE: "bailingmoe", } +VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = { + VISION_PROJECTOR_TYPE.MLP: "mlp", + VISION_PROJECTOR_TYPE.LDP: "ldp", + VISION_PROJECTOR_TYPE.LDPV2: "ldpv2", + VISION_PROJECTOR_TYPE.RESAMPLER: "resampler", + VISION_PROJECTOR_TYPE.GLM_EDGE: "adapter", + VISION_PROJECTOR_TYPE.MERGER: "qwen2vl_merger", + VISION_PROJECTOR_TYPE.GEMMA3: "gemma3", +} + TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.TOKEN_EMBD: "token_embd", MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm", @@ -646,9 +722,72 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.POSNET_ATTN_K: "posnet.{bid}.attn_k", MODEL_TENSOR.POSNET_ATTN_V: "posnet.{bid}.attn_v", MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output", + # vision + MODEL_TENSOR.V_MMPROJ: "mm.{bid}", + MODEL_TENSOR.V_MMPROJ_FC: "mm.model.fc", + MODEL_TENSOR.V_MMPROJ_MLP: "mm.model.mlp.{bid}", + MODEL_TENSOR.V_MMPROJ_PEG: "mm.model.peg.{bid}", + MODEL_TENSOR.V_ENC_EMBD_CLS: "v.class_embd", + MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd", + MODEL_TENSOR.V_ENC_EMBD_POS: "v.position_embd", + MODEL_TENSOR.V_ENC_ATTN_Q: "v.blk.{bid}.attn_q", + MODEL_TENSOR.V_ENC_ATTN_K: "v.blk.{bid}.attn_k", + MODEL_TENSOR.V_ENC_ATTN_V: "v.blk.{bid}.attn_v", + MODEL_TENSOR.V_ENC_INPUT_NORM: "v.blk.{bid}.ln1", + MODEL_TENSOR.V_ENC_OUTPUT: "v.blk.{bid}.attn_out", + MODEL_TENSOR.V_ENC_OUTPUT_NORM: "v.blk.{bid}.ln2", + MODEL_TENSOR.V_ENC_FFN_UP: "v.blk.{bid}.ffn_up", + MODEL_TENSOR.V_ENC_FFN_DOWN: "v.blk.{bid}.ffn_down", + MODEL_TENSOR.V_PRE_NORM: "v.pre_ln", + MODEL_TENSOR.V_POST_NORM: "v.post_ln", + MODEL_TENSOR.V_MM_INP_PROJ: "mm.input_projection", + MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm", + MODEL_TENSOR.V_RESMPL_POS_EMBD_K: "resampler.pos_embd_k", + MODEL_TENSOR.V_RESMPL_ATTN_Q: "resampler.attn.q", + MODEL_TENSOR.V_RESMPL_ATTN_K: "resampler.attn.k", + MODEL_TENSOR.V_RESMPL_ATTN_V: "resampler.attn.v", + MODEL_TENSOR.V_RESMPL_ATTN_OUT: "resampler.attn.out", + MODEL_TENSOR.V_RESMPL_KV: "resampler.kv", + MODEL_TENSOR.V_RESMPL_KV_NORM: "resampler.ln_kv", + MODEL_TENSOR.V_RESMPL_POST_NORM: "resampler.ln_post", + MODEL_TENSOR.V_RESMPL_Q_NORM: "resampler.ln_q", + MODEL_TENSOR.V_RESMPL_PROJ: "resampler.proj", + MODEL_TENSOR.V_RESMPL_QUERY: "resampler.query", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { + MODEL_ARCH.CLIP_VISION: [ + MODEL_TENSOR.V_MMPROJ, + MODEL_TENSOR.V_MMPROJ_FC, + MODEL_TENSOR.V_MMPROJ_MLP, + MODEL_TENSOR.V_MMPROJ_PEG, + MODEL_TENSOR.V_ENC_EMBD_CLS, + MODEL_TENSOR.V_ENC_EMBD_PATCH, + MODEL_TENSOR.V_ENC_EMBD_POS, + MODEL_TENSOR.V_ENC_ATTN_Q, + MODEL_TENSOR.V_ENC_ATTN_K, + MODEL_TENSOR.V_ENC_ATTN_V, + MODEL_TENSOR.V_ENC_INPUT_NORM, + MODEL_TENSOR.V_ENC_OUTPUT, + MODEL_TENSOR.V_ENC_OUTPUT_NORM, + MODEL_TENSOR.V_ENC_FFN_UP, + MODEL_TENSOR.V_ENC_FFN_DOWN, + MODEL_TENSOR.V_PRE_NORM, + MODEL_TENSOR.V_POST_NORM, + MODEL_TENSOR.V_MM_INP_PROJ, + MODEL_TENSOR.V_MM_SOFT_EMB_NORM, + MODEL_TENSOR.V_RESMPL_POS_EMBD_K, + MODEL_TENSOR.V_RESMPL_ATTN_Q, + MODEL_TENSOR.V_RESMPL_ATTN_K, + MODEL_TENSOR.V_RESMPL_ATTN_V, + MODEL_TENSOR.V_RESMPL_ATTN_OUT, + MODEL_TENSOR.V_RESMPL_KV, + MODEL_TENSOR.V_RESMPL_KV_NORM, + MODEL_TENSOR.V_RESMPL_POST_NORM, + MODEL_TENSOR.V_RESMPL_Q_NORM, + MODEL_TENSOR.V_RESMPL_PROJ, + MODEL_TENSOR.V_RESMPL_QUERY, + ], MODEL_ARCH.LLAMA: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -1987,6 +2126,11 @@ class GGUFValueType(IntEnum): raise ValueError(f"Unknown type: {type(val)}") +class VisionProjectorType: + GEMMA3 = "gemma3" + IDEFICS3 = "idefics3" + + # Items here are (block size, type size) QK_K = 256 GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = { diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index aef03db15..48e9a470b 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -931,6 +931,53 @@ class GGUFWriter: def add_eom_token_id(self, id: int) -> None: self.add_uint32(Keys.Tokenizer.EOM_ID, id) + # for vision models + + def add_vision_projection_dim(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.PROJECTION_DIM, value) + + def add_vision_has_vision_encoder(self, value: bool) -> None: + self.add_bool(Keys.ClipVision.HAS_VISION_ENCODER, value) + + def add_vision_patch_size(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.PATCH_SIZE, value) + + def add_vision_embedding_length(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.EMBEDDING_LENGTH, value) + + def add_vision_feed_forward_length(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.FEED_FORWARD_LENGTH, value) + + def add_vision_block_count(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.BLOCK_COUNT, value) + + def add_vision_head_count(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.Attention.HEAD_COUNT, value) + + def add_vision_projector_type(self, value: str) -> None: + self.add_string(Keys.ClipVision.PROJECTOR_TYPE, value) + + def add_vision_attention_layernorm_eps(self, value: float) -> None: + self.add_float32(Keys.ClipVision.Attention.LAYERNORM_EPS, value) + + def add_vision_image_size(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.IMAGE_SIZE, value) + + def add_vision_image_mean(self, values: Sequence[float]) -> None: + self.add_array(Keys.ClipVision.IMAGE_MEAN, values) + + def add_vision_image_std(self, values: Sequence[float]) -> None: + self.add_array(Keys.ClipVision.IMAGE_STD, values) + + def add_vision_use_gelu(self, value: bool) -> None: + self.add_bool(Keys.ClipVision.USE_GELU, value) + + def add_vision_use_silu(self, value: bool) -> None: + self.add_bool(Keys.ClipVision.USE_SILU, value) + + def add_vision_projector_scale_factor(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.Projector.SCALE_FACTOR, value) + def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes: pack_prefix = '' if not skip_pack_prefix: diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 0bc75cf51..3ff378c13 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -886,6 +886,150 @@ class TensorNameMap: MODEL_TENSOR.POSNET_ATTN_OUT: ( "backbone.posnet.{bid}.proj_out", # wavtokenizer ), + + ############################################################################# + ## Vision encoder + + MODEL_TENSOR.V_MMPROJ: ( + "multi_modal_projector.linear_{bid}", + ), + + MODEL_TENSOR.V_MMPROJ_FC: ( + "model.connector.modality_projection.proj", # SmolVLM + ), + + MODEL_TENSOR.V_MMPROJ_MLP: ( + "model.mm_projector.mlp.mlp.{bid}", + ), + + MODEL_TENSOR.V_MMPROJ_PEG: ( + "model.mm_projector.peg.peg.{bid}", + ), + + MODEL_TENSOR.V_ENC_EMBD_CLS: ( + "vision_tower.vision_model.embeddings.class_embedding", + ), + + MODEL_TENSOR.V_ENC_EMBD_PATCH: ( + "vision_tower.vision_model.embeddings.patch_embedding", + "vpm.embeddings.patch_embedding", + "model.vision_model.embeddings.patch_embedding", # SmolVLM + ), + + MODEL_TENSOR.V_ENC_EMBD_POS: ( + "vision_tower.vision_model.embeddings.position_embedding", + "vpm.embeddings.position_embedding", + "model.vision_model.embeddings.position_embedding", # SmolVLM + ), + + MODEL_TENSOR.V_ENC_ATTN_Q: ( + "vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj", + "vpm.encoder.layers.{bid}.self_attn.q_proj", + "model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM + ), + + MODEL_TENSOR.V_ENC_ATTN_K: ( + "vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj", + "vpm.encoder.layers.{bid}.self_attn.k_proj", + "model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM + ), + + MODEL_TENSOR.V_ENC_ATTN_V: ( + "vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj", + "vpm.encoder.layers.{bid}.self_attn.v_proj", + "model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM + ), + + MODEL_TENSOR.V_ENC_INPUT_NORM: ( + "vision_tower.vision_model.encoder.layers.{bid}.layer_norm1", + "vpm.encoder.layers.{bid}.layer_norm1", + "model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM + ), + + MODEL_TENSOR.V_ENC_OUTPUT: ( + "vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj", + "vpm.encoder.layers.{bid}.self_attn.out_proj", + "model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM + ), + + MODEL_TENSOR.V_ENC_OUTPUT_NORM: ( + "vision_tower.vision_model.encoder.layers.{bid}.layer_norm2", + "vpm.encoder.layers.{bid}.layer_norm2", + "model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM + ), + + MODEL_TENSOR.V_ENC_FFN_UP: ( + "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1", + "vpm.encoder.layers.{bid}.mlp.fc1", + "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3 (note: name is swapped) + ), + + MODEL_TENSOR.V_ENC_FFN_DOWN: ( + "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2", + "vpm.encoder.layers.{bid}.mlp.fc2", + "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3 (note: name is swapped) + ), + + MODEL_TENSOR.V_PRE_NORM: ( + "vision_tower.vision_model.pre_layrnorm", + ), + + MODEL_TENSOR.V_POST_NORM: ( + "vision_tower.vision_model.post_layernorm", + "model.vision_model.post_layernorm", # SmolVLM + ), + + MODEL_TENSOR.V_MM_INP_PROJ: ( + "multi_modal_projector.mm_input_projection", + ), + + MODEL_TENSOR.V_MM_SOFT_EMB_NORM: ( + "multi_modal_projector.mm_soft_emb_norm", + ), + + MODEL_TENSOR.V_RESMPL_POS_EMBD_K: ( + "resampler.pos_embed_k", + ), + + MODEL_TENSOR.V_RESMPL_ATTN_Q: ( + "resampler.attn.in_proj_q", # tensor generated from resampler.attn.in_proj + ), + + MODEL_TENSOR.V_RESMPL_ATTN_K: ( + "resampler.attn.in_proj_k", # tensor generated from resampler.attn.in_proj + ), + + MODEL_TENSOR.V_RESMPL_ATTN_V: ( + "resampler.attn.in_proj_v", # tensor generated from resampler.attn.in_proj + ), + + MODEL_TENSOR.V_RESMPL_ATTN_OUT: ( + "resampler.attn.out_proj", + ), + + MODEL_TENSOR.V_RESMPL_KV: ( + "resampler.kv_proj", + ), + + MODEL_TENSOR.V_RESMPL_POST_NORM: ( + "resampler.ln_post", + ), + + MODEL_TENSOR.V_RESMPL_KV_NORM: ( + "resampler.ln_kv", + ), + + MODEL_TENSOR.V_RESMPL_Q_NORM: ( + "resampler.ln_q", + ), + + MODEL_TENSOR.V_RESMPL_PROJ: ( + "resampler.proj", + ), + + MODEL_TENSOR.V_RESMPL_QUERY: ( + "resampler.query", + ), } # architecture-specific block mappings diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp index 721faa4e8..41f89e3a9 100644 --- a/src/llama-chat.cpp +++ b/src/llama-chat.cpp @@ -62,6 +62,7 @@ static const std::map LLM_CHAT_TEMPLATES = { { "yandex", LLM_CHAT_TEMPLATE_YANDEX }, { "bailing", LLM_CHAT_TEMPLATE_BAILING }, { "llama4", LLM_CHAT_TEMPLATE_LLAMA4 }, + { "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM }, }; llm_chat_template llm_chat_template_from_str(const std::string & name) { @@ -81,7 +82,9 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { if (tmpl_contains("<|im_start|>")) { return tmpl_contains("<|im_sep|>") ? LLM_CHAT_TEMPLATE_PHI_4 - : LLM_CHAT_TEMPLATE_CHATML; + : tmpl_contains("") + ? LLM_CHAT_TEMPLATE_SMOLVLM // SmolVLM uses <|im_start|> as BOS, but it is NOT chatml + : LLM_CHAT_TEMPLATE_CHATML; } else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) { if (tmpl_contains("[SYSTEM_PROMPT]")) { return LLM_CHAT_TEMPLATE_MISTRAL_V7; @@ -121,6 +124,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { return LLM_CHAT_TEMPLATE_PHI_3; } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) { return tmpl_contains("") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE; + } else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) { + return LLM_CHAT_TEMPLATE_GLMEDGE; } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) { return LLM_CHAT_TEMPLATE_ZEPHYR; } else if (tmpl_contains("bos_token + message['role']")) { @@ -620,7 +625,23 @@ int32_t llm_chat_apply_template( if (add_ass) { ss << "<|header_start|>assistant<|header_end|>\n\n"; } - } else { + } else if (tmpl == LLM_CHAT_TEMPLATE_SMOLVLM) { + // SmolVLM + ss << "<|im_start|>"; // uses <|im_start|> as BOS, but the actual content is NOT chatml + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + ss << message->content << "\n\n"; + } else if (role == "user") { + ss << "User: " << message->content << "\n"; + } else { + ss << "Assistant: " << message->content << "\n"; + } + } + if (add_ass) { + ss << "Assistant:"; + } + } else { // template not supported return -1; } diff --git a/src/llama-chat.h b/src/llama-chat.h index 34537ca21..dc30df711 100644 --- a/src/llama-chat.h +++ b/src/llama-chat.h @@ -41,6 +41,7 @@ enum llm_chat_template { LLM_CHAT_TEMPLATE_YANDEX, LLM_CHAT_TEMPLATE_BAILING, LLM_CHAT_TEMPLATE_LLAMA4, + LLM_CHAT_TEMPLATE_SMOLVLM, LLM_CHAT_TEMPLATE_UNKNOWN, };