convert : add image break token fallback (#22914)

* convert : add image break token fallback

This commit adds a image_break_token_id fallback for mistral where the
config contains a image_break_token_id of -1:
```console
  "vision_encoder": {
    "image_token_id": 10,
    "image_break_token_id": -1,
    ...
```
But the tokenizer.json has this token:
```console
115       "id": 12,
116       "content": "[IMG_BREAK]",
117       "single_word": false,
118       "lstrip": false,
119       "rstrip": false,
120       "normalized": false,
121       "special": true
122     },
```
If we look in convert_hf_to_gguf.py we have:
```python
        elif self.is_mistral_format:
            # hparams is already vision config here so norm_eps is only defined in global_config.
            self.hparams["norm_eps"] = self.global_config.get("norm_eps", None)
            assert self.hparams["norm_eps"] is not None, "norm_eps not found in params.json"
            if self.use_break_tok:
                self.img_break_tok_id = self.find_vparam(["image_break_token_id"])
```

The motivation for this is that currently converting this models
results in the following error:
```console
load_hparams: model size:         5131.60 MiB
load_hparams: metadata size:      0.15 MiB
clip_init: failed to load model 'models/mmproj-Mistral-Medium-3.5-128B.gguf': operator(): unable to find tensor v.token_embd.img_break

mtmd_init_from_file: error: Failed to load CLIP model from models/mmproj-Mistral-Medium-3.5-128B.gguf

Failed to load vision model from models/mmproj-Mistral-Medium-3.5-128B.gguf
```

With this fallback the model loads successfully.

Resolves: https://github.com/ggml-org/llama.cpp/issues/22901

* Revert "convert : add image break token fallback"

This reverts commit 292e40cfdf9a7553863007c018236f5f554f71d8.

* convert : add image break token fallback

This commit adds a image_break_token_id fallback for mistral where the
config contains a image_break_token_id of -1:
```console
  "vision_encoder": {
    "image_token_id": 10,
    "image_break_token_id": -1,
    ...
```
But the tokenizer.json has this token:
```console
115       "id": 12,
116       "content": "[IMG_BREAK]",
117       "single_word": false,
118       "lstrip": false,
119       "rstrip": false,
120       "normalized": false,
121       "special": true
122     },
```
If we look in convert_hf_to_gguf.py we have:
```python
        elif self.is_mistral_format:
            # hparams is already vision config here so norm_eps is only defined in global_config.
            self.hparams["norm_eps"] = self.global_config.get("norm_eps", None)
            assert self.hparams["norm_eps"] is not None, "norm_eps not found in params.json"
            if self.use_break_tok:
                self.img_break_tok_id = self.find_vparam(["image_break_token_id"])
```

The motivation for this is that currently converting this models
results in the following error:
```console
load_hparams: model size:         5131.60 MiB
load_hparams: metadata size:      0.15 MiB
clip_init: failed to load model 'models/mmproj-Mistral-Medium-3.5-128B.gguf': operator(): unable to find tensor v.token_embd.img_break

mtmd_init_from_file: error: Failed to load CLIP model from models/mmproj-Mistral-Medium-3.5-128B.gguf

Failed to load vision model from models/mmproj-Mistral-Medium-3.5-128B.gguf
```

With this fallback the model loads successfully.

Co-authored-by: Pascal <admin@serveurperso.com>

Resolves: https://github.com/ggml-org/llama.cpp/issues/22901

* convert : allow zero value for img_break_tok_id
This commit is contained in:
Daniel Bevenius 2026-05-11 12:07:17 +02:00 committed by GitHub
parent 838374375c
commit f5636f8fc7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -2176,7 +2176,8 @@ class MmprojModel(ModelBase):
text_config = {
k: v for k, v in self.hparams.items() if k not in ["vision_encoder", "audio_encoder"]
}
self.n_embd_text = text_config.get("hidden_dim", 0)
# mistral native params.json: "dim" is the text hidden size ("hidden_dim" is the FFN intermediate size)
self.n_embd_text = text_config.get("dim", 0)
assert self.n_embd_text > 0, "n_embd not found in hparams"
@ -3137,6 +3138,11 @@ class LlavaVisionModel(MmprojModel):
assert self.hparams["norm_eps"] is not None, "norm_eps not found in params.json"
if self.use_break_tok:
self.img_break_tok_id = self.find_vparam(["image_break_token_id"])
# params.json may ship -1 placeholders (Mistral Medium 3.5)
# resolve the real id from the bundled tokenizer in that case
if self.img_break_tok_id < 0:
self.img_break_tok_id = self.get_mistral_token_id("[IMG_BREAK]")
else:
raise ValueError(f"Unsupported model type: {self.hparams['model_type']}")
logger.info(f"Image break token id: {self.img_break_tok_id}")
@ -3156,6 +3162,24 @@ class LlavaVisionModel(MmprojModel):
return int(token_data["id"])
raise ValueError(f"Token '{token}' not found in tokenizer config.")
def get_mistral_token_id(self, token: str) -> int:
# mistral native format ships tekken.json or a versioned spm tokenizer
tekken_file = self.dir_model / "tekken.json"
if tekken_file.is_file():
with open(tekken_file, "r", encoding="utf-8") as f:
data = json.load(f)
for entry in data.get("special_tokens", []):
if entry.get("token_str") == token:
return int(entry["rank"])
tokenizer_json_file = self.dir_model / "tokenizer.json"
if tokenizer_json_file.is_file():
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
data = json.load(f)
for entry in data.get("added_tokens", []):
if entry.get("content") == token:
return int(entry["id"])
raise ValueError(f"Token '{token}' not found in mistral tokenizer files.")
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams