From f5636f8fc77e403596a8524d789c4a16350837ac Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Mon, 11 May 2026 12:07:17 +0200
Subject: [PATCH] convert : add image break token fallback (#22914)

* convert : add image break token fallback

This commit adds a image_break_token_id fallback for mistral where the
config contains a image_break_token_id of -1:
```console
  "vision_encoder": {
    "image_token_id": 10,
    "image_break_token_id": -1,
    ...
```
But the tokenizer.json has this token:
```console
115       "id": 12,
116       "content": "[IMG_BREAK]",
117       "single_word": false,
118       "lstrip": false,
119       "rstrip": false,
120       "normalized": false,
121       "special": true
122     },
```
If we look in convert_hf_to_gguf.py we have:
```python
        elif self.is_mistral_format:
            # hparams is already vision config here so norm_eps is only defined in global_config.
            self.hparams["norm_eps"] = self.global_config.get("norm_eps", None)
            assert self.hparams["norm_eps"] is not None, "norm_eps not found in params.json"
            if self.use_break_tok:
                self.img_break_tok_id = self.find_vparam(["image_break_token_id"])
```

The motivation for this is that currently converting this models
results in the following error:
```console
load_hparams: model size:         5131.60 MiB
load_hparams: metadata size:      0.15 MiB
clip_init: failed to load model 'models/mmproj-Mistral-Medium-3.5-128B.gguf': operator(): unable to find tensor v.token_embd.img_break

mtmd_init_from_file: error: Failed to load CLIP model from models/mmproj-Mistral-Medium-3.5-128B.gguf

Failed to load vision model from models/mmproj-Mistral-Medium-3.5-128B.gguf
```

With this fallback the model loads successfully.

Resolves: https://github.com/ggml-org/llama.cpp/issues/22901

* Revert "convert : add image break token fallback"

This reverts commit 292e40cfdf9a7553863007c018236f5f554f71d8.

* convert : add image break token fallback

This commit adds a image_break_token_id fallback for mistral where the
config contains a image_break_token_id of -1:
```console
  "vision_encoder": {
    "image_token_id": 10,
    "image_break_token_id": -1,
    ...
```
But the tokenizer.json has this token:
```console
115       "id": 12,
116       "content": "[IMG_BREAK]",
117       "single_word": false,
118       "lstrip": false,
119       "rstrip": false,
120       "normalized": false,
121       "special": true
122     },
```
If we look in convert_hf_to_gguf.py we have:
```python
        elif self.is_mistral_format:
            # hparams is already vision config here so norm_eps is only defined in global_config.
            self.hparams["norm_eps"] = self.global_config.get("norm_eps", None)
            assert self.hparams["norm_eps"] is not None, "norm_eps not found in params.json"
            if self.use_break_tok:
                self.img_break_tok_id = self.find_vparam(["image_break_token_id"])
```

The motivation for this is that currently converting this models
results in the following error:
```console
load_hparams: model size:         5131.60 MiB
load_hparams: metadata size:      0.15 MiB
clip_init: failed to load model 'models/mmproj-Mistral-Medium-3.5-128B.gguf': operator(): unable to find tensor v.token_embd.img_break

mtmd_init_from_file: error: Failed to load CLIP model from models/mmproj-Mistral-Medium-3.5-128B.gguf

Failed to load vision model from models/mmproj-Mistral-Medium-3.5-128B.gguf
```

With this fallback the model loads successfully.

Co-authored-by: Pascal <admin@serveurperso.com>

Resolves: https://github.com/ggml-org/llama.cpp/issues/22901

* convert : allow zero value for img_break_tok_id
---
 convert_hf_to_gguf.py | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index e5dea18ae..bf76fa406 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -2176,7 +2176,8 @@ class MmprojModel(ModelBase):
             text_config = {
                 k: v for k, v in self.hparams.items() if k not in ["vision_encoder", "audio_encoder"]
             }
-            self.n_embd_text = text_config.get("hidden_dim", 0)
+            # mistral native params.json: "dim" is the text hidden size ("hidden_dim" is the FFN intermediate size)
+            self.n_embd_text = text_config.get("dim", 0)
 
         assert self.n_embd_text > 0, "n_embd not found in hparams"
 
@@ -3137,6 +3138,11 @@ class LlavaVisionModel(MmprojModel):
             assert self.hparams["norm_eps"] is not None, "norm_eps not found in params.json"
             if self.use_break_tok:
                 self.img_break_tok_id = self.find_vparam(["image_break_token_id"])
+
+                # params.json may ship -1 placeholders (Mistral Medium 3.5)
+                # resolve the real id from the bundled tokenizer in that case
+                if self.img_break_tok_id < 0:
+                    self.img_break_tok_id = self.get_mistral_token_id("[IMG_BREAK]")
         else:
             raise ValueError(f"Unsupported model type: {self.hparams['model_type']}")
         logger.info(f"Image break token id: {self.img_break_tok_id}")
@@ -3156,6 +3162,24 @@ class LlavaVisionModel(MmprojModel):
                     return int(token_data["id"])
         raise ValueError(f"Token '{token}' not found in tokenizer config.")
 
+    def get_mistral_token_id(self, token: str) -> int:
+        # mistral native format ships tekken.json or a versioned spm tokenizer
+        tekken_file = self.dir_model / "tekken.json"
+        if tekken_file.is_file():
+            with open(tekken_file, "r", encoding="utf-8") as f:
+                data = json.load(f)
+            for entry in data.get("special_tokens", []):
+                if entry.get("token_str") == token:
+                    return int(entry["rank"])
+        tokenizer_json_file = self.dir_model / "tokenizer.json"
+        if tokenizer_json_file.is_file():
+            with open(tokenizer_json_file, "r", encoding="utf-8") as f:
+                data = json.load(f)
+            for entry in data.get("added_tokens", []):
+                if entry.get("content") == token:
+                    return int(entry["id"])
+        raise ValueError(f"Token '{token}' not found in mistral tokenizer files.")
+
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
         hparams = self.hparams