mtmd: Add DeepSeekOCR 2 Support (#20975)

* mtmd: DeepSeek-OCR 2 support, with multi-tile dynamic resolution * introduced clip_image_f32::add_viewsep * address PR review - drop redundant ggml_cpy ops in both deepseekocr versions build - drop no-op ggml_cont in build_sam - assert num_image_tokens deepseekocr2 - view_seperator as (1, n_embd) at conversion (for both versions) - drop redundant ggml_reshape_2d * Update tools/mtmd/models/deepseekocr2.cpp Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com> --------- Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
2026-05-29 19:23:39 +00:00 · 2026-05-29 16:13:51 +02:00 · 2026-05-29 16:13:51 +02:00 · da3f990a47
commit da3f990a47
parent 6ed481eea4
16 changed files with 505 additions and 90 deletions
--- a/conversion/init.py
+++ b/conversion/init.py
@ -237,6 +237,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
 MMPROJ_MODEL_MAP: dict[str, str] = {
    "AudioFlamingo3ForConditionalGeneration": "ultravox",
    "CogVLMForCausalLM": "cogvlm",
+    "DeepseekOCR2ForCausalLM": "deepseek",
    "DeepseekOCRForCausalLM": "deepseek",
    "DotsOCRForCausalLM": "dotsocr",
    "Gemma3ForConditionalGeneration": "gemma",
--- a/conversion/base.py
+++ b/conversion/base.py
@ -1140,7 +1140,7 @@ class TextModel(ModelBase):
        # Skip multimodal tensors
        if name.startswith(("mlp", "vit.", "vpm.", "siglip2.", "conformer.", "merger.", "resampler.", "sound_encoder.", "sound_projection.", "speech_embeddings.")) \
                or "visual." in name or "vision." in name or "audio." in name or "talker." in name \
-                or "vision_" in name or "audio_" in name or "sam_model" in name \
+                or "vision_" in name or "audio_" in name \
                or "token2wav." in name or "code2wav." in name \
                or "projector." in name or "pre_mm_projector_norm" in name \
                or "image_newline" in name or "view_seperator" in name \
--- a/conversion/deepseek.py
+++ b/conversion/deepseek.py
@ -16,10 +16,14 @@ from .qwen import QwenModel

@ModelBase.register("DeepseekOCRForCausalLM")
 class DeepseekOCRVisionModel(MmprojModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.clip_projector_type = gguf.VisionProjectorType.DEEPSEEKOCR
+
    def set_gguf_parameters(self):
        super().set_gguf_parameters()
        hparams = self.hparams
-        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DEEPSEEKOCR)
+        self.gguf_writer.add_clip_projector_type(self.clip_projector_type)
        # default values below are taken from HF tranformers code
        self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
        self.gguf_writer.add_vision_use_gelu(True)
@ -49,22 +53,27 @@ class DeepseekOCRVisionModel(MmprojModel):
            raise ValueError("DeepseekOCR model requires 'vision_config' in the model configuration, but it was not found")

        vision_config['sam'] = vision_config['width']['sam_vit_b']
-        vision_config.update(vision_config['width']['clip-l-14-224'])
-        vision_config['hidden_size'] = vision_config['width']
-        vision_config['num_heads'] = vision_config['heads']
-        vision_config['intermediate_size'] = vision_config['heads'] * 4
+        if vision_config['width'].get('clip-l-14-224') is not None:
+            vision_config.update(vision_config['width']['clip-l-14-224'])
+        if isinstance(vision_config['width'], int):
+            vision_config['hidden_size'] = vision_config['width']
+        if vision_config.get('heads') is not None:
+            vision_config['num_heads'] = vision_config['heads']
+            vision_config['intermediate_size'] = vision_config['heads'] * 4

        return vision_config

    def tensor_force_quant(self, name, new_name, bid, n_dims):
-        if ".embeddings." in name or 'pos_embed' in name:
-            return gguf.GGMLQuantizationType.F32
-        if ".rel_pos_h" in name or '.rel_pos_w' in name:
-            return gguf.GGMLQuantizationType.F32
-        if ".neck." in name or ".net_" in name:
-            return gguf.GGMLQuantizationType.F32
+        for nq_name in ('.embeddings.', 'pos_embed', '.rel_pos_h', '.rel_pos_w', '.neck.', '.net_'):
+            if nq_name in name:
+                return gguf.GGMLQuantizationType.F32
        return super().tensor_force_quant(name, new_name, bid, n_dims)

+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name.endswith("view_seperator"):
+            data_torch = data_torch.unsqueeze(0)
+        yield from super().modify_tensors(data_torch, name, bid)
+
    @classmethod
    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
        name, gen = item
@ -81,6 +90,33 @@ class DeepseekOCRVisionModel(MmprojModel):
        return super().filter_tensors((name, gen))


+@ModelBase.register("DeepseekOCR2ForCausalLM")
+class DeepseekOCR2VisionModel(DeepseekOCRVisionModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.clip_projector_type = gguf.VisionProjectorType.DEEPSEEKOCR2
+
+    def set_gguf_parameters(self):
+        # the vision tower's qwen2 encoder is built from fixed defaults,
+        # see build_qwen2_decoder_as_encoder() in deepencoderv2.py
+        if self.hparams.get("patch_size") is None:
+            self.hparams["patch_size"] = 16
+        if self.hparams.get("intermediate_size") is None:
+            self.hparams["intermediate_size"] = 4864
+        if self.hparams.get("num_attention_heads") is None:
+            self.hparams["num_attention_heads"] = 14
+        super().set_gguf_parameters()
+        # qwen2 encoder is GQA: 14 Q heads, 2 KV heads
+        self.gguf_writer.add_vision_head_count_kv(2)
+
+    def get_vision_config(self) -> dict[str, Any]:
+        vision_config = super().get_vision_config()
+        vision_config['hidden_size'] = vision_config['width']['qwen2-0-5b']['dim']
+        if vision_config.get('layers') is None:
+            vision_config['layers'] = 24
+        return vision_config
+
+
@ModelBase.register("DeepseekForCausalLM")
 class DeepseekModel(TextModel):
    model_arch = gguf.MODEL_ARCH.DEEPSEEK
@ -188,13 +224,21 @@ class DeepseekV2Model(TextModel):
        self.origin_hf_arch = hparams.get('architectures', [None])[0]

        # special handling for Deepseek OCR
-        if self.origin_hf_arch == "DeepseekOCRForCausalLM":
+        if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM"):
            self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR
            self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
            self.gguf_writer.add_architecture()
            # default jinja template
            self.gguf_writer.add_chat_template("{% for m in messages %}{{m['content']}}{% endfor %}")

+    @classmethod
+    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
+        name, _ = item
+        # DeepSeek-OCR vision encoder (SAM + DeepSeek-OCR-2 qwen2 tower)
+        if "sam_model" in name or "qwen2_model" in name:
+            return None
+        return super().filter_tensors(item)
+
    def set_vocab(self):
        try:
            self._set_vocab_gpt2()
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -812,6 +812,8 @@ class MODEL_TENSOR(IntEnum):
    V_SAM_NET_3          = auto() # Deepseek-OCR
    V_ENC_EMBD_IMGNL     = auto() # Deepseek-OCR
    V_ENC_EMBD_VSEP      = auto() # Deepseek-OCR
+    V_RESMPL_QUERY_768   = auto() # Deepseek-OCR-2
+    V_RESMPL_QUERY_1024  = auto() # Deepseek-OCR-2

    # audio (mtmd)
    A_ENC_EMBD_POS        = auto()
@ -1329,6 +1331,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.V_SAM_NET_3:               "v.sam.net_3",
    MODEL_TENSOR.V_ENC_EMBD_IMGNL:          "v.image_newline", # Deepseek-OCR
    MODEL_TENSOR.V_ENC_EMBD_VSEP:           "v.view_seperator", # Deepseek-OCR
+    MODEL_TENSOR.V_RESMPL_QUERY_768:        "v.resample_query_768", # Deepseek-OCR-2 qwen2
+    MODEL_TENSOR.V_RESMPL_QUERY_1024:       "v.resample_query_1024", # Deepseek-OCR-2 qwen2
    # audio (mtmd)
    # note: all audio tensor names must use prefix "a." or "mm.a."
    MODEL_TENSOR.A_ENC_EMBD_POS:            "a.position_embd",
@ -1507,6 +1511,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.V_SAM_NECK,
        MODEL_TENSOR.V_SAM_NET_2,
        MODEL_TENSOR.V_SAM_NET_3,
+        MODEL_TENSOR.V_RESMPL_QUERY_768,
+        MODEL_TENSOR.V_RESMPL_QUERY_1024,
        # audio
        MODEL_TENSOR.A_ENC_EMBD_POS,
        MODEL_TENSOR.A_ENC_EMBD_NORM,
@ -4329,6 +4335,7 @@ class VisionProjectorType:
    JANUS_PRO = "janus_pro"
    DOTSOCR = "dots_ocr"
    DEEPSEEKOCR = "deepseekocr"
+    DEEPSEEKOCR2 = "deepseekocr2"
    LFM2A = "lfm2a" # audio
    MUSIC_FLAMINGO = "musicflamingo" # audio
    GLM4V = "glm4v"
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -1485,6 +1485,7 @@ class TensorNameMap:
            "siglip2.vision_model.encoder.layers.{bid}.self_attn.q_proj", # youtuvl
            "model.vision_model.transformer.layers.{bid}.self_attn.q_proj", # Deepseek-OCR CLIP, generated
            "vision_model.model.layers.{bid}.self_attn.q_proj.linear", # gemma4
+            "model.qwen2_model.model.model.layers.{bid}.self_attn.q_proj" # Deepseek-OCR-2 qwen2
        ),

        MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
@ -1509,6 +1510,7 @@ class TensorNameMap:
            "model.vision_model.transformer.layers.{bid}.self_attn.k_proj", # Deepseek-OCR CLIP, generated
            "siglip2.vision_model.encoder.layers.{bid}.self_attn.k_proj",
            "vision_model.model.layers.{bid}.self_attn.k_proj.linear", # gemma4
+            "model.qwen2_model.model.model.layers.{bid}.self_attn.k_proj" # Deepseek-OCR-2 qwen2
        ),

        MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
@ -1533,6 +1535,7 @@ class TensorNameMap:
            "siglip2.vision_model.encoder.layers.{bid}.self_attn.v_proj",
            "model.vision_model.transformer.layers.{bid}.self_attn.v_proj", # Deepseek-OCR CLIP, generated
            "vision_model.model.layers.{bid}.self_attn.v_proj.linear", # gemma4
+            "model.qwen2_model.model.model.layers.{bid}.self_attn.v_proj" # Deepseek-OCR-2 qwen2
        ),

        MODEL_TENSOR.V_ENC_INPUT_NORM: (
@ -1554,6 +1557,7 @@ class TensorNameMap:
            "vision_model.radio_model.model.blocks.{bid}.norm1", # Nemotron Nano v2 VL
            "vision_tower.blocks.{bid}.norm1", # dots.ocr
            "vision_model.transformer.resblocks.{bid}.ln_1", # Step3-VL
+            "model.qwen2_model.model.model.layers.{bid}.input_layernorm", # Deepseek-OCR-2 qwen2
        ),

        MODEL_TENSOR.V_ENC_ATTN_O: (
@ -1574,6 +1578,7 @@ class TensorNameMap:
            "model.vision_model.transformer.layers.{bid}.self_attn.out_proj", # Deepseek-OCR CLIP
            "siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl
            "vision_model.radio_model.model.blocks.{bid}.attn.proj", # Nemotron Nano v2 VL
+            "model.qwen2_model.model.model.layers.{bid}.self_attn.o_proj", # Deepseek-OCR-2 qwen2
            "vision_model.model.layers.{bid}.self_attn.o_proj.linear", # gemma4
            "vision_tower.blocks.{bid}.attn.proj", # dots.ocr
            "vision_model.transformer.resblocks.{bid}.attn.out_proj", # Step3-VL
@ -1603,6 +1608,7 @@ class TensorNameMap:
            "vision_model.model.layers.{bid}.pre_feedforward_layernorm", # gemma4
            "vision_tower.blocks.{bid}.norm2", # dots.ocr
            "vision_model.transformer.resblocks.{bid}.ln_2", # Step3-VL
+            "model.qwen2_model.model.model.layers.{bid}.post_attention_layernorm", # Deepseek-OCR-2 qwen2
        ),

        MODEL_TENSOR.V_ENC_FFN_UP: (
@ -1625,6 +1631,7 @@ class TensorNameMap:
            "vision_model.radio_model.model.blocks.{bid}.mlp.fc1", # Nemotron Nano v2 VL
            "vision_model.model.layers.{bid}.mlp.up_proj", # gemma4
            "vision_model.transformer.resblocks.{bid}.mlp.c_fc", # Step3-VL
+            "model.qwen2_model.model.model.layers.{bid}.mlp.up_proj", # Deepseek-OCR-2 qwen2
        ),

        MODEL_TENSOR.V_ENC_FFN_GATE: (
@ -1632,6 +1639,7 @@ class TensorNameMap:
            "vision_encoder.transformer.layers.{bid}.feed_forward.w1", # pixtral
            "visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl
            "vision_model.model.layers.{bid}.mlp.gate_proj", # gemma4
+            "model.qwen2_model.model.model.layers.{bid}.mlp.gate_proj", # Deepseek-OCR-2 qwen2
        ),

        MODEL_TENSOR.V_ENC_FFN_DOWN: (
@ -1652,6 +1660,7 @@ class TensorNameMap:
            "model.vision_model.transformer.layers.{bid}.mlp.fc2", # Deepseek-OCR CLIP
            "siglip2.vision_model.encoder.layers.{bid}.mlp.fc2",
            "vision_model.radio_model.model.blocks.{bid}.mlp.fc2", # Nemotron Nano v2 VL
+            "model.qwen2_model.model.model.layers.{bid}.mlp.down_proj" , # Deepseek-OCR-2 qwen2
            "vision_model.model.layers.{bid}.mlp.down_proj", # gemma4
            "vision_model.transformer.resblocks.{bid}.mlp.c_proj", # Step3-VL
        ),
@ -1699,6 +1708,7 @@ class TensorNameMap:
            "vision_tower.encoder.final_layernorm", # kimi-vl
            "visual.post_layernorm", # glm4v
            "siglip2.vision_model.post_layernorm",
+            "model.qwen2_model.model.model.norm", # Deepseek-OCR-2 qwen2
        ),

        MODEL_TENSOR.V_MM_POST_NORM: (
@ -1879,6 +1889,14 @@ class TensorNameMap:
            "model.sam_model.net_3",
        ),

+        MODEL_TENSOR.V_RESMPL_QUERY_768: (
+            "model.qwen2_model.query_768", # Deepseek-OCR-2 qwen2
+        ),
+
+        MODEL_TENSOR.V_RESMPL_QUERY_1024: (
+            "model.qwen2_model.query_1024", # Deepseek-OCR-2 qwen2
+        ),
+
        MODEL_TENSOR.V_MM_POST_FC_NORM: (
            "model.vision.linear_proj.norm1", # cogvlm
        ),
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@ -40,6 +40,7 @@ add_library(mtmd
            models/siglip.cpp
            models/whisper-enc.cpp
            models/deepseekocr.cpp
+            models/deepseekocr2.cpp
            models/mobilenetv5.cpp
            models/youtuvl.cpp
            models/yasa2.cpp
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@ -188,6 +188,8 @@
 #define TN_SAM_FFN_DOWN   "v.sam.blk.%d.mlp.lin2.%s"
 #define TN_SAM_NECK       "v.sam.neck.%d.%s"
 #define TN_SAM_NET        "v.sam.net_%d.%s"
+// deepseek-ocr-2
+#define TN_RESMPL_QUERY  "v.resample_query_%d.%s"
 // (conformer) lfm2
 #define TN_PRE_ENCODE_OUT  "a.pre_encode.out.%s"
 #define TN_FFN_NORM        "%s.blk.%d.ffn_norm.%s"
@ -337,6 +339,7 @@ enum projector_type {
    PROJECTOR_TYPE_JANUS_PRO,
    PROJECTOR_TYPE_DOTS_OCR,
    PROJECTOR_TYPE_DEEPSEEKOCR,
+    PROJECTOR_TYPE_DEEPSEEKOCR2,
    PROJECTOR_TYPE_LFM2A,
    PROJECTOR_TYPE_GLM4V,
    PROJECTOR_TYPE_YOUTUVL,
@ -386,6 +389,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
    { PROJECTOR_TYPE_DOTS_OCR,  "dots_ocr"},
    { PROJECTOR_TYPE_DEEPSEEKOCR,"deepseekocr"},
+    { PROJECTOR_TYPE_DEEPSEEKOCR2,"deepseekocr2"},
    { PROJECTOR_TYPE_LFM2A,     "lfm2a"},
    { PROJECTOR_TYPE_GLM4V,     "glm4v"},
    { PROJECTOR_TYPE_YOUTUVL,   "youtuvl"},
@ -424,6 +428,9 @@ struct clip_image_f32 {
    int ny;

    std::vector<float> buf;
+
+    // marks the global view in e.g., DeepSeek-OCR Models
+    bool add_viewsep = false;
 };

 //
--- a/tools/mtmd/clip-model.h
+++ b/tools/mtmd/clip-model.h
@ -542,6 +542,11 @@ struct clip_model {
    int32_t n_sam_layers = 12; // used by deepseek-ocr sam encoder

    std::vector<clip_layer> sam_layers;
+
+    // deepseek-ocr-2
+    ggml_tensor * resample_query_768 = nullptr;
+    ggml_tensor * resample_query_1024 = nullptr;
+
    // lfm2 audio
    std::array<ggml_tensor *, 7> pre_encode_conv_X_w = {nullptr};
    std::array<ggml_tensor *, 7> pre_encode_conv_X_b = {nullptr};
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@ -953,6 +953,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            {
                builder = std::make_unique<clip_graph_deepseekocr>(ctx, img);
            } break;
+        case PROJECTOR_TYPE_DEEPSEEKOCR2:
+             {
+                builder = std::make_unique<clip_graph_deepseekocr2>(ctx, img);
+            } break;
        case PROJECTOR_TYPE_LFM2A:
            {
                builder = std::make_unique<clip_graph_conformer>(ctx, img);
@ -1514,6 +1518,7 @@ struct clip_model_loader {
                        hparams.set_warmup_n_tokens(28*28); // avoid OOM on warmup
                    } break;
                case PROJECTOR_TYPE_DEEPSEEKOCR:
+                case PROJECTOR_TYPE_DEEPSEEKOCR2:
                    {
                        hparams.patch_size = 16;
                        hparams.image_size = 1024;
@ -1525,6 +1530,10 @@ struct clip_model_loader {
                        get_u32(KEY_SAM_N_HEAD, hparams.sam_n_head, true);
                        get_u32(KEY_SAM_N_EMBD, hparams.sam_n_embd, true);
                        get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
+                        if (model.proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2) {
+                            // qwen2 encoder is GQA, requires KEY_N_HEAD_KV
+                            get_u32(string_format(KEY_N_HEAD_KV, "vision"), hparams.n_head_kv);
+                        }
                     } break;
                case PROJECTOR_TYPE_HUNYUANVL:
                    {
@ -2374,6 +2383,7 @@ struct clip_model_loader {
                    model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
                } break;
            case PROJECTOR_TYPE_DEEPSEEKOCR:
+            case PROJECTOR_TYPE_DEEPSEEKOCR2:
                {
                    model.pos_embed          = get_tensor(string_format(TN_SAM_POS_EMBD,   "weight"));
                    model.patch_embed_proj_w = get_tensor(string_format(TN_SAM_PATCH_EMBD, "weight"));
@ -2404,10 +2414,12 @@ struct clip_model_loader {
                    model.neck_3_w       = get_tensor(string_format(TN_SAM_NECK, 3, "weight"));
                    model.net_2          = get_tensor(string_format(TN_SAM_NET, 2, "weight"));
                    model.net_3          = get_tensor(string_format(TN_SAM_NET, 3, "weight"));
-                    model.image_newline  = get_tensor(TN_IMAGE_NEWLINE);
+                    model.image_newline  = get_tensor(TN_IMAGE_NEWLINE, false);
                    model.view_seperator = get_tensor(TN_IMAGE_SEPERATOR);
                    model.mm_fc_w        = get_tensor(string_format(TN_MM_PROJECTOR, "weight"));
                    model.mm_fc_b        = get_tensor(string_format(TN_MM_PROJECTOR, "bias"));
+                    model.resample_query_768  = get_tensor(string_format(TN_RESMPL_QUERY, 768, "weight"), false);
+                    model.resample_query_1024 = get_tensor(string_format(TN_RESMPL_QUERY, 1024, "weight"), false);
                 } break;
            case PROJECTOR_TYPE_GEMMA4A:
                {
@ -3277,7 +3289,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
        case PROJECTOR_TYPE_DEEPSEEKOCR:
        {
            // SAM encoder applies two stride-2 convolutions (net_2 and net_3)
-            // which reduces spatial dimensions by 4x in each direction (16x total)
+            // that reduce spatial dimensions by 4x in each direction (16x total)
            // E.g., 64x64 -> 16x16 patches
            n_patches /= 16;

@ -3293,6 +3305,15 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
                int oh = (img->ny / patch_size) / merge;
                n_patches = (ow + 1) * oh + 2;
            } break;
+        case PROJECTOR_TYPE_DEEPSEEKOCR2:
+        {
+            // 1024 global view -> 256 query tokens + 1 view separator = 257;
+            // 768 local tile   -> 144 query tokens, no separator.
+            n_patches /= 16;
+            if (img->add_viewsep) {
+                n_patches += 1; // view separator, appended only after the global view
+            }
+        } break;
        case PROJECTOR_TYPE_LFM2A:
            {
                n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2;
@ -3882,6 +3903,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                set_input_i32("pos_y", pos_y);
            } break;
        case PROJECTOR_TYPE_DEEPSEEKOCR:
+        case PROJECTOR_TYPE_DEEPSEEKOCR2:
            {
                GGML_ASSERT(pos_w == pos_h);

@ -3904,6 +3926,34 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima

                set_input_i32("rel_pos_indices_local", rel_pos_indices_local);
                set_input_i32("rel_pos_indices_global", rel_pos_indices_global);
+
+                if (ctx->proj_type() == PROJECTOR_TYPE_DEEPSEEKOCR2) {
+
+                    // qwen2 encoder attention mask
+
+                    // num_image_tokens = num_patches / 16
+                    //   256 for 1024 global view
+                    //   144 for 768 tile views
+                    const int   num_image_tokens = num_patches / 16;
+                    const int   seq_len          = num_image_tokens * 2;
+                    std::vector qwen2_mask(static_cast<size_t>(seq_len) * seq_len, 0.0f);
+
+                    // attention mask layout
+                    //  +--------------+---------------+
+                    //  |    all 0     |   all -inf    |
+                    //  +--------------+---------------+
+                    //  |    all 0     |  lower tri 0  |
+                    //  +--------------+---------------+
+                    for (int i = 0; i < seq_len; i++) {
+                        for (int j = 0; j < seq_len; j++) {
+                            const bool zero = i < num_image_tokens ?
+                                                     j < num_image_tokens :
+                                                     j < num_image_tokens || j <= i;
+                            qwen2_mask[static_cast<size_t>(i) * seq_len + j] = zero ? 0.0f : -1e9f;
+                        }
+                    }
+                    set_input_f32("qwen2_attn_mask", qwen2_mask);
+                }
            } break;
        case PROJECTOR_TYPE_GEMMA3:
        case PROJECTOR_TYPE_GEMMA3NV:
@ -4256,6 +4306,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
        case PROJECTOR_TYPE_COGVLM:
            return ctx->model.mm_4h_to_h_w->ne[1];
        case PROJECTOR_TYPE_DEEPSEEKOCR:
+        case PROJECTOR_TYPE_DEEPSEEKOCR2:
            return ctx->model.mm_fc_w->ne[1];
        case PROJECTOR_TYPE_LFM2A:
            return ctx->model.position_embeddings->ne[0];
--- a/tools/mtmd/models/deepseekocr.cpp
+++ b/tools/mtmd/models/deepseekocr.cpp
@ -157,7 +157,6 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {

            cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
            cur = ggml_add(ctx0, cur, layer.qkv_b);
-            cur = ggml_cont(ctx0, cur); // Ensure tensor is contiguous before reshape
            cur = ggml_reshape_4d(ctx0, cur, n_embd, 3, W * H, B);

            ggml_tensor * Q;
@ -251,17 +250,17 @@ ggml_cgraph * clip_graph_deepseekocr::build() {
    ggml_tensor * inp_raw = build_inp_raw();
    ggml_tensor * sam_out = build_sam(inp_raw);

+    const int clip_n_patches = sam_out->ne[0] * sam_out->ne[1];
+
    ggml_tensor * clip_out;
    // Building DS-OCR CLIP
    {
        ggml_tensor * inp;

-        inp = ggml_cpy(ctx0, sam_out, ggml_dup_tensor(ctx0, sam_out));
-        inp = ggml_reshape_2d(ctx0, inp, inp->ne[0] * inp->ne[1], inp->ne[2]);
+        inp = ggml_reshape_2d(ctx0, sam_out, clip_n_patches, sam_out->ne[2]);
        inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));

-        ggml_tensor * new_pos_embd =
-            ggml_cpy(ctx0, model.position_embeddings, ggml_dup_tensor(ctx0, model.position_embeddings));
+        ggml_tensor * new_pos_embd = model.position_embeddings;

        int        n_pos    = new_pos_embd->ne[1];  // +1 for [CLS]
        const auto tgt_size = static_cast<int>(std::sqrt(inp->ne[1]));
@ -295,16 +294,12 @@ ggml_cgraph * clip_graph_deepseekocr::build() {
        clip_out = cur;
    }

-    const int clip_n_patches = sam_out->ne[0] * sam_out->ne[1];
-
    sam_out  = ggml_cont(ctx0, ggml_permute(ctx0, sam_out, 1, 2, 0, 3));
    sam_out  = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0], clip_n_patches);
    clip_out = ggml_view_2d(ctx0, clip_out, n_embd, clip_n_patches, clip_out->nb[1], clip_out->nb[1]);

    ggml_tensor * cur;
    cur = ggml_concat(ctx0, clip_out, sam_out, 0);
-    cur = ggml_reshape_2d(ctx0, cur, 2 * n_embd, clip_n_patches);
-    cur = ggml_cont(ctx0, cur);
    cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
    cur = ggml_add(ctx0, cur, model.mm_fc_b);

@ -313,13 +308,11 @@ ggml_cgraph * clip_graph_deepseekocr::build() {
    const auto n_dim = cur->ne[0];

    ggml_tensor * imgnl;
-    ggml_tensor * vs;

    imgnl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, h, 1);
-    vs    = ggml_reshape_2d(ctx0, model.view_seperator, n_dim, 1);  // (n_dim, 1)
    cur   = ggml_reshape_3d(ctx0, cur, n_dim, w, h);
    cur   = ggml_reshape_2d(ctx0, ggml_concat(ctx0, cur, imgnl, 1), n_dim, (w + 1) * h);
-    cur   = ggml_concat(ctx0, cur, vs, 1);  // (n_dim, h*(w+1) + 1)
+    cur   = ggml_concat(ctx0, cur, model.view_seperator, 1);  // (n_dim, h*(w+1) + 1)

    cb(cur, "dsocr_output", -1);

--- a/tools/mtmd/models/deepseekocr2.cpp
+++ b/tools/mtmd/models/deepseekocr2.cpp
@ -0,0 +1,81 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_deepseekocr2::build() {
+    GGML_ASSERT(hparams.n_head_kv > 0);
+    GGML_ASSERT(n_head % hparams.n_head_kv == 0);
+
+    // patch embedding
+    ggml_tensor * inp_raw = build_inp_raw();
+
+    ggml_tensor * sam_out = build_sam(inp_raw);
+
+    ggml_tensor * qwen2_out;
+    // Building Qwen2 encoder
+    {
+        ggml_tensor * inp;
+
+        inp = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0] * sam_out->ne[1], sam_out->ne[2]); // H*W, C
+        inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
+
+        auto num_image_tokens = inp->ne[1]; // H*W
+        GGML_ASSERT(num_image_tokens == 144 || num_image_tokens == 256);
+
+        // query based on numbers of image tokens (in SAM output)
+        // 16x16 -> query_1024 (1024x1024 images)
+        // 12x12 -> query_768 (768x768 images)
+
+        ggml_tensor * query_embed = model.resample_query_1024;
+        int           num_queries = 256;
+
+        if (num_image_tokens == 144) {
+            query_embed = model.resample_query_768;
+            num_queries = 144;
+        }
+
+        // (B, num_image_tokens + num_queries, C)
+        inp = ggml_concat(ctx0, inp, ggml_cast(ctx0, query_embed, inp->type), 1);
+
+        auto seq_len = inp->ne[1];
+
+        // qwen2 encoder attention mask
+        ggml_tensor * attn_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, seq_len, seq_len);
+        ggml_set_name(attn_mask, "qwen2_attn_mask");
+        ggml_set_input(attn_mask);
+
+        ggml_tensor * inp_pos = ggml_cast(ctx0, ggml_arange(ctx0, 0, seq_len, 1), GGML_TYPE_I32);
+
+        auto add_rope = [&](ggml_tensor * x, const clip_layer &) {
+            return ggml_rope_ext(ctx0, x, inp_pos, nullptr, d_head,
+                                 GGML_ROPE_TYPE_NEOX, 131072, 1000000, 1, 0, 1, 0, 0);
+        };
+
+        build_vit_opts vit_opts;
+        vit_opts.attn_mask = attn_mask;
+
+        // build_vit applies model.post_ln_w internally; do not re-apply
+        ggml_tensor * cur = build_vit(inp, seq_len, NORM_TYPE_RMS, FFN_SILU,
+                                      /* learned_pos_embd */ nullptr, add_rope, vit_opts);
+
+        cur = ggml_cont(ctx0,
+                        ggml_view_2d(ctx0, cur, cur->ne[0], num_queries, cur->nb[1],
+                                     cur->nb[1] * (cur->ne[1] - num_queries))); // only take query tokens for output
+
+        ggml_build_forward_expand(gf, cur);
+        qwen2_out = cur;
+    }
+
+    ggml_tensor * cur;
+
+    cur = ggml_mul_mat(ctx0, model.mm_fc_w, qwen2_out);
+    cur = ggml_add(ctx0, cur, model.mm_fc_b);
+
+    // view_seperator only after the global view
+    if (img.add_viewsep) {
+        cur = ggml_concat(ctx0, cur, model.view_seperator, 1); // (n_dim, 257)
+    }
+
+    cb(cur, "dsocr2_output", -1);
+
+    ggml_build_forward_expand(gf, cur);
+    return gf;
+}
--- a/tools/mtmd/models/models.h
+++ b/tools/mtmd/models/models.h
@ -121,6 +121,11 @@ struct clip_graph_deepseekocr : clip_graph {
    ggml_tensor * build_sam(ggml_tensor * inp); // build the SAM model
 };

+struct clip_graph_deepseekocr2 : clip_graph_deepseekocr {
+    clip_graph_deepseekocr2(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph_deepseekocr(ctx, img) {}
+    ggml_cgraph * build() override; // reuses build_sam() from base
+};
+
 struct clip_graph_conformer : clip_graph {
    clip_graph_conformer(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
--- a/tools/mtmd/mtmd-image.cpp
+++ b/tools/mtmd/mtmd-image.cpp
@ -1137,6 +1137,105 @@ bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img,
    return true;
 }

+//
+// mtmd_image_preprocessor_deepseekocr2
+//
+
+// candidate tile grids (cols, rows) with min_tiles <= cols*rows <= max_tiles
+// sorted by tile count
+std::vector<clip_image_size> mtmd_image_preprocessor_deepseekocr2::get_target_ratios() {
+    std::vector<clip_image_size> ratios;
+    for (int n = min_tiles; n <= max_tiles; n++) {
+        for (int w = 1; w <= n; w++) {
+            for (int h = 1; h <= n; h++) {
+                if (w * h < min_tiles || w * h > max_tiles) {
+                    continue;
+                }
+                bool found = false;
+                for (const auto & r : ratios) {
+                    if (r.width == w && r.height == h) {
+                        found = true;
+                        break;
+                    }
+                }
+                if (!found) {
+                    ratios.push_back({ w, h });
+                }
+            }
+        }
+    }
+    std::sort(ratios.begin(), ratios.end(), [](const clip_image_size & a, const clip_image_size & b) {
+        return a.width * a.height < b.width * b.height;
+    });
+    return ratios;
+}
+
+// pick the grid whose aspect ratio is closest to the image
+// on a tie, prefer the larger grid when the image fits
+clip_image_size mtmd_image_preprocessor_deepseekocr2::find_closest_aspect_ratio(
+    float                                aspect_ratio,
+    const std::vector<clip_image_size> & target_ratios,
+    int                                  width,
+    int                                  height) {
+    float           best_ratio_diff = std::numeric_limits<float>::max();
+    clip_image_size best_ratio      = { 1, 1 };
+    const float     area            = static_cast<float>(width * height);
+
+    for (const auto & ratio : target_ratios) {
+        const float target_aspect_ratio = static_cast<float>(ratio.width) / ratio.height;
+        const float ratio_diff          = std::abs(aspect_ratio - target_aspect_ratio);
+        if (ratio_diff < best_ratio_diff) {
+            best_ratio_diff = ratio_diff;
+            best_ratio      = ratio;
+        } else if (ratio_diff == best_ratio_diff) {
+            const float target_area = static_cast<float>(tile_size * tile_size * ratio.width * ratio.height);
+            if (area > 0.5f * target_area) {
+                best_ratio = ratio;
+            }
+        }
+    }
+    return best_ratio;
+}
+
+bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
+    // emit 768x768 local tiles when the image is larger than a tile in either
+    // dimension, then always a 1024x1024 global view. order: [tiles..., global].
+
+    if (img.nx > tile_size || img.ny > tile_size) {
+        const float           aspect_ratio  = static_cast<float>(img.nx) / img.ny;
+        const auto            target_ratios = get_target_ratios();
+        const clip_image_size grid          = find_closest_aspect_ratio(aspect_ratio, target_ratios, img.nx, img.ny);
+
+        // stretch onto the grid (no aspect preserve), then crop tiles row-major.
+        clip_image_u8 refined;
+        img_tool::resize(img, refined, { tile_size * grid.width, tile_size * grid.height },
+                         RESIZE_ALGO_BICUBIC_PILLOW, PAD_NONE);
+
+        for (int row = 0; row < grid.height; row++) {
+            for (int col = 0; col < grid.width; col++) {
+                clip_image_u8 tile;
+                img_tool::crop(refined, tile, col * tile_size, row * tile_size, tile_size, tile_size);
+                clip_image_f32_ptr res(clip_image_f32_init());
+                img_u8_to_f32(tile, *res, hparams.image_mean, hparams.image_std);
+                output.entries.push_back(std::move(res));
+            }
+        }
+    }
+
+    // global view: aspect-preserving fit-and-pad to base_size.
+    clip_image_u8 padded;
+    img_tool::resize(img, padded, { base_size, base_size }, RESIZE_ALGO_BICUBIC_PILLOW,
+                     PAD_NEAREST, hparams.image_pad_color);
+    clip_image_f32_ptr global(clip_image_f32_init());
+    img_u8_to_f32(padded, *global, hparams.image_mean, hparams.image_std);
+    global->add_viewsep = true;
+    output.entries.push_back(std::move(global));
+
+    output.grid_x = 1;
+    output.grid_y = 1;
+    return true;
+}
+
 //
 // mtmd_image_preprocessor_step3vl
 //
--- a/tools/mtmd/mtmd-image.h
+++ b/tools/mtmd/mtmd-image.h
@ -144,6 +144,26 @@ struct mtmd_image_preprocessor_deepseekocr : mtmd_image_preprocessor {
    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
 };

+// DeepSeek-OCR-2: a 1024x1024 global view, plus InternVL-style 768x768 local
+// tiles when the image is larger than a tile in either dimension.
+struct mtmd_image_preprocessor_deepseekocr2 : mtmd_image_preprocessor {
+    static constexpr int base_size = 1024; // global view
+    static constexpr int tile_size = 768;  // local tile
+    static constexpr int min_tiles = 2;
+    static constexpr int max_tiles = 6;
+
+    mtmd_image_preprocessor_deepseekocr2(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
+    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
+
+private:
+    static std::vector<clip_image_size> get_target_ratios();
+    static clip_image_size              find_closest_aspect_ratio(
+        float                                aspect_ratio,
+        const std::vector<clip_image_size> & target_ratios,
+        int                                  width,
+        int                                  height);
+};
+
 // custom image preprocessing for Step3VL
 // ref: https://huggingface.co/stepfun-ai/Step3-VL-10B/blob/main/processing_step3.py
 struct mtmd_image_preprocessor_step3vl : mtmd_image_preprocessor_llava_uhd {
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@ -493,6 +493,11 @@ struct mtmd_context {
                    img_end = "\n"; // prevent empty batch on llama-server
                    image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
                } break;
+            case PROJECTOR_TYPE_DEEPSEEKOCR2:
+                {
+                    img_end = "\n"; // prevent empty batch on llama-server
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr2>(ctx_v);
+                } break;
            case PROJECTOR_TYPE_HUNYUANVL:
                {
                    // note: these use fullwidth ｜ (U+FF5C) and ▁ (U+2581) to match the tokenizer vocabulary
@ -1091,16 +1096,21 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
    if (clip_is_llava(ctx_clip)
        || proj_type == PROJECTOR_TYPE_MINICPMV
        || proj_type == PROJECTOR_TYPE_GLM_EDGE
-        || proj_type == PROJECTOR_TYPE_INTERNVL) {
+        || proj_type == PROJECTOR_TYPE_INTERNVL
+        || proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2) {
        // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
        const auto & entries = image_tokens->batch_f32.entries;
+        // entries may have different token counts
+        // e.g., DeepSeek-OCR-2: 144 per tile views, 257 for the global view
+        size_t offset = 0;
        for (size_t i = 0; i < entries.size(); i++) {
            int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
            ok = clip_image_encode(
                ctx_clip,
                ctx->n_threads,
                entries[i].get(),
-                ctx->image_embd_v.data() + i*n_mmproj_embd*n_tokens_per_image);
+                ctx->image_embd_v.data() + offset);
+            offset += static_cast<size_t>(n_mmproj_embd) * n_tokens_per_image;
        }
    } else {
        ok = clip_image_batch_encode(
--- a/tools/mtmd/tests/test-deepseek-ocr.py
+++ b/tools/mtmd/tests/test-deepseek-ocr.py
@ -3,7 +3,7 @@
 Evaluates llama.cpp's DeepSeek-OCR by comparing its output for a test
 image to the actual text in part of that image.

-Runs the test image through mtmd-cli, calculates CER and chrF for
+Runs each test image through mtmd-cli, calculates CER and chrF for
 its output, and holds them against the HF model's scores.
 """

@ -12,24 +12,81 @@ import logging
 import subprocess
 import sys
 import unicodedata
+from dataclasses import dataclass
 from pathlib import Path

 logger = logging.getLogger("deepseek-ocr-test")

-DEFAULT_IMAGE = "test-1.jpeg"
-DEFAULT_EXPECTED_TEXT = "test-1-ground-truth.txt"
 RUN_TIMEOUT = 300

-# DeepSeek-OCR reference scores on the test image.
-# This is the baseline the implementation should keep up with.
-HF_REFERENCE_CER = 0.3030
-HF_REFERENCE_CHRF = 67.52

-CER_TOLERANCE = 0.02
-CHRF_TOLERANCE = 2.0
+@dataclass
+class ModelSpec:
+    key: str
+    label: str
+    model_arg: str
+    mmproj_arg: str
+    model_default: str
+    mmproj_default: str

-CER_MAX = HF_REFERENCE_CER + CER_TOLERANCE
-CHRF_MIN = HF_REFERENCE_CHRF - CHRF_TOLERANCE
+
+@dataclass
+class TestCase:
+    model_key: str
+    label: str
+    image: str
+    ground_truth: str
+    hf_cer: float
+    hf_chrf: float
+    cer_tol: float
+    chrf_tol: float
+
+    @property
+    def cer_max(self) -> float:
+        return self.hf_cer + self.cer_tol
+
+    @property
+    def chrf_min(self) -> float:
+        return self.hf_chrf - self.chrf_tol
+
+
+MODELS = {
+    "v1": ModelSpec(
+        key="v1", label="DeepSeek-OCR",
+        model_arg="--llama-model", mmproj_arg="--mmproj",
+        model_default="gguf_models/deepseek-ai/deepseek-ocr-bf16.gguf",
+        mmproj_default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-bf16.gguf",
+    ),
+    "v2": ModelSpec(
+        key="v2", label="DeepSeek-OCR-2",
+        model_arg="--llama-model-2", mmproj_arg="--mmproj-2",
+        model_default="gguf_models/deepseek-ai/deepseek-ocr-2-bf16.gguf",
+        mmproj_default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-2-bf16.gguf",
+    ),
+}
+
+CASES = [
+    TestCase(
+        model_key="v1", label="single-view scan",
+        image="tools/mtmd/test-1.jpeg",
+        ground_truth="tools/mtmd/tests/test-1-ground-truth.txt",
+        hf_cer=0.3030, hf_chrf=67.52, cer_tol=0.02, chrf_tol=2.0,
+    ),
+    TestCase(
+        model_key="v2", label="single-view scan",
+        image="tools/mtmd/test-1.jpeg",
+        ground_truth="tools/mtmd/tests/test-1-ground-truth.txt",
+        # 640x488 is below the 768 tiling threshold -- single 1024 global view.
+        # hf_cer/hf_chrf are the deepseek-ai repo's own scores (ImageOps.pad);
+        # the transformers HF processor is *not* the reference -- its pad_to_square
+        # is one pixel off and lands at ~0.69 instead.
+        hf_cer=0.7761, hf_chrf=28.70, cer_tol=0.12, chrf_tol=8.0,
+    ),
+]
+
+
+def arg_dest(flag: str) -> str:
+    return flag.lstrip("-").replace("-", "_")


 def verdict(ok: bool) -> str:
@ -84,6 +141,14 @@ def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str:
        "--temp", "0",
        "--flash-attn", "off",  # match the HF "eager" attention reference
        "--no-warmup",
+        "-n", "512",  # cap loops on hard images (KV would otherwise fill)
+        # HF decodes with no_repeat_ngram_size; llama.cpp's analog is DRY.
+        # Default DRY breakers include "\n", so they are cleared below.
+        "--dry-multiplier", "0.8",
+        "--dry-base", "1.75",
+        "--dry-allowed-length", "2",
+        "--dry-penalty-last-n", "-1",
+        "--dry-sequence-breaker", "none",
    ]
    logger.debug(f"  command: {' '.join(cmd)}")

@ -110,7 +175,7 @@ def read_expected_text(file_path: Path) -> str:
        return f.read().strip()


-def evaluate(expected: str, ocr_out: str) -> bool:
+def evaluate(case: "TestCase", expected: str, ocr_out: str) -> bool:
    expected = normalize_text(expected)
    ocr_out = normalize_text(ocr_out)
    aligned = locally_align(expected, ocr_out)
@ -122,16 +187,16 @@ def evaluate(expected: str, ocr_out: str) -> bool:
    cer = compute_cer(expected, aligned)
    chrf = compute_chrf(expected, aligned)

-    cer_pass = cer <= CER_MAX
-    chrf_pass = chrf >= CHRF_MIN
+    cer_pass = cer <= case.cer_max
+    chrf_pass = chrf >= case.chrf_min
    passed = cer_pass and chrf_pass

    logger.info("")
    logger.info("=" * 60)
    logger.info("Free OCR evaluation:")
    logger.info("=" * 60)
-    logger.info(f"  CER               {cer:>7.4f}    (<= {CER_MAX:>7.4f}  -> {verdict(cer_pass)})")
-    logger.info(f"  chrF (0-100)      {chrf:>7.2f}    (>= {CHRF_MIN:>7.2f}  -> {verdict(chrf_pass)})")
+    logger.info(f"  CER               {cer:>7.4f}    (HF {case.hf_cer:.4f}, <= {case.cer_max:>7.4f}  -> {verdict(cer_pass)})")
+    logger.info(f"  chrF (0-100)      {chrf:>7.2f}    (HF {case.hf_chrf:.2f}, >= {case.chrf_min:>7.2f}  -> {verdict(chrf_pass)})")
    logger.info(f"  Expected chars    {len(expected):>7}")
    logger.info(f"  Aligned chars     {len(aligned):>7} (of {len(ocr_out)} OCR chars)")
    logger.info("")
@ -142,12 +207,13 @@ def evaluate(expected: str, ocr_out: str) -> bool:

 def argument_parser() -> argparse.ArgumentParser:
    ap = argparse.ArgumentParser(description="Compare llama.cpp DeepSeek-OCR output with a ground-truth transcript")
-    ap.add_argument("--llama-model", default="gguf_models/deepseek-ai/deepseek-ocr-bf16.gguf",
-                    help="Path to llama.cpp GGUF model (relative to repo root or absolute)")
-    ap.add_argument("--mmproj", default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-bf16.gguf",
-                    help="Path to mmproj GGUF file (relative to repo root or absolute)")
    ap.add_argument("--llama-bin", default="build/bin/llama-mtmd-cli",
                    help="Path to llama-mtmd-cli binary (relative to repo root or absolute)")
+    for spec in MODELS.values():
+        ap.add_argument(spec.model_arg, default=spec.model_default,
+                        help=f"Path to the {spec.label} GGUF model (relative to repo root or absolute)")
+        ap.add_argument(spec.mmproj_arg, default=spec.mmproj_default,
+                        help=f"Path to the {spec.label} mmproj GGUF file (relative to repo root or absolute)")
    ap.add_argument("--verbose", action="store_true",
                    help="Also log the expected, OCR, and aligned text")
    return ap
@ -167,53 +233,60 @@ def main() -> int:
    args = argument_parser().parse_args()
    configure_logging(args.verbose)

-    tests_dir = Path(__file__).parent  # tools/mtmd/tests
-    mtmd_dir = tests_dir.parent  # tools/mtmd
-    repo_root = mtmd_dir.parent.parent  # repo root
+    repo_root = Path(__file__).resolve().parents[3]  # tests -> mtmd -> tools -> repo root
+    binary = resolve_path(args.llama_bin, repo_root)

-    inputs = [
-        ("image", resolve_path(DEFAULT_IMAGE, mtmd_dir)),
-        ("expected-text", resolve_path(DEFAULT_EXPECTED_TEXT, tests_dir)),
-        ("model", resolve_path(args.llama_model, repo_root)),
-        ("mmproj", resolve_path(args.mmproj, repo_root)),
-        ("binary", resolve_path(args.llama_bin, repo_root)),
-    ]
-    for label, path in inputs:
-        if not path.exists():
-            logger.error(f"Error: {label} not found: {path}")
-            return 1
-    paths = dict(inputs)
-
-    logger.info("=" * 60)
-    logger.info("DeepSeek-OCR: llama.cpp vs ground-truth comparison")
-    logger.info("=" * 60)
-    logger.info(f"HF baselines: CER {HF_REFERENCE_CER:.4f}, chrF {HF_REFERENCE_CHRF:.2f}")
-    logger.info(f"Test thresholds: CER <= {CER_MAX:.4f}, chrF >= {CHRF_MIN:.2f}")
-
-    logger.debug("")
-    logger.debug("Resolved test inputs:")
-    for label, path in inputs:
-        logger.debug(f"  {label:<14} {path}")
-
-    logger.info("")
-    logger.info("[1/3] Running llama.cpp 'Free OCR'")
-    try:
-        ocr_out = run_mtmd_cli(paths["model"], paths["mmproj"],
-                               paths["image"], paths["binary"])
-    except RuntimeError as e:
-        logger.error(f"Error: {e}")
+    if not binary.exists():
+        logger.error(f"Error: binary not found: {binary}")
        return 1

-    logger.info("")
-    logger.info("[2/3] Reading expected output")
-    expected = read_expected_text(paths["expected-text"])
-    logger.info(f"  expected: {len(expected)} chars")
+    logger.info("=" * 60)
+    logger.info("DeepSeek-OCR: llama.cpp vs HF parity check")
+    logger.info("=" * 60)
+
+    results = {}
+    for case in CASES:
+        model_spec = MODELS[case.model_key]
+        title = f"{model_spec.label} -- {case.label}"
+
+        logger.info("")
+        logger.info(f"=== {title} ===")
+
+        model = resolve_path(getattr(args, arg_dest(model_spec.model_arg)), repo_root)
+        mmproj = resolve_path(getattr(args, arg_dest(model_spec.mmproj_arg)), repo_root)
+        image = resolve_path(case.image, repo_root)
+        ground_truth = resolve_path(case.ground_truth, repo_root)
+
+        missing = [(lbl, p) for lbl, p in [("model", model), ("mmproj", mmproj),
+                                           ("image", image), ("ground-truth", ground_truth)]
+                   if not p.exists()]
+        if missing:
+            for lbl, p in missing:
+                logger.error(f"  Error: {lbl} not found: {p}")
+            results[title] = False
+            continue
+
+        expected = read_expected_text(ground_truth)
+        logger.info(f"  Image: {case.image}")
+        logger.info(f"  Expected text: {len(expected)} chars")
+        logger.info("  Running llama.cpp 'Free OCR'")
+        try:
+            ocr_out = run_mtmd_cli(model, mmproj, image, binary)
+        except RuntimeError as e:
+            logger.error(f"  Error: {e}")
+            results[title] = False
+            continue
+
+        results[title] = evaluate(case, expected, ocr_out)

    logger.info("")
-    logger.info("[3/3] Computing OCR metrics")
-    ok = evaluate(expected, ocr_out)
+    logger.info("=== Summary ===")
+    for title, ok in results.items():
+        logger.info(f"  {title:<48} {verdict(ok)}")
+    all_passed = all(results.values())
+    logger.info(f"Overall: {verdict(all_passed)}")

-    return 0 if ok else 1
+    return 0 if all_passed else 1


 if __name__ == "__main__":