diff --git a/conversion/__init__.py b/conversion/__init__.py index 89e05d691..cfaa24ba1 100644 --- a/conversion/__init__.py +++ b/conversion/__init__.py @@ -237,6 +237,7 @@ TEXT_MODEL_MAP: dict[str, str] = { MMPROJ_MODEL_MAP: dict[str, str] = { "AudioFlamingo3ForConditionalGeneration": "ultravox", "CogVLMForCausalLM": "cogvlm", + "DeepseekOCR2ForCausalLM": "deepseek", "DeepseekOCRForCausalLM": "deepseek", "DotsOCRForCausalLM": "dotsocr", "Gemma3ForConditionalGeneration": "gemma", diff --git a/conversion/base.py b/conversion/base.py index f861f8b52..44b2c964f 100644 --- a/conversion/base.py +++ b/conversion/base.py @@ -1140,7 +1140,7 @@ class TextModel(ModelBase): # Skip multimodal tensors if name.startswith(("mlp", "vit.", "vpm.", "siglip2.", "conformer.", "merger.", "resampler.", "sound_encoder.", "sound_projection.", "speech_embeddings.")) \ or "visual." in name or "vision." in name or "audio." in name or "talker." in name \ - or "vision_" in name or "audio_" in name or "sam_model" in name \ + or "vision_" in name or "audio_" in name \ or "token2wav." in name or "code2wav." in name \ or "projector." in name or "pre_mm_projector_norm" in name \ or "image_newline" in name or "view_seperator" in name \ diff --git a/conversion/deepseek.py b/conversion/deepseek.py index af18a25a8..72520cc9f 100644 --- a/conversion/deepseek.py +++ b/conversion/deepseek.py @@ -16,10 +16,14 @@ from .qwen import QwenModel @ModelBase.register("DeepseekOCRForCausalLM") class DeepseekOCRVisionModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.clip_projector_type = gguf.VisionProjectorType.DEEPSEEKOCR + def set_gguf_parameters(self): super().set_gguf_parameters() hparams = self.hparams - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DEEPSEEKOCR) + self.gguf_writer.add_clip_projector_type(self.clip_projector_type) # default values below are taken from HF tranformers code self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6)) self.gguf_writer.add_vision_use_gelu(True) @@ -49,22 +53,27 @@ class DeepseekOCRVisionModel(MmprojModel): raise ValueError("DeepseekOCR model requires 'vision_config' in the model configuration, but it was not found") vision_config['sam'] = vision_config['width']['sam_vit_b'] - vision_config.update(vision_config['width']['clip-l-14-224']) - vision_config['hidden_size'] = vision_config['width'] - vision_config['num_heads'] = vision_config['heads'] - vision_config['intermediate_size'] = vision_config['heads'] * 4 + if vision_config['width'].get('clip-l-14-224') is not None: + vision_config.update(vision_config['width']['clip-l-14-224']) + if isinstance(vision_config['width'], int): + vision_config['hidden_size'] = vision_config['width'] + if vision_config.get('heads') is not None: + vision_config['num_heads'] = vision_config['heads'] + vision_config['intermediate_size'] = vision_config['heads'] * 4 return vision_config def tensor_force_quant(self, name, new_name, bid, n_dims): - if ".embeddings." in name or 'pos_embed' in name: - return gguf.GGMLQuantizationType.F32 - if ".rel_pos_h" in name or '.rel_pos_w' in name: - return gguf.GGMLQuantizationType.F32 - if ".neck." in name or ".net_" in name: - return gguf.GGMLQuantizationType.F32 + for nq_name in ('.embeddings.', 'pos_embed', '.rel_pos_h', '.rel_pos_w', '.neck.', '.net_'): + if nq_name in name: + return gguf.GGMLQuantizationType.F32 return super().tensor_force_quant(name, new_name, bid, n_dims) + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.endswith("view_seperator"): + data_torch = data_torch.unsqueeze(0) + yield from super().modify_tensors(data_torch, name, bid) + @classmethod def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: name, gen = item @@ -81,6 +90,33 @@ class DeepseekOCRVisionModel(MmprojModel): return super().filter_tensors((name, gen)) +@ModelBase.register("DeepseekOCR2ForCausalLM") +class DeepseekOCR2VisionModel(DeepseekOCRVisionModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.clip_projector_type = gguf.VisionProjectorType.DEEPSEEKOCR2 + + def set_gguf_parameters(self): + # the vision tower's qwen2 encoder is built from fixed defaults, + # see build_qwen2_decoder_as_encoder() in deepencoderv2.py + if self.hparams.get("patch_size") is None: + self.hparams["patch_size"] = 16 + if self.hparams.get("intermediate_size") is None: + self.hparams["intermediate_size"] = 4864 + if self.hparams.get("num_attention_heads") is None: + self.hparams["num_attention_heads"] = 14 + super().set_gguf_parameters() + # qwen2 encoder is GQA: 14 Q heads, 2 KV heads + self.gguf_writer.add_vision_head_count_kv(2) + + def get_vision_config(self) -> dict[str, Any]: + vision_config = super().get_vision_config() + vision_config['hidden_size'] = vision_config['width']['qwen2-0-5b']['dim'] + if vision_config.get('layers') is None: + vision_config['layers'] = 24 + return vision_config + + @ModelBase.register("DeepseekForCausalLM") class DeepseekModel(TextModel): model_arch = gguf.MODEL_ARCH.DEEPSEEK @@ -188,13 +224,21 @@ class DeepseekV2Model(TextModel): self.origin_hf_arch = hparams.get('architectures', [None])[0] # special handling for Deepseek OCR - if self.origin_hf_arch == "DeepseekOCRForCausalLM": + if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM"): self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch] self.gguf_writer.add_architecture() # default jinja template self.gguf_writer.add_chat_template("{% for m in messages %}{{m['content']}}{% endfor %}") + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, _ = item + # DeepSeek-OCR vision encoder (SAM + DeepSeek-OCR-2 qwen2 tower) + if "sam_model" in name or "qwen2_model" in name: + return None + return super().filter_tensors(item) + def set_vocab(self): try: self._set_vocab_gpt2() diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 92578490c..5a567e2d1 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -812,6 +812,8 @@ class MODEL_TENSOR(IntEnum): V_SAM_NET_3 = auto() # Deepseek-OCR V_ENC_EMBD_IMGNL = auto() # Deepseek-OCR V_ENC_EMBD_VSEP = auto() # Deepseek-OCR + V_RESMPL_QUERY_768 = auto() # Deepseek-OCR-2 + V_RESMPL_QUERY_1024 = auto() # Deepseek-OCR-2 # audio (mtmd) A_ENC_EMBD_POS = auto() @@ -1329,6 +1331,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.V_SAM_NET_3: "v.sam.net_3", MODEL_TENSOR.V_ENC_EMBD_IMGNL: "v.image_newline", # Deepseek-OCR MODEL_TENSOR.V_ENC_EMBD_VSEP: "v.view_seperator", # Deepseek-OCR + MODEL_TENSOR.V_RESMPL_QUERY_768: "v.resample_query_768", # Deepseek-OCR-2 qwen2 + MODEL_TENSOR.V_RESMPL_QUERY_1024: "v.resample_query_1024", # Deepseek-OCR-2 qwen2 # audio (mtmd) # note: all audio tensor names must use prefix "a." or "mm.a." MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd", @@ -1507,6 +1511,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.V_SAM_NECK, MODEL_TENSOR.V_SAM_NET_2, MODEL_TENSOR.V_SAM_NET_3, + MODEL_TENSOR.V_RESMPL_QUERY_768, + MODEL_TENSOR.V_RESMPL_QUERY_1024, # audio MODEL_TENSOR.A_ENC_EMBD_POS, MODEL_TENSOR.A_ENC_EMBD_NORM, @@ -4329,6 +4335,7 @@ class VisionProjectorType: JANUS_PRO = "janus_pro" DOTSOCR = "dots_ocr" DEEPSEEKOCR = "deepseekocr" + DEEPSEEKOCR2 = "deepseekocr2" LFM2A = "lfm2a" # audio MUSIC_FLAMINGO = "musicflamingo" # audio GLM4V = "glm4v" diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index ecc3c05f9..444f0f285 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1485,6 +1485,7 @@ class TensorNameMap: "siglip2.vision_model.encoder.layers.{bid}.self_attn.q_proj", # youtuvl "model.vision_model.transformer.layers.{bid}.self_attn.q_proj", # Deepseek-OCR CLIP, generated "vision_model.model.layers.{bid}.self_attn.q_proj.linear", # gemma4 + "model.qwen2_model.model.model.layers.{bid}.self_attn.q_proj" # Deepseek-OCR-2 qwen2 ), MODEL_TENSOR.V_ENC_ATTN_Q_NORM: ( @@ -1509,6 +1510,7 @@ class TensorNameMap: "model.vision_model.transformer.layers.{bid}.self_attn.k_proj", # Deepseek-OCR CLIP, generated "siglip2.vision_model.encoder.layers.{bid}.self_attn.k_proj", "vision_model.model.layers.{bid}.self_attn.k_proj.linear", # gemma4 + "model.qwen2_model.model.model.layers.{bid}.self_attn.k_proj" # Deepseek-OCR-2 qwen2 ), MODEL_TENSOR.V_ENC_ATTN_K_NORM: ( @@ -1533,6 +1535,7 @@ class TensorNameMap: "siglip2.vision_model.encoder.layers.{bid}.self_attn.v_proj", "model.vision_model.transformer.layers.{bid}.self_attn.v_proj", # Deepseek-OCR CLIP, generated "vision_model.model.layers.{bid}.self_attn.v_proj.linear", # gemma4 + "model.qwen2_model.model.model.layers.{bid}.self_attn.v_proj" # Deepseek-OCR-2 qwen2 ), MODEL_TENSOR.V_ENC_INPUT_NORM: ( @@ -1554,6 +1557,7 @@ class TensorNameMap: "vision_model.radio_model.model.blocks.{bid}.norm1", # Nemotron Nano v2 VL "vision_tower.blocks.{bid}.norm1", # dots.ocr "vision_model.transformer.resblocks.{bid}.ln_1", # Step3-VL + "model.qwen2_model.model.model.layers.{bid}.input_layernorm", # Deepseek-OCR-2 qwen2 ), MODEL_TENSOR.V_ENC_ATTN_O: ( @@ -1574,6 +1578,7 @@ class TensorNameMap: "model.vision_model.transformer.layers.{bid}.self_attn.out_proj", # Deepseek-OCR CLIP "siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl "vision_model.radio_model.model.blocks.{bid}.attn.proj", # Nemotron Nano v2 VL + "model.qwen2_model.model.model.layers.{bid}.self_attn.o_proj", # Deepseek-OCR-2 qwen2 "vision_model.model.layers.{bid}.self_attn.o_proj.linear", # gemma4 "vision_tower.blocks.{bid}.attn.proj", # dots.ocr "vision_model.transformer.resblocks.{bid}.attn.out_proj", # Step3-VL @@ -1603,6 +1608,7 @@ class TensorNameMap: "vision_model.model.layers.{bid}.pre_feedforward_layernorm", # gemma4 "vision_tower.blocks.{bid}.norm2", # dots.ocr "vision_model.transformer.resblocks.{bid}.ln_2", # Step3-VL + "model.qwen2_model.model.model.layers.{bid}.post_attention_layernorm", # Deepseek-OCR-2 qwen2 ), MODEL_TENSOR.V_ENC_FFN_UP: ( @@ -1625,6 +1631,7 @@ class TensorNameMap: "vision_model.radio_model.model.blocks.{bid}.mlp.fc1", # Nemotron Nano v2 VL "vision_model.model.layers.{bid}.mlp.up_proj", # gemma4 "vision_model.transformer.resblocks.{bid}.mlp.c_fc", # Step3-VL + "model.qwen2_model.model.model.layers.{bid}.mlp.up_proj", # Deepseek-OCR-2 qwen2 ), MODEL_TENSOR.V_ENC_FFN_GATE: ( @@ -1632,6 +1639,7 @@ class TensorNameMap: "vision_encoder.transformer.layers.{bid}.feed_forward.w1", # pixtral "visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl "vision_model.model.layers.{bid}.mlp.gate_proj", # gemma4 + "model.qwen2_model.model.model.layers.{bid}.mlp.gate_proj", # Deepseek-OCR-2 qwen2 ), MODEL_TENSOR.V_ENC_FFN_DOWN: ( @@ -1652,6 +1660,7 @@ class TensorNameMap: "model.vision_model.transformer.layers.{bid}.mlp.fc2", # Deepseek-OCR CLIP "siglip2.vision_model.encoder.layers.{bid}.mlp.fc2", "vision_model.radio_model.model.blocks.{bid}.mlp.fc2", # Nemotron Nano v2 VL + "model.qwen2_model.model.model.layers.{bid}.mlp.down_proj" , # Deepseek-OCR-2 qwen2 "vision_model.model.layers.{bid}.mlp.down_proj", # gemma4 "vision_model.transformer.resblocks.{bid}.mlp.c_proj", # Step3-VL ), @@ -1699,6 +1708,7 @@ class TensorNameMap: "vision_tower.encoder.final_layernorm", # kimi-vl "visual.post_layernorm", # glm4v "siglip2.vision_model.post_layernorm", + "model.qwen2_model.model.model.norm", # Deepseek-OCR-2 qwen2 ), MODEL_TENSOR.V_MM_POST_NORM: ( @@ -1879,6 +1889,14 @@ class TensorNameMap: "model.sam_model.net_3", ), + MODEL_TENSOR.V_RESMPL_QUERY_768: ( + "model.qwen2_model.query_768", # Deepseek-OCR-2 qwen2 + ), + + MODEL_TENSOR.V_RESMPL_QUERY_1024: ( + "model.qwen2_model.query_1024", # Deepseek-OCR-2 qwen2 + ), + MODEL_TENSOR.V_MM_POST_FC_NORM: ( "model.vision.linear_proj.norm1", # cogvlm ), diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index ffd30c7e6..14808d422 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -40,6 +40,7 @@ add_library(mtmd models/siglip.cpp models/whisper-enc.cpp models/deepseekocr.cpp + models/deepseekocr2.cpp models/mobilenetv5.cpp models/youtuvl.cpp models/yasa2.cpp diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index ef4c342ba..14398dc48 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -188,6 +188,8 @@ #define TN_SAM_FFN_DOWN "v.sam.blk.%d.mlp.lin2.%s" #define TN_SAM_NECK "v.sam.neck.%d.%s" #define TN_SAM_NET "v.sam.net_%d.%s" +// deepseek-ocr-2 +#define TN_RESMPL_QUERY "v.resample_query_%d.%s" // (conformer) lfm2 #define TN_PRE_ENCODE_OUT "a.pre_encode.out.%s" #define TN_FFN_NORM "%s.blk.%d.ffn_norm.%s" @@ -337,6 +339,7 @@ enum projector_type { PROJECTOR_TYPE_JANUS_PRO, PROJECTOR_TYPE_DOTS_OCR, PROJECTOR_TYPE_DEEPSEEKOCR, + PROJECTOR_TYPE_DEEPSEEKOCR2, PROJECTOR_TYPE_LFM2A, PROJECTOR_TYPE_GLM4V, PROJECTOR_TYPE_YOUTUVL, @@ -386,6 +389,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"}, { PROJECTOR_TYPE_DOTS_OCR, "dots_ocr"}, { PROJECTOR_TYPE_DEEPSEEKOCR,"deepseekocr"}, + { PROJECTOR_TYPE_DEEPSEEKOCR2,"deepseekocr2"}, { PROJECTOR_TYPE_LFM2A, "lfm2a"}, { PROJECTOR_TYPE_GLM4V, "glm4v"}, { PROJECTOR_TYPE_YOUTUVL, "youtuvl"}, @@ -424,6 +428,9 @@ struct clip_image_f32 { int ny; std::vector buf; + + // marks the global view in e.g., DeepSeek-OCR Models + bool add_viewsep = false; }; // diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index e0de41e0b..1f3657a85 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -542,6 +542,11 @@ struct clip_model { int32_t n_sam_layers = 12; // used by deepseek-ocr sam encoder std::vector sam_layers; + + // deepseek-ocr-2 + ggml_tensor * resample_query_768 = nullptr; + ggml_tensor * resample_query_1024 = nullptr; + // lfm2 audio std::array pre_encode_conv_X_w = {nullptr}; std::array pre_encode_conv_X_b = {nullptr}; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index a7aa297c5..7bb702b95 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -953,6 +953,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { builder = std::make_unique(ctx, img); } break; + case PROJECTOR_TYPE_DEEPSEEKOCR2: + { + builder = std::make_unique(ctx, img); + } break; case PROJECTOR_TYPE_LFM2A: { builder = std::make_unique(ctx, img); @@ -1514,6 +1518,7 @@ struct clip_model_loader { hparams.set_warmup_n_tokens(28*28); // avoid OOM on warmup } break; case PROJECTOR_TYPE_DEEPSEEKOCR: + case PROJECTOR_TYPE_DEEPSEEKOCR2: { hparams.patch_size = 16; hparams.image_size = 1024; @@ -1525,6 +1530,10 @@ struct clip_model_loader { get_u32(KEY_SAM_N_HEAD, hparams.sam_n_head, true); get_u32(KEY_SAM_N_EMBD, hparams.sam_n_embd, true); get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true); + if (model.proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2) { + // qwen2 encoder is GQA, requires KEY_N_HEAD_KV + get_u32(string_format(KEY_N_HEAD_KV, "vision"), hparams.n_head_kv); + } } break; case PROJECTOR_TYPE_HUNYUANVL: { @@ -2374,6 +2383,7 @@ struct clip_model_loader { model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); } break; case PROJECTOR_TYPE_DEEPSEEKOCR: + case PROJECTOR_TYPE_DEEPSEEKOCR2: { model.pos_embed = get_tensor(string_format(TN_SAM_POS_EMBD, "weight")); model.patch_embed_proj_w = get_tensor(string_format(TN_SAM_PATCH_EMBD, "weight")); @@ -2404,10 +2414,12 @@ struct clip_model_loader { model.neck_3_w = get_tensor(string_format(TN_SAM_NECK, 3, "weight")); model.net_2 = get_tensor(string_format(TN_SAM_NET, 2, "weight")); model.net_3 = get_tensor(string_format(TN_SAM_NET, 3, "weight")); - model.image_newline = get_tensor(TN_IMAGE_NEWLINE); + model.image_newline = get_tensor(TN_IMAGE_NEWLINE, false); model.view_seperator = get_tensor(TN_IMAGE_SEPERATOR); model.mm_fc_w = get_tensor(string_format(TN_MM_PROJECTOR, "weight")); model.mm_fc_b = get_tensor(string_format(TN_MM_PROJECTOR, "bias")); + model.resample_query_768 = get_tensor(string_format(TN_RESMPL_QUERY, 768, "weight"), false); + model.resample_query_1024 = get_tensor(string_format(TN_RESMPL_QUERY, 1024, "weight"), false); } break; case PROJECTOR_TYPE_GEMMA4A: { @@ -3277,7 +3289,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_DEEPSEEKOCR: { // SAM encoder applies two stride-2 convolutions (net_2 and net_3) - // which reduces spatial dimensions by 4x in each direction (16x total) + // that reduce spatial dimensions by 4x in each direction (16x total) // E.g., 64x64 -> 16x16 patches n_patches /= 16; @@ -3293,6 +3305,15 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im int oh = (img->ny / patch_size) / merge; n_patches = (ow + 1) * oh + 2; } break; + case PROJECTOR_TYPE_DEEPSEEKOCR2: + { + // 1024 global view -> 256 query tokens + 1 view separator = 257; + // 768 local tile -> 144 query tokens, no separator. + n_patches /= 16; + if (img->add_viewsep) { + n_patches += 1; // view separator, appended only after the global view + } + } break; case PROJECTOR_TYPE_LFM2A: { n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2; @@ -3882,6 +3903,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima set_input_i32("pos_y", pos_y); } break; case PROJECTOR_TYPE_DEEPSEEKOCR: + case PROJECTOR_TYPE_DEEPSEEKOCR2: { GGML_ASSERT(pos_w == pos_h); @@ -3904,6 +3926,34 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima set_input_i32("rel_pos_indices_local", rel_pos_indices_local); set_input_i32("rel_pos_indices_global", rel_pos_indices_global); + + if (ctx->proj_type() == PROJECTOR_TYPE_DEEPSEEKOCR2) { + + // qwen2 encoder attention mask + + // num_image_tokens = num_patches / 16 + // 256 for 1024 global view + // 144 for 768 tile views + const int num_image_tokens = num_patches / 16; + const int seq_len = num_image_tokens * 2; + std::vector qwen2_mask(static_cast(seq_len) * seq_len, 0.0f); + + // attention mask layout + // +--------------+---------------+ + // | all 0 | all -inf | + // +--------------+---------------+ + // | all 0 | lower tri 0 | + // +--------------+---------------+ + for (int i = 0; i < seq_len; i++) { + for (int j = 0; j < seq_len; j++) { + const bool zero = i < num_image_tokens ? + j < num_image_tokens : + j < num_image_tokens || j <= i; + qwen2_mask[static_cast(i) * seq_len + j] = zero ? 0.0f : -1e9f; + } + } + set_input_f32("qwen2_attn_mask", qwen2_mask); + } } break; case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_GEMMA3NV: @@ -4256,6 +4306,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { case PROJECTOR_TYPE_COGVLM: return ctx->model.mm_4h_to_h_w->ne[1]; case PROJECTOR_TYPE_DEEPSEEKOCR: + case PROJECTOR_TYPE_DEEPSEEKOCR2: return ctx->model.mm_fc_w->ne[1]; case PROJECTOR_TYPE_LFM2A: return ctx->model.position_embeddings->ne[0]; diff --git a/tools/mtmd/models/deepseekocr.cpp b/tools/mtmd/models/deepseekocr.cpp index 8419d496a..c3c22d0a4 100644 --- a/tools/mtmd/models/deepseekocr.cpp +++ b/tools/mtmd/models/deepseekocr.cpp @@ -157,7 +157,6 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) { cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); cur = ggml_add(ctx0, cur, layer.qkv_b); - cur = ggml_cont(ctx0, cur); // Ensure tensor is contiguous before reshape cur = ggml_reshape_4d(ctx0, cur, n_embd, 3, W * H, B); ggml_tensor * Q; @@ -251,17 +250,17 @@ ggml_cgraph * clip_graph_deepseekocr::build() { ggml_tensor * inp_raw = build_inp_raw(); ggml_tensor * sam_out = build_sam(inp_raw); + const int clip_n_patches = sam_out->ne[0] * sam_out->ne[1]; + ggml_tensor * clip_out; // Building DS-OCR CLIP { ggml_tensor * inp; - inp = ggml_cpy(ctx0, sam_out, ggml_dup_tensor(ctx0, sam_out)); - inp = ggml_reshape_2d(ctx0, inp, inp->ne[0] * inp->ne[1], inp->ne[2]); + inp = ggml_reshape_2d(ctx0, sam_out, clip_n_patches, sam_out->ne[2]); inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); - ggml_tensor * new_pos_embd = - ggml_cpy(ctx0, model.position_embeddings, ggml_dup_tensor(ctx0, model.position_embeddings)); + ggml_tensor * new_pos_embd = model.position_embeddings; int n_pos = new_pos_embd->ne[1]; // +1 for [CLS] const auto tgt_size = static_cast(std::sqrt(inp->ne[1])); @@ -295,16 +294,12 @@ ggml_cgraph * clip_graph_deepseekocr::build() { clip_out = cur; } - const int clip_n_patches = sam_out->ne[0] * sam_out->ne[1]; - sam_out = ggml_cont(ctx0, ggml_permute(ctx0, sam_out, 1, 2, 0, 3)); sam_out = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0], clip_n_patches); clip_out = ggml_view_2d(ctx0, clip_out, n_embd, clip_n_patches, clip_out->nb[1], clip_out->nb[1]); ggml_tensor * cur; cur = ggml_concat(ctx0, clip_out, sam_out, 0); - cur = ggml_reshape_2d(ctx0, cur, 2 * n_embd, clip_n_patches); - cur = ggml_cont(ctx0, cur); cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur); cur = ggml_add(ctx0, cur, model.mm_fc_b); @@ -313,13 +308,11 @@ ggml_cgraph * clip_graph_deepseekocr::build() { const auto n_dim = cur->ne[0]; ggml_tensor * imgnl; - ggml_tensor * vs; imgnl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, h, 1); - vs = ggml_reshape_2d(ctx0, model.view_seperator, n_dim, 1); // (n_dim, 1) cur = ggml_reshape_3d(ctx0, cur, n_dim, w, h); cur = ggml_reshape_2d(ctx0, ggml_concat(ctx0, cur, imgnl, 1), n_dim, (w + 1) * h); - cur = ggml_concat(ctx0, cur, vs, 1); // (n_dim, h*(w+1) + 1) + cur = ggml_concat(ctx0, cur, model.view_seperator, 1); // (n_dim, h*(w+1) + 1) cb(cur, "dsocr_output", -1); diff --git a/tools/mtmd/models/deepseekocr2.cpp b/tools/mtmd/models/deepseekocr2.cpp new file mode 100644 index 000000000..056bb8180 --- /dev/null +++ b/tools/mtmd/models/deepseekocr2.cpp @@ -0,0 +1,81 @@ +#include "models.h" + +ggml_cgraph * clip_graph_deepseekocr2::build() { + GGML_ASSERT(hparams.n_head_kv > 0); + GGML_ASSERT(n_head % hparams.n_head_kv == 0); + + // patch embedding + ggml_tensor * inp_raw = build_inp_raw(); + + ggml_tensor * sam_out = build_sam(inp_raw); + + ggml_tensor * qwen2_out; + // Building Qwen2 encoder + { + ggml_tensor * inp; + + inp = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0] * sam_out->ne[1], sam_out->ne[2]); // H*W, C + inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); + + auto num_image_tokens = inp->ne[1]; // H*W + GGML_ASSERT(num_image_tokens == 144 || num_image_tokens == 256); + + // query based on numbers of image tokens (in SAM output) + // 16x16 -> query_1024 (1024x1024 images) + // 12x12 -> query_768 (768x768 images) + + ggml_tensor * query_embed = model.resample_query_1024; + int num_queries = 256; + + if (num_image_tokens == 144) { + query_embed = model.resample_query_768; + num_queries = 144; + } + + // (B, num_image_tokens + num_queries, C) + inp = ggml_concat(ctx0, inp, ggml_cast(ctx0, query_embed, inp->type), 1); + + auto seq_len = inp->ne[1]; + + // qwen2 encoder attention mask + ggml_tensor * attn_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, seq_len, seq_len); + ggml_set_name(attn_mask, "qwen2_attn_mask"); + ggml_set_input(attn_mask); + + ggml_tensor * inp_pos = ggml_cast(ctx0, ggml_arange(ctx0, 0, seq_len, 1), GGML_TYPE_I32); + + auto add_rope = [&](ggml_tensor * x, const clip_layer &) { + return ggml_rope_ext(ctx0, x, inp_pos, nullptr, d_head, + GGML_ROPE_TYPE_NEOX, 131072, 1000000, 1, 0, 1, 0, 0); + }; + + build_vit_opts vit_opts; + vit_opts.attn_mask = attn_mask; + + // build_vit applies model.post_ln_w internally; do not re-apply + ggml_tensor * cur = build_vit(inp, seq_len, NORM_TYPE_RMS, FFN_SILU, + /* learned_pos_embd */ nullptr, add_rope, vit_opts); + + cur = ggml_cont(ctx0, + ggml_view_2d(ctx0, cur, cur->ne[0], num_queries, cur->nb[1], + cur->nb[1] * (cur->ne[1] - num_queries))); // only take query tokens for output + + ggml_build_forward_expand(gf, cur); + qwen2_out = cur; + } + + ggml_tensor * cur; + + cur = ggml_mul_mat(ctx0, model.mm_fc_w, qwen2_out); + cur = ggml_add(ctx0, cur, model.mm_fc_b); + + // view_seperator only after the global view + if (img.add_viewsep) { + cur = ggml_concat(ctx0, cur, model.view_seperator, 1); // (n_dim, 257) + } + + cb(cur, "dsocr2_output", -1); + + ggml_build_forward_expand(gf, cur); + return gf; +} diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index 119c2d541..a856882c2 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -121,6 +121,11 @@ struct clip_graph_deepseekocr : clip_graph { ggml_tensor * build_sam(ggml_tensor * inp); // build the SAM model }; +struct clip_graph_deepseekocr2 : clip_graph_deepseekocr { + clip_graph_deepseekocr2(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph_deepseekocr(ctx, img) {} + ggml_cgraph * build() override; // reuses build_sam() from base +}; + struct clip_graph_conformer : clip_graph { clip_graph_conformer(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; diff --git a/tools/mtmd/mtmd-image.cpp b/tools/mtmd/mtmd-image.cpp index 37c271d18..caf72d536 100644 --- a/tools/mtmd/mtmd-image.cpp +++ b/tools/mtmd/mtmd-image.cpp @@ -1137,6 +1137,105 @@ bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, return true; } +// +// mtmd_image_preprocessor_deepseekocr2 +// + +// candidate tile grids (cols, rows) with min_tiles <= cols*rows <= max_tiles +// sorted by tile count +std::vector mtmd_image_preprocessor_deepseekocr2::get_target_ratios() { + std::vector ratios; + for (int n = min_tiles; n <= max_tiles; n++) { + for (int w = 1; w <= n; w++) { + for (int h = 1; h <= n; h++) { + if (w * h < min_tiles || w * h > max_tiles) { + continue; + } + bool found = false; + for (const auto & r : ratios) { + if (r.width == w && r.height == h) { + found = true; + break; + } + } + if (!found) { + ratios.push_back({ w, h }); + } + } + } + } + std::sort(ratios.begin(), ratios.end(), [](const clip_image_size & a, const clip_image_size & b) { + return a.width * a.height < b.width * b.height; + }); + return ratios; +} + +// pick the grid whose aspect ratio is closest to the image +// on a tie, prefer the larger grid when the image fits +clip_image_size mtmd_image_preprocessor_deepseekocr2::find_closest_aspect_ratio( + float aspect_ratio, + const std::vector & target_ratios, + int width, + int height) { + float best_ratio_diff = std::numeric_limits::max(); + clip_image_size best_ratio = { 1, 1 }; + const float area = static_cast(width * height); + + for (const auto & ratio : target_ratios) { + const float target_aspect_ratio = static_cast(ratio.width) / ratio.height; + const float ratio_diff = std::abs(aspect_ratio - target_aspect_ratio); + if (ratio_diff < best_ratio_diff) { + best_ratio_diff = ratio_diff; + best_ratio = ratio; + } else if (ratio_diff == best_ratio_diff) { + const float target_area = static_cast(tile_size * tile_size * ratio.width * ratio.height); + if (area > 0.5f * target_area) { + best_ratio = ratio; + } + } + } + return best_ratio; +} + +bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { + // emit 768x768 local tiles when the image is larger than a tile in either + // dimension, then always a 1024x1024 global view. order: [tiles..., global]. + + if (img.nx > tile_size || img.ny > tile_size) { + const float aspect_ratio = static_cast(img.nx) / img.ny; + const auto target_ratios = get_target_ratios(); + const clip_image_size grid = find_closest_aspect_ratio(aspect_ratio, target_ratios, img.nx, img.ny); + + // stretch onto the grid (no aspect preserve), then crop tiles row-major. + clip_image_u8 refined; + img_tool::resize(img, refined, { tile_size * grid.width, tile_size * grid.height }, + RESIZE_ALGO_BICUBIC_PILLOW, PAD_NONE); + + for (int row = 0; row < grid.height; row++) { + for (int col = 0; col < grid.width; col++) { + clip_image_u8 tile; + img_tool::crop(refined, tile, col * tile_size, row * tile_size, tile_size, tile_size); + clip_image_f32_ptr res(clip_image_f32_init()); + img_u8_to_f32(tile, *res, hparams.image_mean, hparams.image_std); + output.entries.push_back(std::move(res)); + } + } + } + + // global view: aspect-preserving fit-and-pad to base_size. + clip_image_u8 padded; + img_tool::resize(img, padded, { base_size, base_size }, RESIZE_ALGO_BICUBIC_PILLOW, + PAD_NEAREST, hparams.image_pad_color); + clip_image_f32_ptr global(clip_image_f32_init()); + img_u8_to_f32(padded, *global, hparams.image_mean, hparams.image_std); + global->add_viewsep = true; + output.entries.push_back(std::move(global)); + + output.grid_x = 1; + output.grid_y = 1; + return true; +} + // // mtmd_image_preprocessor_step3vl // diff --git a/tools/mtmd/mtmd-image.h b/tools/mtmd/mtmd-image.h index 08129a08e..91a5bc253 100644 --- a/tools/mtmd/mtmd-image.h +++ b/tools/mtmd/mtmd-image.h @@ -144,6 +144,26 @@ struct mtmd_image_preprocessor_deepseekocr : mtmd_image_preprocessor { bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; }; +// DeepSeek-OCR-2: a 1024x1024 global view, plus InternVL-style 768x768 local +// tiles when the image is larger than a tile in either dimension. +struct mtmd_image_preprocessor_deepseekocr2 : mtmd_image_preprocessor { + static constexpr int base_size = 1024; // global view + static constexpr int tile_size = 768; // local tile + static constexpr int min_tiles = 2; + static constexpr int max_tiles = 6; + + mtmd_image_preprocessor_deepseekocr2(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {} + bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; + +private: + static std::vector get_target_ratios(); + static clip_image_size find_closest_aspect_ratio( + float aspect_ratio, + const std::vector & target_ratios, + int width, + int height); +}; + // custom image preprocessing for Step3VL // ref: https://huggingface.co/stepfun-ai/Step3-VL-10B/blob/main/processing_step3.py struct mtmd_image_preprocessor_step3vl : mtmd_image_preprocessor_llava_uhd { diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 63b7e4d05..b3401634f 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -493,6 +493,11 @@ struct mtmd_context { img_end = "\n"; // prevent empty batch on llama-server image_preproc = std::make_unique(ctx_v); } break; + case PROJECTOR_TYPE_DEEPSEEKOCR2: + { + img_end = "\n"; // prevent empty batch on llama-server + image_preproc = std::make_unique(ctx_v); + } break; case PROJECTOR_TYPE_HUNYUANVL: { // note: these use fullwidth | (U+FF5C) and ▁ (U+2581) to match the tokenizer vocabulary @@ -1091,16 +1096,21 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) if (clip_is_llava(ctx_clip) || proj_type == PROJECTOR_TYPE_MINICPMV || proj_type == PROJECTOR_TYPE_GLM_EDGE - || proj_type == PROJECTOR_TYPE_INTERNVL) { + || proj_type == PROJECTOR_TYPE_INTERNVL + || proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2) { // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode() const auto & entries = image_tokens->batch_f32.entries; + // entries may have different token counts + // e.g., DeepSeek-OCR-2: 144 per tile views, 257 for the global view + size_t offset = 0; for (size_t i = 0; i < entries.size(); i++) { int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get()); ok = clip_image_encode( ctx_clip, ctx->n_threads, entries[i].get(), - ctx->image_embd_v.data() + i*n_mmproj_embd*n_tokens_per_image); + ctx->image_embd_v.data() + offset); + offset += static_cast(n_mmproj_embd) * n_tokens_per_image; } } else { ok = clip_image_batch_encode( diff --git a/tools/mtmd/tests/test-deepseek-ocr.py b/tools/mtmd/tests/test-deepseek-ocr.py index 5c1980271..5f5fef765 100644 --- a/tools/mtmd/tests/test-deepseek-ocr.py +++ b/tools/mtmd/tests/test-deepseek-ocr.py @@ -3,7 +3,7 @@ Evaluates llama.cpp's DeepSeek-OCR by comparing its output for a test image to the actual text in part of that image. -Runs the test image through mtmd-cli, calculates CER and chrF for +Runs each test image through mtmd-cli, calculates CER and chrF for its output, and holds them against the HF model's scores. """ @@ -12,24 +12,81 @@ import logging import subprocess import sys import unicodedata +from dataclasses import dataclass from pathlib import Path logger = logging.getLogger("deepseek-ocr-test") -DEFAULT_IMAGE = "test-1.jpeg" -DEFAULT_EXPECTED_TEXT = "test-1-ground-truth.txt" RUN_TIMEOUT = 300 -# DeepSeek-OCR reference scores on the test image. -# This is the baseline the implementation should keep up with. -HF_REFERENCE_CER = 0.3030 -HF_REFERENCE_CHRF = 67.52 -CER_TOLERANCE = 0.02 -CHRF_TOLERANCE = 2.0 +@dataclass +class ModelSpec: + key: str + label: str + model_arg: str + mmproj_arg: str + model_default: str + mmproj_default: str -CER_MAX = HF_REFERENCE_CER + CER_TOLERANCE -CHRF_MIN = HF_REFERENCE_CHRF - CHRF_TOLERANCE + +@dataclass +class TestCase: + model_key: str + label: str + image: str + ground_truth: str + hf_cer: float + hf_chrf: float + cer_tol: float + chrf_tol: float + + @property + def cer_max(self) -> float: + return self.hf_cer + self.cer_tol + + @property + def chrf_min(self) -> float: + return self.hf_chrf - self.chrf_tol + + +MODELS = { + "v1": ModelSpec( + key="v1", label="DeepSeek-OCR", + model_arg="--llama-model", mmproj_arg="--mmproj", + model_default="gguf_models/deepseek-ai/deepseek-ocr-bf16.gguf", + mmproj_default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-bf16.gguf", + ), + "v2": ModelSpec( + key="v2", label="DeepSeek-OCR-2", + model_arg="--llama-model-2", mmproj_arg="--mmproj-2", + model_default="gguf_models/deepseek-ai/deepseek-ocr-2-bf16.gguf", + mmproj_default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-2-bf16.gguf", + ), +} + +CASES = [ + TestCase( + model_key="v1", label="single-view scan", + image="tools/mtmd/test-1.jpeg", + ground_truth="tools/mtmd/tests/test-1-ground-truth.txt", + hf_cer=0.3030, hf_chrf=67.52, cer_tol=0.02, chrf_tol=2.0, + ), + TestCase( + model_key="v2", label="single-view scan", + image="tools/mtmd/test-1.jpeg", + ground_truth="tools/mtmd/tests/test-1-ground-truth.txt", + # 640x488 is below the 768 tiling threshold -- single 1024 global view. + # hf_cer/hf_chrf are the deepseek-ai repo's own scores (ImageOps.pad); + # the transformers HF processor is *not* the reference -- its pad_to_square + # is one pixel off and lands at ~0.69 instead. + hf_cer=0.7761, hf_chrf=28.70, cer_tol=0.12, chrf_tol=8.0, + ), +] + + +def arg_dest(flag: str) -> str: + return flag.lstrip("-").replace("-", "_") def verdict(ok: bool) -> str: @@ -84,6 +141,14 @@ def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str: "--temp", "0", "--flash-attn", "off", # match the HF "eager" attention reference "--no-warmup", + "-n", "512", # cap loops on hard images (KV would otherwise fill) + # HF decodes with no_repeat_ngram_size; llama.cpp's analog is DRY. + # Default DRY breakers include "\n", so they are cleared below. + "--dry-multiplier", "0.8", + "--dry-base", "1.75", + "--dry-allowed-length", "2", + "--dry-penalty-last-n", "-1", + "--dry-sequence-breaker", "none", ] logger.debug(f" command: {' '.join(cmd)}") @@ -110,7 +175,7 @@ def read_expected_text(file_path: Path) -> str: return f.read().strip() -def evaluate(expected: str, ocr_out: str) -> bool: +def evaluate(case: "TestCase", expected: str, ocr_out: str) -> bool: expected = normalize_text(expected) ocr_out = normalize_text(ocr_out) aligned = locally_align(expected, ocr_out) @@ -122,16 +187,16 @@ def evaluate(expected: str, ocr_out: str) -> bool: cer = compute_cer(expected, aligned) chrf = compute_chrf(expected, aligned) - cer_pass = cer <= CER_MAX - chrf_pass = chrf >= CHRF_MIN + cer_pass = cer <= case.cer_max + chrf_pass = chrf >= case.chrf_min passed = cer_pass and chrf_pass logger.info("") logger.info("=" * 60) logger.info("Free OCR evaluation:") logger.info("=" * 60) - logger.info(f" CER {cer:>7.4f} (<= {CER_MAX:>7.4f} -> {verdict(cer_pass)})") - logger.info(f" chrF (0-100) {chrf:>7.2f} (>= {CHRF_MIN:>7.2f} -> {verdict(chrf_pass)})") + logger.info(f" CER {cer:>7.4f} (HF {case.hf_cer:.4f}, <= {case.cer_max:>7.4f} -> {verdict(cer_pass)})") + logger.info(f" chrF (0-100) {chrf:>7.2f} (HF {case.hf_chrf:.2f}, >= {case.chrf_min:>7.2f} -> {verdict(chrf_pass)})") logger.info(f" Expected chars {len(expected):>7}") logger.info(f" Aligned chars {len(aligned):>7} (of {len(ocr_out)} OCR chars)") logger.info("") @@ -142,12 +207,13 @@ def evaluate(expected: str, ocr_out: str) -> bool: def argument_parser() -> argparse.ArgumentParser: ap = argparse.ArgumentParser(description="Compare llama.cpp DeepSeek-OCR output with a ground-truth transcript") - ap.add_argument("--llama-model", default="gguf_models/deepseek-ai/deepseek-ocr-bf16.gguf", - help="Path to llama.cpp GGUF model (relative to repo root or absolute)") - ap.add_argument("--mmproj", default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-bf16.gguf", - help="Path to mmproj GGUF file (relative to repo root or absolute)") ap.add_argument("--llama-bin", default="build/bin/llama-mtmd-cli", help="Path to llama-mtmd-cli binary (relative to repo root or absolute)") + for spec in MODELS.values(): + ap.add_argument(spec.model_arg, default=spec.model_default, + help=f"Path to the {spec.label} GGUF model (relative to repo root or absolute)") + ap.add_argument(spec.mmproj_arg, default=spec.mmproj_default, + help=f"Path to the {spec.label} mmproj GGUF file (relative to repo root or absolute)") ap.add_argument("--verbose", action="store_true", help="Also log the expected, OCR, and aligned text") return ap @@ -167,53 +233,60 @@ def main() -> int: args = argument_parser().parse_args() configure_logging(args.verbose) - tests_dir = Path(__file__).parent # tools/mtmd/tests - mtmd_dir = tests_dir.parent # tools/mtmd - repo_root = mtmd_dir.parent.parent # repo root + repo_root = Path(__file__).resolve().parents[3] # tests -> mtmd -> tools -> repo root + binary = resolve_path(args.llama_bin, repo_root) - inputs = [ - ("image", resolve_path(DEFAULT_IMAGE, mtmd_dir)), - ("expected-text", resolve_path(DEFAULT_EXPECTED_TEXT, tests_dir)), - ("model", resolve_path(args.llama_model, repo_root)), - ("mmproj", resolve_path(args.mmproj, repo_root)), - ("binary", resolve_path(args.llama_bin, repo_root)), - ] - for label, path in inputs: - if not path.exists(): - logger.error(f"Error: {label} not found: {path}") - return 1 - paths = dict(inputs) - - logger.info("=" * 60) - logger.info("DeepSeek-OCR: llama.cpp vs ground-truth comparison") - logger.info("=" * 60) - logger.info(f"HF baselines: CER {HF_REFERENCE_CER:.4f}, chrF {HF_REFERENCE_CHRF:.2f}") - logger.info(f"Test thresholds: CER <= {CER_MAX:.4f}, chrF >= {CHRF_MIN:.2f}") - - logger.debug("") - logger.debug("Resolved test inputs:") - for label, path in inputs: - logger.debug(f" {label:<14} {path}") - - logger.info("") - logger.info("[1/3] Running llama.cpp 'Free OCR'") - try: - ocr_out = run_mtmd_cli(paths["model"], paths["mmproj"], - paths["image"], paths["binary"]) - except RuntimeError as e: - logger.error(f"Error: {e}") + if not binary.exists(): + logger.error(f"Error: binary not found: {binary}") return 1 - logger.info("") - logger.info("[2/3] Reading expected output") - expected = read_expected_text(paths["expected-text"]) - logger.info(f" expected: {len(expected)} chars") + logger.info("=" * 60) + logger.info("DeepSeek-OCR: llama.cpp vs HF parity check") + logger.info("=" * 60) + + results = {} + for case in CASES: + model_spec = MODELS[case.model_key] + title = f"{model_spec.label} -- {case.label}" + + logger.info("") + logger.info(f"=== {title} ===") + + model = resolve_path(getattr(args, arg_dest(model_spec.model_arg)), repo_root) + mmproj = resolve_path(getattr(args, arg_dest(model_spec.mmproj_arg)), repo_root) + image = resolve_path(case.image, repo_root) + ground_truth = resolve_path(case.ground_truth, repo_root) + + missing = [(lbl, p) for lbl, p in [("model", model), ("mmproj", mmproj), + ("image", image), ("ground-truth", ground_truth)] + if not p.exists()] + if missing: + for lbl, p in missing: + logger.error(f" Error: {lbl} not found: {p}") + results[title] = False + continue + + expected = read_expected_text(ground_truth) + logger.info(f" Image: {case.image}") + logger.info(f" Expected text: {len(expected)} chars") + logger.info(" Running llama.cpp 'Free OCR'") + try: + ocr_out = run_mtmd_cli(model, mmproj, image, binary) + except RuntimeError as e: + logger.error(f" Error: {e}") + results[title] = False + continue + + results[title] = evaluate(case, expected, ocr_out) logger.info("") - logger.info("[3/3] Computing OCR metrics") - ok = evaluate(expected, ocr_out) + logger.info("=== Summary ===") + for title, ok in results.items(): + logger.info(f" {title:<48} {verdict(ok)}") + all_passed = all(results.values()) + logger.info(f"Overall: {verdict(all_passed)}") - return 0 if ok else 1 + return 0 if all_passed else 1 if __name__ == "__main__":