mtmd: Add DeepSeekOCR 2 Support (#20975)

* mtmd: DeepSeek-OCR 2 support, with multi-tile dynamic resolution

* introduced clip_image_f32::add_viewsep

* address PR review

- drop redundant ggml_cpy ops in both deepseekocr versions build
- drop no-op ggml_cont in build_sam
- assert num_image_tokens deepseekocr2
- view_seperator as (1, n_embd) at conversion (for both versions)
- drop redundant ggml_reshape_2d

* Update tools/mtmd/models/deepseekocr2.cpp

Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>

---------

Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
This commit is contained in:
Saba Fallah 2026-05-29 16:13:51 +02:00 committed by GitHub
parent 6ed481eea4
commit da3f990a47
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
16 changed files with 505 additions and 90 deletions

View file

@ -237,6 +237,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
MMPROJ_MODEL_MAP: dict[str, str] = {
"AudioFlamingo3ForConditionalGeneration": "ultravox",
"CogVLMForCausalLM": "cogvlm",
"DeepseekOCR2ForCausalLM": "deepseek",
"DeepseekOCRForCausalLM": "deepseek",
"DotsOCRForCausalLM": "dotsocr",
"Gemma3ForConditionalGeneration": "gemma",

View file

@ -1140,7 +1140,7 @@ class TextModel(ModelBase):
# Skip multimodal tensors
if name.startswith(("mlp", "vit.", "vpm.", "siglip2.", "conformer.", "merger.", "resampler.", "sound_encoder.", "sound_projection.", "speech_embeddings.")) \
or "visual." in name or "vision." in name or "audio." in name or "talker." in name \
or "vision_" in name or "audio_" in name or "sam_model" in name \
or "vision_" in name or "audio_" in name \
or "token2wav." in name or "code2wav." in name \
or "projector." in name or "pre_mm_projector_norm" in name \
or "image_newline" in name or "view_seperator" in name \

View file

@ -16,10 +16,14 @@ from .qwen import QwenModel
@ModelBase.register("DeepseekOCRForCausalLM")
class DeepseekOCRVisionModel(MmprojModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.clip_projector_type = gguf.VisionProjectorType.DEEPSEEKOCR
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DEEPSEEKOCR)
self.gguf_writer.add_clip_projector_type(self.clip_projector_type)
# default values below are taken from HF tranformers code
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
self.gguf_writer.add_vision_use_gelu(True)
@ -49,22 +53,27 @@ class DeepseekOCRVisionModel(MmprojModel):
raise ValueError("DeepseekOCR model requires 'vision_config' in the model configuration, but it was not found")
vision_config['sam'] = vision_config['width']['sam_vit_b']
vision_config.update(vision_config['width']['clip-l-14-224'])
vision_config['hidden_size'] = vision_config['width']
vision_config['num_heads'] = vision_config['heads']
vision_config['intermediate_size'] = vision_config['heads'] * 4
if vision_config['width'].get('clip-l-14-224') is not None:
vision_config.update(vision_config['width']['clip-l-14-224'])
if isinstance(vision_config['width'], int):
vision_config['hidden_size'] = vision_config['width']
if vision_config.get('heads') is not None:
vision_config['num_heads'] = vision_config['heads']
vision_config['intermediate_size'] = vision_config['heads'] * 4
return vision_config
def tensor_force_quant(self, name, new_name, bid, n_dims):
if ".embeddings." in name or 'pos_embed' in name:
return gguf.GGMLQuantizationType.F32
if ".rel_pos_h" in name or '.rel_pos_w' in name:
return gguf.GGMLQuantizationType.F32
if ".neck." in name or ".net_" in name:
return gguf.GGMLQuantizationType.F32
for nq_name in ('.embeddings.', 'pos_embed', '.rel_pos_h', '.rel_pos_w', '.neck.', '.net_'):
if nq_name in name:
return gguf.GGMLQuantizationType.F32
return super().tensor_force_quant(name, new_name, bid, n_dims)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.endswith("view_seperator"):
data_torch = data_torch.unsqueeze(0)
yield from super().modify_tensors(data_torch, name, bid)
@classmethod
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
name, gen = item
@ -81,6 +90,33 @@ class DeepseekOCRVisionModel(MmprojModel):
return super().filter_tensors((name, gen))
@ModelBase.register("DeepseekOCR2ForCausalLM")
class DeepseekOCR2VisionModel(DeepseekOCRVisionModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.clip_projector_type = gguf.VisionProjectorType.DEEPSEEKOCR2
def set_gguf_parameters(self):
# the vision tower's qwen2 encoder is built from fixed defaults,
# see build_qwen2_decoder_as_encoder() in deepencoderv2.py
if self.hparams.get("patch_size") is None:
self.hparams["patch_size"] = 16
if self.hparams.get("intermediate_size") is None:
self.hparams["intermediate_size"] = 4864
if self.hparams.get("num_attention_heads") is None:
self.hparams["num_attention_heads"] = 14
super().set_gguf_parameters()
# qwen2 encoder is GQA: 14 Q heads, 2 KV heads
self.gguf_writer.add_vision_head_count_kv(2)
def get_vision_config(self) -> dict[str, Any]:
vision_config = super().get_vision_config()
vision_config['hidden_size'] = vision_config['width']['qwen2-0-5b']['dim']
if vision_config.get('layers') is None:
vision_config['layers'] = 24
return vision_config
@ModelBase.register("DeepseekForCausalLM")
class DeepseekModel(TextModel):
model_arch = gguf.MODEL_ARCH.DEEPSEEK
@ -188,13 +224,21 @@ class DeepseekV2Model(TextModel):
self.origin_hf_arch = hparams.get('architectures', [None])[0]
# special handling for Deepseek OCR
if self.origin_hf_arch == "DeepseekOCRForCausalLM":
if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM"):
self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR
self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
self.gguf_writer.add_architecture()
# default jinja template
self.gguf_writer.add_chat_template("{% for m in messages %}{{m['content']}}{% endfor %}")
@classmethod
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
name, _ = item
# DeepSeek-OCR vision encoder (SAM + DeepSeek-OCR-2 qwen2 tower)
if "sam_model" in name or "qwen2_model" in name:
return None
return super().filter_tensors(item)
def set_vocab(self):
try:
self._set_vocab_gpt2()

View file

@ -812,6 +812,8 @@ class MODEL_TENSOR(IntEnum):
V_SAM_NET_3 = auto() # Deepseek-OCR
V_ENC_EMBD_IMGNL = auto() # Deepseek-OCR
V_ENC_EMBD_VSEP = auto() # Deepseek-OCR
V_RESMPL_QUERY_768 = auto() # Deepseek-OCR-2
V_RESMPL_QUERY_1024 = auto() # Deepseek-OCR-2
# audio (mtmd)
A_ENC_EMBD_POS = auto()
@ -1329,6 +1331,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.V_SAM_NET_3: "v.sam.net_3",
MODEL_TENSOR.V_ENC_EMBD_IMGNL: "v.image_newline", # Deepseek-OCR
MODEL_TENSOR.V_ENC_EMBD_VSEP: "v.view_seperator", # Deepseek-OCR
MODEL_TENSOR.V_RESMPL_QUERY_768: "v.resample_query_768", # Deepseek-OCR-2 qwen2
MODEL_TENSOR.V_RESMPL_QUERY_1024: "v.resample_query_1024", # Deepseek-OCR-2 qwen2
# audio (mtmd)
# note: all audio tensor names must use prefix "a." or "mm.a."
MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd",
@ -1507,6 +1511,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.V_SAM_NECK,
MODEL_TENSOR.V_SAM_NET_2,
MODEL_TENSOR.V_SAM_NET_3,
MODEL_TENSOR.V_RESMPL_QUERY_768,
MODEL_TENSOR.V_RESMPL_QUERY_1024,
# audio
MODEL_TENSOR.A_ENC_EMBD_POS,
MODEL_TENSOR.A_ENC_EMBD_NORM,
@ -4329,6 +4335,7 @@ class VisionProjectorType:
JANUS_PRO = "janus_pro"
DOTSOCR = "dots_ocr"
DEEPSEEKOCR = "deepseekocr"
DEEPSEEKOCR2 = "deepseekocr2"
LFM2A = "lfm2a" # audio
MUSIC_FLAMINGO = "musicflamingo" # audio
GLM4V = "glm4v"

View file

@ -1485,6 +1485,7 @@ class TensorNameMap:
"siglip2.vision_model.encoder.layers.{bid}.self_attn.q_proj", # youtuvl
"model.vision_model.transformer.layers.{bid}.self_attn.q_proj", # Deepseek-OCR CLIP, generated
"vision_model.model.layers.{bid}.self_attn.q_proj.linear", # gemma4
"model.qwen2_model.model.model.layers.{bid}.self_attn.q_proj" # Deepseek-OCR-2 qwen2
),
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
@ -1509,6 +1510,7 @@ class TensorNameMap:
"model.vision_model.transformer.layers.{bid}.self_attn.k_proj", # Deepseek-OCR CLIP, generated
"siglip2.vision_model.encoder.layers.{bid}.self_attn.k_proj",
"vision_model.model.layers.{bid}.self_attn.k_proj.linear", # gemma4
"model.qwen2_model.model.model.layers.{bid}.self_attn.k_proj" # Deepseek-OCR-2 qwen2
),
MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
@ -1533,6 +1535,7 @@ class TensorNameMap:
"siglip2.vision_model.encoder.layers.{bid}.self_attn.v_proj",
"model.vision_model.transformer.layers.{bid}.self_attn.v_proj", # Deepseek-OCR CLIP, generated
"vision_model.model.layers.{bid}.self_attn.v_proj.linear", # gemma4
"model.qwen2_model.model.model.layers.{bid}.self_attn.v_proj" # Deepseek-OCR-2 qwen2
),
MODEL_TENSOR.V_ENC_INPUT_NORM: (
@ -1554,6 +1557,7 @@ class TensorNameMap:
"vision_model.radio_model.model.blocks.{bid}.norm1", # Nemotron Nano v2 VL
"vision_tower.blocks.{bid}.norm1", # dots.ocr
"vision_model.transformer.resblocks.{bid}.ln_1", # Step3-VL
"model.qwen2_model.model.model.layers.{bid}.input_layernorm", # Deepseek-OCR-2 qwen2
),
MODEL_TENSOR.V_ENC_ATTN_O: (
@ -1574,6 +1578,7 @@ class TensorNameMap:
"model.vision_model.transformer.layers.{bid}.self_attn.out_proj", # Deepseek-OCR CLIP
"siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl
"vision_model.radio_model.model.blocks.{bid}.attn.proj", # Nemotron Nano v2 VL
"model.qwen2_model.model.model.layers.{bid}.self_attn.o_proj", # Deepseek-OCR-2 qwen2
"vision_model.model.layers.{bid}.self_attn.o_proj.linear", # gemma4
"vision_tower.blocks.{bid}.attn.proj", # dots.ocr
"vision_model.transformer.resblocks.{bid}.attn.out_proj", # Step3-VL
@ -1603,6 +1608,7 @@ class TensorNameMap:
"vision_model.model.layers.{bid}.pre_feedforward_layernorm", # gemma4
"vision_tower.blocks.{bid}.norm2", # dots.ocr
"vision_model.transformer.resblocks.{bid}.ln_2", # Step3-VL
"model.qwen2_model.model.model.layers.{bid}.post_attention_layernorm", # Deepseek-OCR-2 qwen2
),
MODEL_TENSOR.V_ENC_FFN_UP: (
@ -1625,6 +1631,7 @@ class TensorNameMap:
"vision_model.radio_model.model.blocks.{bid}.mlp.fc1", # Nemotron Nano v2 VL
"vision_model.model.layers.{bid}.mlp.up_proj", # gemma4
"vision_model.transformer.resblocks.{bid}.mlp.c_fc", # Step3-VL
"model.qwen2_model.model.model.layers.{bid}.mlp.up_proj", # Deepseek-OCR-2 qwen2
),
MODEL_TENSOR.V_ENC_FFN_GATE: (
@ -1632,6 +1639,7 @@ class TensorNameMap:
"vision_encoder.transformer.layers.{bid}.feed_forward.w1", # pixtral
"visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl
"vision_model.model.layers.{bid}.mlp.gate_proj", # gemma4
"model.qwen2_model.model.model.layers.{bid}.mlp.gate_proj", # Deepseek-OCR-2 qwen2
),
MODEL_TENSOR.V_ENC_FFN_DOWN: (
@ -1652,6 +1660,7 @@ class TensorNameMap:
"model.vision_model.transformer.layers.{bid}.mlp.fc2", # Deepseek-OCR CLIP
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc2",
"vision_model.radio_model.model.blocks.{bid}.mlp.fc2", # Nemotron Nano v2 VL
"model.qwen2_model.model.model.layers.{bid}.mlp.down_proj" , # Deepseek-OCR-2 qwen2
"vision_model.model.layers.{bid}.mlp.down_proj", # gemma4
"vision_model.transformer.resblocks.{bid}.mlp.c_proj", # Step3-VL
),
@ -1699,6 +1708,7 @@ class TensorNameMap:
"vision_tower.encoder.final_layernorm", # kimi-vl
"visual.post_layernorm", # glm4v
"siglip2.vision_model.post_layernorm",
"model.qwen2_model.model.model.norm", # Deepseek-OCR-2 qwen2
),
MODEL_TENSOR.V_MM_POST_NORM: (
@ -1879,6 +1889,14 @@ class TensorNameMap:
"model.sam_model.net_3",
),
MODEL_TENSOR.V_RESMPL_QUERY_768: (
"model.qwen2_model.query_768", # Deepseek-OCR-2 qwen2
),
MODEL_TENSOR.V_RESMPL_QUERY_1024: (
"model.qwen2_model.query_1024", # Deepseek-OCR-2 qwen2
),
MODEL_TENSOR.V_MM_POST_FC_NORM: (
"model.vision.linear_proj.norm1", # cogvlm
),

View file

@ -40,6 +40,7 @@ add_library(mtmd
models/siglip.cpp
models/whisper-enc.cpp
models/deepseekocr.cpp
models/deepseekocr2.cpp
models/mobilenetv5.cpp
models/youtuvl.cpp
models/yasa2.cpp

View file

@ -188,6 +188,8 @@
#define TN_SAM_FFN_DOWN "v.sam.blk.%d.mlp.lin2.%s"
#define TN_SAM_NECK "v.sam.neck.%d.%s"
#define TN_SAM_NET "v.sam.net_%d.%s"
// deepseek-ocr-2
#define TN_RESMPL_QUERY "v.resample_query_%d.%s"
// (conformer) lfm2
#define TN_PRE_ENCODE_OUT "a.pre_encode.out.%s"
#define TN_FFN_NORM "%s.blk.%d.ffn_norm.%s"
@ -337,6 +339,7 @@ enum projector_type {
PROJECTOR_TYPE_JANUS_PRO,
PROJECTOR_TYPE_DOTS_OCR,
PROJECTOR_TYPE_DEEPSEEKOCR,
PROJECTOR_TYPE_DEEPSEEKOCR2,
PROJECTOR_TYPE_LFM2A,
PROJECTOR_TYPE_GLM4V,
PROJECTOR_TYPE_YOUTUVL,
@ -386,6 +389,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
{ PROJECTOR_TYPE_DOTS_OCR, "dots_ocr"},
{ PROJECTOR_TYPE_DEEPSEEKOCR,"deepseekocr"},
{ PROJECTOR_TYPE_DEEPSEEKOCR2,"deepseekocr2"},
{ PROJECTOR_TYPE_LFM2A, "lfm2a"},
{ PROJECTOR_TYPE_GLM4V, "glm4v"},
{ PROJECTOR_TYPE_YOUTUVL, "youtuvl"},
@ -424,6 +428,9 @@ struct clip_image_f32 {
int ny;
std::vector<float> buf;
// marks the global view in e.g., DeepSeek-OCR Models
bool add_viewsep = false;
};
//

View file

@ -542,6 +542,11 @@ struct clip_model {
int32_t n_sam_layers = 12; // used by deepseek-ocr sam encoder
std::vector<clip_layer> sam_layers;
// deepseek-ocr-2
ggml_tensor * resample_query_768 = nullptr;
ggml_tensor * resample_query_1024 = nullptr;
// lfm2 audio
std::array<ggml_tensor *, 7> pre_encode_conv_X_w = {nullptr};
std::array<ggml_tensor *, 7> pre_encode_conv_X_b = {nullptr};

View file

@ -953,6 +953,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
{
builder = std::make_unique<clip_graph_deepseekocr>(ctx, img);
} break;
case PROJECTOR_TYPE_DEEPSEEKOCR2:
{
builder = std::make_unique<clip_graph_deepseekocr2>(ctx, img);
} break;
case PROJECTOR_TYPE_LFM2A:
{
builder = std::make_unique<clip_graph_conformer>(ctx, img);
@ -1514,6 +1518,7 @@ struct clip_model_loader {
hparams.set_warmup_n_tokens(28*28); // avoid OOM on warmup
} break;
case PROJECTOR_TYPE_DEEPSEEKOCR:
case PROJECTOR_TYPE_DEEPSEEKOCR2:
{
hparams.patch_size = 16;
hparams.image_size = 1024;
@ -1525,6 +1530,10 @@ struct clip_model_loader {
get_u32(KEY_SAM_N_HEAD, hparams.sam_n_head, true);
get_u32(KEY_SAM_N_EMBD, hparams.sam_n_embd, true);
get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
if (model.proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2) {
// qwen2 encoder is GQA, requires KEY_N_HEAD_KV
get_u32(string_format(KEY_N_HEAD_KV, "vision"), hparams.n_head_kv);
}
} break;
case PROJECTOR_TYPE_HUNYUANVL:
{
@ -2374,6 +2383,7 @@ struct clip_model_loader {
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
} break;
case PROJECTOR_TYPE_DEEPSEEKOCR:
case PROJECTOR_TYPE_DEEPSEEKOCR2:
{
model.pos_embed = get_tensor(string_format(TN_SAM_POS_EMBD, "weight"));
model.patch_embed_proj_w = get_tensor(string_format(TN_SAM_PATCH_EMBD, "weight"));
@ -2404,10 +2414,12 @@ struct clip_model_loader {
model.neck_3_w = get_tensor(string_format(TN_SAM_NECK, 3, "weight"));
model.net_2 = get_tensor(string_format(TN_SAM_NET, 2, "weight"));
model.net_3 = get_tensor(string_format(TN_SAM_NET, 3, "weight"));
model.image_newline = get_tensor(TN_IMAGE_NEWLINE);
model.image_newline = get_tensor(TN_IMAGE_NEWLINE, false);
model.view_seperator = get_tensor(TN_IMAGE_SEPERATOR);
model.mm_fc_w = get_tensor(string_format(TN_MM_PROJECTOR, "weight"));
model.mm_fc_b = get_tensor(string_format(TN_MM_PROJECTOR, "bias"));
model.resample_query_768 = get_tensor(string_format(TN_RESMPL_QUERY, 768, "weight"), false);
model.resample_query_1024 = get_tensor(string_format(TN_RESMPL_QUERY, 1024, "weight"), false);
} break;
case PROJECTOR_TYPE_GEMMA4A:
{
@ -3277,7 +3289,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
case PROJECTOR_TYPE_DEEPSEEKOCR:
{
// SAM encoder applies two stride-2 convolutions (net_2 and net_3)
// which reduces spatial dimensions by 4x in each direction (16x total)
// that reduce spatial dimensions by 4x in each direction (16x total)
// E.g., 64x64 -> 16x16 patches
n_patches /= 16;
@ -3293,6 +3305,15 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
int oh = (img->ny / patch_size) / merge;
n_patches = (ow + 1) * oh + 2;
} break;
case PROJECTOR_TYPE_DEEPSEEKOCR2:
{
// 1024 global view -> 256 query tokens + 1 view separator = 257;
// 768 local tile -> 144 query tokens, no separator.
n_patches /= 16;
if (img->add_viewsep) {
n_patches += 1; // view separator, appended only after the global view
}
} break;
case PROJECTOR_TYPE_LFM2A:
{
n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2;
@ -3882,6 +3903,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
set_input_i32("pos_y", pos_y);
} break;
case PROJECTOR_TYPE_DEEPSEEKOCR:
case PROJECTOR_TYPE_DEEPSEEKOCR2:
{
GGML_ASSERT(pos_w == pos_h);
@ -3904,6 +3926,34 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
set_input_i32("rel_pos_indices_local", rel_pos_indices_local);
set_input_i32("rel_pos_indices_global", rel_pos_indices_global);
if (ctx->proj_type() == PROJECTOR_TYPE_DEEPSEEKOCR2) {
// qwen2 encoder attention mask
// num_image_tokens = num_patches / 16
// 256 for 1024 global view
// 144 for 768 tile views
const int num_image_tokens = num_patches / 16;
const int seq_len = num_image_tokens * 2;
std::vector qwen2_mask(static_cast<size_t>(seq_len) * seq_len, 0.0f);
// attention mask layout
// +--------------+---------------+
// | all 0 | all -inf |
// +--------------+---------------+
// | all 0 | lower tri 0 |
// +--------------+---------------+
for (int i = 0; i < seq_len; i++) {
for (int j = 0; j < seq_len; j++) {
const bool zero = i < num_image_tokens ?
j < num_image_tokens :
j < num_image_tokens || j <= i;
qwen2_mask[static_cast<size_t>(i) * seq_len + j] = zero ? 0.0f : -1e9f;
}
}
set_input_f32("qwen2_attn_mask", qwen2_mask);
}
} break;
case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_GEMMA3NV:
@ -4256,6 +4306,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
case PROJECTOR_TYPE_COGVLM:
return ctx->model.mm_4h_to_h_w->ne[1];
case PROJECTOR_TYPE_DEEPSEEKOCR:
case PROJECTOR_TYPE_DEEPSEEKOCR2:
return ctx->model.mm_fc_w->ne[1];
case PROJECTOR_TYPE_LFM2A:
return ctx->model.position_embeddings->ne[0];

View file

@ -157,7 +157,6 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
cur = ggml_add(ctx0, cur, layer.qkv_b);
cur = ggml_cont(ctx0, cur); // Ensure tensor is contiguous before reshape
cur = ggml_reshape_4d(ctx0, cur, n_embd, 3, W * H, B);
ggml_tensor * Q;
@ -251,17 +250,17 @@ ggml_cgraph * clip_graph_deepseekocr::build() {
ggml_tensor * inp_raw = build_inp_raw();
ggml_tensor * sam_out = build_sam(inp_raw);
const int clip_n_patches = sam_out->ne[0] * sam_out->ne[1];
ggml_tensor * clip_out;
// Building DS-OCR CLIP
{
ggml_tensor * inp;
inp = ggml_cpy(ctx0, sam_out, ggml_dup_tensor(ctx0, sam_out));
inp = ggml_reshape_2d(ctx0, inp, inp->ne[0] * inp->ne[1], inp->ne[2]);
inp = ggml_reshape_2d(ctx0, sam_out, clip_n_patches, sam_out->ne[2]);
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
ggml_tensor * new_pos_embd =
ggml_cpy(ctx0, model.position_embeddings, ggml_dup_tensor(ctx0, model.position_embeddings));
ggml_tensor * new_pos_embd = model.position_embeddings;
int n_pos = new_pos_embd->ne[1]; // +1 for [CLS]
const auto tgt_size = static_cast<int>(std::sqrt(inp->ne[1]));
@ -295,16 +294,12 @@ ggml_cgraph * clip_graph_deepseekocr::build() {
clip_out = cur;
}
const int clip_n_patches = sam_out->ne[0] * sam_out->ne[1];
sam_out = ggml_cont(ctx0, ggml_permute(ctx0, sam_out, 1, 2, 0, 3));
sam_out = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0], clip_n_patches);
clip_out = ggml_view_2d(ctx0, clip_out, n_embd, clip_n_patches, clip_out->nb[1], clip_out->nb[1]);
ggml_tensor * cur;
cur = ggml_concat(ctx0, clip_out, sam_out, 0);
cur = ggml_reshape_2d(ctx0, cur, 2 * n_embd, clip_n_patches);
cur = ggml_cont(ctx0, cur);
cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
cur = ggml_add(ctx0, cur, model.mm_fc_b);
@ -313,13 +308,11 @@ ggml_cgraph * clip_graph_deepseekocr::build() {
const auto n_dim = cur->ne[0];
ggml_tensor * imgnl;
ggml_tensor * vs;
imgnl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, h, 1);
vs = ggml_reshape_2d(ctx0, model.view_seperator, n_dim, 1); // (n_dim, 1)
cur = ggml_reshape_3d(ctx0, cur, n_dim, w, h);
cur = ggml_reshape_2d(ctx0, ggml_concat(ctx0, cur, imgnl, 1), n_dim, (w + 1) * h);
cur = ggml_concat(ctx0, cur, vs, 1); // (n_dim, h*(w+1) + 1)
cur = ggml_concat(ctx0, cur, model.view_seperator, 1); // (n_dim, h*(w+1) + 1)
cb(cur, "dsocr_output", -1);

View file

@ -0,0 +1,81 @@
#include "models.h"
ggml_cgraph * clip_graph_deepseekocr2::build() {
GGML_ASSERT(hparams.n_head_kv > 0);
GGML_ASSERT(n_head % hparams.n_head_kv == 0);
// patch embedding
ggml_tensor * inp_raw = build_inp_raw();
ggml_tensor * sam_out = build_sam(inp_raw);
ggml_tensor * qwen2_out;
// Building Qwen2 encoder
{
ggml_tensor * inp;
inp = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0] * sam_out->ne[1], sam_out->ne[2]); // H*W, C
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
auto num_image_tokens = inp->ne[1]; // H*W
GGML_ASSERT(num_image_tokens == 144 || num_image_tokens == 256);
// query based on numbers of image tokens (in SAM output)
// 16x16 -> query_1024 (1024x1024 images)
// 12x12 -> query_768 (768x768 images)
ggml_tensor * query_embed = model.resample_query_1024;
int num_queries = 256;
if (num_image_tokens == 144) {
query_embed = model.resample_query_768;
num_queries = 144;
}
// (B, num_image_tokens + num_queries, C)
inp = ggml_concat(ctx0, inp, ggml_cast(ctx0, query_embed, inp->type), 1);
auto seq_len = inp->ne[1];
// qwen2 encoder attention mask
ggml_tensor * attn_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, seq_len, seq_len);
ggml_set_name(attn_mask, "qwen2_attn_mask");
ggml_set_input(attn_mask);
ggml_tensor * inp_pos = ggml_cast(ctx0, ggml_arange(ctx0, 0, seq_len, 1), GGML_TYPE_I32);
auto add_rope = [&](ggml_tensor * x, const clip_layer &) {
return ggml_rope_ext(ctx0, x, inp_pos, nullptr, d_head,
GGML_ROPE_TYPE_NEOX, 131072, 1000000, 1, 0, 1, 0, 0);
};
build_vit_opts vit_opts;
vit_opts.attn_mask = attn_mask;
// build_vit applies model.post_ln_w internally; do not re-apply
ggml_tensor * cur = build_vit(inp, seq_len, NORM_TYPE_RMS, FFN_SILU,
/* learned_pos_embd */ nullptr, add_rope, vit_opts);
cur = ggml_cont(ctx0,
ggml_view_2d(ctx0, cur, cur->ne[0], num_queries, cur->nb[1],
cur->nb[1] * (cur->ne[1] - num_queries))); // only take query tokens for output
ggml_build_forward_expand(gf, cur);
qwen2_out = cur;
}
ggml_tensor * cur;
cur = ggml_mul_mat(ctx0, model.mm_fc_w, qwen2_out);
cur = ggml_add(ctx0, cur, model.mm_fc_b);
// view_seperator only after the global view
if (img.add_viewsep) {
cur = ggml_concat(ctx0, cur, model.view_seperator, 1); // (n_dim, 257)
}
cb(cur, "dsocr2_output", -1);
ggml_build_forward_expand(gf, cur);
return gf;
}

View file

@ -121,6 +121,11 @@ struct clip_graph_deepseekocr : clip_graph {
ggml_tensor * build_sam(ggml_tensor * inp); // build the SAM model
};
struct clip_graph_deepseekocr2 : clip_graph_deepseekocr {
clip_graph_deepseekocr2(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph_deepseekocr(ctx, img) {}
ggml_cgraph * build() override; // reuses build_sam() from base
};
struct clip_graph_conformer : clip_graph {
clip_graph_conformer(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;

View file

@ -1137,6 +1137,105 @@ bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img,
return true;
}
//
// mtmd_image_preprocessor_deepseekocr2
//
// candidate tile grids (cols, rows) with min_tiles <= cols*rows <= max_tiles
// sorted by tile count
std::vector<clip_image_size> mtmd_image_preprocessor_deepseekocr2::get_target_ratios() {
std::vector<clip_image_size> ratios;
for (int n = min_tiles; n <= max_tiles; n++) {
for (int w = 1; w <= n; w++) {
for (int h = 1; h <= n; h++) {
if (w * h < min_tiles || w * h > max_tiles) {
continue;
}
bool found = false;
for (const auto & r : ratios) {
if (r.width == w && r.height == h) {
found = true;
break;
}
}
if (!found) {
ratios.push_back({ w, h });
}
}
}
}
std::sort(ratios.begin(), ratios.end(), [](const clip_image_size & a, const clip_image_size & b) {
return a.width * a.height < b.width * b.height;
});
return ratios;
}
// pick the grid whose aspect ratio is closest to the image
// on a tie, prefer the larger grid when the image fits
clip_image_size mtmd_image_preprocessor_deepseekocr2::find_closest_aspect_ratio(
float aspect_ratio,
const std::vector<clip_image_size> & target_ratios,
int width,
int height) {
float best_ratio_diff = std::numeric_limits<float>::max();
clip_image_size best_ratio = { 1, 1 };
const float area = static_cast<float>(width * height);
for (const auto & ratio : target_ratios) {
const float target_aspect_ratio = static_cast<float>(ratio.width) / ratio.height;
const float ratio_diff = std::abs(aspect_ratio - target_aspect_ratio);
if (ratio_diff < best_ratio_diff) {
best_ratio_diff = ratio_diff;
best_ratio = ratio;
} else if (ratio_diff == best_ratio_diff) {
const float target_area = static_cast<float>(tile_size * tile_size * ratio.width * ratio.height);
if (area > 0.5f * target_area) {
best_ratio = ratio;
}
}
}
return best_ratio;
}
bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
// emit 768x768 local tiles when the image is larger than a tile in either
// dimension, then always a 1024x1024 global view. order: [tiles..., global].
if (img.nx > tile_size || img.ny > tile_size) {
const float aspect_ratio = static_cast<float>(img.nx) / img.ny;
const auto target_ratios = get_target_ratios();
const clip_image_size grid = find_closest_aspect_ratio(aspect_ratio, target_ratios, img.nx, img.ny);
// stretch onto the grid (no aspect preserve), then crop tiles row-major.
clip_image_u8 refined;
img_tool::resize(img, refined, { tile_size * grid.width, tile_size * grid.height },
RESIZE_ALGO_BICUBIC_PILLOW, PAD_NONE);
for (int row = 0; row < grid.height; row++) {
for (int col = 0; col < grid.width; col++) {
clip_image_u8 tile;
img_tool::crop(refined, tile, col * tile_size, row * tile_size, tile_size, tile_size);
clip_image_f32_ptr res(clip_image_f32_init());
img_u8_to_f32(tile, *res, hparams.image_mean, hparams.image_std);
output.entries.push_back(std::move(res));
}
}
}
// global view: aspect-preserving fit-and-pad to base_size.
clip_image_u8 padded;
img_tool::resize(img, padded, { base_size, base_size }, RESIZE_ALGO_BICUBIC_PILLOW,
PAD_NEAREST, hparams.image_pad_color);
clip_image_f32_ptr global(clip_image_f32_init());
img_u8_to_f32(padded, *global, hparams.image_mean, hparams.image_std);
global->add_viewsep = true;
output.entries.push_back(std::move(global));
output.grid_x = 1;
output.grid_y = 1;
return true;
}
//
// mtmd_image_preprocessor_step3vl
//

View file

@ -144,6 +144,26 @@ struct mtmd_image_preprocessor_deepseekocr : mtmd_image_preprocessor {
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
};
// DeepSeek-OCR-2: a 1024x1024 global view, plus InternVL-style 768x768 local
// tiles when the image is larger than a tile in either dimension.
struct mtmd_image_preprocessor_deepseekocr2 : mtmd_image_preprocessor {
static constexpr int base_size = 1024; // global view
static constexpr int tile_size = 768; // local tile
static constexpr int min_tiles = 2;
static constexpr int max_tiles = 6;
mtmd_image_preprocessor_deepseekocr2(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
private:
static std::vector<clip_image_size> get_target_ratios();
static clip_image_size find_closest_aspect_ratio(
float aspect_ratio,
const std::vector<clip_image_size> & target_ratios,
int width,
int height);
};
// custom image preprocessing for Step3VL
// ref: https://huggingface.co/stepfun-ai/Step3-VL-10B/blob/main/processing_step3.py
struct mtmd_image_preprocessor_step3vl : mtmd_image_preprocessor_llava_uhd {

View file

@ -493,6 +493,11 @@ struct mtmd_context {
img_end = "\n"; // prevent empty batch on llama-server
image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
} break;
case PROJECTOR_TYPE_DEEPSEEKOCR2:
{
img_end = "\n"; // prevent empty batch on llama-server
image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr2>(ctx_v);
} break;
case PROJECTOR_TYPE_HUNYUANVL:
{
// note: these use fullwidth (U+FF5C) and ▁ (U+2581) to match the tokenizer vocabulary
@ -1091,16 +1096,21 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
if (clip_is_llava(ctx_clip)
|| proj_type == PROJECTOR_TYPE_MINICPMV
|| proj_type == PROJECTOR_TYPE_GLM_EDGE
|| proj_type == PROJECTOR_TYPE_INTERNVL) {
|| proj_type == PROJECTOR_TYPE_INTERNVL
|| proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2) {
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
const auto & entries = image_tokens->batch_f32.entries;
// entries may have different token counts
// e.g., DeepSeek-OCR-2: 144 per tile views, 257 for the global view
size_t offset = 0;
for (size_t i = 0; i < entries.size(); i++) {
int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
ok = clip_image_encode(
ctx_clip,
ctx->n_threads,
entries[i].get(),
ctx->image_embd_v.data() + i*n_mmproj_embd*n_tokens_per_image);
ctx->image_embd_v.data() + offset);
offset += static_cast<size_t>(n_mmproj_embd) * n_tokens_per_image;
}
} else {
ok = clip_image_batch_encode(

View file

@ -3,7 +3,7 @@
Evaluates llama.cpp's DeepSeek-OCR by comparing its output for a test
image to the actual text in part of that image.
Runs the test image through mtmd-cli, calculates CER and chrF for
Runs each test image through mtmd-cli, calculates CER and chrF for
its output, and holds them against the HF model's scores.
"""
@ -12,24 +12,81 @@ import logging
import subprocess
import sys
import unicodedata
from dataclasses import dataclass
from pathlib import Path
logger = logging.getLogger("deepseek-ocr-test")
DEFAULT_IMAGE = "test-1.jpeg"
DEFAULT_EXPECTED_TEXT = "test-1-ground-truth.txt"
RUN_TIMEOUT = 300
# DeepSeek-OCR reference scores on the test image.
# This is the baseline the implementation should keep up with.
HF_REFERENCE_CER = 0.3030
HF_REFERENCE_CHRF = 67.52
CER_TOLERANCE = 0.02
CHRF_TOLERANCE = 2.0
@dataclass
class ModelSpec:
key: str
label: str
model_arg: str
mmproj_arg: str
model_default: str
mmproj_default: str
CER_MAX = HF_REFERENCE_CER + CER_TOLERANCE
CHRF_MIN = HF_REFERENCE_CHRF - CHRF_TOLERANCE
@dataclass
class TestCase:
model_key: str
label: str
image: str
ground_truth: str
hf_cer: float
hf_chrf: float
cer_tol: float
chrf_tol: float
@property
def cer_max(self) -> float:
return self.hf_cer + self.cer_tol
@property
def chrf_min(self) -> float:
return self.hf_chrf - self.chrf_tol
MODELS = {
"v1": ModelSpec(
key="v1", label="DeepSeek-OCR",
model_arg="--llama-model", mmproj_arg="--mmproj",
model_default="gguf_models/deepseek-ai/deepseek-ocr-bf16.gguf",
mmproj_default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-bf16.gguf",
),
"v2": ModelSpec(
key="v2", label="DeepSeek-OCR-2",
model_arg="--llama-model-2", mmproj_arg="--mmproj-2",
model_default="gguf_models/deepseek-ai/deepseek-ocr-2-bf16.gguf",
mmproj_default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-2-bf16.gguf",
),
}
CASES = [
TestCase(
model_key="v1", label="single-view scan",
image="tools/mtmd/test-1.jpeg",
ground_truth="tools/mtmd/tests/test-1-ground-truth.txt",
hf_cer=0.3030, hf_chrf=67.52, cer_tol=0.02, chrf_tol=2.0,
),
TestCase(
model_key="v2", label="single-view scan",
image="tools/mtmd/test-1.jpeg",
ground_truth="tools/mtmd/tests/test-1-ground-truth.txt",
# 640x488 is below the 768 tiling threshold -- single 1024 global view.
# hf_cer/hf_chrf are the deepseek-ai repo's own scores (ImageOps.pad);
# the transformers HF processor is *not* the reference -- its pad_to_square
# is one pixel off and lands at ~0.69 instead.
hf_cer=0.7761, hf_chrf=28.70, cer_tol=0.12, chrf_tol=8.0,
),
]
def arg_dest(flag: str) -> str:
return flag.lstrip("-").replace("-", "_")
def verdict(ok: bool) -> str:
@ -84,6 +141,14 @@ def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str:
"--temp", "0",
"--flash-attn", "off", # match the HF "eager" attention reference
"--no-warmup",
"-n", "512", # cap loops on hard images (KV would otherwise fill)
# HF decodes with no_repeat_ngram_size; llama.cpp's analog is DRY.
# Default DRY breakers include "\n", so they are cleared below.
"--dry-multiplier", "0.8",
"--dry-base", "1.75",
"--dry-allowed-length", "2",
"--dry-penalty-last-n", "-1",
"--dry-sequence-breaker", "none",
]
logger.debug(f" command: {' '.join(cmd)}")
@ -110,7 +175,7 @@ def read_expected_text(file_path: Path) -> str:
return f.read().strip()
def evaluate(expected: str, ocr_out: str) -> bool:
def evaluate(case: "TestCase", expected: str, ocr_out: str) -> bool:
expected = normalize_text(expected)
ocr_out = normalize_text(ocr_out)
aligned = locally_align(expected, ocr_out)
@ -122,16 +187,16 @@ def evaluate(expected: str, ocr_out: str) -> bool:
cer = compute_cer(expected, aligned)
chrf = compute_chrf(expected, aligned)
cer_pass = cer <= CER_MAX
chrf_pass = chrf >= CHRF_MIN
cer_pass = cer <= case.cer_max
chrf_pass = chrf >= case.chrf_min
passed = cer_pass and chrf_pass
logger.info("")
logger.info("=" * 60)
logger.info("Free OCR evaluation:")
logger.info("=" * 60)
logger.info(f" CER {cer:>7.4f} (<= {CER_MAX:>7.4f} -> {verdict(cer_pass)})")
logger.info(f" chrF (0-100) {chrf:>7.2f} (>= {CHRF_MIN:>7.2f} -> {verdict(chrf_pass)})")
logger.info(f" CER {cer:>7.4f} (HF {case.hf_cer:.4f}, <= {case.cer_max:>7.4f} -> {verdict(cer_pass)})")
logger.info(f" chrF (0-100) {chrf:>7.2f} (HF {case.hf_chrf:.2f}, >= {case.chrf_min:>7.2f} -> {verdict(chrf_pass)})")
logger.info(f" Expected chars {len(expected):>7}")
logger.info(f" Aligned chars {len(aligned):>7} (of {len(ocr_out)} OCR chars)")
logger.info("")
@ -142,12 +207,13 @@ def evaluate(expected: str, ocr_out: str) -> bool:
def argument_parser() -> argparse.ArgumentParser:
ap = argparse.ArgumentParser(description="Compare llama.cpp DeepSeek-OCR output with a ground-truth transcript")
ap.add_argument("--llama-model", default="gguf_models/deepseek-ai/deepseek-ocr-bf16.gguf",
help="Path to llama.cpp GGUF model (relative to repo root or absolute)")
ap.add_argument("--mmproj", default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-bf16.gguf",
help="Path to mmproj GGUF file (relative to repo root or absolute)")
ap.add_argument("--llama-bin", default="build/bin/llama-mtmd-cli",
help="Path to llama-mtmd-cli binary (relative to repo root or absolute)")
for spec in MODELS.values():
ap.add_argument(spec.model_arg, default=spec.model_default,
help=f"Path to the {spec.label} GGUF model (relative to repo root or absolute)")
ap.add_argument(spec.mmproj_arg, default=spec.mmproj_default,
help=f"Path to the {spec.label} mmproj GGUF file (relative to repo root or absolute)")
ap.add_argument("--verbose", action="store_true",
help="Also log the expected, OCR, and aligned text")
return ap
@ -167,53 +233,60 @@ def main() -> int:
args = argument_parser().parse_args()
configure_logging(args.verbose)
tests_dir = Path(__file__).parent # tools/mtmd/tests
mtmd_dir = tests_dir.parent # tools/mtmd
repo_root = mtmd_dir.parent.parent # repo root
repo_root = Path(__file__).resolve().parents[3] # tests -> mtmd -> tools -> repo root
binary = resolve_path(args.llama_bin, repo_root)
inputs = [
("image", resolve_path(DEFAULT_IMAGE, mtmd_dir)),
("expected-text", resolve_path(DEFAULT_EXPECTED_TEXT, tests_dir)),
("model", resolve_path(args.llama_model, repo_root)),
("mmproj", resolve_path(args.mmproj, repo_root)),
("binary", resolve_path(args.llama_bin, repo_root)),
]
for label, path in inputs:
if not path.exists():
logger.error(f"Error: {label} not found: {path}")
return 1
paths = dict(inputs)
logger.info("=" * 60)
logger.info("DeepSeek-OCR: llama.cpp vs ground-truth comparison")
logger.info("=" * 60)
logger.info(f"HF baselines: CER {HF_REFERENCE_CER:.4f}, chrF {HF_REFERENCE_CHRF:.2f}")
logger.info(f"Test thresholds: CER <= {CER_MAX:.4f}, chrF >= {CHRF_MIN:.2f}")
logger.debug("")
logger.debug("Resolved test inputs:")
for label, path in inputs:
logger.debug(f" {label:<14} {path}")
logger.info("")
logger.info("[1/3] Running llama.cpp 'Free OCR'")
try:
ocr_out = run_mtmd_cli(paths["model"], paths["mmproj"],
paths["image"], paths["binary"])
except RuntimeError as e:
logger.error(f"Error: {e}")
if not binary.exists():
logger.error(f"Error: binary not found: {binary}")
return 1
logger.info("")
logger.info("[2/3] Reading expected output")
expected = read_expected_text(paths["expected-text"])
logger.info(f" expected: {len(expected)} chars")
logger.info("=" * 60)
logger.info("DeepSeek-OCR: llama.cpp vs HF parity check")
logger.info("=" * 60)
results = {}
for case in CASES:
model_spec = MODELS[case.model_key]
title = f"{model_spec.label} -- {case.label}"
logger.info("")
logger.info(f"=== {title} ===")
model = resolve_path(getattr(args, arg_dest(model_spec.model_arg)), repo_root)
mmproj = resolve_path(getattr(args, arg_dest(model_spec.mmproj_arg)), repo_root)
image = resolve_path(case.image, repo_root)
ground_truth = resolve_path(case.ground_truth, repo_root)
missing = [(lbl, p) for lbl, p in [("model", model), ("mmproj", mmproj),
("image", image), ("ground-truth", ground_truth)]
if not p.exists()]
if missing:
for lbl, p in missing:
logger.error(f" Error: {lbl} not found: {p}")
results[title] = False
continue
expected = read_expected_text(ground_truth)
logger.info(f" Image: {case.image}")
logger.info(f" Expected text: {len(expected)} chars")
logger.info(" Running llama.cpp 'Free OCR'")
try:
ocr_out = run_mtmd_cli(model, mmproj, image, binary)
except RuntimeError as e:
logger.error(f" Error: {e}")
results[title] = False
continue
results[title] = evaluate(case, expected, ocr_out)
logger.info("")
logger.info("[3/3] Computing OCR metrics")
ok = evaluate(expected, ocr_out)
logger.info("=== Summary ===")
for title, ok in results.items():
logger.info(f" {title:<48} {verdict(ok)}")
all_passed = all(results.values())
logger.info(f"Overall: {verdict(all_passed)}")
return 0 if ok else 1
return 0 if all_passed else 1
if __name__ == "__main__":