mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-29 19:23:39 +00:00
mtmd: Add DeepSeekOCR 2 Support (#20975)
* mtmd: DeepSeek-OCR 2 support, with multi-tile dynamic resolution * introduced clip_image_f32::add_viewsep * address PR review - drop redundant ggml_cpy ops in both deepseekocr versions build - drop no-op ggml_cont in build_sam - assert num_image_tokens deepseekocr2 - view_seperator as (1, n_embd) at conversion (for both versions) - drop redundant ggml_reshape_2d * Update tools/mtmd/models/deepseekocr2.cpp Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com> --------- Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
This commit is contained in:
parent
6ed481eea4
commit
da3f990a47
16 changed files with 505 additions and 90 deletions
|
|
@ -237,6 +237,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
|
|||
MMPROJ_MODEL_MAP: dict[str, str] = {
|
||||
"AudioFlamingo3ForConditionalGeneration": "ultravox",
|
||||
"CogVLMForCausalLM": "cogvlm",
|
||||
"DeepseekOCR2ForCausalLM": "deepseek",
|
||||
"DeepseekOCRForCausalLM": "deepseek",
|
||||
"DotsOCRForCausalLM": "dotsocr",
|
||||
"Gemma3ForConditionalGeneration": "gemma",
|
||||
|
|
|
|||
|
|
@ -1140,7 +1140,7 @@ class TextModel(ModelBase):
|
|||
# Skip multimodal tensors
|
||||
if name.startswith(("mlp", "vit.", "vpm.", "siglip2.", "conformer.", "merger.", "resampler.", "sound_encoder.", "sound_projection.", "speech_embeddings.")) \
|
||||
or "visual." in name or "vision." in name or "audio." in name or "talker." in name \
|
||||
or "vision_" in name or "audio_" in name or "sam_model" in name \
|
||||
or "vision_" in name or "audio_" in name \
|
||||
or "token2wav." in name or "code2wav." in name \
|
||||
or "projector." in name or "pre_mm_projector_norm" in name \
|
||||
or "image_newline" in name or "view_seperator" in name \
|
||||
|
|
|
|||
|
|
@ -16,10 +16,14 @@ from .qwen import QwenModel
|
|||
|
||||
@ModelBase.register("DeepseekOCRForCausalLM")
|
||||
class DeepseekOCRVisionModel(MmprojModel):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.clip_projector_type = gguf.VisionProjectorType.DEEPSEEKOCR
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DEEPSEEKOCR)
|
||||
self.gguf_writer.add_clip_projector_type(self.clip_projector_type)
|
||||
# default values below are taken from HF tranformers code
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
|
||||
self.gguf_writer.add_vision_use_gelu(True)
|
||||
|
|
@ -49,22 +53,27 @@ class DeepseekOCRVisionModel(MmprojModel):
|
|||
raise ValueError("DeepseekOCR model requires 'vision_config' in the model configuration, but it was not found")
|
||||
|
||||
vision_config['sam'] = vision_config['width']['sam_vit_b']
|
||||
vision_config.update(vision_config['width']['clip-l-14-224'])
|
||||
vision_config['hidden_size'] = vision_config['width']
|
||||
vision_config['num_heads'] = vision_config['heads']
|
||||
vision_config['intermediate_size'] = vision_config['heads'] * 4
|
||||
if vision_config['width'].get('clip-l-14-224') is not None:
|
||||
vision_config.update(vision_config['width']['clip-l-14-224'])
|
||||
if isinstance(vision_config['width'], int):
|
||||
vision_config['hidden_size'] = vision_config['width']
|
||||
if vision_config.get('heads') is not None:
|
||||
vision_config['num_heads'] = vision_config['heads']
|
||||
vision_config['intermediate_size'] = vision_config['heads'] * 4
|
||||
|
||||
return vision_config
|
||||
|
||||
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
||||
if ".embeddings." in name or 'pos_embed' in name:
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
if ".rel_pos_h" in name or '.rel_pos_w' in name:
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
if ".neck." in name or ".net_" in name:
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
for nq_name in ('.embeddings.', 'pos_embed', '.rel_pos_h', '.rel_pos_w', '.neck.', '.net_'):
|
||||
if nq_name in name:
|
||||
return gguf.GGMLQuantizationType.F32
|
||||
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if name.endswith("view_seperator"):
|
||||
data_torch = data_torch.unsqueeze(0)
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
|
||||
name, gen = item
|
||||
|
|
@ -81,6 +90,33 @@ class DeepseekOCRVisionModel(MmprojModel):
|
|||
return super().filter_tensors((name, gen))
|
||||
|
||||
|
||||
@ModelBase.register("DeepseekOCR2ForCausalLM")
|
||||
class DeepseekOCR2VisionModel(DeepseekOCRVisionModel):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.clip_projector_type = gguf.VisionProjectorType.DEEPSEEKOCR2
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
# the vision tower's qwen2 encoder is built from fixed defaults,
|
||||
# see build_qwen2_decoder_as_encoder() in deepencoderv2.py
|
||||
if self.hparams.get("patch_size") is None:
|
||||
self.hparams["patch_size"] = 16
|
||||
if self.hparams.get("intermediate_size") is None:
|
||||
self.hparams["intermediate_size"] = 4864
|
||||
if self.hparams.get("num_attention_heads") is None:
|
||||
self.hparams["num_attention_heads"] = 14
|
||||
super().set_gguf_parameters()
|
||||
# qwen2 encoder is GQA: 14 Q heads, 2 KV heads
|
||||
self.gguf_writer.add_vision_head_count_kv(2)
|
||||
|
||||
def get_vision_config(self) -> dict[str, Any]:
|
||||
vision_config = super().get_vision_config()
|
||||
vision_config['hidden_size'] = vision_config['width']['qwen2-0-5b']['dim']
|
||||
if vision_config.get('layers') is None:
|
||||
vision_config['layers'] = 24
|
||||
return vision_config
|
||||
|
||||
|
||||
@ModelBase.register("DeepseekForCausalLM")
|
||||
class DeepseekModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.DEEPSEEK
|
||||
|
|
@ -188,13 +224,21 @@ class DeepseekV2Model(TextModel):
|
|||
self.origin_hf_arch = hparams.get('architectures', [None])[0]
|
||||
|
||||
# special handling for Deepseek OCR
|
||||
if self.origin_hf_arch == "DeepseekOCRForCausalLM":
|
||||
if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM"):
|
||||
self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR
|
||||
self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
|
||||
self.gguf_writer.add_architecture()
|
||||
# default jinja template
|
||||
self.gguf_writer.add_chat_template("{% for m in messages %}{{m['content']}}{% endfor %}")
|
||||
|
||||
@classmethod
|
||||
def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
|
||||
name, _ = item
|
||||
# DeepSeek-OCR vision encoder (SAM + DeepSeek-OCR-2 qwen2 tower)
|
||||
if "sam_model" in name or "qwen2_model" in name:
|
||||
return None
|
||||
return super().filter_tensors(item)
|
||||
|
||||
def set_vocab(self):
|
||||
try:
|
||||
self._set_vocab_gpt2()
|
||||
|
|
|
|||
|
|
@ -812,6 +812,8 @@ class MODEL_TENSOR(IntEnum):
|
|||
V_SAM_NET_3 = auto() # Deepseek-OCR
|
||||
V_ENC_EMBD_IMGNL = auto() # Deepseek-OCR
|
||||
V_ENC_EMBD_VSEP = auto() # Deepseek-OCR
|
||||
V_RESMPL_QUERY_768 = auto() # Deepseek-OCR-2
|
||||
V_RESMPL_QUERY_1024 = auto() # Deepseek-OCR-2
|
||||
|
||||
# audio (mtmd)
|
||||
A_ENC_EMBD_POS = auto()
|
||||
|
|
@ -1329,6 +1331,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.V_SAM_NET_3: "v.sam.net_3",
|
||||
MODEL_TENSOR.V_ENC_EMBD_IMGNL: "v.image_newline", # Deepseek-OCR
|
||||
MODEL_TENSOR.V_ENC_EMBD_VSEP: "v.view_seperator", # Deepseek-OCR
|
||||
MODEL_TENSOR.V_RESMPL_QUERY_768: "v.resample_query_768", # Deepseek-OCR-2 qwen2
|
||||
MODEL_TENSOR.V_RESMPL_QUERY_1024: "v.resample_query_1024", # Deepseek-OCR-2 qwen2
|
||||
# audio (mtmd)
|
||||
# note: all audio tensor names must use prefix "a." or "mm.a."
|
||||
MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd",
|
||||
|
|
@ -1507,6 +1511,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.V_SAM_NECK,
|
||||
MODEL_TENSOR.V_SAM_NET_2,
|
||||
MODEL_TENSOR.V_SAM_NET_3,
|
||||
MODEL_TENSOR.V_RESMPL_QUERY_768,
|
||||
MODEL_TENSOR.V_RESMPL_QUERY_1024,
|
||||
# audio
|
||||
MODEL_TENSOR.A_ENC_EMBD_POS,
|
||||
MODEL_TENSOR.A_ENC_EMBD_NORM,
|
||||
|
|
@ -4329,6 +4335,7 @@ class VisionProjectorType:
|
|||
JANUS_PRO = "janus_pro"
|
||||
DOTSOCR = "dots_ocr"
|
||||
DEEPSEEKOCR = "deepseekocr"
|
||||
DEEPSEEKOCR2 = "deepseekocr2"
|
||||
LFM2A = "lfm2a" # audio
|
||||
MUSIC_FLAMINGO = "musicflamingo" # audio
|
||||
GLM4V = "glm4v"
|
||||
|
|
|
|||
|
|
@ -1485,6 +1485,7 @@ class TensorNameMap:
|
|||
"siglip2.vision_model.encoder.layers.{bid}.self_attn.q_proj", # youtuvl
|
||||
"model.vision_model.transformer.layers.{bid}.self_attn.q_proj", # Deepseek-OCR CLIP, generated
|
||||
"vision_model.model.layers.{bid}.self_attn.q_proj.linear", # gemma4
|
||||
"model.qwen2_model.model.model.layers.{bid}.self_attn.q_proj" # Deepseek-OCR-2 qwen2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
|
||||
|
|
@ -1509,6 +1510,7 @@ class TensorNameMap:
|
|||
"model.vision_model.transformer.layers.{bid}.self_attn.k_proj", # Deepseek-OCR CLIP, generated
|
||||
"siglip2.vision_model.encoder.layers.{bid}.self_attn.k_proj",
|
||||
"vision_model.model.layers.{bid}.self_attn.k_proj.linear", # gemma4
|
||||
"model.qwen2_model.model.model.layers.{bid}.self_attn.k_proj" # Deepseek-OCR-2 qwen2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
|
||||
|
|
@ -1533,6 +1535,7 @@ class TensorNameMap:
|
|||
"siglip2.vision_model.encoder.layers.{bid}.self_attn.v_proj",
|
||||
"model.vision_model.transformer.layers.{bid}.self_attn.v_proj", # Deepseek-OCR CLIP, generated
|
||||
"vision_model.model.layers.{bid}.self_attn.v_proj.linear", # gemma4
|
||||
"model.qwen2_model.model.model.layers.{bid}.self_attn.v_proj" # Deepseek-OCR-2 qwen2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_INPUT_NORM: (
|
||||
|
|
@ -1554,6 +1557,7 @@ class TensorNameMap:
|
|||
"vision_model.radio_model.model.blocks.{bid}.norm1", # Nemotron Nano v2 VL
|
||||
"vision_tower.blocks.{bid}.norm1", # dots.ocr
|
||||
"vision_model.transformer.resblocks.{bid}.ln_1", # Step3-VL
|
||||
"model.qwen2_model.model.model.layers.{bid}.input_layernorm", # Deepseek-OCR-2 qwen2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_O: (
|
||||
|
|
@ -1574,6 +1578,7 @@ class TensorNameMap:
|
|||
"model.vision_model.transformer.layers.{bid}.self_attn.out_proj", # Deepseek-OCR CLIP
|
||||
"siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl
|
||||
"vision_model.radio_model.model.blocks.{bid}.attn.proj", # Nemotron Nano v2 VL
|
||||
"model.qwen2_model.model.model.layers.{bid}.self_attn.o_proj", # Deepseek-OCR-2 qwen2
|
||||
"vision_model.model.layers.{bid}.self_attn.o_proj.linear", # gemma4
|
||||
"vision_tower.blocks.{bid}.attn.proj", # dots.ocr
|
||||
"vision_model.transformer.resblocks.{bid}.attn.out_proj", # Step3-VL
|
||||
|
|
@ -1603,6 +1608,7 @@ class TensorNameMap:
|
|||
"vision_model.model.layers.{bid}.pre_feedforward_layernorm", # gemma4
|
||||
"vision_tower.blocks.{bid}.norm2", # dots.ocr
|
||||
"vision_model.transformer.resblocks.{bid}.ln_2", # Step3-VL
|
||||
"model.qwen2_model.model.model.layers.{bid}.post_attention_layernorm", # Deepseek-OCR-2 qwen2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_FFN_UP: (
|
||||
|
|
@ -1625,6 +1631,7 @@ class TensorNameMap:
|
|||
"vision_model.radio_model.model.blocks.{bid}.mlp.fc1", # Nemotron Nano v2 VL
|
||||
"vision_model.model.layers.{bid}.mlp.up_proj", # gemma4
|
||||
"vision_model.transformer.resblocks.{bid}.mlp.c_fc", # Step3-VL
|
||||
"model.qwen2_model.model.model.layers.{bid}.mlp.up_proj", # Deepseek-OCR-2 qwen2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_FFN_GATE: (
|
||||
|
|
@ -1632,6 +1639,7 @@ class TensorNameMap:
|
|||
"vision_encoder.transformer.layers.{bid}.feed_forward.w1", # pixtral
|
||||
"visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl
|
||||
"vision_model.model.layers.{bid}.mlp.gate_proj", # gemma4
|
||||
"model.qwen2_model.model.model.layers.{bid}.mlp.gate_proj", # Deepseek-OCR-2 qwen2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_FFN_DOWN: (
|
||||
|
|
@ -1652,6 +1660,7 @@ class TensorNameMap:
|
|||
"model.vision_model.transformer.layers.{bid}.mlp.fc2", # Deepseek-OCR CLIP
|
||||
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc2",
|
||||
"vision_model.radio_model.model.blocks.{bid}.mlp.fc2", # Nemotron Nano v2 VL
|
||||
"model.qwen2_model.model.model.layers.{bid}.mlp.down_proj" , # Deepseek-OCR-2 qwen2
|
||||
"vision_model.model.layers.{bid}.mlp.down_proj", # gemma4
|
||||
"vision_model.transformer.resblocks.{bid}.mlp.c_proj", # Step3-VL
|
||||
),
|
||||
|
|
@ -1699,6 +1708,7 @@ class TensorNameMap:
|
|||
"vision_tower.encoder.final_layernorm", # kimi-vl
|
||||
"visual.post_layernorm", # glm4v
|
||||
"siglip2.vision_model.post_layernorm",
|
||||
"model.qwen2_model.model.model.norm", # Deepseek-OCR-2 qwen2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MM_POST_NORM: (
|
||||
|
|
@ -1879,6 +1889,14 @@ class TensorNameMap:
|
|||
"model.sam_model.net_3",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_RESMPL_QUERY_768: (
|
||||
"model.qwen2_model.query_768", # Deepseek-OCR-2 qwen2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_RESMPL_QUERY_1024: (
|
||||
"model.qwen2_model.query_1024", # Deepseek-OCR-2 qwen2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MM_POST_FC_NORM: (
|
||||
"model.vision.linear_proj.norm1", # cogvlm
|
||||
),
|
||||
|
|
|
|||
|
|
@ -40,6 +40,7 @@ add_library(mtmd
|
|||
models/siglip.cpp
|
||||
models/whisper-enc.cpp
|
||||
models/deepseekocr.cpp
|
||||
models/deepseekocr2.cpp
|
||||
models/mobilenetv5.cpp
|
||||
models/youtuvl.cpp
|
||||
models/yasa2.cpp
|
||||
|
|
|
|||
|
|
@ -188,6 +188,8 @@
|
|||
#define TN_SAM_FFN_DOWN "v.sam.blk.%d.mlp.lin2.%s"
|
||||
#define TN_SAM_NECK "v.sam.neck.%d.%s"
|
||||
#define TN_SAM_NET "v.sam.net_%d.%s"
|
||||
// deepseek-ocr-2
|
||||
#define TN_RESMPL_QUERY "v.resample_query_%d.%s"
|
||||
// (conformer) lfm2
|
||||
#define TN_PRE_ENCODE_OUT "a.pre_encode.out.%s"
|
||||
#define TN_FFN_NORM "%s.blk.%d.ffn_norm.%s"
|
||||
|
|
@ -337,6 +339,7 @@ enum projector_type {
|
|||
PROJECTOR_TYPE_JANUS_PRO,
|
||||
PROJECTOR_TYPE_DOTS_OCR,
|
||||
PROJECTOR_TYPE_DEEPSEEKOCR,
|
||||
PROJECTOR_TYPE_DEEPSEEKOCR2,
|
||||
PROJECTOR_TYPE_LFM2A,
|
||||
PROJECTOR_TYPE_GLM4V,
|
||||
PROJECTOR_TYPE_YOUTUVL,
|
||||
|
|
@ -386,6 +389,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
|||
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
|
||||
{ PROJECTOR_TYPE_DOTS_OCR, "dots_ocr"},
|
||||
{ PROJECTOR_TYPE_DEEPSEEKOCR,"deepseekocr"},
|
||||
{ PROJECTOR_TYPE_DEEPSEEKOCR2,"deepseekocr2"},
|
||||
{ PROJECTOR_TYPE_LFM2A, "lfm2a"},
|
||||
{ PROJECTOR_TYPE_GLM4V, "glm4v"},
|
||||
{ PROJECTOR_TYPE_YOUTUVL, "youtuvl"},
|
||||
|
|
@ -424,6 +428,9 @@ struct clip_image_f32 {
|
|||
int ny;
|
||||
|
||||
std::vector<float> buf;
|
||||
|
||||
// marks the global view in e.g., DeepSeek-OCR Models
|
||||
bool add_viewsep = false;
|
||||
};
|
||||
|
||||
//
|
||||
|
|
|
|||
|
|
@ -542,6 +542,11 @@ struct clip_model {
|
|||
int32_t n_sam_layers = 12; // used by deepseek-ocr sam encoder
|
||||
|
||||
std::vector<clip_layer> sam_layers;
|
||||
|
||||
// deepseek-ocr-2
|
||||
ggml_tensor * resample_query_768 = nullptr;
|
||||
ggml_tensor * resample_query_1024 = nullptr;
|
||||
|
||||
// lfm2 audio
|
||||
std::array<ggml_tensor *, 7> pre_encode_conv_X_w = {nullptr};
|
||||
std::array<ggml_tensor *, 7> pre_encode_conv_X_b = {nullptr};
|
||||
|
|
|
|||
|
|
@ -953,6 +953,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
{
|
||||
builder = std::make_unique<clip_graph_deepseekocr>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_DEEPSEEKOCR2:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_deepseekocr2>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2A:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_conformer>(ctx, img);
|
||||
|
|
@ -1514,6 +1518,7 @@ struct clip_model_loader {
|
|||
hparams.set_warmup_n_tokens(28*28); // avoid OOM on warmup
|
||||
} break;
|
||||
case PROJECTOR_TYPE_DEEPSEEKOCR:
|
||||
case PROJECTOR_TYPE_DEEPSEEKOCR2:
|
||||
{
|
||||
hparams.patch_size = 16;
|
||||
hparams.image_size = 1024;
|
||||
|
|
@ -1525,6 +1530,10 @@ struct clip_model_loader {
|
|||
get_u32(KEY_SAM_N_HEAD, hparams.sam_n_head, true);
|
||||
get_u32(KEY_SAM_N_EMBD, hparams.sam_n_embd, true);
|
||||
get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
|
||||
if (model.proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2) {
|
||||
// qwen2 encoder is GQA, requires KEY_N_HEAD_KV
|
||||
get_u32(string_format(KEY_N_HEAD_KV, "vision"), hparams.n_head_kv);
|
||||
}
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
{
|
||||
|
|
@ -2374,6 +2383,7 @@ struct clip_model_loader {
|
|||
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_DEEPSEEKOCR:
|
||||
case PROJECTOR_TYPE_DEEPSEEKOCR2:
|
||||
{
|
||||
model.pos_embed = get_tensor(string_format(TN_SAM_POS_EMBD, "weight"));
|
||||
model.patch_embed_proj_w = get_tensor(string_format(TN_SAM_PATCH_EMBD, "weight"));
|
||||
|
|
@ -2404,10 +2414,12 @@ struct clip_model_loader {
|
|||
model.neck_3_w = get_tensor(string_format(TN_SAM_NECK, 3, "weight"));
|
||||
model.net_2 = get_tensor(string_format(TN_SAM_NET, 2, "weight"));
|
||||
model.net_3 = get_tensor(string_format(TN_SAM_NET, 3, "weight"));
|
||||
model.image_newline = get_tensor(TN_IMAGE_NEWLINE);
|
||||
model.image_newline = get_tensor(TN_IMAGE_NEWLINE, false);
|
||||
model.view_seperator = get_tensor(TN_IMAGE_SEPERATOR);
|
||||
model.mm_fc_w = get_tensor(string_format(TN_MM_PROJECTOR, "weight"));
|
||||
model.mm_fc_b = get_tensor(string_format(TN_MM_PROJECTOR, "bias"));
|
||||
model.resample_query_768 = get_tensor(string_format(TN_RESMPL_QUERY, 768, "weight"), false);
|
||||
model.resample_query_1024 = get_tensor(string_format(TN_RESMPL_QUERY, 1024, "weight"), false);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA4A:
|
||||
{
|
||||
|
|
@ -3277,7 +3289,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
|||
case PROJECTOR_TYPE_DEEPSEEKOCR:
|
||||
{
|
||||
// SAM encoder applies two stride-2 convolutions (net_2 and net_3)
|
||||
// which reduces spatial dimensions by 4x in each direction (16x total)
|
||||
// that reduce spatial dimensions by 4x in each direction (16x total)
|
||||
// E.g., 64x64 -> 16x16 patches
|
||||
n_patches /= 16;
|
||||
|
||||
|
|
@ -3293,6 +3305,15 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
|||
int oh = (img->ny / patch_size) / merge;
|
||||
n_patches = (ow + 1) * oh + 2;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_DEEPSEEKOCR2:
|
||||
{
|
||||
// 1024 global view -> 256 query tokens + 1 view separator = 257;
|
||||
// 768 local tile -> 144 query tokens, no separator.
|
||||
n_patches /= 16;
|
||||
if (img->add_viewsep) {
|
||||
n_patches += 1; // view separator, appended only after the global view
|
||||
}
|
||||
} break;
|
||||
case PROJECTOR_TYPE_LFM2A:
|
||||
{
|
||||
n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2;
|
||||
|
|
@ -3882,6 +3903,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||
set_input_i32("pos_y", pos_y);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_DEEPSEEKOCR:
|
||||
case PROJECTOR_TYPE_DEEPSEEKOCR2:
|
||||
{
|
||||
GGML_ASSERT(pos_w == pos_h);
|
||||
|
||||
|
|
@ -3904,6 +3926,34 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||
|
||||
set_input_i32("rel_pos_indices_local", rel_pos_indices_local);
|
||||
set_input_i32("rel_pos_indices_global", rel_pos_indices_global);
|
||||
|
||||
if (ctx->proj_type() == PROJECTOR_TYPE_DEEPSEEKOCR2) {
|
||||
|
||||
// qwen2 encoder attention mask
|
||||
|
||||
// num_image_tokens = num_patches / 16
|
||||
// 256 for 1024 global view
|
||||
// 144 for 768 tile views
|
||||
const int num_image_tokens = num_patches / 16;
|
||||
const int seq_len = num_image_tokens * 2;
|
||||
std::vector qwen2_mask(static_cast<size_t>(seq_len) * seq_len, 0.0f);
|
||||
|
||||
// attention mask layout
|
||||
// +--------------+---------------+
|
||||
// | all 0 | all -inf |
|
||||
// +--------------+---------------+
|
||||
// | all 0 | lower tri 0 |
|
||||
// +--------------+---------------+
|
||||
for (int i = 0; i < seq_len; i++) {
|
||||
for (int j = 0; j < seq_len; j++) {
|
||||
const bool zero = i < num_image_tokens ?
|
||||
j < num_image_tokens :
|
||||
j < num_image_tokens || j <= i;
|
||||
qwen2_mask[static_cast<size_t>(i) * seq_len + j] = zero ? 0.0f : -1e9f;
|
||||
}
|
||||
}
|
||||
set_input_f32("qwen2_attn_mask", qwen2_mask);
|
||||
}
|
||||
} break;
|
||||
case PROJECTOR_TYPE_GEMMA3:
|
||||
case PROJECTOR_TYPE_GEMMA3NV:
|
||||
|
|
@ -4256,6 +4306,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|||
case PROJECTOR_TYPE_COGVLM:
|
||||
return ctx->model.mm_4h_to_h_w->ne[1];
|
||||
case PROJECTOR_TYPE_DEEPSEEKOCR:
|
||||
case PROJECTOR_TYPE_DEEPSEEKOCR2:
|
||||
return ctx->model.mm_fc_w->ne[1];
|
||||
case PROJECTOR_TYPE_LFM2A:
|
||||
return ctx->model.position_embeddings->ne[0];
|
||||
|
|
|
|||
|
|
@ -157,7 +157,6 @@ ggml_tensor * clip_graph_deepseekocr::build_sam(ggml_tensor * inp_raw) {
|
|||
|
||||
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
|
||||
cur = ggml_add(ctx0, cur, layer.qkv_b);
|
||||
cur = ggml_cont(ctx0, cur); // Ensure tensor is contiguous before reshape
|
||||
cur = ggml_reshape_4d(ctx0, cur, n_embd, 3, W * H, B);
|
||||
|
||||
ggml_tensor * Q;
|
||||
|
|
@ -251,17 +250,17 @@ ggml_cgraph * clip_graph_deepseekocr::build() {
|
|||
ggml_tensor * inp_raw = build_inp_raw();
|
||||
ggml_tensor * sam_out = build_sam(inp_raw);
|
||||
|
||||
const int clip_n_patches = sam_out->ne[0] * sam_out->ne[1];
|
||||
|
||||
ggml_tensor * clip_out;
|
||||
// Building DS-OCR CLIP
|
||||
{
|
||||
ggml_tensor * inp;
|
||||
|
||||
inp = ggml_cpy(ctx0, sam_out, ggml_dup_tensor(ctx0, sam_out));
|
||||
inp = ggml_reshape_2d(ctx0, inp, inp->ne[0] * inp->ne[1], inp->ne[2]);
|
||||
inp = ggml_reshape_2d(ctx0, sam_out, clip_n_patches, sam_out->ne[2]);
|
||||
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
|
||||
|
||||
ggml_tensor * new_pos_embd =
|
||||
ggml_cpy(ctx0, model.position_embeddings, ggml_dup_tensor(ctx0, model.position_embeddings));
|
||||
ggml_tensor * new_pos_embd = model.position_embeddings;
|
||||
|
||||
int n_pos = new_pos_embd->ne[1]; // +1 for [CLS]
|
||||
const auto tgt_size = static_cast<int>(std::sqrt(inp->ne[1]));
|
||||
|
|
@ -295,16 +294,12 @@ ggml_cgraph * clip_graph_deepseekocr::build() {
|
|||
clip_out = cur;
|
||||
}
|
||||
|
||||
const int clip_n_patches = sam_out->ne[0] * sam_out->ne[1];
|
||||
|
||||
sam_out = ggml_cont(ctx0, ggml_permute(ctx0, sam_out, 1, 2, 0, 3));
|
||||
sam_out = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0], clip_n_patches);
|
||||
clip_out = ggml_view_2d(ctx0, clip_out, n_embd, clip_n_patches, clip_out->nb[1], clip_out->nb[1]);
|
||||
|
||||
ggml_tensor * cur;
|
||||
cur = ggml_concat(ctx0, clip_out, sam_out, 0);
|
||||
cur = ggml_reshape_2d(ctx0, cur, 2 * n_embd, clip_n_patches);
|
||||
cur = ggml_cont(ctx0, cur);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_fc_b);
|
||||
|
||||
|
|
@ -313,13 +308,11 @@ ggml_cgraph * clip_graph_deepseekocr::build() {
|
|||
const auto n_dim = cur->ne[0];
|
||||
|
||||
ggml_tensor * imgnl;
|
||||
ggml_tensor * vs;
|
||||
|
||||
imgnl = ggml_repeat_4d(ctx0, model.image_newline, n_dim, 1, h, 1);
|
||||
vs = ggml_reshape_2d(ctx0, model.view_seperator, n_dim, 1); // (n_dim, 1)
|
||||
cur = ggml_reshape_3d(ctx0, cur, n_dim, w, h);
|
||||
cur = ggml_reshape_2d(ctx0, ggml_concat(ctx0, cur, imgnl, 1), n_dim, (w + 1) * h);
|
||||
cur = ggml_concat(ctx0, cur, vs, 1); // (n_dim, h*(w+1) + 1)
|
||||
cur = ggml_concat(ctx0, cur, model.view_seperator, 1); // (n_dim, h*(w+1) + 1)
|
||||
|
||||
cb(cur, "dsocr_output", -1);
|
||||
|
||||
|
|
|
|||
81
tools/mtmd/models/deepseekocr2.cpp
Normal file
81
tools/mtmd/models/deepseekocr2.cpp
Normal file
|
|
@ -0,0 +1,81 @@
|
|||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_deepseekocr2::build() {
|
||||
GGML_ASSERT(hparams.n_head_kv > 0);
|
||||
GGML_ASSERT(n_head % hparams.n_head_kv == 0);
|
||||
|
||||
// patch embedding
|
||||
ggml_tensor * inp_raw = build_inp_raw();
|
||||
|
||||
ggml_tensor * sam_out = build_sam(inp_raw);
|
||||
|
||||
ggml_tensor * qwen2_out;
|
||||
// Building Qwen2 encoder
|
||||
{
|
||||
ggml_tensor * inp;
|
||||
|
||||
inp = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0] * sam_out->ne[1], sam_out->ne[2]); // H*W, C
|
||||
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
|
||||
|
||||
auto num_image_tokens = inp->ne[1]; // H*W
|
||||
GGML_ASSERT(num_image_tokens == 144 || num_image_tokens == 256);
|
||||
|
||||
// query based on numbers of image tokens (in SAM output)
|
||||
// 16x16 -> query_1024 (1024x1024 images)
|
||||
// 12x12 -> query_768 (768x768 images)
|
||||
|
||||
ggml_tensor * query_embed = model.resample_query_1024;
|
||||
int num_queries = 256;
|
||||
|
||||
if (num_image_tokens == 144) {
|
||||
query_embed = model.resample_query_768;
|
||||
num_queries = 144;
|
||||
}
|
||||
|
||||
// (B, num_image_tokens + num_queries, C)
|
||||
inp = ggml_concat(ctx0, inp, ggml_cast(ctx0, query_embed, inp->type), 1);
|
||||
|
||||
auto seq_len = inp->ne[1];
|
||||
|
||||
// qwen2 encoder attention mask
|
||||
ggml_tensor * attn_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, seq_len, seq_len);
|
||||
ggml_set_name(attn_mask, "qwen2_attn_mask");
|
||||
ggml_set_input(attn_mask);
|
||||
|
||||
ggml_tensor * inp_pos = ggml_cast(ctx0, ggml_arange(ctx0, 0, seq_len, 1), GGML_TYPE_I32);
|
||||
|
||||
auto add_rope = [&](ggml_tensor * x, const clip_layer &) {
|
||||
return ggml_rope_ext(ctx0, x, inp_pos, nullptr, d_head,
|
||||
GGML_ROPE_TYPE_NEOX, 131072, 1000000, 1, 0, 1, 0, 0);
|
||||
};
|
||||
|
||||
build_vit_opts vit_opts;
|
||||
vit_opts.attn_mask = attn_mask;
|
||||
|
||||
// build_vit applies model.post_ln_w internally; do not re-apply
|
||||
ggml_tensor * cur = build_vit(inp, seq_len, NORM_TYPE_RMS, FFN_SILU,
|
||||
/* learned_pos_embd */ nullptr, add_rope, vit_opts);
|
||||
|
||||
cur = ggml_cont(ctx0,
|
||||
ggml_view_2d(ctx0, cur, cur->ne[0], num_queries, cur->nb[1],
|
||||
cur->nb[1] * (cur->ne[1] - num_queries))); // only take query tokens for output
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
qwen2_out = cur;
|
||||
}
|
||||
|
||||
ggml_tensor * cur;
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.mm_fc_w, qwen2_out);
|
||||
cur = ggml_add(ctx0, cur, model.mm_fc_b);
|
||||
|
||||
// view_seperator only after the global view
|
||||
if (img.add_viewsep) {
|
||||
cur = ggml_concat(ctx0, cur, model.view_seperator, 1); // (n_dim, 257)
|
||||
}
|
||||
|
||||
cb(cur, "dsocr2_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
return gf;
|
||||
}
|
||||
|
|
@ -121,6 +121,11 @@ struct clip_graph_deepseekocr : clip_graph {
|
|||
ggml_tensor * build_sam(ggml_tensor * inp); // build the SAM model
|
||||
};
|
||||
|
||||
struct clip_graph_deepseekocr2 : clip_graph_deepseekocr {
|
||||
clip_graph_deepseekocr2(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph_deepseekocr(ctx, img) {}
|
||||
ggml_cgraph * build() override; // reuses build_sam() from base
|
||||
};
|
||||
|
||||
struct clip_graph_conformer : clip_graph {
|
||||
clip_graph_conformer(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
|
|
|
|||
|
|
@ -1137,6 +1137,105 @@ bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img,
|
|||
return true;
|
||||
}
|
||||
|
||||
//
|
||||
// mtmd_image_preprocessor_deepseekocr2
|
||||
//
|
||||
|
||||
// candidate tile grids (cols, rows) with min_tiles <= cols*rows <= max_tiles
|
||||
// sorted by tile count
|
||||
std::vector<clip_image_size> mtmd_image_preprocessor_deepseekocr2::get_target_ratios() {
|
||||
std::vector<clip_image_size> ratios;
|
||||
for (int n = min_tiles; n <= max_tiles; n++) {
|
||||
for (int w = 1; w <= n; w++) {
|
||||
for (int h = 1; h <= n; h++) {
|
||||
if (w * h < min_tiles || w * h > max_tiles) {
|
||||
continue;
|
||||
}
|
||||
bool found = false;
|
||||
for (const auto & r : ratios) {
|
||||
if (r.width == w && r.height == h) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
ratios.push_back({ w, h });
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
std::sort(ratios.begin(), ratios.end(), [](const clip_image_size & a, const clip_image_size & b) {
|
||||
return a.width * a.height < b.width * b.height;
|
||||
});
|
||||
return ratios;
|
||||
}
|
||||
|
||||
// pick the grid whose aspect ratio is closest to the image
|
||||
// on a tie, prefer the larger grid when the image fits
|
||||
clip_image_size mtmd_image_preprocessor_deepseekocr2::find_closest_aspect_ratio(
|
||||
float aspect_ratio,
|
||||
const std::vector<clip_image_size> & target_ratios,
|
||||
int width,
|
||||
int height) {
|
||||
float best_ratio_diff = std::numeric_limits<float>::max();
|
||||
clip_image_size best_ratio = { 1, 1 };
|
||||
const float area = static_cast<float>(width * height);
|
||||
|
||||
for (const auto & ratio : target_ratios) {
|
||||
const float target_aspect_ratio = static_cast<float>(ratio.width) / ratio.height;
|
||||
const float ratio_diff = std::abs(aspect_ratio - target_aspect_ratio);
|
||||
if (ratio_diff < best_ratio_diff) {
|
||||
best_ratio_diff = ratio_diff;
|
||||
best_ratio = ratio;
|
||||
} else if (ratio_diff == best_ratio_diff) {
|
||||
const float target_area = static_cast<float>(tile_size * tile_size * ratio.width * ratio.height);
|
||||
if (area > 0.5f * target_area) {
|
||||
best_ratio = ratio;
|
||||
}
|
||||
}
|
||||
}
|
||||
return best_ratio;
|
||||
}
|
||||
|
||||
bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
|
||||
// emit 768x768 local tiles when the image is larger than a tile in either
|
||||
// dimension, then always a 1024x1024 global view. order: [tiles..., global].
|
||||
|
||||
if (img.nx > tile_size || img.ny > tile_size) {
|
||||
const float aspect_ratio = static_cast<float>(img.nx) / img.ny;
|
||||
const auto target_ratios = get_target_ratios();
|
||||
const clip_image_size grid = find_closest_aspect_ratio(aspect_ratio, target_ratios, img.nx, img.ny);
|
||||
|
||||
// stretch onto the grid (no aspect preserve), then crop tiles row-major.
|
||||
clip_image_u8 refined;
|
||||
img_tool::resize(img, refined, { tile_size * grid.width, tile_size * grid.height },
|
||||
RESIZE_ALGO_BICUBIC_PILLOW, PAD_NONE);
|
||||
|
||||
for (int row = 0; row < grid.height; row++) {
|
||||
for (int col = 0; col < grid.width; col++) {
|
||||
clip_image_u8 tile;
|
||||
img_tool::crop(refined, tile, col * tile_size, row * tile_size, tile_size, tile_size);
|
||||
clip_image_f32_ptr res(clip_image_f32_init());
|
||||
img_u8_to_f32(tile, *res, hparams.image_mean, hparams.image_std);
|
||||
output.entries.push_back(std::move(res));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// global view: aspect-preserving fit-and-pad to base_size.
|
||||
clip_image_u8 padded;
|
||||
img_tool::resize(img, padded, { base_size, base_size }, RESIZE_ALGO_BICUBIC_PILLOW,
|
||||
PAD_NEAREST, hparams.image_pad_color);
|
||||
clip_image_f32_ptr global(clip_image_f32_init());
|
||||
img_u8_to_f32(padded, *global, hparams.image_mean, hparams.image_std);
|
||||
global->add_viewsep = true;
|
||||
output.entries.push_back(std::move(global));
|
||||
|
||||
output.grid_x = 1;
|
||||
output.grid_y = 1;
|
||||
return true;
|
||||
}
|
||||
|
||||
//
|
||||
// mtmd_image_preprocessor_step3vl
|
||||
//
|
||||
|
|
|
|||
|
|
@ -144,6 +144,26 @@ struct mtmd_image_preprocessor_deepseekocr : mtmd_image_preprocessor {
|
|||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
};
|
||||
|
||||
// DeepSeek-OCR-2: a 1024x1024 global view, plus InternVL-style 768x768 local
|
||||
// tiles when the image is larger than a tile in either dimension.
|
||||
struct mtmd_image_preprocessor_deepseekocr2 : mtmd_image_preprocessor {
|
||||
static constexpr int base_size = 1024; // global view
|
||||
static constexpr int tile_size = 768; // local tile
|
||||
static constexpr int min_tiles = 2;
|
||||
static constexpr int max_tiles = 6;
|
||||
|
||||
mtmd_image_preprocessor_deepseekocr2(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
|
||||
bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
|
||||
|
||||
private:
|
||||
static std::vector<clip_image_size> get_target_ratios();
|
||||
static clip_image_size find_closest_aspect_ratio(
|
||||
float aspect_ratio,
|
||||
const std::vector<clip_image_size> & target_ratios,
|
||||
int width,
|
||||
int height);
|
||||
};
|
||||
|
||||
// custom image preprocessing for Step3VL
|
||||
// ref: https://huggingface.co/stepfun-ai/Step3-VL-10B/blob/main/processing_step3.py
|
||||
struct mtmd_image_preprocessor_step3vl : mtmd_image_preprocessor_llava_uhd {
|
||||
|
|
|
|||
|
|
@ -493,6 +493,11 @@ struct mtmd_context {
|
|||
img_end = "\n"; // prevent empty batch on llama-server
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_DEEPSEEKOCR2:
|
||||
{
|
||||
img_end = "\n"; // prevent empty batch on llama-server
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr2>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_HUNYUANVL:
|
||||
{
|
||||
// note: these use fullwidth | (U+FF5C) and ▁ (U+2581) to match the tokenizer vocabulary
|
||||
|
|
@ -1091,16 +1096,21 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
|
|||
if (clip_is_llava(ctx_clip)
|
||||
|| proj_type == PROJECTOR_TYPE_MINICPMV
|
||||
|| proj_type == PROJECTOR_TYPE_GLM_EDGE
|
||||
|| proj_type == PROJECTOR_TYPE_INTERNVL) {
|
||||
|| proj_type == PROJECTOR_TYPE_INTERNVL
|
||||
|| proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2) {
|
||||
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
|
||||
const auto & entries = image_tokens->batch_f32.entries;
|
||||
// entries may have different token counts
|
||||
// e.g., DeepSeek-OCR-2: 144 per tile views, 257 for the global view
|
||||
size_t offset = 0;
|
||||
for (size_t i = 0; i < entries.size(); i++) {
|
||||
int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
|
||||
ok = clip_image_encode(
|
||||
ctx_clip,
|
||||
ctx->n_threads,
|
||||
entries[i].get(),
|
||||
ctx->image_embd_v.data() + i*n_mmproj_embd*n_tokens_per_image);
|
||||
ctx->image_embd_v.data() + offset);
|
||||
offset += static_cast<size_t>(n_mmproj_embd) * n_tokens_per_image;
|
||||
}
|
||||
} else {
|
||||
ok = clip_image_batch_encode(
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
Evaluates llama.cpp's DeepSeek-OCR by comparing its output for a test
|
||||
image to the actual text in part of that image.
|
||||
|
||||
Runs the test image through mtmd-cli, calculates CER and chrF for
|
||||
Runs each test image through mtmd-cli, calculates CER and chrF for
|
||||
its output, and holds them against the HF model's scores.
|
||||
"""
|
||||
|
||||
|
|
@ -12,24 +12,81 @@ import logging
|
|||
import subprocess
|
||||
import sys
|
||||
import unicodedata
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger("deepseek-ocr-test")
|
||||
|
||||
DEFAULT_IMAGE = "test-1.jpeg"
|
||||
DEFAULT_EXPECTED_TEXT = "test-1-ground-truth.txt"
|
||||
RUN_TIMEOUT = 300
|
||||
|
||||
# DeepSeek-OCR reference scores on the test image.
|
||||
# This is the baseline the implementation should keep up with.
|
||||
HF_REFERENCE_CER = 0.3030
|
||||
HF_REFERENCE_CHRF = 67.52
|
||||
|
||||
CER_TOLERANCE = 0.02
|
||||
CHRF_TOLERANCE = 2.0
|
||||
@dataclass
|
||||
class ModelSpec:
|
||||
key: str
|
||||
label: str
|
||||
model_arg: str
|
||||
mmproj_arg: str
|
||||
model_default: str
|
||||
mmproj_default: str
|
||||
|
||||
CER_MAX = HF_REFERENCE_CER + CER_TOLERANCE
|
||||
CHRF_MIN = HF_REFERENCE_CHRF - CHRF_TOLERANCE
|
||||
|
||||
@dataclass
|
||||
class TestCase:
|
||||
model_key: str
|
||||
label: str
|
||||
image: str
|
||||
ground_truth: str
|
||||
hf_cer: float
|
||||
hf_chrf: float
|
||||
cer_tol: float
|
||||
chrf_tol: float
|
||||
|
||||
@property
|
||||
def cer_max(self) -> float:
|
||||
return self.hf_cer + self.cer_tol
|
||||
|
||||
@property
|
||||
def chrf_min(self) -> float:
|
||||
return self.hf_chrf - self.chrf_tol
|
||||
|
||||
|
||||
MODELS = {
|
||||
"v1": ModelSpec(
|
||||
key="v1", label="DeepSeek-OCR",
|
||||
model_arg="--llama-model", mmproj_arg="--mmproj",
|
||||
model_default="gguf_models/deepseek-ai/deepseek-ocr-bf16.gguf",
|
||||
mmproj_default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-bf16.gguf",
|
||||
),
|
||||
"v2": ModelSpec(
|
||||
key="v2", label="DeepSeek-OCR-2",
|
||||
model_arg="--llama-model-2", mmproj_arg="--mmproj-2",
|
||||
model_default="gguf_models/deepseek-ai/deepseek-ocr-2-bf16.gguf",
|
||||
mmproj_default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-2-bf16.gguf",
|
||||
),
|
||||
}
|
||||
|
||||
CASES = [
|
||||
TestCase(
|
||||
model_key="v1", label="single-view scan",
|
||||
image="tools/mtmd/test-1.jpeg",
|
||||
ground_truth="tools/mtmd/tests/test-1-ground-truth.txt",
|
||||
hf_cer=0.3030, hf_chrf=67.52, cer_tol=0.02, chrf_tol=2.0,
|
||||
),
|
||||
TestCase(
|
||||
model_key="v2", label="single-view scan",
|
||||
image="tools/mtmd/test-1.jpeg",
|
||||
ground_truth="tools/mtmd/tests/test-1-ground-truth.txt",
|
||||
# 640x488 is below the 768 tiling threshold -- single 1024 global view.
|
||||
# hf_cer/hf_chrf are the deepseek-ai repo's own scores (ImageOps.pad);
|
||||
# the transformers HF processor is *not* the reference -- its pad_to_square
|
||||
# is one pixel off and lands at ~0.69 instead.
|
||||
hf_cer=0.7761, hf_chrf=28.70, cer_tol=0.12, chrf_tol=8.0,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def arg_dest(flag: str) -> str:
|
||||
return flag.lstrip("-").replace("-", "_")
|
||||
|
||||
|
||||
def verdict(ok: bool) -> str:
|
||||
|
|
@ -84,6 +141,14 @@ def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str:
|
|||
"--temp", "0",
|
||||
"--flash-attn", "off", # match the HF "eager" attention reference
|
||||
"--no-warmup",
|
||||
"-n", "512", # cap loops on hard images (KV would otherwise fill)
|
||||
# HF decodes with no_repeat_ngram_size; llama.cpp's analog is DRY.
|
||||
# Default DRY breakers include "\n", so they are cleared below.
|
||||
"--dry-multiplier", "0.8",
|
||||
"--dry-base", "1.75",
|
||||
"--dry-allowed-length", "2",
|
||||
"--dry-penalty-last-n", "-1",
|
||||
"--dry-sequence-breaker", "none",
|
||||
]
|
||||
logger.debug(f" command: {' '.join(cmd)}")
|
||||
|
||||
|
|
@ -110,7 +175,7 @@ def read_expected_text(file_path: Path) -> str:
|
|||
return f.read().strip()
|
||||
|
||||
|
||||
def evaluate(expected: str, ocr_out: str) -> bool:
|
||||
def evaluate(case: "TestCase", expected: str, ocr_out: str) -> bool:
|
||||
expected = normalize_text(expected)
|
||||
ocr_out = normalize_text(ocr_out)
|
||||
aligned = locally_align(expected, ocr_out)
|
||||
|
|
@ -122,16 +187,16 @@ def evaluate(expected: str, ocr_out: str) -> bool:
|
|||
cer = compute_cer(expected, aligned)
|
||||
chrf = compute_chrf(expected, aligned)
|
||||
|
||||
cer_pass = cer <= CER_MAX
|
||||
chrf_pass = chrf >= CHRF_MIN
|
||||
cer_pass = cer <= case.cer_max
|
||||
chrf_pass = chrf >= case.chrf_min
|
||||
passed = cer_pass and chrf_pass
|
||||
|
||||
logger.info("")
|
||||
logger.info("=" * 60)
|
||||
logger.info("Free OCR evaluation:")
|
||||
logger.info("=" * 60)
|
||||
logger.info(f" CER {cer:>7.4f} (<= {CER_MAX:>7.4f} -> {verdict(cer_pass)})")
|
||||
logger.info(f" chrF (0-100) {chrf:>7.2f} (>= {CHRF_MIN:>7.2f} -> {verdict(chrf_pass)})")
|
||||
logger.info(f" CER {cer:>7.4f} (HF {case.hf_cer:.4f}, <= {case.cer_max:>7.4f} -> {verdict(cer_pass)})")
|
||||
logger.info(f" chrF (0-100) {chrf:>7.2f} (HF {case.hf_chrf:.2f}, >= {case.chrf_min:>7.2f} -> {verdict(chrf_pass)})")
|
||||
logger.info(f" Expected chars {len(expected):>7}")
|
||||
logger.info(f" Aligned chars {len(aligned):>7} (of {len(ocr_out)} OCR chars)")
|
||||
logger.info("")
|
||||
|
|
@ -142,12 +207,13 @@ def evaluate(expected: str, ocr_out: str) -> bool:
|
|||
|
||||
def argument_parser() -> argparse.ArgumentParser:
|
||||
ap = argparse.ArgumentParser(description="Compare llama.cpp DeepSeek-OCR output with a ground-truth transcript")
|
||||
ap.add_argument("--llama-model", default="gguf_models/deepseek-ai/deepseek-ocr-bf16.gguf",
|
||||
help="Path to llama.cpp GGUF model (relative to repo root or absolute)")
|
||||
ap.add_argument("--mmproj", default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-bf16.gguf",
|
||||
help="Path to mmproj GGUF file (relative to repo root or absolute)")
|
||||
ap.add_argument("--llama-bin", default="build/bin/llama-mtmd-cli",
|
||||
help="Path to llama-mtmd-cli binary (relative to repo root or absolute)")
|
||||
for spec in MODELS.values():
|
||||
ap.add_argument(spec.model_arg, default=spec.model_default,
|
||||
help=f"Path to the {spec.label} GGUF model (relative to repo root or absolute)")
|
||||
ap.add_argument(spec.mmproj_arg, default=spec.mmproj_default,
|
||||
help=f"Path to the {spec.label} mmproj GGUF file (relative to repo root or absolute)")
|
||||
ap.add_argument("--verbose", action="store_true",
|
||||
help="Also log the expected, OCR, and aligned text")
|
||||
return ap
|
||||
|
|
@ -167,53 +233,60 @@ def main() -> int:
|
|||
args = argument_parser().parse_args()
|
||||
configure_logging(args.verbose)
|
||||
|
||||
tests_dir = Path(__file__).parent # tools/mtmd/tests
|
||||
mtmd_dir = tests_dir.parent # tools/mtmd
|
||||
repo_root = mtmd_dir.parent.parent # repo root
|
||||
repo_root = Path(__file__).resolve().parents[3] # tests -> mtmd -> tools -> repo root
|
||||
binary = resolve_path(args.llama_bin, repo_root)
|
||||
|
||||
inputs = [
|
||||
("image", resolve_path(DEFAULT_IMAGE, mtmd_dir)),
|
||||
("expected-text", resolve_path(DEFAULT_EXPECTED_TEXT, tests_dir)),
|
||||
("model", resolve_path(args.llama_model, repo_root)),
|
||||
("mmproj", resolve_path(args.mmproj, repo_root)),
|
||||
("binary", resolve_path(args.llama_bin, repo_root)),
|
||||
]
|
||||
for label, path in inputs:
|
||||
if not path.exists():
|
||||
logger.error(f"Error: {label} not found: {path}")
|
||||
return 1
|
||||
paths = dict(inputs)
|
||||
|
||||
logger.info("=" * 60)
|
||||
logger.info("DeepSeek-OCR: llama.cpp vs ground-truth comparison")
|
||||
logger.info("=" * 60)
|
||||
logger.info(f"HF baselines: CER {HF_REFERENCE_CER:.4f}, chrF {HF_REFERENCE_CHRF:.2f}")
|
||||
logger.info(f"Test thresholds: CER <= {CER_MAX:.4f}, chrF >= {CHRF_MIN:.2f}")
|
||||
|
||||
logger.debug("")
|
||||
logger.debug("Resolved test inputs:")
|
||||
for label, path in inputs:
|
||||
logger.debug(f" {label:<14} {path}")
|
||||
|
||||
logger.info("")
|
||||
logger.info("[1/3] Running llama.cpp 'Free OCR'")
|
||||
try:
|
||||
ocr_out = run_mtmd_cli(paths["model"], paths["mmproj"],
|
||||
paths["image"], paths["binary"])
|
||||
except RuntimeError as e:
|
||||
logger.error(f"Error: {e}")
|
||||
if not binary.exists():
|
||||
logger.error(f"Error: binary not found: {binary}")
|
||||
return 1
|
||||
|
||||
logger.info("")
|
||||
logger.info("[2/3] Reading expected output")
|
||||
expected = read_expected_text(paths["expected-text"])
|
||||
logger.info(f" expected: {len(expected)} chars")
|
||||
logger.info("=" * 60)
|
||||
logger.info("DeepSeek-OCR: llama.cpp vs HF parity check")
|
||||
logger.info("=" * 60)
|
||||
|
||||
results = {}
|
||||
for case in CASES:
|
||||
model_spec = MODELS[case.model_key]
|
||||
title = f"{model_spec.label} -- {case.label}"
|
||||
|
||||
logger.info("")
|
||||
logger.info(f"=== {title} ===")
|
||||
|
||||
model = resolve_path(getattr(args, arg_dest(model_spec.model_arg)), repo_root)
|
||||
mmproj = resolve_path(getattr(args, arg_dest(model_spec.mmproj_arg)), repo_root)
|
||||
image = resolve_path(case.image, repo_root)
|
||||
ground_truth = resolve_path(case.ground_truth, repo_root)
|
||||
|
||||
missing = [(lbl, p) for lbl, p in [("model", model), ("mmproj", mmproj),
|
||||
("image", image), ("ground-truth", ground_truth)]
|
||||
if not p.exists()]
|
||||
if missing:
|
||||
for lbl, p in missing:
|
||||
logger.error(f" Error: {lbl} not found: {p}")
|
||||
results[title] = False
|
||||
continue
|
||||
|
||||
expected = read_expected_text(ground_truth)
|
||||
logger.info(f" Image: {case.image}")
|
||||
logger.info(f" Expected text: {len(expected)} chars")
|
||||
logger.info(" Running llama.cpp 'Free OCR'")
|
||||
try:
|
||||
ocr_out = run_mtmd_cli(model, mmproj, image, binary)
|
||||
except RuntimeError as e:
|
||||
logger.error(f" Error: {e}")
|
||||
results[title] = False
|
||||
continue
|
||||
|
||||
results[title] = evaluate(case, expected, ocr_out)
|
||||
|
||||
logger.info("")
|
||||
logger.info("[3/3] Computing OCR metrics")
|
||||
ok = evaluate(expected, ocr_out)
|
||||
logger.info("=== Summary ===")
|
||||
for title, ok in results.items():
|
||||
logger.info(f" {title:<48} {verdict(ok)}")
|
||||
all_passed = all(results.values())
|
||||
logger.info(f"Overall: {verdict(all_passed)}")
|
||||
|
||||
return 0 if ok else 1
|
||||
return 0 if all_passed else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue