koboldcpp/tools/mtmd/models/deepseekocr2.cpp
Saba Fallah da3f990a47
mtmd: Add DeepSeekOCR 2 Support (#20975)
* mtmd: DeepSeek-OCR 2 support, with multi-tile dynamic resolution

* introduced clip_image_f32::add_viewsep

* address PR review

- drop redundant ggml_cpy ops in both deepseekocr versions build
- drop no-op ggml_cont in build_sam
- assert num_image_tokens deepseekocr2
- view_seperator as (1, n_embd) at conversion (for both versions)
- drop redundant ggml_reshape_2d

* Update tools/mtmd/models/deepseekocr2.cpp

Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>

---------

Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
2026-05-29 16:13:51 +02:00

81 lines
2.7 KiB
C++

#include "models.h"
ggml_cgraph * clip_graph_deepseekocr2::build() {
GGML_ASSERT(hparams.n_head_kv > 0);
GGML_ASSERT(n_head % hparams.n_head_kv == 0);
// patch embedding
ggml_tensor * inp_raw = build_inp_raw();
ggml_tensor * sam_out = build_sam(inp_raw);
ggml_tensor * qwen2_out;
// Building Qwen2 encoder
{
ggml_tensor * inp;
inp = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0] * sam_out->ne[1], sam_out->ne[2]); // H*W, C
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
auto num_image_tokens = inp->ne[1]; // H*W
GGML_ASSERT(num_image_tokens == 144 || num_image_tokens == 256);
// query based on numbers of image tokens (in SAM output)
// 16x16 -> query_1024 (1024x1024 images)
// 12x12 -> query_768 (768x768 images)
ggml_tensor * query_embed = model.resample_query_1024;
int num_queries = 256;
if (num_image_tokens == 144) {
query_embed = model.resample_query_768;
num_queries = 144;
}
// (B, num_image_tokens + num_queries, C)
inp = ggml_concat(ctx0, inp, ggml_cast(ctx0, query_embed, inp->type), 1);
auto seq_len = inp->ne[1];
// qwen2 encoder attention mask
ggml_tensor * attn_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, seq_len, seq_len);
ggml_set_name(attn_mask, "qwen2_attn_mask");
ggml_set_input(attn_mask);
ggml_tensor * inp_pos = ggml_cast(ctx0, ggml_arange(ctx0, 0, seq_len, 1), GGML_TYPE_I32);
auto add_rope = [&](ggml_tensor * x, const clip_layer &) {
return ggml_rope_ext(ctx0, x, inp_pos, nullptr, d_head,
GGML_ROPE_TYPE_NEOX, 131072, 1000000, 1, 0, 1, 0, 0);
};
build_vit_opts vit_opts;
vit_opts.attn_mask = attn_mask;
// build_vit applies model.post_ln_w internally; do not re-apply
ggml_tensor * cur = build_vit(inp, seq_len, NORM_TYPE_RMS, FFN_SILU,
/* learned_pos_embd */ nullptr, add_rope, vit_opts);
cur = ggml_cont(ctx0,
ggml_view_2d(ctx0, cur, cur->ne[0], num_queries, cur->nb[1],
cur->nb[1] * (cur->ne[1] - num_queries))); // only take query tokens for output
ggml_build_forward_expand(gf, cur);
qwen2_out = cur;
}
ggml_tensor * cur;
cur = ggml_mul_mat(ctx0, model.mm_fc_w, qwen2_out);
cur = ggml_add(ctx0, cur, model.mm_fc_b);
// view_seperator only after the global view
if (img.add_viewsep) {
cur = ggml_concat(ctx0, cur, model.view_seperator, 1); // (n_dim, 257)
}
cb(cur, "dsocr2_output", -1);
ggml_build_forward_expand(gf, cur);
return gf;
}