mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-30 12:03:38 +00:00
* mtmd: DeepSeek-OCR 2 support, with multi-tile dynamic resolution * introduced clip_image_f32::add_viewsep * address PR review - drop redundant ggml_cpy ops in both deepseekocr versions build - drop no-op ggml_cont in build_sam - assert num_image_tokens deepseekocr2 - view_seperator as (1, n_embd) at conversion (for both versions) - drop redundant ggml_reshape_2d * Update tools/mtmd/models/deepseekocr2.cpp Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com> --------- Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
81 lines
2.7 KiB
C++
81 lines
2.7 KiB
C++
#include "models.h"
|
|
|
|
ggml_cgraph * clip_graph_deepseekocr2::build() {
|
|
GGML_ASSERT(hparams.n_head_kv > 0);
|
|
GGML_ASSERT(n_head % hparams.n_head_kv == 0);
|
|
|
|
// patch embedding
|
|
ggml_tensor * inp_raw = build_inp_raw();
|
|
|
|
ggml_tensor * sam_out = build_sam(inp_raw);
|
|
|
|
ggml_tensor * qwen2_out;
|
|
// Building Qwen2 encoder
|
|
{
|
|
ggml_tensor * inp;
|
|
|
|
inp = ggml_reshape_2d(ctx0, sam_out, sam_out->ne[0] * sam_out->ne[1], sam_out->ne[2]); // H*W, C
|
|
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
|
|
|
|
auto num_image_tokens = inp->ne[1]; // H*W
|
|
GGML_ASSERT(num_image_tokens == 144 || num_image_tokens == 256);
|
|
|
|
// query based on numbers of image tokens (in SAM output)
|
|
// 16x16 -> query_1024 (1024x1024 images)
|
|
// 12x12 -> query_768 (768x768 images)
|
|
|
|
ggml_tensor * query_embed = model.resample_query_1024;
|
|
int num_queries = 256;
|
|
|
|
if (num_image_tokens == 144) {
|
|
query_embed = model.resample_query_768;
|
|
num_queries = 144;
|
|
}
|
|
|
|
// (B, num_image_tokens + num_queries, C)
|
|
inp = ggml_concat(ctx0, inp, ggml_cast(ctx0, query_embed, inp->type), 1);
|
|
|
|
auto seq_len = inp->ne[1];
|
|
|
|
// qwen2 encoder attention mask
|
|
ggml_tensor * attn_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, seq_len, seq_len);
|
|
ggml_set_name(attn_mask, "qwen2_attn_mask");
|
|
ggml_set_input(attn_mask);
|
|
|
|
ggml_tensor * inp_pos = ggml_cast(ctx0, ggml_arange(ctx0, 0, seq_len, 1), GGML_TYPE_I32);
|
|
|
|
auto add_rope = [&](ggml_tensor * x, const clip_layer &) {
|
|
return ggml_rope_ext(ctx0, x, inp_pos, nullptr, d_head,
|
|
GGML_ROPE_TYPE_NEOX, 131072, 1000000, 1, 0, 1, 0, 0);
|
|
};
|
|
|
|
build_vit_opts vit_opts;
|
|
vit_opts.attn_mask = attn_mask;
|
|
|
|
// build_vit applies model.post_ln_w internally; do not re-apply
|
|
ggml_tensor * cur = build_vit(inp, seq_len, NORM_TYPE_RMS, FFN_SILU,
|
|
/* learned_pos_embd */ nullptr, add_rope, vit_opts);
|
|
|
|
cur = ggml_cont(ctx0,
|
|
ggml_view_2d(ctx0, cur, cur->ne[0], num_queries, cur->nb[1],
|
|
cur->nb[1] * (cur->ne[1] - num_queries))); // only take query tokens for output
|
|
|
|
ggml_build_forward_expand(gf, cur);
|
|
qwen2_out = cur;
|
|
}
|
|
|
|
ggml_tensor * cur;
|
|
|
|
cur = ggml_mul_mat(ctx0, model.mm_fc_w, qwen2_out);
|
|
cur = ggml_add(ctx0, cur, model.mm_fc_b);
|
|
|
|
// view_seperator only after the global view
|
|
if (img.add_viewsep) {
|
|
cur = ggml_concat(ctx0, cur, model.view_seperator, 1); // (n_dim, 257)
|
|
}
|
|
|
|
cb(cur, "dsocr2_output", -1);
|
|
|
|
ggml_build_forward_expand(gf, cur);
|
|
return gf;
|
|
}
|