Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	.github/workflows/build-riscv.yml
#	.github/workflows/build.yml
#	ggml/src/ggml-hexagon/htp/argsort-ops.c
#	ggml/src/ggml-sycl/fattn-tile.hpp
#	tools/mtmd/CMakeLists.txt
This commit is contained in:
Concedo 2026-04-06 20:56:02 +08:00
commit a395af65db
16 changed files with 296 additions and 19 deletions

View file

@ -55,6 +55,7 @@
#include "models/conformer.cpp"
#include "models/gemma4v.cpp"
#include "models/glm4v.cpp"
#include "models/hunyuanocr.cpp"
#include "models/internvl.cpp"
#include "models/kimivl.cpp"
#include "models/kimik25.cpp"
@ -954,6 +955,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
{
builder = std::make_unique<clip_graph_cogvlm>(ctx, img);
} break;
case PROJECTOR_TYPE_HUNYUANOCR:
{
builder = std::make_unique<clip_graph_hunyuanocr>(ctx, img);
} break;
case PROJECTOR_TYPE_MLP:
case PROJECTOR_TYPE_MLP_NORM:
case PROJECTOR_TYPE_LDP:
@ -1488,6 +1493,14 @@ struct clip_model_loader {
get_u32(KEY_SAM_N_EMBD, hparams.sam_n_embd, true);
get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
} break;
case PROJECTOR_TYPE_HUNYUANOCR:
{
hparams.n_merge = 2;
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels);
get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels);
hparams.set_warmup_n_tokens(28*28);
} break;
case PROJECTOR_TYPE_LFM2A:
{
// audio preprocessing params
@ -2120,6 +2133,22 @@ struct clip_model_loader {
model.mm_boi = get_tensor(TN_TOK_BOI);
model.mm_eoi = get_tensor(TN_TOK_EOI);
} break;
case PROJECTOR_TYPE_HUNYUANOCR:
{
// proj.0 -> mm.0 (conv1), proj.2 -> mm.2 (conv2), mlp -> mm.model.fc (linear)
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
model.mm_model_proj = get_tensor(string_format(TN_MM_PROJECTOR, "weight"));
model.mm_model_proj_b = get_tensor(string_format(TN_MM_PROJECTOR, "bias"));
model.mm_pre_norm_w = get_tensor(string_format(TN_MM_PRE_NORM, "weight"));
model.mm_post_norm_w = get_tensor(string_format(TN_MM_POST_NORM, "weight"));
model.mm_img_begin = get_tensor(TN_TOK_IMG_BEGIN);
model.mm_img_end = get_tensor(TN_TOK_IMG_END);
model.image_newline = get_tensor(TN_IMAGE_NEWLINE);
model.view_seperator = get_tensor(TN_IMAGE_SEPERATOR, false);
} break;
case PROJECTOR_TYPE_JANUS_PRO:
{
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
@ -2958,6 +2987,13 @@ void setup_init_vision_shim_kcpp(struct clip_ctx * ctx_v) {
img_end = "\n"; // prevent empty batch on llama-server
image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
} break;
case PROJECTOR_TYPE_HUNYUANOCR:
{
// note: these use fullwidth (U+FF5C) and ▁ (U+2581) to match the tokenizer vocabulary
img_beg = "<hy_place▁holder▁no▁100>";
img_end = "<hy_place▁holder▁no▁101>";
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
} break;
default:
throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj));
}
@ -3039,6 +3075,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_GLM4V:
case PROJECTOR_TYPE_PADDLEOCR:
case PROJECTOR_TYPE_HUNYUANOCR:
case PROJECTOR_TYPE_YOUTUVL:
return (img->nx / params.patch_size) / 2;
default:
@ -3223,6 +3260,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
int h = static_cast<int>(std::sqrt(static_cast<float>(n_patches)));
n_patches = h * (h + 1) + 1;
} break;
case PROJECTOR_TYPE_HUNYUANOCR:
{
int merge = ctx->model.hparams.n_merge;
int ow = (img->nx / patch_size) / merge;
int oh = (img->ny / patch_size) / merge;
n_patches = (ow + 1) * oh + 2;
} break;
case PROJECTOR_TYPE_LFM2A:
{
n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2;
@ -3630,6 +3674,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
case PROJECTOR_TYPE_JANUS_PRO:
case PROJECTOR_TYPE_PHI4:
case PROJECTOR_TYPE_COGVLM:
case PROJECTOR_TYPE_HUNYUANOCR:
{
// do nothing
} break;
@ -3998,6 +4043,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
case PROJECTOR_TYPE_PADDLEOCR:
case PROJECTOR_TYPE_KIMIK25:
return ctx->model.mm_2_w->ne[1];
case PROJECTOR_TYPE_HUNYUANOCR:
return ctx->model.mm_model_proj->ne[1];
case PROJECTOR_TYPE_COGVLM:
return ctx->model.mm_4h_to_h_w->ne[1];
case PROJECTOR_TYPE_DEEPSEEKOCR: