model: support GLM4V vision encoder (#18042)

* convert ok

* no deepstack

* less new tensors

* cgraph ok

* add mrope for text model

* faster patch merger

* add GGML_ROPE_TYPE_MRNORM

* add support for metal

* move glm4v do dedicated graph

* convert: add norm_embd

* clip: add debugging fn

* working correctly

* fix style

* use bicubic

* fix mrope metal

* improve cpu

* convert to neox ordering on conversion

* revert backend changes

* force stop if using old weight

* support moe variant

* fix conversion

* fix convert (2)

* Update tools/mtmd/clip-graph.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* process mrope_section on TextModel base class

* resolve conflict merge

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Xuan-Son Nguyen 2025-12-16 11:25:26 +01:00 committed by GitHub
parent 9963b81f63
commit 3d86c6c2b5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
17 changed files with 412 additions and 79 deletions

View file

@ -217,7 +217,7 @@ struct mtmd_context {
void init_vision() {
GGML_ASSERT(ctx_v != nullptr);
use_mrope = clip_is_qwen2vl(ctx_v);
use_mrope = clip_is_mrope(ctx_v);
projector_type proj = clip_get_projector_type(ctx_v);
int minicpmv_version = clip_is_minicpmv(ctx_v);
@ -309,6 +309,10 @@ struct mtmd_context {
img_beg = "<|image_start|>";
img_end = "<|image_end|>";
} else if (proj == PROJECTOR_TYPE_GLM4V) {
img_beg = "<|begin_of_image|>";
img_end = "<|end_of_image|>";
}
}