Merge branch 'upstream' into concedo_experimental

# Conflicts:
#	README.md
#	examples/llama-bench/README.md
#	examples/llama-bench/llama-bench.cpp
#	examples/llava/CMakeLists.txt
#	ggml/src/ggml-rpc/ggml-rpc.cpp
#	ggml/src/ggml-sycl/common.hpp
#	ggml/src/ggml-sycl/element_wise.cpp
#	ggml/src/ggml-sycl/element_wise.hpp
#	ggml/src/ggml-sycl/ggml-sycl.cpp
#	tests/test-chat-template.cpp
This commit is contained in:
Concedo 2025-04-29 21:05:16 +08:00
commit b2ecfa0f55
26 changed files with 724 additions and 499 deletions

View file

@ -184,8 +184,8 @@ struct clip_hparams {
std::vector<int32_t> image_grid_pinpoints;
int32_t image_crop_resolution;
std::unordered_set<int32_t> vision_feature_layer;
int32_t attn_window_size;
int32_t n_wa_pattern;
int32_t attn_window_size = 0;
int32_t n_wa_pattern = 0;
};
struct clip_layer {
@ -345,7 +345,6 @@ struct clip_ctx {
float image_std[3];
bool use_gelu = false;
bool use_silu = false;
int32_t ftype = 1;
gguf_context_ptr ctx_gguf;
ggml_context_ptr ctx_data;
@ -801,7 +800,6 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_
const int image_size_width = imgs.entries[0]->nx;
const int image_size_height = imgs.entries[0]->ny;
const bool use_mrope = ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL;
const bool use_window_attn = hparams.n_wa_pattern > 0;
const int n_wa_pattern = hparams.n_wa_pattern;
@ -810,10 +808,11 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_
const int patches_w = image_size_width / patch_size;
const int patches_h = image_size_height / patch_size;
const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
const int num_position_ids = use_mrope ? num_positions * 4 : num_positions;
const int num_position_ids = num_positions * 4; // m-rope requires 4 dim per position
const int hidden_size = hparams.hidden_size;
const int n_head = hparams.n_head;
const int d_head = hidden_size / n_head;
const int n_layer = hparams.n_layer;
const float eps = hparams.eps;
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
@ -895,7 +894,7 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_
}
// loop over layers
for (int il = 0; il < ctx->max_feature_layer; il++) {
for (int il = 0; il < n_layer; il++) {
struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
// rmsnorm1
@ -1140,15 +1139,8 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
int pos_w = image_size_width/patch_size;
int pos_h = image_size_height/patch_size;
if (ctx->minicpmv_version == 2) {
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1);
}
else if (ctx->minicpmv_version == 3) {
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
}
else if (ctx->minicpmv_version == 4) {
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
}
int n_output_dim = clip_n_mmproj_embd(ctx);
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_output_dim, pos_w * pos_h, 1);
ggml_set_name(pos_embed, "pos_embed");
ggml_set_input(pos_embed);
}
@ -1486,23 +1478,17 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
}
{ // attention
int hidden_size = 4096;
int hidden_size = clip_n_mmproj_embd(ctx);
const int d_head = 128;
int n_head = hidden_size/d_head;
int num_query = 96;
if (ctx->minicpmv_version == 2) {
hidden_size = 4096;
n_head = hidden_size/d_head;
num_query = 96;
}
else if (ctx->minicpmv_version == 3) {
hidden_size = 3584;
n_head = hidden_size/d_head;
num_query = 64;
}
else if (ctx->minicpmv_version == 4) {
hidden_size = 3584;
n_head = hidden_size/d_head;
num_query = 64;
}
@ -1613,7 +1599,7 @@ struct clip_model_loader {
clip_ctx & ctx_clip;
std::string fname;
size_t model_size; // in bytes
size_t model_size = 0; // in bytes
// TODO @ngxson : we should not pass clip_ctx here, it should be clip_vision_model
clip_model_loader(const char * fname, clip_ctx & ctx_clip) : ctx_clip(ctx_clip), fname(fname) {
@ -1810,6 +1796,10 @@ struct clip_model_loader {
LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str());
LOG_INF("%s: has_llava_proj: %d\n", __func__, ctx_clip.has_llava_projector);
LOG_INF("%s: minicpmv_version: %d\n", __func__, ctx_clip.minicpmv_version);
LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor);
LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
LOG_INF("%s: use_silu: %d\n", __func__, ctx_clip.use_silu);
LOG_INF("%s: use_gelu: %d\n", __func__, ctx_clip.use_gelu);
LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
LOG_INF("%s: metadata size: %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0);
}
@ -2972,15 +2962,18 @@ void clip_free(clip_ctx * ctx) {
delete ctx;
}
// deprecated
size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
const int32_t nx = ctx->vision_model.hparams.image_size;
const int32_t ny = ctx->vision_model.hparams.image_size;
return clip_embd_nbytes_by_img(ctx, nx, ny);
}
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h) {
clip_image_f32 img;
img.nx = img_w;
img.ny = img_h;
return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
return clip_n_output_tokens(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
}
int32_t clip_get_image_size(const struct clip_ctx * ctx) {
@ -3010,14 +3003,37 @@ size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
return ctx->vision_model.hparams.image_grid_pinpoints.size();
}
// deprecated
int clip_n_patches(const struct clip_ctx * ctx) {
clip_image_f32 img;
img.nx = ctx->vision_model.hparams.image_size;
img.ny = ctx->vision_model.hparams.image_size;
return clip_n_patches_by_img(ctx, &img);
return clip_n_output_tokens(ctx, &img);
}
// deprecated
int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
return clip_n_output_tokens(ctx, img);
}
int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
const auto & params = ctx->vision_model.hparams;
const int n_total = clip_n_output_tokens(ctx, img);
if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
return img->nx / (params.patch_size * 2) + (int)(img->nx % params.patch_size > 0);
}
return n_total;
}
int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
const auto & params = ctx->vision_model.hparams;
if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
return img->ny / (params.patch_size * 2) + (int)(img->ny % params.patch_size > 0);
}
return 1;
}
int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
const auto & params = ctx->vision_model.hparams;
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
@ -3179,15 +3195,43 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
const int patch_size = hparams.patch_size;
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
const int pos_w = ctx->load_image_size.width / patch_size;
const int pos_w = ctx->load_image_size.width / patch_size;
const int pos_h = ctx->load_image_size.height / patch_size;
const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl
auto get_inp_tensor = [&gf](const char * name) {
struct ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
if (inp == nullptr) {
GGML_ABORT("Failed to get tensor %s", name);
}
if (!(inp->flags & GGML_TENSOR_FLAG_INPUT)) {
GGML_ABORT("Tensor %s is not an input tensor", name);
}
return inp;
};
auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector<float> & values) {
ggml_tensor * cur = get_inp_tensor(name);
GGML_ASSERT(cur->type == GGML_TYPE_F32);
GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
};
auto set_input_i32 = [&get_inp_tensor](const char * name, std::vector<int32_t> & values) {
ggml_tensor * cur = get_inp_tensor(name);
GGML_ASSERT(cur->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
};
// set input pixel values
{
struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
std::vector<float> inp_data(ggml_nelements(inp_raw));
float * data = inp_data.data();
size_t nelem = 0;
for (const auto & img : imgs.entries) {
nelem += img->nx * img->ny * 3;
}
std::vector<float> inp_raw(nelem);
// layout of data (note: the channel dim is unrolled to better visualize the layout):
//
@ -3206,7 +3250,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
const int n = nx * ny;
for (int b = 0; b < batch_size; b++) {
float * batch_entry = data + b * (3*n);
float * batch_entry = inp_raw.data() + b * (3*n);
for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) {
size_t base_src = 3*(y * nx + x); // idx of the first channel
@ -3218,266 +3262,207 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
}
}
}
ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
set_input_f32("inp_raw", inp_raw);
}
if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
{
// inspired from siglip:
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
std::vector<int> pos_data(ggml_nelements(positions));
int * data = pos_data.data();
int bucket_coords_h[1024];
int bucket_coords_w[1024];
for (int i = 0; i < pos_h; i++){
bucket_coords_h[i] = std::floor(70.0*i/pos_h);
}
for (int i = 0; i < pos_w; i++){
bucket_coords_w[i] = std::floor(70.0*i/pos_w);
}
for (int i = 0, id = 0; i < pos_h; i++){
for (int j = 0; j < pos_w; j++){
data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
// set input per projector
switch (ctx->proj_type) {
case PROJECTOR_TYPE_MINICPMV:
{
// inspired from siglip:
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
std::vector<int32_t> positions(pos_h * pos_w);
int bucket_coords_h[1024];
int bucket_coords_w[1024];
for (int i = 0; i < pos_h; i++){
bucket_coords_h[i] = std::floor(70.0*i/pos_h);
}
}
ggml_backend_tensor_set(positions, data, 0, ggml_nbytes(positions));
}
{
// inspired from resampler of Qwen-VL:
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
int embed_dim = 4096;
if (ctx->minicpmv_version == 2) {
embed_dim = 4096;
}
else if (ctx->minicpmv_version == 3) {
embed_dim = 3584;
}
else if (ctx->minicpmv_version == 4) {
embed_dim = 3584;
}
else {
GGML_ABORT("Unknown minicpmv version");
}
// TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos?
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
std::vector<float> pos_data(ggml_nelements(pos_embed));
float * data = pos_data.data();
for(int i = 0; i < pos_w * pos_h; ++i){
for(int j = 0; j < embed_dim; ++j){
data[i * embed_dim + j] = pos_embed_t[i][j];
for (int i = 0; i < pos_w; i++){
bucket_coords_w[i] = std::floor(70.0*i/pos_w);
}
}
for (int i = 0, id = 0; i < pos_h; i++){
for (int j = 0; j < pos_w; j++){
positions[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
}
}
set_input_i32("positions", positions);
ggml_backend_tensor_set(pos_embed, data, 0, ggml_nbytes(pos_embed));
}
}
else {
// non-minicpmv models
// inspired from resampler of Qwen-VL:
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
int embed_dim = clip_n_mmproj_embd(ctx);
if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
// pw * ph = number of tokens output by ViT after apply patch merger
// ipw * ipw = number of vision token been processed inside ViT
const int merge_ratio = 2;
const int pw = image_size_width / patch_size / merge_ratio;
const int ph = image_size_height / patch_size / merge_ratio;
const int ipw = image_size_width / patch_size;
const int iph = image_size_height / patch_size;
// TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos?
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
std::vector<int> idx (ph * pw);
std::vector<int> inv_idx(ph * pw);
std::vector<float> pos_embed(embed_dim * pos_w * pos_h);
for(int i = 0; i < pos_w * pos_h; ++i){
for(int j = 0; j < embed_dim; ++j){
pos_embed[i * embed_dim + j] = pos_embed_t[i][j];
}
}
if (use_window_attn) {
const int attn_window_size = 112;
struct ggml_tensor * window_idx = ggml_graph_get_tensor(gf, "window_idx");
struct ggml_tensor * inv_window_idx = ggml_graph_get_tensor(gf, "inv_window_idx");
struct ggml_tensor * window_mask = ggml_graph_get_tensor(gf, "window_mask");
const int grid_window = attn_window_size / patch_size / merge_ratio;
int dst = 0;
// [num_vision_tokens, num_vision_tokens] attention mask tensor
std::vector<float> mask(pow(ipw * iph, 2), std::numeric_limits<float>::lowest());
int mask_row = 0;
for (int y = 0; y < ph; y += grid_window)
{
for (int x = 0; x < pw; x += grid_window)
{
const int win_h = std::min(grid_window, ph - y);
const int win_w = std::min(grid_window, pw - x);
const int dst_0 = dst;
// group all tokens belong to the same window togather (to a continue range)
for (int dy = 0; dy < win_h; dy++) {
for (int dx = 0; dx < win_w; dx++) {
const int src = (y + dy) * pw + (x + dx);
assert(src < (int)idx.size());
assert(dst < (int)inv_idx.size());
idx [src] = dst;
inv_idx[dst] = src;
dst++;
set_input_f32("pos_embed", pos_embed);
} break;
case PROJECTOR_TYPE_QWEN2VL:
{
const int merge_ratio = 2;
const int pw = image_size_width / patch_size;
const int ph = image_size_height / patch_size;
std::vector<int> positions(num_positions * 4);
int ptr = 0;
for (int y = 0; y < ph; y += merge_ratio) {
for (int x = 0; x < pw; x += merge_ratio) {
for (int dy = 0; dy < 2; dy++) {
for (int dx = 0; dx < 2; dx++) {
positions[ ptr] = y + dy;
positions[ num_patches + ptr] = x + dx;
positions[2 * num_patches + ptr] = y + dy;
positions[3 * num_patches + ptr] = x + dx;
ptr++;
}
}
for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) {
int row_offset = mask_row * (ipw * iph);
std::fill(
mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio),
mask.begin() + row_offset + (dst * merge_ratio * merge_ratio),
0.0);
mask_row++;
}
}
}
ggml_backend_tensor_set(window_idx, idx.data(), 0, ggml_nbytes(window_idx));
ggml_backend_tensor_set(inv_window_idx, inv_idx.data(), 0, ggml_nbytes(inv_window_idx));
ggml_backend_tensor_set(window_mask, mask.data(), 0, ggml_nbytes(window_mask));
} else {
std::iota(idx.begin(), idx.end(), 0);
std::iota(inv_idx.begin(), inv_idx.end(), 0);
}
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
const int mpow = merge_ratio * merge_ratio;
std::vector<int> positions_data(ggml_nelements(positions));
int * data = positions_data.data();
int ptr = 0;
for (int y = 0; y < iph; y += merge_ratio)
set_input_i32("positions", positions);
} break;
case PROJECTOR_TYPE_QWEN25VL:
{
for (int x = 0; x < ipw; x += merge_ratio)
{
for (int dy = 0; dy < 2; dy++) {
for (int dx = 0; dx < 2; dx++) {
auto remap = idx[ptr / mpow];
remap = remap * mpow + (ptr % mpow);
// pw * ph = number of tokens output by ViT after apply patch merger
// ipw * ipw = number of vision token been processed inside ViT
const int merge_ratio = 2;
const int pw = image_size_width / patch_size / merge_ratio;
const int ph = image_size_height / patch_size / merge_ratio;
const int ipw = image_size_width / patch_size;
const int iph = image_size_height / patch_size;
data[ remap] = y + dy;
data[ num_patches + remap] = x + dx;
data[2 * num_patches + remap] = y + dy;
data[3 * num_patches + remap] = x + dx;
ptr++;
std::vector<int> idx (ph * pw);
std::vector<int> inv_idx(ph * pw);
if (use_window_attn) {
const int attn_window_size = 112;
const int grid_window = attn_window_size / patch_size / merge_ratio;
int dst = 0;
// [num_vision_tokens, num_vision_tokens] attention mask tensor
std::vector<float> mask(pow(ipw * iph, 2), std::numeric_limits<float>::lowest());
int mask_row = 0;
for (int y = 0; y < ph; y += grid_window) {
for (int x = 0; x < pw; x += grid_window) {
const int win_h = std::min(grid_window, ph - y);
const int win_w = std::min(grid_window, pw - x);
const int dst_0 = dst;
// group all tokens belong to the same window togather (to a continue range)
for (int dy = 0; dy < win_h; dy++) {
for (int dx = 0; dx < win_w; dx++) {
const int src = (y + dy) * pw + (x + dx);
GGML_ASSERT(src < (int)idx.size());
GGML_ASSERT(dst < (int)inv_idx.size());
idx [src] = dst;
inv_idx[dst] = src;
dst++;
}
}
for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) {
int row_offset = mask_row * (ipw * iph);
std::fill(
mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio),
mask.begin() + row_offset + (dst * merge_ratio * merge_ratio),
0.0);
mask_row++;
}
}
}
set_input_i32("window_idx", idx);
set_input_i32("inv_window_idx", inv_idx);
set_input_f32("window_mask", mask);
} else {
for (int i = 0; i < ph * pw; i++) {
idx[i] = i;
}
}
const int mpow = merge_ratio * merge_ratio;
std::vector<int> positions(num_positions * 4);
int ptr = 0;
for (int y = 0; y < iph; y += merge_ratio) {
for (int x = 0; x < ipw; x += merge_ratio) {
for (int dy = 0; dy < 2; dy++) {
for (int dx = 0; dx < 2; dx++) {
auto remap = idx[ptr / mpow];
remap = (remap * mpow) + (ptr % mpow);
positions[ remap] = y + dy;
positions[ num_patches + remap] = x + dx;
positions[2 * num_patches + remap] = y + dy;
positions[3 * num_patches + remap] = x + dx;
ptr++;
}
}
}
}
}
ggml_backend_tensor_set(positions, data, 0, ggml_nbytes(positions));
}
else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
// do nothing
}
else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
// do nothing
}
else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
// set the 2D positions
int n_patches_per_col = image_size_width / patch_size;
std::vector<int> pos_data(num_positions);
struct ggml_tensor * pos;
// dimension H
pos = ggml_graph_get_tensor(gf, "pos_h");
for (int i = 0; i < num_positions; i++) {
pos_data[i] = i / n_patches_per_col;
}
ggml_backend_tensor_set(pos, pos_data.data(), 0, ggml_nbytes(pos));
// dimension W
pos = ggml_graph_get_tensor(gf, "pos_w");
for (int i = 0; i < num_positions; i++) {
pos_data[i] = i % n_patches_per_col;
}
ggml_backend_tensor_set(pos, pos_data.data(), 0, ggml_nbytes(pos));
}
else {
set_input_i32("positions", positions);
} break;
case PROJECTOR_TYPE_PIXTRAL:
{
// set the 2D positions
int n_patches_per_col = image_size_width / patch_size;
std::vector<int> pos_data(num_positions);
// dimension H
for (int i = 0; i < num_positions; i++) {
pos_data[i] = i / n_patches_per_col;
}
set_input_i32("pos_h", pos_data);
// dimension W
for (int i = 0; i < num_positions; i++) {
pos_data[i] = i % n_patches_per_col;
}
set_input_i32("pos_w", pos_data);
} break;
case PROJECTOR_TYPE_GLM_EDGE:
{
// llava and other models
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
int* positions_data = (int*)malloc(ggml_nbytes(positions));
std::vector<int32_t> positions(num_positions);
for (int i = 0; i < num_positions; i++) {
positions_data[i] = i;
positions[i] = i;
}
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
free(positions_data);
set_input_i32("positions", positions);
} break;
case PROJECTOR_TYPE_MLP:
case PROJECTOR_TYPE_MLP_NORM:
case PROJECTOR_TYPE_LDP:
case PROJECTOR_TYPE_LDPV2:
{
// llava and other models
std::vector<int32_t> positions(num_positions);
for (int i = 0; i < num_positions; i++) {
positions[i] = i;
}
set_input_i32("positions", positions);
if (ctx->proj_type != PROJECTOR_TYPE_GLM_EDGE) {
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
// The patches vector is used to get rows to index into the embeds with;
// we should skip dim 0 only if we have CLS to avoid going out of bounds
// when retrieving the rows.
int patch_offset = model.class_embedding ? 1 : 0;
int* patches_data = (int*)malloc(ggml_nbytes(patches));
std::vector<int32_t> patches(num_patches);
for (int i = 0; i < num_patches; i++) {
patches_data[i] = i + patch_offset;
patches[i] = i + patch_offset;
}
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
free(patches_data);
}
}
}
if (use_window_attn && (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL)) {
struct ggml_tensor * window_idx = ggml_graph_get_tensor(gf, "window_idx");
struct ggml_tensor * inv_window_idx = ggml_graph_get_tensor(gf, "inv_window_idx");
struct ggml_tensor * window_mask = ggml_graph_get_tensor(gf, "window_mask");
const int merge_ratio = 2;
const int attn_window_size = 112;
const int pw = image_size_width / patch_size / merge_ratio;
const int ph = image_size_height / patch_size / merge_ratio;
const int grid_window = attn_window_size / patch_size / merge_ratio;
const int ipw = image_size_width / patch_size;
const int iph = image_size_height / patch_size;
/*
pw * ph = number of tokens output by ViT after apply patch merger
ipw * ipw = number of vision token been processed inside ViT
*/
std::vector<int> idx(ph * pw);
std::vector<int> inv_idx(ph * pw);
int dst = 0;
// [num_vision_tokens, num_vision_tokens] attention mask tensor
std::vector<float> mask(pow(ipw * iph, 2), std::numeric_limits<float>::lowest());
int mask_row = 0;
for (int y = 0; y < ph; y+=grid_window)
{
for (int x = 0; x < pw; x+=grid_window)
set_input_i32("patches", patches);
} break;
case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_IDEFICS3:
{
const int win_h = std::min(grid_window, ph - y);
const int win_w = std::min(grid_window, pw - x);
const int dst_0 = dst;
// group all tokens belong to the same window togather (to a continue range)
for (int dy = 0; dy < win_h; dy++) {
for (int dx = 0; dx < win_w; dx++) {
const int src = (y + dy) * pw + (x + dx);
assert(src < (int)idx.size());
assert(dst < (int)inv_idx.size());
idx[src] = dst;
inv_idx[dst] = src;
dst++;
}
}
for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) {
int row_offset = mask_row * (ipw * iph);
std::fill(
mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio),
mask.begin() + row_offset + (dst * merge_ratio * merge_ratio),
0.0);
mask_row++;
}
}
}
ggml_backend_tensor_set(window_idx, idx.data(), 0, ggml_nbytes(window_idx));
ggml_backend_tensor_set(inv_window_idx, inv_idx.data(), 0, ggml_nbytes(inv_window_idx));
ggml_backend_tensor_set(window_mask, mask.data(), 0, ggml_nbytes(window_mask));
// do nothing
} break;
default:
GGML_ABORT("Unknown projector type");
}
if (ggml_backend_is_cpu(ctx->backend)) {
@ -3695,7 +3680,7 @@ bool clip_is_glm(const struct clip_ctx * ctx) {
}
bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
return ctx->proj_type == PROJECTOR_TYPE_QWEN2VL;
return ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL;
}
bool clip_is_llava(const struct clip_ctx * ctx) {

View file

@ -47,7 +47,7 @@ CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_par
CLIP_API void clip_free(struct clip_ctx * ctx);
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);
CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx);
CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx);
@ -59,9 +59,20 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx);
GGML_DEPRECATED(CLIP_API int clip_n_patches(const struct clip_ctx * ctx),
"use clip_n_output_tokens instead");
GGML_DEPRECATED(CLIP_API int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img),
"use clip_n_output_tokens instead");
CLIP_API int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
// for M-RoPE, this will be the number of token positions in X and Y directions
// for other models, X will be the total number of tokens and Y will be 1
CLIP_API int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img);
CLIP_API int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img);
// this should be equal to the embedding dimension of the text model
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);

View file

@ -112,7 +112,7 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
}
// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out, clip_image_f32 * img_input) {
struct {
struct ggml_context * ctx;
} model;
@ -175,7 +175,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
model.ctx = ggml_init(params);
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_output_tokens(ctx_clip, img_input), num_images - 1); // example: 4096 x 576 x 4
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
// fill it with the image embeddings, ignoring the base
for (size_t i = 1; i < num_images; i++) {
@ -214,8 +214,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
// append without newline tokens (default behavior in llava_arch when not using unpad ):
memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
*n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip));
memcpy(image_embd_out + clip_n_output_tokens(ctx_clip, img_input) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
*n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_output_tokens(ctx_clip, img_input));
// Debug: Test single segments
// Current findings: sending base image, sending a segment embedding all works similar to python
@ -313,7 +313,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
image_embd_v[i],
clip_embd_nbytes_by_img(ctx_clip, nx, ny));
n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res);
n_img_pos_out += clip_n_output_tokens(ctx_clip, img_res);
}
*n_img_pos = n_img_pos_out;
for (size_t i = 0; i < image_embd_v.size(); i++) {
@ -352,8 +352,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
}
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
// flat / default llava-1.5 type embedding
*n_img_pos = clip_n_patches(ctx_clip);
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
*n_img_pos = clip_n_output_tokens(ctx_clip, img_res);
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
if (!encoded) {
LOG_ERR("Unable to encode image\n");
@ -391,7 +391,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
int n_img_pos_out;
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
clip_image_f32 * img_input = clip_image_f32_get_img(img_res_v.get(), 0);
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out, img_input);
*n_img_pos = n_img_pos_out;
for (size_t i = 0; i < image_embd_v.size(); i++) {

View file

@ -136,39 +136,6 @@ struct mtmd_cli_context {
}
};
struct decode_embd_batch {
std::vector<llama_pos> pos;
std::vector<int32_t> n_seq_id;
std::vector<llama_seq_id> seq_id_0;
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
pos .resize(n_tokens);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
logits .resize(n_tokens);
seq_id_0.resize(1);
seq_id_0[0] = seq_id;
seq_ids [n_tokens] = nullptr;
batch = {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ embd,
/*pos =*/ pos.data(),
/*n_seq_id =*/ n_seq_id.data(),
/*seq_id =*/ seq_ids.data(),
/*logits =*/ logits.data(),
};
for (int i = 0; i < n_tokens; i++) {
batch.pos [i] = pos_0 + i;
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = false;
}
}
};
static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int n_predict) {
llama_tokens generated_tokens;
for (int i = 0; i < n_predict; i++) {
@ -243,7 +210,7 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, std::vect
return 1;
}
ctx.n_past += mtmd_helper_get_n_tokens(chunks);
ctx.n_past += mtmd_helper_get_n_pos(chunks);
return 0;
}
@ -371,6 +338,7 @@ int main(int argc, char ** argv) {
}
}
if (g_is_interrupted) LOG("\nInterrupted by user\n");
LOG("\n\n");
llama_perf_context_print(ctx.lctx);
return g_is_interrupted ? 130 : 0;
}

View file

@ -40,11 +40,14 @@ struct mtmd_context {
llama_token tok_sli_img_end = LLAMA_TOKEN_NULL; // single slice
llama_token tok_row_end = LLAMA_TOKEN_NULL; // end of row
bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE
// TODO @ngxson : add timings
mtmd_context(const char * mmproj_fname,
const llama_model * text_model,
const mtmd_context_params & ctx_params) :
text_model (text_model),
print_timings(ctx_params.print_timings),
n_threads (ctx_params.n_threads),
image_marker (ctx_params.image_marker)
@ -56,9 +59,8 @@ struct mtmd_context {
if (!ctx_clip) {
throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
}
this->text_model = text_model;
GGML_ASSERT(!clip_is_qwen2vl(ctx_clip) && "Qwen2VL model is not supported yet, use llama-qwen2vl-cli instead");
use_mrope = clip_is_qwen2vl(ctx_clip);
int minicpmv_version = clip_is_minicpmv(ctx_clip);
if (minicpmv_version == 2) {
@ -126,6 +128,7 @@ struct mtmd_image_tokens_data {
struct mtmd_image_tokens {
uint32_t nx; // number of tokens in x direction
uint32_t ny; // number of tokens in y direction
bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
uint32_t n_tokens() const { return nx * ny; }
clip_image_f32_batch batch_f32; // preprocessed image patches
std::string id; // optional user-defined ID, useful for KV cache tracking
@ -202,10 +205,14 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
}
// llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
// for glm-edge, we don't need to add because the tokens are already in the returned embeddings
else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
marker_modified = "<|vision_start|>" + ctx->image_marker + "<|vision_end|>";
string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
// TODO @ngxson : glm-edge : remove BOI / EOI tokens embeddings, decode them as normal tokens
}
// llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
std::vector<std::string> parts = string_split_str(prompt_modified, ctx->image_marker);
output.clear();
@ -229,7 +236,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
for (auto & entry : batch_f32.entries) {
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
image_tokens->nx = clip_n_patches_by_img(ctx->ctx_clip, entry.get());
image_tokens->nx = clip_n_output_tokens(ctx->ctx_clip, entry.get());
image_tokens->ny = 1;
image_tokens->batch_f32.entries.push_back(std::move(entry));
image_tokens->id = id;
@ -246,7 +253,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
};
for (const auto & part : parts) {
//printf("tokenizing part: %s\n", part.c_str());
// printf("tokenizing part: %s\n", part.c_str());
bool add_bos = &parts.front() == &part;
auto tokens = mtmd_tokenize_text_internal(vocab, part, text.add_special && add_bos, text.parse_special);
if (tokens.empty()) {
@ -325,12 +332,20 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
} else {
size_t n_tokens = 0;
for (const auto & entry : batch_f32.entries) {
n_tokens += clip_n_patches_by_img(ctx->ctx_clip, entry.get());
n_tokens += clip_n_output_tokens(ctx->ctx_clip, entry.get());
}
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
image_tokens->nx = n_tokens;
image_tokens->ny = 1; // TODO
if (ctx->use_mrope) {
// for Qwen2VL, we need this information for M-RoPE decoding positions
image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_clip, batch_f32.entries[0].get());
image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_clip, batch_f32.entries[0].get());
image_tokens->use_mrope_pos = true;
} else {
// other models, we only need the total number of tokens
image_tokens->nx = n_tokens;
image_tokens->ny = 1;
}
image_tokens->batch_f32 = std::move(batch_f32);
image_tokens->id = bitmaps[i_img].id; // optional
@ -338,11 +353,6 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
if (clip_is_glm(ctx->ctx_clip)) {
// glm-edge
image_tokens->nx += 2; // add 2 for the begin_of_image and end_of_image token embeddings
}
mtmd_input_chunk chunk{
MTMD_INPUT_CHUNK_TYPE_IMAGE,
{},
@ -380,6 +390,13 @@ std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
return image_tokens->id;
}
llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
if (image_tokens->use_mrope_pos) {
return 1; // for M-RoPE, the whole image is 1 in temporal dimension
}
return image_tokens->n_tokens();
}
int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
@ -397,7 +414,7 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
const auto & entries = image_tokens->batch_f32.entries;
for (size_t i = 0; i < entries.size(); i++) {
int n_tokens_per_image = clip_n_patches_by_img(ctx->ctx_clip, entries[i].get());
int n_tokens_per_image = clip_n_output_tokens(ctx->ctx_clip, entries[i].get());
ok = clip_image_encode(
ctx->ctx_clip,
ctx->n_threads,
@ -425,7 +442,7 @@ size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks) {
if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
n_tokens += chunk.tokens_text.size();
} else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
n_tokens += chunk.tokens_image->n_tokens();
n_tokens += mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get());
} else {
GGML_ASSERT(false && "chunk type not supported");
}
@ -433,22 +450,38 @@ size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks) {
return n_tokens;
}
llama_pos mtmd_helper_get_n_pos(mtmd_input_chunks & chunks) {
llama_pos n_pos = 0;
for (auto & chunk : chunks) {
if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
n_pos += chunk.tokens_text.size();
} else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
n_pos += mtmd_image_tokens_get_n_pos(chunk.tokens_image.get());
} else {
GGML_ASSERT(false && "chunk type not supported");
}
}
return n_pos;
}
// helper struct to make working with embd batch easier
// note: this will be removed after llama_batch_ext refactoring
struct decode_embd_batch {
int n_pos_per_embd;
int n_mmproj_embd;
std::vector<llama_pos> pos;
std::vector<llama_pos> pos_view; // used by mrope
std::vector<int32_t> n_seq_id;
std::vector<llama_seq_id> seq_id_0;
std::vector<llama_seq_id *> seq_ids;
std::vector<int8_t> logits;
llama_batch batch;
decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
pos .resize(n_tokens);
decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
pos .resize(n_tokens * n_pos_per_embd);
n_seq_id.resize(n_tokens);
seq_ids .resize(n_tokens + 1);
logits .resize(n_tokens);
seq_id_0.resize(1);
seq_id_0[0] = seq_id;
seq_ids [n_tokens] = nullptr;
batch = {
/*n_tokens =*/ n_tokens,
@ -459,13 +492,64 @@ struct decode_embd_batch {
/*seq_id =*/ seq_ids.data(),
/*logits =*/ logits.data(),
};
for (int i = 0; i < n_tokens; i++) {
}
void set_position_normal(llama_pos pos_0, llama_seq_id seq_id) {
seq_id_0[0] = seq_id;
for (int i = 0; i < batch.n_tokens; i++) {
batch.pos [i] = pos_0 + i;
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = false;
}
}
void set_position_mrope(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
GGML_ASSERT(n_pos_per_embd == 4);
seq_id_0[0] = seq_id;
for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) {
int i = y * nx + x;
pos[i ] = pos_0;
pos[i + batch.n_tokens ] = pos_0 + y;
pos[i + batch.n_tokens * 2] = pos_0 + x;
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
}
}
for (int i = 0; i < batch.n_tokens; i++) {
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = false;
}
}
llama_batch get_view(int offset, int n_tokens) {
llama_pos * pos_ptr;
pos_view.clear();
pos_view.resize(n_tokens * n_pos_per_embd);
if (n_pos_per_embd > 1) {
// mrope
// for example, with layout of src: 1234...1234...1234...1234...
// offset 2 will give us dst: 34...34...34...34...
for (int i = 0; i < n_pos_per_embd; i++) {
auto src = pos.begin() + i * batch.n_tokens + offset;
pos_view.insert(pos_view.end(), src, src + n_tokens);
}
pos_ptr = pos_view.data();
} else {
// normal
pos_ptr = pos.data() + offset;
}
return {
/*n_tokens =*/ n_tokens,
/*tokens =*/ nullptr,
/*embd =*/ batch.embd + offset * n_mmproj_embd,
/*pos =*/ pos_ptr,
/*n_seq_id =*/ batch.n_seq_id + offset,
/*seq_id =*/ batch.seq_id + offset,
/*logits =*/ batch.logits + offset,
};
}
};
int32_t mtmd_helper_eval(mtmd_context * ctx,
@ -478,6 +562,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
llama_pos n_past = pos0;
llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
for (auto & chunk : chunks) {
bool is_last = &chunk == &chunks.back();
@ -525,6 +610,16 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
int32_t i_batch = 0;
int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
float * embd = mtmd_get_output_embd(ctx);
decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
const int nx = mtmd_image_tokens_get_nx(chunk.tokens_image.get());
const int ny = mtmd_image_tokens_get_ny(chunk.tokens_image.get());
if (mtmd_decode_use_mrope(ctx)) {
batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
} else {
batch_embd.set_position_normal(n_past, seq_id);
}
if (mtmd_decode_use_non_causal(ctx)) {
llama_set_causal_attn(lctx, false);
@ -532,15 +627,14 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
}
while (i_batch < n_img_batches) { // split into batches
int32_t pos_offset = i_batch*n_batch;
int32_t n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
float * embd_batch = embd + pos_offset*n_mmproj_embd;
decode_embd_batch batch_img(embd_batch, n_tokens_batch, n_past, 0);
int pos_offset = i_batch*n_batch;
int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch);
printf("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
LOG_INF("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
int64_t t1 = ggml_time_ms();
ret = llama_decode(lctx, batch_img.batch);
ret = llama_decode(lctx, batch_embd_view);
if (ret != 0) {
LOG_ERR("failed to decode image\n");
llama_set_causal_attn(lctx, true); // restore causal attn
@ -553,9 +647,11 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
}
i_batch++;
n_past += n_tokens_batch;
}
// for mrope, one image is one single **temporal** position
n_past += mtmd_decode_use_mrope(ctx) ? 1 : n_tokens;
if (mtmd_decode_use_non_causal(ctx)) {
llama_set_causal_attn(lctx, true);
}
@ -603,6 +699,10 @@ bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
return false;
}
bool mtmd_decode_use_mrope(mtmd_context * ctx) {
return ctx->use_mrope;
}
void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) {
mtmd_image_tokens_free(val);
}

View file

@ -102,6 +102,7 @@ MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * im
MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
MTMD_API std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens);
MTMD_API llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens); // number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
MTMD_API void mtmd_image_tokens_free(mtmd_image_tokens * image_tokens);
// returns 0 on success
@ -114,15 +115,21 @@ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
// whether we need to set non-causal mask before llama_decode
MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
// whether the current model use M-RoPE for llama_decode
MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
//
// helper functions (can be implemented based on other functions)
//
// helper to count the total number of tokens from a list of chunks, useful to keep track of n_past
// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks);
// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
MTMD_API llama_pos mtmd_helper_get_n_pos(mtmd_input_chunks & chunks);
// helper function that automatically:
// 1. run llama_decode() on text chunks
// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()

View file

@ -27,6 +27,8 @@
#include <cassert>
#include <cmath>
// THIS FILE IS ONLY USED FOR TESTING THE QWEN2VL MODEL
// IT IS NOT A PRODUCTION CODE
static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed,
int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) {
@ -92,20 +94,12 @@ static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct lla
static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past, int * st_pos_id) {
int N = (int) tokens.size();
std::vector<llama_pos> pos;
for (int i = 0; i < N; i += n_batch) {
int n_eval = (int) tokens.size() - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}
auto batch = llama_batch_get_one(&tokens[i], n_eval);
// TODO: add mrope pos ids somewhere else
pos.resize(batch.n_tokens * 4);
std::fill(pos.begin(), pos.end(), 0);
for (int j = 0; j < batch.n_tokens * 3; j ++) {
pos[j] = *st_pos_id + (j % batch.n_tokens);
}
batch.pos = pos.data();
if (llama_decode(ctx_llama, batch)) {
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);

View file

@ -54,8 +54,8 @@ add_test "llama-mtmd-cli" "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
add_test "llama-mtmd-cli" "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
add_test "llama-mtmd-cli" "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
add_test "llama-mtmd-cli" "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
add_test "llama-qwen2vl-cli" "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
add_test "llama-qwen2vl-cli" "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
add_test "llama-mtmd-cli" "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
# to test the big models, run: ./tests.sh big
add_test_big "llama-mtmd-cli" "ggml-org/pixtral-12b-GGUF:Q4_K_M"