From e971eaefe38264fd4010007f02eb7c94f134b628 Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Mon, 1 Dec 2025 18:26:31 -0300 Subject: [PATCH] fix: qwenvl.hpp was renamed to llm.hpp --- otherarch/sdcpp/qwenvl.hpp | 1381 ------------------------------------ 1 file changed, 1381 deletions(-) delete mode 100644 otherarch/sdcpp/qwenvl.hpp diff --git a/otherarch/sdcpp/qwenvl.hpp b/otherarch/sdcpp/qwenvl.hpp deleted file mode 100644 index cd00997b9..000000000 --- a/otherarch/sdcpp/qwenvl.hpp +++ /dev/null @@ -1,1381 +0,0 @@ -#ifndef __QWENVL_HPP__ -#define __QWENVL_HPP__ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "clip.hpp" -#include "ggml_extend.hpp" -#include -#include "rope.hpp" -#include "tokenize_util.h" - -namespace Qwen { - constexpr int QWENVL_GRAPH_SIZE = 10240; - - class Qwen2Tokenizer { - private: - std::map byte_encoder; - std::map byte_decoder; - std::map encoder; - std::map decoder; - std::map, int> bpe_ranks; - std::regex pat; - int encoder_len; - int bpe_len; - - public: - const std::string UNK_TOKEN = "<|endoftext|>"; - const std::string EOS_TOKEN = "<|endoftext|>"; - const std::string PAD_TOKEN = "<|endoftext|>"; - - const int UNK_TOKEN_ID = 151643; - const int EOS_TOKEN_ID = 151643; - const int PAD_TOKEN_ID = 151643; - - std::vector special_tokens = { - "<|endoftext|>", - "<|im_start|>", - "<|im_end|>", - "<|object_ref_start|>", - "<|object_ref_end|>", - "<|box_start|>", - "<|box_end|>", - "<|quad_start|>", - "<|quad_end|>", - "<|vision_start|>", - "<|vision_end|>", - "<|vision_pad|>", - "<|image_pad|>", - "<|video_pad|>", - "", - "", - "<|fim_prefix|>", - "<|fim_middle|>", - "<|fim_suffix|>", - "<|fim_pad|>", - "<|repo_name|>", - "<|file_sep|>", - }; - - private: - static std::string strip(const std::string& str) { - std::string::size_type start = str.find_first_not_of(" \t\n\r\v\f"); - std::string::size_type end = str.find_last_not_of(" \t\n\r\v\f"); - - if (start == std::string::npos) { - // String contains only whitespace characters - return ""; - } - - return str.substr(start, end - start + 1); - } - - static std::string whitespace_clean(std::string text) { - text = std::regex_replace(text, std::regex(R"(\s+)"), " "); - text = strip(text); - return text; - } - - static std::set> get_pairs(const std::vector& subwords) { - std::set> pairs; - if (subwords.size() == 0) { - return pairs; - } - std::u32string prev_subword = subwords[0]; - for (int i = 1; i < subwords.size(); i++) { - std::u32string subword = subwords[i]; - std::pair pair(prev_subword, subword); - pairs.insert(pair); - prev_subword = subword; - } - return pairs; - } - - bool is_special_token(const std::string& token) { - for (auto& special_token : special_tokens) { - if (special_token == token) { - return true; - } - } - return false; - } - - public: - explicit Qwen2Tokenizer(const std::string& merges_utf8_str = "") { - if (merges_utf8_str.size() > 0) { - load_from_merges(merges_utf8_str); - } else { - load_from_merges(ModelLoader::load_qwen2_merges()); - } - } - - void load_from_merges(const std::string& merges_utf8_str) { - auto byte_unicode_pairs = bytes_to_unicode(); - // printf("byte_unicode_pairs have %lu pairs \n", byte_unicode_pairs.size()); - byte_encoder = std::map(byte_unicode_pairs.begin(), byte_unicode_pairs.end()); - for (auto& pair : byte_unicode_pairs) { - byte_decoder[pair.second] = pair.first; - } - // for (auto & pair: byte_unicode_pairs) { - // std::cout << pair.first << ": " << pair.second << std::endl; - // } - std::vector merges; - size_t start = 0; - size_t pos; - std::u32string merges_utf32_str = utf8_to_utf32(merges_utf8_str); - while ((pos = merges_utf32_str.find('\n', start)) != std::string::npos) { - merges.push_back(merges_utf32_str.substr(start, pos - start)); - start = pos + 1; - } - LOG_DEBUG("merges size %llu", merges.size()); - merges = std::vector(merges.begin(), merges.end()); - std::vector> merge_pairs; - for (const auto& merge : merges) { - size_t space_pos = merge.find(' '); - merge_pairs.emplace_back(merge.substr(0, space_pos), merge.substr(space_pos + 1)); - // LOG_DEBUG("%s", utf32_to_utf8(merge.substr(space_pos + 1)).c_str()); - // printf("%s :: %s | %s \n", utf32_to_utf8(merge).c_str(), utf32_to_utf8(merge.substr(0, space_pos)).c_str(), - // utf32_to_utf8(merge.substr(space_pos + 1)).c_str()); - } - - std::vector vocab; - for (const auto& pair : byte_unicode_pairs) { - vocab.push_back(pair.second); - } - for (const auto& merge : merge_pairs) { - vocab.push_back(merge.first + merge.second); - } - for (auto& special_token : special_tokens) { - vocab.push_back(utf8_to_utf32(special_token)); - } - - LOG_DEBUG("vocab size: %llu", vocab.size()); - int i = 0; - for (const auto& token : vocab) { - encoder[token] = i; - decoder[i] = token; - i++; - } - encoder_len = i; - - int rank = 0; - for (const auto& merge : merge_pairs) { - bpe_ranks[merge] = rank++; - } - bpe_len = rank; - }; - - std::u32string bpe(const std::u32string& token) { - std::vector word; - - for (int i = 0; i < token.size(); i++) { - word.emplace_back(1, token[i]); - } - - std::set> pairs = get_pairs(word); - - if (pairs.empty()) { - return token; - } - - while (true) { - auto min_pair_iter = std::min_element(pairs.begin(), - pairs.end(), - [&](const std::pair& a, - const std::pair& b) { - if (bpe_ranks.find(a) == bpe_ranks.end()) { - return false; - } else if (bpe_ranks.find(b) == bpe_ranks.end()) { - return true; - } - return bpe_ranks.at(a) < bpe_ranks.at(b); - }); - - const std::pair& bigram = *min_pair_iter; - - if (bpe_ranks.find(bigram) == bpe_ranks.end()) { - break; - } - - std::u32string first = bigram.first; - std::u32string second = bigram.second; - std::vector new_word; - int32_t i = 0; - - while (i < word.size()) { - auto it = std::find(word.begin() + i, word.end(), first); - if (it == word.end()) { - new_word.insert(new_word.end(), word.begin() + i, word.end()); - break; - } - new_word.insert(new_word.end(), word.begin() + i, it); - i = static_cast(std::distance(word.begin(), it)); - - if (word[i] == first && i < static_cast(word.size()) - 1 && word[i + 1] == second) { - new_word.push_back(first + second); - i += 2; - } else { - new_word.push_back(word[i]); - i += 1; - } - } - - word = new_word; - - if (word.size() == 1) { - break; - } - pairs = get_pairs(word); - } - - std::u32string result; - for (int i = 0; i < word.size(); i++) { - result += word[i]; - if (i != word.size() - 1) { - result += utf8_to_utf32(" "); - } - } - - return result; - } - - std::vector tokenize(std::string text, - on_new_token_cb_t on_new_token_cb = nullptr, - size_t max_length = 0, - bool padding = false) { - std::vector tokens = encode(text, on_new_token_cb); - - if (max_length > 0) { - if (tokens.size() < max_length) { - tokens.resize(max_length); - } else { - if (padding) { - tokens.insert(tokens.end(), max_length - tokens.size(), PAD_TOKEN_ID); - } - } - } - - return tokens; - } - - void pad_tokens(std::vector& tokens, - std::vector& weights, - size_t max_length = 0, - bool padding = false) { - if (max_length > 0 && padding) { - size_t n = std::ceil(tokens.size() * 1.0 / max_length); - if (n == 0) { - n = 1; - } - size_t length = max_length * n; - LOG_DEBUG("token length: %llu", length); - tokens.insert(tokens.end(), length - tokens.size(), PAD_TOKEN_ID); - weights.insert(weights.end(), length - weights.size(), 1.0); - } - } - - std::vector encode(std::string text, on_new_token_cb_t on_new_token_cb = nullptr) { - std::string original_text = text; - std::vector bpe_tokens; - std::vector token_strs; - - auto splited_texts = split_with_special_tokens(text, special_tokens); - - for (auto& splited_text : splited_texts) { - if (is_special_token(splited_text)) { - bpe_tokens.push_back(encoder[utf8_to_utf32(splited_text)]); - token_strs.push_back(splited_text); - continue; - } - auto tokens = token_split(splited_text); - for (auto& token : tokens) { - if (on_new_token_cb != nullptr) { - bool skip = on_new_token_cb(token, bpe_tokens); - if (skip) { - continue; - } - } - - std::string token_str = token; - std::u32string utf32_token; - for (int i = 0; i < token_str.length(); i++) { - unsigned char b = token_str[i]; - utf32_token += byte_encoder[b]; - } - auto bpe_strs = bpe(utf32_token); - size_t start = 0; - size_t pos; - while ((pos = bpe_strs.find(' ', start)) != std::u32string::npos) { - auto bpe_str = bpe_strs.substr(start, pos - start); - bpe_tokens.push_back(encoder[bpe_str]); - token_strs.push_back(utf32_to_utf8(bpe_str)); - - start = pos + 1; - } - auto bpe_str = bpe_strs.substr(start, bpe_strs.size() - start); - bpe_tokens.push_back(encoder[bpe_str]); - token_strs.push_back(utf32_to_utf8(bpe_str)); - } - } - - std::stringstream ss; - ss << "["; - for (auto token : token_strs) { - ss << "\"" << token << "\", "; - } - ss << "]"; - // LOG_DEBUG("split prompt \"%s\" to tokens %s", original_text.c_str(), ss.str().c_str()); - // printf("split prompt \"%s\" to tokens %s \n", original_text.c_str(), ss.str().c_str()); - return bpe_tokens; - } - }; - - struct Qwen2_5_VLMLP : public GGMLBlock { - public: - Qwen2_5_VLMLP(int64_t hidden_size, int64_t intermediate_size, bool bias = false) { - blocks["gate_proj"] = std::shared_ptr(new Linear(hidden_size, intermediate_size, bias)); - blocks["up_proj"] = std::shared_ptr(new Linear(hidden_size, intermediate_size, bias)); - blocks["down_proj"] = std::shared_ptr(new Linear(intermediate_size, hidden_size, bias)); - } - - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { - // x: [N, n_token, hidden_size] - auto gate_proj = std::dynamic_pointer_cast(blocks["gate_proj"]); - auto up_proj = std::dynamic_pointer_cast(blocks["up_proj"]); - auto down_proj = std::dynamic_pointer_cast(blocks["down_proj"]); - - auto h = gate_proj->forward(ctx, x); - h = ggml_silu_inplace(ctx->ggml_ctx, h); - h = ggml_mul_inplace(ctx->ggml_ctx, h, up_proj->forward(ctx, x)); - h = down_proj->forward(ctx, h); - return h; - } - }; - - struct Qwen2_5_VisionPatchEmbed : public GGMLBlock { - protected: - bool llama_cpp_style; - int patch_size; - int temporal_patch_size; - int64_t in_channels; - int64_t embed_dim; - - public: - Qwen2_5_VisionPatchEmbed(bool llama_cpp_style, - int patch_size = 14, - int temporal_patch_size = 2, - int64_t in_channels = 3, - int64_t embed_dim = 1152) - : llama_cpp_style(llama_cpp_style), - patch_size(patch_size), - temporal_patch_size(temporal_patch_size), - in_channels(in_channels), - embed_dim(embed_dim) { - if (llama_cpp_style) { - blocks["proj.0"] = std::shared_ptr(new Conv2d(in_channels, - embed_dim, - {patch_size, patch_size}, - {patch_size, patch_size}, // stride - {0, 0}, // padding - {1, 1}, // dilation - false)); - blocks["proj.1"] = std::shared_ptr(new Conv2d(in_channels, - embed_dim, - {patch_size, patch_size}, - {patch_size, patch_size}, // stride - {0, 0}, // padding - {1, 1}, // dilation - false)); - } else { - std::tuple kernel_size = {(int)temporal_patch_size, (int)patch_size, (int)patch_size}; - blocks["proj"] = std::shared_ptr(new Conv3d(in_channels, - embed_dim, - kernel_size, - kernel_size, // stride - {0, 0, 0}, // padding - {1, 1, 1}, // dilation - false)); - } - } - - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { - // x: [N*grid_t*grid_h*grid_w, in_channels, temporal_patch_size*patch_size*patch_size] - // return: [N*grid_t*grid_h*grid_w, embed_dim] - x = ggml_reshape_4d(ctx->ggml_ctx, - x, - patch_size, - patch_size, - temporal_patch_size, - ggml_nelements(x) / (temporal_patch_size * patch_size * patch_size)); - - if (llama_cpp_style) { - auto proj_0 = std::dynamic_pointer_cast(blocks["proj.0"]); - auto proj_1 = std::dynamic_pointer_cast(blocks["proj.1"]); - - auto x0 = ggml_ext_slice(ctx->ggml_ctx, x, 2, 0, 1); - x0 = ggml_reshape_4d(ctx->ggml_ctx, x0, x0->ne[0], x0->ne[1], in_channels, x0->ne[3] / in_channels); - x0 = proj_0->forward(ctx, x0); - - auto x1 = ggml_ext_slice(ctx->ggml_ctx, x, 2, 1, 2); - x1 = ggml_reshape_4d(ctx->ggml_ctx, x1, x1->ne[0], x1->ne[1], in_channels, x1->ne[3] / in_channels); - x1 = proj_1->forward(ctx, x1); - - x = ggml_add(ctx->ggml_ctx, x0, x1); - } else { - auto proj = std::dynamic_pointer_cast(blocks["proj"]); - - x = proj->forward(ctx, x); - } - - x = ggml_reshape_2d(ctx->ggml_ctx, x, embed_dim, ggml_nelements(x) / embed_dim); - return x; - } - }; - - struct Qwen2_5_VLPatchMerger : public GGMLBlock { - protected: - int64_t hidden_size; - - public: - Qwen2_5_VLPatchMerger(int64_t dim, - int64_t context_dim, - int64_t spatial_merge_size) { - hidden_size = context_dim * spatial_merge_size * spatial_merge_size; - blocks["ln_q"] = std::shared_ptr(new RMSNorm(context_dim, 1e-6f)); - blocks["mlp.0"] = std::shared_ptr(new Linear(hidden_size, hidden_size)); - // mlp.1 is nn.GELU() - blocks["mlp.2"] = std::shared_ptr(new Linear(hidden_size, dim)); - } - - struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { - auto ln_q = std::dynamic_pointer_cast(blocks["ln_q"]); - auto mlp_0 = std::dynamic_pointer_cast(blocks["mlp.0"]); - auto mlp_2 = std::dynamic_pointer_cast(blocks["mlp.2"]); - - x = ln_q->forward(ctx, x); - x = ggml_reshape_2d(ctx->ggml_ctx, x, hidden_size, ggml_nelements(x) / hidden_size); - x = mlp_0->forward(ctx, x); - x = ggml_gelu(ctx->ggml_ctx, x); - x = mlp_2->forward(ctx, x); - return x; - } - }; - - struct Qwen2_5_VLVisionAttention : public GGMLBlock { - protected: - bool llama_cpp_style; - int64_t head_dim; - int64_t num_heads; - - public: - Qwen2_5_VLVisionAttention(bool llama_cpp_style, - int64_t hidden_size, - int64_t num_heads) - : llama_cpp_style(llama_cpp_style), num_heads(num_heads) { - head_dim = hidden_size / num_heads; - GGML_ASSERT(num_heads * head_dim == hidden_size); - if (llama_cpp_style) { - blocks["q_proj"] = std::shared_ptr(new Linear(hidden_size, hidden_size)); - blocks["k_proj"] = std::shared_ptr(new Linear(hidden_size, hidden_size)); - blocks["v_proj"] = std::shared_ptr(new Linear(hidden_size, hidden_size)); - } else { - blocks["qkv"] = std::shared_ptr(new Linear(hidden_size, hidden_size * 3)); - } - blocks["proj"] = std::shared_ptr(new Linear(hidden_size, hidden_size)); - } - - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* pe, - struct ggml_tensor* mask = nullptr) { - // x: [N, n_token, hidden_size] - int64_t n_token = x->ne[1]; - int64_t N = x->ne[2]; - auto proj = std::dynamic_pointer_cast(blocks["proj"]); - - std::vector qkv_vec; - if (llama_cpp_style) { - auto q_proj = std::dynamic_pointer_cast(blocks["q_proj"]); - auto k_proj = std::dynamic_pointer_cast(blocks["k_proj"]); - auto v_proj = std::dynamic_pointer_cast(blocks["v_proj"]); - - auto q = q_proj->forward(ctx, x); - auto k = k_proj->forward(ctx, x); - auto v = v_proj->forward(ctx, x); - - qkv_vec = {q, k, v}; - } else { - auto qkv_proj = std::dynamic_pointer_cast(blocks["qkv"]); - auto qkv = qkv_proj->forward(ctx, x); - qkv_vec = split_qkv(ctx->ggml_ctx, qkv); - } - - auto q = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[0], head_dim, num_heads, qkv_vec[0]->ne[1], qkv_vec[0]->ne[2]); // [N, n_token, n_head, d_head] - auto k = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[1], head_dim, num_heads, qkv_vec[1]->ne[1], qkv_vec[1]->ne[2]); // [N, n_token, n_head, d_head] - auto v = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[2], head_dim, num_heads, qkv_vec[2]->ne[1], qkv_vec[2]->ne[2]); // [N, n_token, n_head, d_head] - - x = Rope::attention(ctx, q, k, v, pe, mask, 1.f, false); // [N, n_token, hidden_size] - - x = proj->forward(ctx, x); // [N, n_token, hidden_size] - return x; - } - }; - - struct Qwen2_5_VLVisionBlock : public GGMLBlock { - public: - Qwen2_5_VLVisionBlock(bool llama_cpp_style, - int64_t hidden_size, - int64_t intermediate_size, - int64_t num_heads, - float eps = 1e-6f) { - blocks["attn"] = std::shared_ptr(new Qwen2_5_VLVisionAttention(llama_cpp_style, hidden_size, num_heads)); - blocks["mlp"] = std::shared_ptr(new Qwen2_5_VLMLP(hidden_size, intermediate_size, true)); - blocks["norm1"] = std::shared_ptr(new RMSNorm(hidden_size, eps)); - blocks["norm2"] = std::shared_ptr(new RMSNorm(hidden_size, eps)); - } - - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* pe, - struct ggml_tensor* mask = nullptr) { - // x: [N, n_token, hidden_size] - auto attn = std::dynamic_pointer_cast(blocks["attn"]); - auto mlp = std::dynamic_pointer_cast(blocks["mlp"]); - auto norm1 = std::dynamic_pointer_cast(blocks["norm1"]); - auto norm2 = std::dynamic_pointer_cast(blocks["norm2"]); - - auto residual = x; - x = norm1->forward(ctx, x); - x = attn->forward(ctx, x, pe, mask); - x = ggml_add_inplace(ctx->ggml_ctx, x, residual); - - residual = x; - x = norm2->forward(ctx, x); - x = mlp->forward(ctx, x); - x = ggml_add_inplace(ctx->ggml_ctx, x, residual); - - return x; - } - }; - - struct Qwen2_5_VLVisionModel : public GGMLBlock { - protected: - int64_t num_layers; - int64_t spatial_merge_size; - std::set fullatt_block_indexes; - - public: - Qwen2_5_VLVisionModel(bool llama_cpp_style, - int64_t num_layers, - int64_t in_channels, - int64_t hidden_size, - int64_t out_hidden_size, - int64_t intermediate_size, - int64_t num_heads, - int64_t spatial_merge_size, - int64_t patch_size, - int64_t temporal_patch_size, - int64_t window_size, - std::set fullatt_block_indexes = {7, 15, 23, 31}, - float eps = 1e-6f) - : num_layers(num_layers), fullatt_block_indexes(std::move(fullatt_block_indexes)), spatial_merge_size(spatial_merge_size) { - blocks["patch_embed"] = std::shared_ptr(new Qwen2_5_VisionPatchEmbed(llama_cpp_style, - patch_size, - temporal_patch_size, - in_channels, - hidden_size)); - for (int i = 0; i < num_layers; i++) { - blocks["blocks." + std::to_string(i)] = std::shared_ptr(new Qwen2_5_VLVisionBlock(llama_cpp_style, - hidden_size, - intermediate_size, - num_heads, - eps)); - } - blocks["merger"] = std::shared_ptr(new Qwen2_5_VLPatchMerger(out_hidden_size, hidden_size, spatial_merge_size)); - } - - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* pixel_values, - struct ggml_tensor* pe, - struct ggml_tensor* window_index, - struct ggml_tensor* window_inverse_index, - struct ggml_tensor* window_mask) { - // pixel_values: [grid_t*(H/mh/ph)*(W/mw/pw)*mh*mw, C*pt*ph*pw] - // window_index: [grid_t*(H/mh/ph)*(W/mw/pw)] - // window_inverse_index: [grid_t*(H/mh/ph)*(W/mw/pw)] - // window_mask: [grid_h*grid_w, grid_h*grid_w] - auto patch_embed = std::dynamic_pointer_cast(blocks["patch_embed"]); - auto merger = std::dynamic_pointer_cast(blocks["merger"]); - - auto x = patch_embed->forward(ctx, pixel_values); - - x = ggml_reshape_4d(ctx->ggml_ctx, x, x->ne[0] * spatial_merge_size * spatial_merge_size, x->ne[1] / spatial_merge_size / spatial_merge_size, x->ne[2], x->ne[3]); - x = ggml_get_rows(ctx->ggml_ctx, x, window_index); - x = ggml_reshape_4d(ctx->ggml_ctx, x, x->ne[0] / spatial_merge_size / spatial_merge_size, x->ne[1] * spatial_merge_size * spatial_merge_size, x->ne[2], x->ne[3]); - - for (int i = 0; i < num_layers; i++) { - auto block = std::dynamic_pointer_cast(blocks["blocks." + std::to_string(i)]); - - auto mask = window_mask; - if (fullatt_block_indexes.find(i) != fullatt_block_indexes.end()) { - mask = nullptr; - } - x = block->forward(ctx, x, pe, mask); - } - - x = merger->forward(ctx, x); - - x = ggml_get_rows(ctx->ggml_ctx, x, window_inverse_index); - - return x; - } - }; - - struct Qwen2_5_VLAttention : public GGMLBlock { - protected: - int64_t head_dim; - int64_t num_heads; - int64_t num_kv_heads; - - public: - Qwen2_5_VLAttention(int64_t hidden_size, - int64_t num_heads, - int64_t num_kv_heads) - : num_heads(num_heads), num_kv_heads(num_kv_heads) { - head_dim = hidden_size / num_heads; - GGML_ASSERT(num_heads * head_dim == hidden_size); - blocks["q_proj"] = std::shared_ptr(new Linear(hidden_size, num_heads * head_dim)); - blocks["k_proj"] = std::shared_ptr(new Linear(hidden_size, num_kv_heads * head_dim)); - blocks["v_proj"] = std::shared_ptr(new Linear(hidden_size, num_kv_heads * head_dim)); - blocks["o_proj"] = std::shared_ptr(new Linear(num_heads * head_dim, hidden_size, false)); - } - - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* input_pos) { - // x: [N, n_token, hidden_size] - int64_t n_token = x->ne[1]; - int64_t N = x->ne[2]; - auto q_proj = std::dynamic_pointer_cast(blocks["q_proj"]); - auto k_proj = std::dynamic_pointer_cast(blocks["k_proj"]); - auto v_proj = std::dynamic_pointer_cast(blocks["v_proj"]); - auto out_proj = std::dynamic_pointer_cast(blocks["o_proj"]); - - auto q = q_proj->forward(ctx, x); // [N, n_token, num_heads*head_dim] - auto k = k_proj->forward(ctx, x); // [N, n_token, num_kv_heads*head_dim] - auto v = v_proj->forward(ctx, x); // [N, n_token, num_kv_heads*head_dim] - - q = ggml_reshape_4d(ctx->ggml_ctx, q, head_dim, num_heads, n_token, N); // [N, n_token, num_heads, head_dim] - k = ggml_reshape_4d(ctx->ggml_ctx, k, head_dim, num_kv_heads, n_token, N); // [N, n_token, num_kv_heads, head_dim] - v = ggml_reshape_4d(ctx->ggml_ctx, v, head_dim, num_kv_heads, n_token, N); // [N, n_token, num_kv_heads, head_dim] - - int sections[4] = {16, 24, 24, 0}; - q = ggml_rope_multi(ctx->ggml_ctx, q, input_pos, nullptr, head_dim, sections, GGML_ROPE_TYPE_MROPE, 128000, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f); - k = ggml_rope_multi(ctx->ggml_ctx, k, input_pos, nullptr, head_dim, sections, GGML_ROPE_TYPE_MROPE, 128000, 1000000.f, 1.f, 0.f, 1.f, 32.f, 1.f); - - q = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, q, 0, 2, 1, 3)); // [N, num_heads, n_token, head_dim] - q = ggml_reshape_3d(ctx->ggml_ctx, q, q->ne[0], q->ne[1], q->ne[2] * q->ne[3]); // [N*num_heads, n_token, head_dim] - - k = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, k, 0, 2, 1, 3)); // [N, num_kv_heads, n_token, head_dim] - k = ggml_reshape_3d(ctx->ggml_ctx, k, k->ne[0], k->ne[1], k->ne[2] * k->ne[3]); // [N*num_kv_heads, n_token, head_dim] - - x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, nullptr, true, true, false); // [N, n_token, hidden_size] - - x = out_proj->forward(ctx, x); // [N, n_token, hidden_size] - return x; - } - }; - - struct Qwen2_5_VLBlock : public GGMLBlock { - public: - Qwen2_5_VLBlock(int64_t hidden_size, - int64_t intermediate_size, - int64_t num_heads, - int64_t num_kv_heads, - float eps = 1e-6f) { - blocks["self_attn"] = std::shared_ptr(new Qwen2_5_VLAttention(hidden_size, num_heads, num_kv_heads)); - blocks["mlp"] = std::shared_ptr(new Qwen2_5_VLMLP(hidden_size, intermediate_size)); - blocks["input_layernorm"] = std::shared_ptr(new RMSNorm(hidden_size, eps)); - blocks["post_attention_layernorm"] = std::shared_ptr(new RMSNorm(hidden_size, eps)); - } - - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* x, - struct ggml_tensor* input_pos) { - // x: [N, n_token, hidden_size] - auto self_attn = std::dynamic_pointer_cast(blocks["self_attn"]); - auto mlp = std::dynamic_pointer_cast(blocks["mlp"]); - auto input_layernorm = std::dynamic_pointer_cast(blocks["input_layernorm"]); - auto post_attention_layernorm = std::dynamic_pointer_cast(blocks["post_attention_layernorm"]); - - auto residual = x; - x = input_layernorm->forward(ctx, x); - x = self_attn->forward(ctx, x, input_pos); - x = ggml_add_inplace(ctx->ggml_ctx, x, residual); - - residual = x; - x = post_attention_layernorm->forward(ctx, x); - x = mlp->forward(ctx, x); - x = ggml_add_inplace(ctx->ggml_ctx, x, residual); - - return x; - } - }; - - struct Qwen2_5_VLTextModel : public GGMLBlock { - protected: - int64_t num_layers; - - public: - Qwen2_5_VLTextModel(int64_t num_layers, - int64_t vocab_size, - int64_t hidden_size, - int64_t intermediate_size, - int64_t num_heads, - int64_t num_kv_heads, - float eps = 1e-6f) - : num_layers(num_layers) { - blocks["embed_tokens"] = std::shared_ptr(new Embedding(vocab_size, hidden_size)); - for (int i = 0; i < num_layers; i++) { - blocks["layers." + std::to_string(i)] = std::shared_ptr(new Qwen2_5_VLBlock(hidden_size, - intermediate_size, - num_heads, - num_kv_heads)); - } - blocks["norm"] = std::shared_ptr(new RMSNorm(hidden_size, eps)); - } - - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* input_ids, - struct ggml_tensor* input_pos, - std::vector> image_embeds) { - // input_ids: [N, n_token] - // return: [N, n_token, hidden_size] - - auto embed_tokens = std::dynamic_pointer_cast(blocks["embed_tokens"]); - auto norm = std::dynamic_pointer_cast(blocks["norm"]); - - auto x = embed_tokens->forward(ctx, input_ids); - - if (image_embeds.size() > 0) { - GGML_ASSERT(x->ne[2] == 1); // N == 1 - - auto raw_x = ggml_cast(ctx->ggml_ctx, x, image_embeds[0].second->type); - int64_t txt_token_start = 0; - int64_t txt_token_end = 0; - - ggml_tensor* input_embed = nullptr; - - for (int i = 0; i < image_embeds.size(); i++) { - if (i == 0) { - txt_token_start = 0; - } else { - txt_token_start = image_embeds[i - 1].first + image_embeds[i - 1].second->ne[1]; - } - txt_token_end = image_embeds[i].first; - - auto txt_embed = ggml_ext_slice(ctx->ggml_ctx, raw_x, 1, txt_token_start, txt_token_end); - if (input_embed == nullptr) { - input_embed = txt_embed; - } else { - input_embed = ggml_concat(ctx->ggml_ctx, input_embed, txt_embed, 1); - } - - auto image_embed = image_embeds[i].second; - input_embed = ggml_concat(ctx->ggml_ctx, input_embed, image_embed, 1); - } - - txt_token_start = image_embeds[image_embeds.size() - 1].first + image_embeds[image_embeds.size() - 1].second->ne[1]; - txt_token_end = raw_x->ne[1]; - - auto final_txt_embed = ggml_ext_slice(ctx->ggml_ctx, raw_x, 1, txt_token_start, txt_token_end); - - input_embed = ggml_concat(ctx->ggml_ctx, input_embed, final_txt_embed, 1); - GGML_ASSERT(raw_x->ne[1] == input_embed->ne[1]); - - x = input_embed; - } - - for (int i = 0; i < num_layers; i++) { - auto block = std::dynamic_pointer_cast(blocks["layers." + std::to_string(i)]); - - x = block->forward(ctx, x, input_pos); - } - - x = norm->forward(ctx, x); - return x; - } - }; - - struct Qwen2_5_VLVisionParams { - int64_t num_layers = 32; - int64_t hidden_size = 1280; - int64_t intermediate_size = 3420; - int64_t num_heads = 16; - int64_t in_channels = 3; - int64_t out_hidden_size = 3584; - int64_t temporal_patch_size = 2; - int64_t patch_size = 14; - int64_t spatial_merge_size = 2; - int64_t window_size = 112; - std::set fullatt_block_indexes = {7, 15, 23, 31}; - }; - - struct Qwen2_5_VLParams { - int64_t num_layers = 28; - int64_t hidden_size = 3584; - int64_t intermediate_size = 18944; - int64_t num_heads = 28; - int64_t num_kv_heads = 4; - int64_t vocab_size = 152064; - float rms_norm_eps = 1e-06f; - Qwen2_5_VLVisionParams vision; - }; - - struct Qwen2_5_VL : public GGMLBlock { - bool enable_vision; - Qwen2_5_VLParams params; - - public: - Qwen2_5_VL() {} - Qwen2_5_VL(Qwen2_5_VLParams params, bool enable_vision = false, bool llama_cpp_style = false) - : enable_vision(enable_vision), params(params) { - blocks["model"] = std::shared_ptr(new Qwen2_5_VLTextModel(params.num_layers, - params.vocab_size, - params.hidden_size, - params.intermediate_size, - params.num_heads, - params.num_kv_heads, - params.rms_norm_eps)); - if (enable_vision) { - blocks["visual"] = std::shared_ptr(new Qwen2_5_VLVisionModel(llama_cpp_style, - params.vision.num_layers, - params.vision.in_channels, - params.vision.hidden_size, - params.vision.out_hidden_size, - params.vision.intermediate_size, - params.vision.num_heads, - params.vision.spatial_merge_size, - params.vision.patch_size, - params.vision.temporal_patch_size, - params.vision.window_size, - params.vision.fullatt_block_indexes)); - } - } - - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* input_ids, - struct ggml_tensor* input_pos, - std::vector> image_embeds) { - // input_ids: [N, n_token] - auto model = std::dynamic_pointer_cast(blocks["model"]); - - auto x = model->forward(ctx, input_ids, input_pos, image_embeds); - return x; - } - - struct ggml_tensor* vision_forward(GGMLRunnerContext* ctx, - struct ggml_tensor* pixel_values, - struct ggml_tensor* pe, - struct ggml_tensor* window_index, - struct ggml_tensor* window_inverse_index, - struct ggml_tensor* window_mask) { - GGML_ASSERT(enable_vision); - auto vision_model = std::dynamic_pointer_cast(blocks["visual"]); - return vision_model->forward(ctx, pixel_values, pe, window_index, window_inverse_index, window_mask); - } - }; - - struct Qwen2_5_VLRunner : public GGMLRunner { - Qwen2_5_VLParams params; - bool enable_vision; - Qwen2_5_VL model; - - std::vector input_pos_vec; - std::vector window_mask_vec; - std::vector window_index_vec; - std::vector window_inverse_index_vec; - std::vector pe_vec; - - Qwen2_5_VLRunner(ggml_backend_t backend, - bool offload_params_to_cpu, - const String2TensorStorage& tensor_storage_map, - const std::string prefix, - bool enable_vision_ = false) - : GGMLRunner(backend, offload_params_to_cpu), enable_vision(enable_vision_) { - bool have_vision_weight = false; - bool llama_cpp_style = false; - for (auto pair : tensor_storage_map) { - std::string tensor_name = pair.first; - if (tensor_name.find(prefix) == std::string::npos) - continue; - size_t pos = tensor_name.find("visual."); - if (pos != std::string::npos) { - have_vision_weight = true; - if (contains(tensor_name, "attn.q_proj")) { - llama_cpp_style = true; - break; - } - } - } - if (enable_vision && !have_vision_weight) { - LOG_WARN("no vision weights detected, vision disabled"); - enable_vision = false; - } - if (enable_vision) { - LOG_DEBUG("enable qwen2vl vision"); - if (llama_cpp_style) { - LOG_DEBUG("llama.cpp style vision weight"); - } - } - model = Qwen2_5_VL(params, enable_vision, llama_cpp_style); - model.init(params_ctx, tensor_storage_map, prefix); - } - - std::string get_desc() override { - return "qwenvl2.5"; - } - - void get_param_tensors(std::map& tensors, const std::string prefix) { - model.get_param_tensors(tensors, prefix); - } - - struct ggml_tensor* forward(GGMLRunnerContext* ctx, - struct ggml_tensor* input_ids, - struct ggml_tensor* input_pos, - std::vector> image_embeds) { - auto hidden_states = model.forward(ctx, input_ids, input_pos, image_embeds); // [N, n_token, hidden_size] - return hidden_states; - } - - struct ggml_tensor* vision_forward(GGMLRunnerContext* ctx, - struct ggml_tensor* pixel_values, - struct ggml_tensor* input_pos, - struct ggml_tensor* window_index, - struct ggml_tensor* window_inverse_index, - struct ggml_tensor* window_mask) { - auto hidden_states = model.vision_forward(ctx, pixel_values, input_pos, window_index, window_inverse_index, window_mask); - return hidden_states; - } - - struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids, std::vector> image_embeds) { - struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); - - input_ids = to_backend(input_ids); - - for (auto& image_embed : image_embeds) { - image_embed.second = to_backend(image_embed.second); - } - - int64_t n_tokens = input_ids->ne[0]; - input_pos_vec.resize(n_tokens * 4); - for (int i = 0; i < n_tokens; ++i) { - input_pos_vec[i] = i; - input_pos_vec[n_tokens + i] = i; - input_pos_vec[2 * n_tokens + i] = i; - input_pos_vec[3 * n_tokens + i] = 0; - } - - auto input_pos = ggml_new_tensor_1d(compute_ctx, - GGML_TYPE_I32, - n_tokens * 4); - set_backend_tensor_data(input_pos, input_pos_vec.data()); - - auto runner_ctx = get_context(); - - struct ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, input_pos, image_embeds); - - ggml_build_forward_expand(gf, hidden_states); - - return gf; - } - - void compute(const int n_threads, - struct ggml_tensor* input_ids, - std::vector> image_embeds, - ggml_tensor** output, - ggml_context* output_ctx = nullptr) { - auto get_graph = [&]() -> struct ggml_cgraph* { - return build_graph(input_ids, image_embeds); - }; - GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); - } - - int64_t get_num_image_tokens(int64_t t, int64_t h, int64_t w) { - int grid_t = 1; - int grid_h = h / params.vision.patch_size; - int grid_w = w / params.vision.patch_size; - int llm_grid_h = grid_h / params.vision.spatial_merge_size; - int llm_grid_w = grid_w / params.vision.spatial_merge_size; - return grid_t * grid_h * grid_w; - } - - struct ggml_tensor* process_image(struct ggml_context* ctx, struct ggml_tensor* image) { - // image: [C, H, W] - // return: [grid_t*(H/mh/ph)*(W/mw/pw)*mh*mw, C*pt*ph*pw], grid_t == 1 - int64_t C = image->ne[2]; - int64_t H = image->ne[1]; - int64_t W = image->ne[0]; - int64_t mh = params.vision.spatial_merge_size; - int64_t mw = params.vision.spatial_merge_size; - int64_t pt = params.vision.temporal_patch_size; - int64_t ph = params.vision.patch_size; - int64_t pw = params.vision.patch_size; - - image = ggml_reshape_4d(ctx, image, pw, mw, (W / mw / pw), H * C); // [C*H, (W/mw/pw), mw, pw] - image = ggml_cont(ctx, ggml_ext_torch_permute(ctx, image, 0, 2, 3, 1)); // [mw, C*H, (W/mw/pw), pw] - image = ggml_reshape_4d(ctx, image, pw * (W / mw / pw), H, C, mw); // [mw, C, H, (W/mw/pw)*pw] - image = ggml_cont(ctx, ggml_ext_torch_permute(ctx, image, 0, 2, 3, 1)); // [H, mw, C, (W/mw/pw)*pw] - image = ggml_reshape_4d(ctx, image, pw, (W / mw / pw) * C * mw, ph, mh * (H / mh / ph)); // [(H/mh/ph)*mh, ph, mw*C*(W/mw/pw), pw] - image = ggml_cont(ctx, ggml_ext_torch_permute(ctx, image, 0, 2, 1, 3)); // [(H/mh/ph)*mh, mw*C*(W/mw/pw), ph, pw] - image = ggml_reshape_4d(ctx, image, pw * ph, (W / mw / pw), C, mw * mh * (H / mh / ph)); // [(H/mh/ph)*mh*mw, C, (W/mw/pw), ph*pw] - image = ggml_concat(ctx, image, image, 0); // [(H/mh/ph)*mh*mw, C, (W/mw/pw), pt*ph*pw] - image = ggml_cont(ctx, ggml_ext_torch_permute(ctx, image, 0, 2, 1, 3)); // [(H/mh/ph)*mh*mw, (W/mw/pw), C, pt*ph*pw] - image = ggml_reshape_4d(ctx, image, pw * ph * pt * C, (W / mw / pw), mw * mh, (H / mh / ph)); // [(H/mh/ph), mh*mw, (W/mw/pw), C*pt*ph*pw] - image = ggml_cont(ctx, ggml_ext_torch_permute(ctx, image, 0, 2, 1, 3)); // [(H/mh/ph), (W/mw/pw), mh*mw, C*pt*ph*pw] - image = ggml_reshape_2d(ctx, image, pw * ph * pt * C, mw * mh * (W / mw / pw) * (H / mh / ph)); // [(H/mh/ph)*(W/mw/pw)*mh*mw, C*pt*ph*pw] - return image; - } - - struct ggml_cgraph* build_encode_image_graph(struct ggml_tensor* image) { - struct ggml_cgraph* gf = new_graph_custom(QWENVL_GRAPH_SIZE); - - GGML_ASSERT(image->ne[1] % (params.vision.patch_size * params.vision.spatial_merge_size) == 0); - GGML_ASSERT(image->ne[0] % (params.vision.patch_size * params.vision.spatial_merge_size) == 0); - - int grid_t = 1; - int grid_h = image->ne[1] / params.vision.patch_size; - int grid_w = image->ne[0] / params.vision.patch_size; - int llm_grid_h = grid_h / params.vision.spatial_merge_size; - int llm_grid_w = grid_w / params.vision.spatial_merge_size; - int vit_merger_window_size = params.vision.window_size / params.vision.patch_size / params.vision.spatial_merge_size; - - image = to_backend(image); - - auto pixel_values = process_image(compute_ctx, image); - - // window index - int inverse_index = 0; - window_index_vec.resize(llm_grid_h * llm_grid_w); - window_inverse_index_vec.resize(llm_grid_h * llm_grid_w); - std::vector seqlens; - for (int ih = 0; ih < llm_grid_h; ih += vit_merger_window_size) { - for (int iw = 0; iw < llm_grid_w; iw += vit_merger_window_size) { - int win_h = std::min(vit_merger_window_size, llm_grid_h - ih); - int win_w = std::min(vit_merger_window_size, llm_grid_w - iw); - for (int iy = 0; iy < win_h; iy++) { - for (int ix = 0; ix < win_w; ix++) { - int index = (ih + iy) * llm_grid_w + iw + ix; - window_index_vec[inverse_index] = index; - window_inverse_index_vec[index] = inverse_index; - inverse_index++; - } - } - seqlens.push_back(win_h * win_w * params.vision.spatial_merge_size * params.vision.spatial_merge_size); - } - } - // printf("window_index: "); - // for (int i : window_index_vec) { - // printf("%d ", i); - // } - // printf("\n"); - // printf("window_inverse_index: "); - // for (int i : window_inverse_index_vec) { - // printf("%d ", i); - // } - // printf("\n"); - // printf("seqlens: "); - // for (int i : seqlens) { - // printf("%d ", i); - // } - // printf("\n"); - auto window_index = ggml_new_tensor_1d(compute_ctx, - GGML_TYPE_I32, - llm_grid_h * llm_grid_w); - auto window_inverse_index = ggml_new_tensor_1d(compute_ctx, - GGML_TYPE_I32, - llm_grid_h * llm_grid_w); - set_backend_tensor_data(window_index, window_index_vec.data()); - set_backend_tensor_data(window_inverse_index, window_inverse_index_vec.data()); - - // window mask - int seq_window_size = (vit_merger_window_size * params.vision.spatial_merge_size) * (vit_merger_window_size * params.vision.spatial_merge_size); - window_mask_vec.resize((grid_h * grid_w) * (grid_h * grid_w)); - int window_start_index = 0; - for (int seq_index = 0; seq_index < seqlens.size(); seq_index++) { - int window_end_index = window_start_index + seqlens[seq_index]; - // LOG_DEBUG("%d %d", window_start_index, window_end_index); - GGML_ASSERT(window_end_index <= grid_h * grid_w); - for (int i = window_start_index; i < window_end_index; i++) { - for (int j = 0; j < grid_h * grid_w; j++) { - float mask_value = -INFINITY; - if (j >= window_start_index && j < window_end_index) { - mask_value = 0; - } - GGML_ASSERT((i * (grid_h * grid_w) + j) < window_mask_vec.size()); - window_mask_vec[i * (grid_h * grid_w) + j] = mask_value; - } - } - window_start_index = window_end_index; - // printf("\n"); - } - // printf("window_mask: \n"); - // for (int i = 0; i < grid_h*grid_w; i++) { - // for (int j = 0; j < grid_h*grid_w; j++) { - // printf("%f ", window_mask_vec[i * (grid_h * grid_w) + j]); - // } - // printf("\n"); - // } - auto window_mask = ggml_new_tensor_2d(compute_ctx, - GGML_TYPE_F32, - grid_h * grid_w, - grid_h * grid_w); - set_backend_tensor_data(window_mask, window_mask_vec.data()); - - // pe - int head_dim = params.vision.hidden_size / params.vision.num_heads; - pe_vec = Rope::gen_qwen2vl_pe(grid_h, - grid_w, - params.vision.spatial_merge_size, - window_inverse_index_vec, - 10000.f, - {head_dim / 2, head_dim / 2}); - int pos_len = pe_vec.size() / head_dim / 2; - // LOG_DEBUG("pos_len %d", pos_len); - auto pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, head_dim / 2, pos_len); - // pe->data = pe_vec.data(); - // print_ggml_tensor(pe); - // pe->data = nullptr; - set_backend_tensor_data(pe, pe_vec.data()); - - auto runnter_ctx = get_context(); - struct ggml_tensor* hidden_states = vision_forward(&runnter_ctx, - pixel_values, - pe, - window_index, - window_inverse_index, - window_mask); - ggml_build_forward_expand(gf, hidden_states); - - return gf; - } - - void encode_image(const int n_threads, - struct ggml_tensor* image, - ggml_tensor** output, - ggml_context* output_ctx = nullptr) { - auto get_graph = [&]() -> struct ggml_cgraph* { - return build_encode_image_graph(image); - }; - GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); - } - }; - - struct Qwen2_5_VLEmbedder { - Qwen2Tokenizer tokenizer; - Qwen2_5_VLRunner model; - - Qwen2_5_VLEmbedder(ggml_backend_t backend, - bool offload_params_to_cpu, - const String2TensorStorage& tensor_storage_map = {}, - const std::string prefix = "", - bool enable_vision = false) - : model(backend, offload_params_to_cpu, tensor_storage_map, prefix, enable_vision) { - } - - void get_param_tensors(std::map& tensors, const std::string prefix) { - model.get_param_tensors(tensors, prefix); - } - - void alloc_params_buffer() { - model.alloc_params_buffer(); - } - - std::tuple, std::vector> tokenize(std::string text, - size_t max_length = 0, - bool padding = false) { - auto parsed_attention = parse_prompt_attention(text); - - { - std::stringstream ss; - ss << "["; - for (const auto& item : parsed_attention) { - ss << "['" << item.first << "', " << item.second << "], "; - } - ss << "]"; - LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str()); - } - - std::vector tokens; - std::vector weights; - for (const auto& item : parsed_attention) { - const std::string& curr_text = item.first; - float curr_weight = item.second; - std::vector curr_tokens = tokenizer.tokenize(curr_text, nullptr); - tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end()); - weights.insert(weights.end(), curr_tokens.size(), curr_weight); - } - - tokenizer.pad_tokens(tokens, weights, max_length, padding); - - // for (int i = 0; i < tokens.size(); i++) { - // std::cout << tokens[i] << ":" << weights[i] << ", "; - // } - // std::cout << std::endl; - - return {tokens, weights}; - } - - void test() { - struct ggml_init_params params; - params.mem_size = static_cast(1024 * 1024) * 1024; // 1GB - params.mem_buffer = nullptr; - params.no_alloc = false; - - struct ggml_context* work_ctx = ggml_init(params); - GGML_ASSERT(work_ctx != nullptr); - bool test_vit = true; - bool test_decoder_with_vit = true; - - if (test_decoder_with_vit) { - ggml_tensor* image_embed = nullptr; - { - auto image = load_tensor_from_file(work_ctx, "qwen2vl_normalized.bin"); - print_ggml_tensor(image, false, "image"); - struct ggml_tensor* out = nullptr; - - int t0 = ggml_time_ms(); - model.encode_image(8, image, &out, work_ctx); - int t1 = ggml_time_ms(); - - print_ggml_tensor(out, false, "image_embed"); - image_embed = out; - LOG_DEBUG("qwen2vl encode_image test done in %dms", t1 - t0); - } - - std::string placeholder = "<|image_pad|>"; - std::string img_prompt = "Picture 1: <|vision_start|>"; // [24669, 220, 16, 25, 220, 151652] - int64_t num_image_tokens = image_embed->ne[1]; - img_prompt.reserve(num_image_tokens * placeholder.size()); - for (int i = 0; i < num_image_tokens; i++) { - img_prompt += placeholder; - } - img_prompt += "<|vision_end|>"; - - std::vector> image_embeds; - image_embeds.emplace_back(64, image_embed); - - std::string text = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n"; - text += img_prompt; - text += "change 'flux.cpp' to 'edit.cpp'"; - text += "<|im_end|>\n<|im_start|>assistant\n"; - - auto tokens_and_weights = tokenize(text, 0, false); - std::vector& tokens = std::get<0>(tokens_and_weights); - std::vector& weights = std::get<1>(tokens_and_weights); - for (auto token : tokens) { - printf("%d ", token); - } - printf("\n"); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); - struct ggml_tensor* out = nullptr; - - int t0 = ggml_time_ms(); - model.compute(8, input_ids, image_embeds, &out, work_ctx); - int t1 = ggml_time_ms(); - - print_ggml_tensor(out); - LOG_DEBUG("qwen2vl test done in %dms", t1 - t0); - } else if (test_vit) { - // auto image = ggml_new_tensor_3d(work_ctx, GGML_TYPE_F32, 280, 280, 3); - // ggml_set_f32(image, 0.f); - auto image = load_tensor_from_file(work_ctx, "qwen2vl_normalized.bin"); - print_ggml_tensor(image, false, "image"); - struct ggml_tensor* out = nullptr; - - int t0 = ggml_time_ms(); - model.encode_image(8, image, &out, work_ctx); - int t1 = ggml_time_ms(); - - print_ggml_tensor(out, false, "out"); - - // auto ref_out = load_tensor_from_file(work_ctx, "qwen2vl.bin"); - // ggml_ext_tensor_diff(ref_out, out, 0.01f); - - LOG_DEBUG("qwen2vl test done in %dms", t1 - t0); - } else { - std::string text("<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\na lovely cat<|im_end|>\n<|im_start|>assistant\n"); - auto tokens_and_weights = tokenize(text, 0, false); - std::vector& tokens = std::get<0>(tokens_and_weights); - std::vector& weights = std::get<1>(tokens_and_weights); - for (auto token : tokens) { - printf("%d ", token); - } - printf("\n"); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, tokens); - struct ggml_tensor* out = nullptr; - - int t0 = ggml_time_ms(); - model.compute(8, input_ids, {}, &out, work_ctx); - int t1 = ggml_time_ms(); - - print_ggml_tensor(out); - LOG_DEBUG("qwen2vl test done in %dms", t1 - t0); - } - } - - static void load_from_file_and_test(const std::string& file_path) { - // cpu f16: pass - // ggml_backend_t backend = ggml_backend_cuda_init(0); - ggml_backend_t backend = ggml_backend_cpu_init(); - ggml_type model_data_type = GGML_TYPE_F16; - - ModelLoader model_loader; - if (!model_loader.init_from_file_and_convert_name(file_path, "qwen2vl.")) { - LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str()); - return; - } - - auto& tensor_storage_map = model_loader.get_tensor_storage_map(); - for (auto& [name, tensor_storage] : tensor_storage_map) { - if (ends_with(name, "weight")) { - tensor_storage.expected_type = model_data_type; - } - } - - std::shared_ptr qwenvl = std::make_shared(backend, - false, - tensor_storage_map, - "qwen2vl", - true); - - qwenvl->alloc_params_buffer(); - std::map tensors; - qwenvl->get_param_tensors(tensors, "qwen2vl"); - - bool success = model_loader.load_tensors(tensors); - - if (!success) { - LOG_ERROR("load tensors from model loader failed"); - return; - } - - LOG_INFO("qwenvl model loaded"); - qwenvl->test(); - } - }; - -}; // Qwen - -#endif // __QWENVL_HPP__