diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index e6bfd339b..f4754d59a 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -542,9 +542,10 @@ struct kcpp_embd_batch { //duplcated from llava_embd_batch std::vector seq_ids; std::vector logits; llama_batch batch; - kcpp_embd_batch(float * embd, int32_t n_tokens, int32_t npast) { + kcpp_embd_batch(float * embd, int32_t n_tokens, int32_t npast, bool use_mrope) { int32_t seq_id = 0; - pos.resize(n_tokens); + pos.resize(n_tokens * (use_mrope?4:1)); + std::fill(pos.begin(), pos.end(), 0); n_seq_id.resize(n_tokens); seq_ids.resize(n_tokens + 1); logits.resize(n_tokens); @@ -560,23 +561,39 @@ struct kcpp_embd_batch { //duplcated from llava_embd_batch /*seq_id =*/ seq_ids.data(), /*logits =*/ logits.data(), }; - for (int i = 0; i < n_tokens; i++) { - batch.pos [i] = npast + i; - batch.n_seq_id[i] = 1; - batch.seq_id [i] = seq_id_0.data(); - batch.logits [i] = false; + + if(!use_mrope) + { + for (int i = 0; i < n_tokens; i++) { + batch.pos [i] = npast + i; + batch.n_seq_id[i] = 1; + batch.seq_id [i] = seq_id_0.data(); + batch.logits [i] = false; + } + } + else + { + for (int i = 0; i < n_tokens; i++) { + batch.n_seq_id[i] = 1; + batch.seq_id [i] = seq_id_0.data(); + batch.logits [i] = false; + } + for (int j = 0; j < batch.n_tokens * 3; j++) { + batch.pos[j] = npast + (j % batch.n_tokens); + } } } - kcpp_embd_batch(std::vector & tokens, int32_t npast, bool return_all_logits) { + kcpp_embd_batch(std::vector & tokens, int32_t npast, bool use_mrope, bool return_all_logits) { int32_t seq_id = 0; int32_t n_tokens = tokens.size(); - pos.resize(n_tokens); + pos.resize(n_tokens * (use_mrope?4:1)); + std::fill(pos.begin(), pos.end(), 0); n_seq_id.resize(n_tokens); seq_ids.resize(n_tokens + 1); logits.resize(n_tokens); seq_id_0.resize(1); seq_id_0[0] = seq_id; - seq_ids [n_tokens] = nullptr; + seq_ids[n_tokens] = nullptr; batch = { /*n_tokens =*/ n_tokens, /*tokens =*/ tokens.data(), @@ -586,11 +603,26 @@ struct kcpp_embd_batch { //duplcated from llava_embd_batch /*seq_id =*/ seq_ids.data(), /*logits =*/ logits.data(), }; - for (int i = 0; i < n_tokens; i++) { - batch.pos [i] = npast + i; - batch.n_seq_id[i] = 1; - batch.seq_id [i] = seq_id_0.data(); - batch.logits [i] = (return_all_logits?true:false); + + if(!use_mrope) + { + for (int i = 0; i < n_tokens; i++) { + batch.pos [i] = npast + i; + batch.n_seq_id[i] = 1; + batch.seq_id [i] = seq_id_0.data(); + batch.logits [i] = (return_all_logits?true:false); + } + } + else + { + for (int i = 0; i < n_tokens; i++) { + batch.n_seq_id[i] = 1; + batch.seq_id [i] = seq_id_0.data(); + batch.logits [i] = (return_all_logits?true:false); + } + for (int j = 0; j < batch.n_tokens * 3; j++) { + batch.pos[j] = npast + (j % batch.n_tokens); + } } batch.logits[n_tokens - 1] = true; } @@ -687,7 +719,7 @@ static speculative_draft_result speculative_decoding_eval_chunk(llama_context * drafted_ids.push_back(embd[0]); for(int i=0;i real_embd = drafted_ids; real_embd.pop_back(); - kcpp_embd_batch batch2 = kcpp_embd_batch(real_embd, actual_npast, true); + bool use_mrope = (file_format==FileFormat::GGUF_GENERIC && file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL); + kcpp_embd_batch batch2 = kcpp_embd_batch(real_embd, actual_npast, use_mrope, true); auto draftok = (llama_decode(main_ctx, batch2.batch)==0); //actual eval for big model if(!draftok) { @@ -1754,6 +1787,7 @@ static void load_grammar(const std::string & gammarstr) static bool kcpp_eval_image(llama_context * ctx_llama, float * img_embd, int num_img_tokens, int n_batch, int * n_past) { int n_embd = llama_n_embd(llama_get_model(ctx_llama)); + bool use_mrope = (file_format==FileFormat::GGUF_GENERIC && file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL); for (int i = 0; i < num_img_tokens; i += n_batch) { int n_eval = num_img_tokens - i; @@ -1761,7 +1795,7 @@ static bool kcpp_eval_image(llama_context * ctx_llama, float * img_embd, int num n_eval = n_batch; } float * embd = img_embd+i*n_embd; - kcpp_embd_batch llava_batch = kcpp_embd_batch(embd, n_eval, *n_past); + kcpp_embd_batch llava_batch = kcpp_embd_batch(embd, n_eval, *n_past, use_mrope); if (llama_decode(ctx_llama, llava_batch.batch)) { fprintf(stderr, "\n%s : failed to eval image\n", __func__); return false; @@ -1770,6 +1804,70 @@ static bool kcpp_eval_image(llama_context * ctx_llama, float * img_embd, int num } return true; } +static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, float * image_embd, int num_img_tokens, + int n_batch, int * n_past) { + auto image_size = clip_get_load_image_size(clp_ctx); + int n_embd = llama_n_embd(llama_get_model(ctx_llama)); + const int patch_size = 14 * 2; + const int ph = image_size->height / patch_size + (image_size->height % patch_size > 0); + const int pw = image_size->width / patch_size + (image_size->width % patch_size > 0); + auto img_tokens = num_img_tokens; + // llama_pos mrope_pos[img_tokens * 4]; + std::vector mrope_pos; + mrope_pos.resize(img_tokens * 4); + + int st_pos_id = *n_past; + + for (int y = 0; y < ph; y++) + { + for (int x = 0; x < pw; x++) + { + int i = y * pw + x; + mrope_pos[i] = st_pos_id; + mrope_pos[i + img_tokens] = st_pos_id + y; + mrope_pos[i + img_tokens * 2] = st_pos_id + x; + mrope_pos[i + img_tokens * 3] = 0; + } + } + st_pos_id += std::max(pw, ph); + + int processed = 0; + std::vector batch_mrope_pos; + batch_mrope_pos.resize(img_tokens * 4); + + for (int i = 0; i < img_tokens; i += n_batch) { + int n_eval = img_tokens - i; + if (n_eval > n_batch) { + n_eval = n_batch; + } + + // llama_pos batch_mrope_pos[n_eval * 4]; + std::fill(batch_mrope_pos.begin(), batch_mrope_pos.end(), 0); + memcpy(batch_mrope_pos.data(), &mrope_pos[processed], n_eval * sizeof(llama_pos)); + memcpy(&batch_mrope_pos[n_eval * 1], &mrope_pos[img_tokens * 1 + processed], n_eval * sizeof(llama_pos)); + memcpy(&batch_mrope_pos[n_eval * 2], &mrope_pos[img_tokens * 2 + processed], n_eval * sizeof(llama_pos)); + memcpy(&batch_mrope_pos[n_eval * 3], &mrope_pos[img_tokens * 3 + processed], n_eval * sizeof(llama_pos)); + + llama_batch batch = { + int32_t(n_eval), // n_tokens + nullptr, // token + (image_embd+i*n_embd), // embed + batch_mrope_pos.data(), // pos + nullptr, // n_seq_id + nullptr, // seq_id + nullptr, // logits + }; + + if (llama_decode(ctx_llama, batch)) { + fprintf(stderr, "\n%s : failed to eval image\n", __func__); + return false; + } + *n_past += n_eval; + processed += n_eval; + } + return true; +} + //given an old GGUF context and a new context that has some middle portion removed, //find and remove the middle portion from the old context from the KV. Does not fast forward after this destructive action @@ -2160,11 +2258,15 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in printf("CUBLAS: Set main device to %d\n",cu_parseinfo_maindevice); } ggml_cuda_set_mul_mat_q(inputs.use_mmq); - if(file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2 && !kcpp_data->flash_attn) - { - printf("CUBLAS: Warning, you are running Qwen2 without Flash Attention and may observe incoherent output.\n"); - } #endif + if((file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2 || file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL) && !kcpp_data->flash_attn) + { + printf("Warning, you are running Qwen2 without Flash Attention. If you observe incoherent output, try enabling it.\n"); + } + if(file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL) + { + printf("Qwen2VL detected! Mrope will be used!\n"); + } model_params.main_gpu = cu_parseinfo_maindevice; #if defined(GGML_USE_CUDA) @@ -3423,7 +3525,8 @@ generation_outputs gpttype_generate(const generation_inputs inputs) if(embd.size()!=1 || draft_ctx==nullptr || remaining_tokens<=speculative_chunk_amt || grammar!=nullptr || startedsampling==false) //for large batch, or if no draft model, PP/TG as usual { draft_used = false; - kcpp_embd_batch batch = kcpp_embd_batch(embd, n_past, false); + bool use_mrope = (file_format==FileFormat::GGUF_GENERIC && file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL); + kcpp_embd_batch batch = kcpp_embd_batch(embd, n_past, use_mrope, false); evalres = (llama_decode(llama_ctx_v4, batch.batch)==0); if(draft_ctx) { diff --git a/model_adapter.cpp b/model_adapter.cpp index 42c647998..102e41b02 100644 --- a/model_adapter.cpp +++ b/model_adapter.cpp @@ -315,6 +315,10 @@ void print_tok_vec(std::vector &embd) { fileformatmeta->model_architecture = GGUFArch::ARCH_QWEN2; } + else if(modelarch=="qwen2vl") + { + fileformatmeta->model_architecture = GGUFArch::ARCH_QWEN2VL; + } else if(modelarch=="rwkv6") { fileformatmeta->model_architecture = GGUFArch::ARCH_RWKV; diff --git a/model_adapter.h b/model_adapter.h index 48195a26d..a0e921cb3 100644 --- a/model_adapter.h +++ b/model_adapter.h @@ -59,6 +59,7 @@ enum GGUFArch ARCH_SOLAR = 4, ARCH_QWEN2 = 5, ARCH_RWKV = 6, + ARCH_QWEN2VL = 7, }; struct FileFormatExtraMeta diff --git a/msvcp140.dll b/msvcp140.dll index 130f84aaf..0a9b13d75 100644 Binary files a/msvcp140.dll and b/msvcp140.dll differ diff --git a/msvcp140_codecvt_ids.dll b/msvcp140_codecvt_ids.dll index 2fb1e6197..df9573151 100644 Binary files a/msvcp140_codecvt_ids.dll and b/msvcp140_codecvt_ids.dll differ diff --git a/vcruntime140.dll b/vcruntime140.dll index edba5485f..99e0fbdef 100644 Binary files a/vcruntime140.dll and b/vcruntime140.dll differ diff --git a/vcruntime140_1.dll b/vcruntime140_1.dll index 7bf05d3b1..8ad4c3b72 100644 Binary files a/vcruntime140_1.dll and b/vcruntime140_1.dll differ