diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index ff614b052..5bb1e7a11 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -112,6 +112,7 @@ static std::vector last_media_mem; //for storing dummy tokens that will be static std::string media_composite_image_signature = ""; //for identifying when the llava images change, we need to invalidate the cache static int current_media_identifier = MEDIA_TOKEN_IDENTIFIER_A; static int vision_max_res = 2048; +static bool use_mrope = false; static kcpp_params * kcpp_data = nullptr; static int max_context_limit_at_load = 0; @@ -693,7 +694,7 @@ static speculative_draft_result speculative_decoding_eval_chunk(llama_context * std::vector real_embd = drafted_ids; real_embd.pop_back(); - bool use_mrope = (file_format==FileFormat::GGUF_GENERIC && file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL); + kcpp_embd_batch batch2 = kcpp_embd_batch(real_embd, actual_npast, use_mrope, true); auto draftok = (llama_decode(main_ctx, batch2.batch)==0); //actual eval for big model if(!draftok) @@ -1801,7 +1802,6 @@ static void load_grammar(const std::string & gammarstr) static bool kcpp_eval_image(llama_context * ctx_llama, float * img_embd, int num_img_tokens, int n_batch, int * n_past) { int n_embd = llama_n_embd(llama_get_model(ctx_llama)); - bool use_mrope = (file_format==FileFormat::GGUF_GENERIC && file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL); for (int i = 0; i < num_img_tokens; i += n_batch) { int n_eval = num_img_tokens - i; @@ -1987,6 +1987,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in guidance_ctx = nullptr; audio_multimodal_supported = false; vision_multimodal_supported = false; + use_mrope = false; auto clamped_max_context_length = inputs.max_context_length; @@ -2338,6 +2339,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in { printf("\nMRope is used, context shift will be disabled!\n"); kcpp_data->use_contextshift = false; + use_mrope = true; } if(overwriteRope) @@ -3321,8 +3323,25 @@ generation_outputs gpttype_generate(const generation_inputs inputs) media_object lv; lv.b64data = item; lv.is_audio = true; - TokenizeString("\n\n", lv.chunk_end_seq, file_format, false); + std::string aud_start = "\n\n"; + if(clp_ctx_a) + { + int ptype = clip_get_projector_type_ext(clp_ctx_a); + if(ptype==14) //qwen omni + { + aud_start = "<|audio_bos|>"; + aud_end = "<|audio_eos|>\n"; + } + else if(ptype==16) //voxtral + { + aud_start = "[INST][BEGIN_AUDIO]"; + aud_end = "[/INST]\n"; + } + } + + TokenizeString(aud_start, lv.chunk_start_seq, file_format, false); + TokenizeString(aud_end, lv.chunk_end_seq, file_format, false); media_objects.push_back(lv); new_media_composite += item; } @@ -3502,7 +3521,6 @@ generation_outputs gpttype_generate(const generation_inputs inputs) int32_t nctx = kcpp_data->n_ctx; TokenizeString(kcpp_data->prompt, embd_inp, file_format, add_bos_token); - bool use_mrope = (file_format == FileFormat::GGUF_GENERIC && file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL); TokenizeString("\nAttached Media:\n", media_intro, file_format, false); if(media_composite_image_signature=="") diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 1ff914253..27355a9d9 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -4552,6 +4552,11 @@ projector_type clip_get_projector_type(const struct clip_ctx * ctx) { return ctx->proj_type(); } +int clip_get_projector_type_ext(clip_ctx * ctx) { + return ctx->proj_type(); +} + + void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel) { clip_image_f32 * audio = new clip_image_f32; audio->nx = n_frames; diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index 22ca53bba..0c6e778fc 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -102,6 +102,7 @@ bool clip_is_llava(const struct clip_ctx * ctx); bool clip_is_gemma3(const struct clip_ctx * ctx); bool clip_is_pixtral(const struct clip_ctx * ctx); void set_clip_uses_gpu(bool usegpu); +int clip_get_projector_type_ext(clip_ctx * ctx); bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);