voxstral mini is really bad

2025-09-10 09:04:36 +00:00 · 2025-07-29 21:22:17 +08:00 · 2025-07-29 21:22:17 +08:00 · 3284757b56
commit 3284757b56
parent abf527a207
3 changed files with 29 additions and 5 deletions
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@ -112,6 +112,7 @@ static std::vector<int> last_media_mem; //for storing dummy tokens that will be
 static std::string media_composite_image_signature = ""; //for identifying when the llava images change, we need to invalidate the cache
 static int current_media_identifier = MEDIA_TOKEN_IDENTIFIER_A;
 static int vision_max_res = 2048;
+static bool use_mrope = false;

 static kcpp_params * kcpp_data = nullptr;
 static int max_context_limit_at_load = 0;
@ -693,7 +694,7 @@ static speculative_draft_result speculative_decoding_eval_chunk(llama_context *

    std::vector<int> real_embd = drafted_ids;
    real_embd.pop_back();
-    bool use_mrope = (file_format==FileFormat::GGUF_GENERIC && file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL);
+
    kcpp_embd_batch batch2 = kcpp_embd_batch(real_embd, actual_npast, use_mrope, true);
    auto draftok = (llama_decode(main_ctx, batch2.batch)==0); //actual eval for big model
    if(!draftok)
@ -1801,7 +1802,6 @@ static void load_grammar(const std::string & gammarstr)

 static bool kcpp_eval_image(llama_context * ctx_llama, float * img_embd, int num_img_tokens, int n_batch, int * n_past) {
    int n_embd  = llama_n_embd(llama_get_model(ctx_llama));
-    bool use_mrope = (file_format==FileFormat::GGUF_GENERIC && file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL);

    for (int i = 0; i < num_img_tokens; i += n_batch) {
        int n_eval = num_img_tokens - i;
@ -1987,6 +1987,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
    guidance_ctx = nullptr;
    audio_multimodal_supported = false;
    vision_multimodal_supported = false;
+    use_mrope = false;

    auto clamped_max_context_length = inputs.max_context_length;

@ -2338,6 +2339,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
        {
            printf("\nMRope is used, context shift will be disabled!\n");
            kcpp_data->use_contextshift = false;
+            use_mrope = true;
        }

        if(overwriteRope)
@ -3321,8 +3323,25 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
            media_object lv;
            lv.b64data = item;
            lv.is_audio = true;
-            TokenizeString("<audio>", lv.chunk_start_seq, file_format, false);
-            TokenizeString("</audio>\n\n", lv.chunk_end_seq, file_format, false);
+            std::string aud_start = "<audio>";
+            std::string aud_end = "</audio>\n\n";
+            if(clp_ctx_a)
+            {
+                int ptype = clip_get_projector_type_ext(clp_ctx_a);
+                if(ptype==14) //qwen omni
+                {
+                    aud_start = "<|audio_bos|>";
+                    aud_end = "<|audio_eos|>\n";
+                }
+                else if(ptype==16) //voxtral
+                {
+                    aud_start = "[INST][BEGIN_AUDIO]";
+                    aud_end = "[/INST]\n";
+                }
+            }
+
+            TokenizeString(aud_start, lv.chunk_start_seq, file_format, false);
+            TokenizeString(aud_end, lv.chunk_end_seq, file_format, false);
            media_objects.push_back(lv);
            new_media_composite += item;
        }
@ -3502,7 +3521,6 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
    int32_t nctx = kcpp_data->n_ctx;

    TokenizeString(kcpp_data->prompt, embd_inp, file_format, add_bos_token);
-    bool use_mrope = (file_format == FileFormat::GGUF_GENERIC && file_format_meta.model_architecture == GGUFArch::ARCH_QWEN2VL);
    TokenizeString("\nAttached Media:\n", media_intro, file_format, false);

    if(media_composite_image_signature=="")