fixed another tts bug, clblast selection and quiet mode

2026-05-22 03:10:03 +00:00 · 2025-01-22 21:36:13 +08:00 · 2025-01-22 21:36:13 +08:00 · 0e74db7fd4
commit 0e74db7fd4
parent 1cb9805024
4 changed files with 49 additions and 45 deletions
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@ -106,6 +106,7 @@ static kcpp_params * kcpp_data = nullptr;
 static int max_context_limit_at_load = 0;
 static int n_past = 0;
 static int debugmode = 0; //-1 = hide all, 0 = normal, 1 = showall
+static bool quiet = false;
 static std::vector<gpt_vocab::id> last_n_tokens;
 static std::vector<gpt_vocab::id> current_context_tokens;
 static size_t mem_per_token = 0;
@ -930,12 +931,12 @@ void sample_xtc(llama_token_data_array * candidates, float xtc_threshold, float

    if(last_idx>1) //if there are 2 or more viable candidates
    {
-        if (debugmode==1) {
+        if (debugmode==1 && !quiet) {
            printf("XTC penalties [");
        }
        // then remove all other tokens above threshold EXCEPT the least likely one
        for (size_t i = 0; i < last_idx - 1; ++i) {
-            if (debugmode==1)
+            if (debugmode==1 && !quiet)
            {
                gpt_vocab::id token = candidates->data[i].id;
                std::string tokenizedstr = FileFormatTokenizeID(token, file_format);
@ -944,7 +945,7 @@ void sample_xtc(llama_token_data_array * candidates, float xtc_threshold, float
            }
            candidates->data[i].logit -= 999.0f; //infinity gets wonky results downstream, this hack works well enough
        }
-        if (debugmode==1) {
+        if (debugmode==1 && !quiet) {
            printf("]\n");
        }
        candidates->sorted = false;
@ -1133,7 +1134,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe
        max_exponent = FLOAT_MAX_LOG / std::log(penalty_base);
    }

-    if (debugmode==1 && !dry_max_token_repeat.empty()) {
+    if (debugmode==1 && !quiet && !dry_max_token_repeat.empty()) {
        printf("DRY penalties [");
    }
    size_t count = 0;
@ -1144,7 +1145,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe
            repeat_exp = max_exponent;
        }
        float penalty = penalty_multiplier * pow(penalty_base, repeat_exp);
-        if (debugmode==1)
+        if (debugmode==1 && !quiet)
        {
            std::string tokenizedstr = FileFormatTokenizeID(token, file_format);
            ::utreplace(tokenizedstr, "\n", "\\n");
@ -1157,7 +1158,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe
    {
        candidates->sorted = false;
    }
-    if (debugmode==1 && !dry_max_token_repeat.empty()) {
+    if (debugmode==1 && !quiet && !dry_max_token_repeat.empty()) {
        printf("]\n");
    }
 }
@ -1688,7 +1689,7 @@ static void load_grammar(const std::string & gammarstr)
            printf("\nIgnored invalid grammar sampler.");
            return;
        }
-        if(debugmode==1)
+        if(debugmode==1 && !quiet)
        {
            parsed_grammar.print(stderr);
        }
@ -1831,7 +1832,7 @@ static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_trai
        float chi_ctx_value = (n_ctx_desired * ctx_multiplier) / 6.28318;
        float gradient_ai_rope_freq_base_value = powf(original_rope_base, log10f(chi_ctx_value) / log10f(chi_ctx_train_value));

-        if(debugmode==1)
+        if(debugmode==1 && !quiet)
        {
            printf("Trained max context length (value:%.d).\n", n_ctx_train);
            printf("Desired context length (value:%.d).\n", n_ctx_desired);
@ -1848,7 +1849,7 @@ static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_trai
        {
            float extended_rope_positive_offset_value = 1 + ((log10f(chi_ctx_value) - log10f(chi_ctx_train_value)) / ((log10f(chi_ctx_value) * log10f(chi_ctx_train_value)) - (log10f(chi_ctx_value) + log10f(chi_ctx_train_value))));
            float rope_freq_base_with_positive_offset = gradient_ai_rope_freq_base_value * extended_rope_positive_offset_value;
-            if(debugmode==1)
+            if(debugmode==1 && !quiet)
            {
                printf("Extended RoPE Positive Offset (multiplicator) for Solar based models. (value:%.3f).\n", extended_rope_positive_offset_value);
                printf("RoPE base calculated via Gradient AI formula for Solar based models. (value:%.1f).\n", rope_freq_base_with_positive_offset);
@ -2679,13 +2680,13 @@ std::vector<int> gpttype_get_token_arr(const std::string & input, bool addbos)
        printf("\nWarning: KCPP text generation not initialized!\n");
        return toks;
    }
-    if(debugmode==1)
+    if(debugmode==1 && !quiet)
    {
        printf("\nFileFormat: %d, Tokenizing: %s",file_format ,input.c_str());
    }
    TokenizeString(input, toks, file_format,addbos);
    int tokcount = toks.size();
-    if(debugmode==1)
+    if(debugmode==1 && !quiet)
    {
        printf("\nTokens Counted: %d\n",tokcount);
    }
@ -2770,6 +2771,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
        llama_perf_context_reset(llama_ctx_v4);
    }

+    quiet = inputs.quiet;
    generation_finished = false; // Set current generation status
    generated_tokens.clear(); // New Generation, new tokens
    delayed_generated_tokens.clear();
@ -2848,7 +2850,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
    banned_token_ids.clear();
    if(banned_tokens.size()>0)
    {
-        if(debugmode==1)
+        if(debugmode==1 && !quiet)
        {
            printf("\nBanning %zu single character sequences...",banned_tokens.size());
        }
@ -2865,13 +2867,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                }
            }
        }
-        if(debugmode==1)
+        if(debugmode==1 && !quiet)
        {
            printf("\nBanned a total of %zu individual tokens.\n",banned_token_ids.size());
        }
    }

-    if(debugmode==1 && banned_phrases.size()>0)
+    if(debugmode==1 && !quiet && banned_phrases.size()>0)
    {
        printf("\nBanned a total of %zu phrases, with max token count of %d.\n",banned_phrases.size(),delayed_generated_tokens_limit);
    }
@ -2916,7 +2918,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
        //images have changed. swap identifiers to force reprocessing
        current_llava_identifier = (current_llava_identifier==LLAVA_TOKEN_IDENTIFIER_A?LLAVA_TOKEN_IDENTIFIER_B:LLAVA_TOKEN_IDENTIFIER_A);
        llava_composite_image_signature = new_llava_composite;
-        if(debugmode==1)
+        if(debugmode==1 && !quiet)
        {
            printf("\nLLAVA images changed, existing cache invalidated");
        }
@ -2972,7 +2974,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
            const int MAX_CHAR_LEN = 40;
            const int MAX_SEQ_LEN = 20;

-            if (debugmode == 1)
+            if (debugmode == 1 && !quiet)
            {
                printf("\nProcessing %zu dry break strings...", kcpp_data->dry_sequence_breakers.size());
            }
@ -2984,7 +2986,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                }
                GetOverlappingTokenSequences(sequence_break, dry_sequence_breakers, MAX_SEQ_LEN);
            }
-            if (debugmode == 1)
+            if (debugmode == 1 && !quiet)
            {
                int trivial = 0, non_trivial = 0;
                for (const auto &seq : dry_sequence_breakers)
@ -3004,9 +3006,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
    }

    bool stream_sse = inputs.stream_sse;
-
-    bool allow_regular_prints = (debugmode!=-1 && !inputs.quiet) || debugmode >= 1;
-
+    bool allow_regular_prints = (!quiet && debugmode!=-1);

    std::string grammarstr = inputs.grammar;
    bool grammar_retain_state = inputs.grammar_retain_state;
@ -3039,7 +3039,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
    if (kcpp_data->seed <= 0 || kcpp_data->seed==0xFFFFFFFF)
    {
        kcpp_data->seed = (((uint32_t)time(NULL)) % 1000000u);
-        if(debugmode==1)
+        if(debugmode==1 && !quiet)
        {
            printf("\nUsing Seed: %d",kcpp_data->seed);
        }
@ -3071,7 +3071,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
            }
            else
            {
-                if(debugmode==1)
+                if(debugmode==1 && !quiet)
                {
                    printf("\nCreating clip image embed...");
                }
@ -3079,7 +3079,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                if (!llava_image_embed_make_with_clip_img(clp_ctx, kcpp_data->n_threads, clp_img_data, &llava_images[i].clp_img_embd, &llava_images[i].clp_image_tokens)) {
                    printf("\nError: Clip image %d failed to create embd!",i);
                }
-                if(debugmode==1)
+                if(debugmode==1 && !quiet)
                {
                    printf("\nLLAVA Clip Embed %i used Tokens: %d",i,llava_images[i].clp_image_tokens);
                }
@ -3202,7 +3202,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
    n_past = 0;

-    if (debugmode==1)
+    if (debugmode==1 && !quiet)
    {
        std::string outstr = "";
        printf("\n\n[Debug: Dump Raw Input Tokens, format: %d]\n", file_format);
@ -3347,7 +3347,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
        printf("\n");
    }

-    if (debugmode==1)
+    if (debugmode==1 && !quiet)
    {
        std::string outstr = "";
        printf("\n[Debug: Dump Forwarded Input Tokens, format: %d]\n", file_format);
@ -3396,7 +3396,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                    draft_used = true;
                    draft_results = speculative_decoding_eval_chunk(draft_ctx, llama_ctx_v4, embd, n_vocab, n_past);
                    evalres = draft_results.draft_success;
-                    if(debugmode==1)
+                    if(debugmode==1 && !quiet)
                    {
                        std::string draftedtoks = get_tok_vec_str(draft_results.draftids);
                        printf("\nDrafted %d Tokens: [%s]\n",speculative_chunk_amt,draftedtoks.c_str());
@ -3599,7 +3599,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                if(draft_used)
                {
                    int32_t draftedid = draft_results.draftids[logits_sampled];
-                    if(debugmode==1)
+                    if(debugmode==1 && !quiet)
                    {
                        std::string drafttok = FileFormatTokenizeID(draftedid, file_format, true);
                        std::string realtok = FileFormatTokenizeID(id, file_format, true);
@ -3652,7 +3652,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                {
                    printf("\rGenerating (%d / %d tokens)", (kcpp_data->n_predict - remaining_tokens), kcpp_data->n_predict);
                }
-                if(debugmode==1 && top_picks_history.size()>0)
+                if(debugmode==1 && !quiet && top_picks_history.size()>0)
                {
                    printf(" [");
                    bool firstloop = true;
@ -3904,7 +3904,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
        delayed_generated_tokens.pop_front();
    }

-    if(debugmode==1 && file_format == FileFormat::GGUF_GENERIC)
+    if(debugmode==1 && !quiet && file_format == FileFormat::GGUF_GENERIC)
    {
        printf("\n");
        llama_perf_context_print(llama_ctx_v4);