diff --git a/expose.h b/expose.h index 029a2f0d2..326053c35 100644 --- a/expose.h +++ b/expose.h @@ -54,7 +54,6 @@ struct load_model_inputs const int cublas_info = 0; const char * vulkan_info = nullptr; const int blasbatchsize = 512; - const int debugmode = 0; const int forceversion = 0; const int gpulayers = 0; const float rope_freq_scale = 1.0f; @@ -64,6 +63,8 @@ struct load_model_inputs const float tensor_split[tensor_split_max] = {}; const int quant_k = 0; const int quant_v = 0; + const bool quiet = false; + const int debugmode = 0; }; struct generation_inputs { @@ -97,7 +98,6 @@ struct generation_inputs const bool stream_sse = false; const char * grammar = nullptr; const bool grammar_retain_state = false; - const bool quiet = false; const float dynatemp_range = 0.0f; const float dynatemp_exponent = 1.0f; const float smoothing_factor = 0.0f; @@ -157,6 +157,7 @@ struct sd_load_model_inputs const char * vae_filename = nullptr; const char * lora_filename = nullptr; const float lora_multiplier = 1.0f; + const bool quiet = false; const int debugmode = 0; }; struct sd_generation_inputs @@ -172,7 +173,6 @@ struct sd_generation_inputs const int seed = 0; const char * sample_method = nullptr; const int clip_skip = -1; - const bool quiet = false; }; struct sd_generation_outputs { @@ -187,6 +187,7 @@ struct whisper_load_model_inputs const int clblast_info = 0; const int cublas_info = 0; const char * vulkan_info = nullptr; + const bool quiet = false; const int debugmode = 0; }; struct whisper_generation_inputs @@ -195,7 +196,6 @@ struct whisper_generation_inputs const char * audio_data = nullptr; const bool suppress_non_speech = false; const char * langcode = nullptr; - const bool quiet = false; }; struct whisper_generation_outputs { @@ -214,6 +214,7 @@ struct tts_load_model_inputs const char * vulkan_info = nullptr; const int gpulayers = 0; const bool flash_attention = false; + const bool quiet = false; const int debugmode = 0; }; struct tts_generation_inputs @@ -221,7 +222,6 @@ struct tts_generation_inputs const char * prompt = nullptr; const int speaker_seed = 0; const int audio_seed = 0; - const bool quiet = false; const bool nocache = false; }; struct tts_generation_outputs diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 03c1c4e50..811b2f20e 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -106,7 +106,7 @@ static kcpp_params * kcpp_data = nullptr; static int max_context_limit_at_load = 0; static int n_past = 0; static int debugmode = 0; //-1 = hide all, 0 = normal, 1 = showall -static bool quiet = false; +static bool is_quiet = false; static std::vector last_n_tokens; static std::vector current_context_tokens; static size_t mem_per_token = 0; @@ -939,12 +939,12 @@ void sample_xtc(llama_token_data_array * candidates, float xtc_threshold, float if(last_idx>1) //if there are 2 or more viable candidates { - if (debugmode==1 && !quiet) { + if (debugmode==1 && !is_quiet) { printf("XTC penalties ["); } // then remove all other tokens above threshold EXCEPT the least likely one for (size_t i = 0; i < last_idx - 1; ++i) { - if (debugmode==1 && !quiet) + if (debugmode==1 && !is_quiet) { gpt_vocab::id token = candidates->data[i].id; std::string tokenizedstr = FileFormatTokenizeID(token, file_format); @@ -953,7 +953,7 @@ void sample_xtc(llama_token_data_array * candidates, float xtc_threshold, float } candidates->data[i].logit -= 999.0f; //infinity gets wonky results downstream, this hack works well enough } - if (debugmode==1 && !quiet) { + if (debugmode==1 && !is_quiet) { printf("]\n"); } candidates->sorted = false; @@ -1142,7 +1142,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe max_exponent = FLOAT_MAX_LOG / std::log(penalty_base); } - if (debugmode==1 && !quiet && !dry_max_token_repeat.empty()) { + if (debugmode==1 && !is_quiet && !dry_max_token_repeat.empty()) { printf("DRY penalties ["); } size_t count = 0; @@ -1153,7 +1153,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe repeat_exp = max_exponent; } float penalty = penalty_multiplier * pow(penalty_base, repeat_exp); - if (debugmode==1 && !quiet) + if (debugmode==1 && !is_quiet) { std::string tokenizedstr = FileFormatTokenizeID(token, file_format); ::utreplace(tokenizedstr, "\n", "\\n"); @@ -1166,7 +1166,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe { candidates->sorted = false; } - if (debugmode==1 && !quiet && !dry_max_token_repeat.empty()) { + if (debugmode==1 && !is_quiet && !dry_max_token_repeat.empty()) { printf("]\n"); } } @@ -1697,7 +1697,7 @@ static void load_grammar(const std::string & gammarstr) printf("\nIgnored invalid grammar sampler."); return; } - if(debugmode==1 && !quiet) + if(debugmode==1 && !is_quiet) { parsed_grammar.print(stderr); } @@ -1840,7 +1840,7 @@ static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_trai float chi_ctx_value = (n_ctx_desired * ctx_multiplier) / 6.28318; float gradient_ai_rope_freq_base_value = powf(original_rope_base, log10f(chi_ctx_value) / log10f(chi_ctx_train_value)); - if(debugmode==1 && !quiet) + if(debugmode==1 && !is_quiet) { printf("Trained max context length (value:%.d).\n", n_ctx_train); printf("Desired context length (value:%.d).\n", n_ctx_desired); @@ -1857,7 +1857,7 @@ static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_trai { float extended_rope_positive_offset_value = 1 + ((log10f(chi_ctx_value) - log10f(chi_ctx_train_value)) / ((log10f(chi_ctx_value) * log10f(chi_ctx_train_value)) - (log10f(chi_ctx_value) + log10f(chi_ctx_train_value)))); float rope_freq_base_with_positive_offset = gradient_ai_rope_freq_base_value * extended_rope_positive_offset_value; - if(debugmode==1 && !quiet) + if(debugmode==1 && !is_quiet) { printf("Extended RoPE Positive Offset (multiplicator) for Solar based models. (value:%.3f).\n", extended_rope_positive_offset_value); printf("RoPE base calculated via Gradient AI formula for Solar based models. (value:%.1f).\n", rope_freq_base_with_positive_offset); @@ -1873,6 +1873,7 @@ static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_trai ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in_file_format, FileFormatExtraMeta in_file_format_meta) { + is_quiet = inputs.quiet; ggml_time_init(); kcpp_data = new kcpp_params(); //allocate on heap to avoid linux segfault. yes this leaks memory. @@ -2688,13 +2689,13 @@ std::vector gpttype_get_token_arr(const std::string & input, bool addbos) printf("\nWarning: KCPP text generation not initialized!\n"); return toks; } - if(debugmode==1 && !quiet) + if(debugmode==1 && !is_quiet) { printf("\nFileFormat: %d, Tokenizing: %s",file_format ,input.c_str()); } TokenizeString(input, toks, file_format,addbos); int tokcount = toks.size(); - if(debugmode==1 && !quiet) + if(debugmode==1 && !is_quiet) { printf("\nTokens Counted: %d\n",tokcount); } @@ -2779,7 +2780,6 @@ generation_outputs gpttype_generate(const generation_inputs inputs) llama_perf_context_reset(llama_ctx_v4); } - quiet = inputs.quiet; generation_finished = false; // Set current generation status generated_tokens.clear(); // New Generation, new tokens delayed_generated_tokens.clear(); @@ -2858,7 +2858,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs) banned_token_ids.clear(); if(banned_tokens.size()>0) { - if(debugmode==1 && !quiet) + if(debugmode==1 && !is_quiet) { printf("\nBanning %zu single character sequences...",banned_tokens.size()); } @@ -2875,13 +2875,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs) } } } - if(debugmode==1 && !quiet) + if(debugmode==1 && !is_quiet) { printf("\nBanned a total of %zu individual tokens.\n",banned_token_ids.size()); } } - if(debugmode==1 && !quiet && banned_phrases.size()>0) + if(debugmode==1 && !is_quiet && banned_phrases.size()>0) { printf("\nBanned a total of %zu phrases, with max token count of %d.\n",banned_phrases.size(),delayed_generated_tokens_limit); } @@ -2926,7 +2926,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs) //images have changed. swap identifiers to force reprocessing current_llava_identifier = (current_llava_identifier==LLAVA_TOKEN_IDENTIFIER_A?LLAVA_TOKEN_IDENTIFIER_B:LLAVA_TOKEN_IDENTIFIER_A); llava_composite_image_signature = new_llava_composite; - if(debugmode==1 && !quiet) + if(debugmode==1 && !is_quiet) { printf("\nLLAVA images changed, existing cache invalidated"); } @@ -2982,7 +2982,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs) const int MAX_CHAR_LEN = 40; const int MAX_SEQ_LEN = 20; - if (debugmode == 1 && !quiet) + if (debugmode == 1 && !is_quiet) { printf("\nProcessing %zu dry break strings...", kcpp_data->dry_sequence_breakers.size()); } @@ -2994,7 +2994,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs) } GetOverlappingTokenSequences(sequence_break, dry_sequence_breakers, MAX_SEQ_LEN); } - if (debugmode == 1 && !quiet) + if (debugmode == 1 && !is_quiet) { int trivial = 0, non_trivial = 0; for (const auto &seq : dry_sequence_breakers) @@ -3014,7 +3014,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs) } bool stream_sse = inputs.stream_sse; - bool allow_regular_prints = (!quiet && debugmode!=-1); + bool allow_regular_prints = (!is_quiet && debugmode!=-1); std::string grammarstr = inputs.grammar; bool grammar_retain_state = inputs.grammar_retain_state; @@ -3047,7 +3047,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs) if (kcpp_data->seed <= 0 || kcpp_data->seed==0xFFFFFFFF) { kcpp_data->seed = (((uint32_t)time(NULL)) % 1000000u); - if(debugmode==1 && !quiet) + if(debugmode==1 && !is_quiet) { printf("\nUsing Seed: %d",kcpp_data->seed); } @@ -3079,7 +3079,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs) } else { - if(debugmode==1 && !quiet) + if(debugmode==1 && !is_quiet) { printf("\nCreating clip image embed..."); } @@ -3087,7 +3087,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs) if (!llava_image_embed_make_with_clip_img(clp_ctx, kcpp_data->n_threads, clp_img_data, &llava_images[i].clp_img_embd, &llava_images[i].clp_image_tokens)) { printf("\nError: Clip image %d failed to create embd!",i); } - if(debugmode==1 && !quiet) + if(debugmode==1 && !is_quiet) { printf("\nLLAVA Clip Embed %i used Tokens: %d",i,llava_images[i].clp_image_tokens); } @@ -3210,7 +3210,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs) std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); n_past = 0; - if (debugmode==1 && !quiet) + if (debugmode==1 && !is_quiet) { std::string outstr = ""; printf("\n\n[Debug: Dump Raw Input Tokens, format: %d]\n", file_format); @@ -3355,7 +3355,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs) printf("\n"); } - if (debugmode==1 && !quiet) + if (debugmode==1 && !is_quiet) { std::string outstr = ""; printf("\n[Debug: Dump Forwarded Input Tokens, format: %d]\n", file_format); @@ -3404,7 +3404,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs) draft_used = true; draft_results = speculative_decoding_eval_chunk(draft_ctx, llama_ctx_v4, embd, n_vocab, n_past); evalres = draft_results.draft_success; - if(debugmode==1 && !quiet) + if(debugmode==1 && !is_quiet) { std::string draftedtoks = get_tok_vec_str(draft_results.draftids); printf("\nDrafted %d Tokens: [%s]\n",speculative_chunk_amt,draftedtoks.c_str()); @@ -3607,7 +3607,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs) if(draft_used) { int32_t draftedid = draft_results.draftids[logits_sampled]; - if(debugmode==1 && !quiet) + if(debugmode==1 && !is_quiet) { std::string drafttok = FileFormatTokenizeID(draftedid, file_format, true); std::string realtok = FileFormatTokenizeID(id, file_format, true); @@ -3660,7 +3660,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs) { printf("\rGenerating (%d / %d tokens)", (kcpp_data->n_predict - remaining_tokens), kcpp_data->n_predict); } - if(debugmode==1 && !quiet && top_picks_history.size()>0) + if(debugmode==1 && !is_quiet && top_picks_history.size()>0) { printf(" ["); bool firstloop = true; @@ -3912,7 +3912,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs) delayed_generated_tokens.pop_front(); } - if(debugmode==1 && !quiet && file_format == FileFormat::GGUF_GENERIC) + if(debugmode==1 && !is_quiet && file_format == FileFormat::GGUF_GENERIC) { printf("\n"); llama_perf_context_print(llama_ctx_v4); diff --git a/klite.embd b/klite.embd index a583f5a0d..f6c6b0b5d 100644 --- a/klite.embd +++ b/klite.embd @@ -12,7 +12,7 @@ Current version indicated by LITEVER below. -->