diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 9c52feeb4..7bba001cc 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -2083,10 +2083,12 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o } time2 = timer_check(); float pt1 = (time1*1000.0/(embd_inp.size()==0?1:embd_inp.size())); + float ts1 = (1000.0/pt1); int realnpredict = kcpp_params->n_predict-stopper_unused_tokens; float pt2 = (time2*1000.0/(realnpredict==0?1:realnpredict)); + float ts2 = (1000.0/pt2); float tokens_per_second = (realnpredict == 0 ? 0 : realnpredict / (time1 + time2)); - printf("\nContextLimit: %d/%d, Processing:%.2fs (%.1fms/T), Generation:%.2fs (%.1fms/T), Total:%.2fs (%.1fms/T = %.2fT/s)",current_context_tokens.size(),nctx, time1, pt1, time2, pt2, (time1 + time2), (1000.0f/tokens_per_second) , tokens_per_second); + printf("\nContextLimit: %d/%d, Processing:%.2fs (%.1fms/T = %.2fT/s), Generation:%.2fs (%.1fms/T = %.2fT/s), Total:%.2fs (%.1fms/T = %.2fT/s)",current_context_tokens.size(),nctx, time1, pt1, ts1, time2, pt2, ts2, (time1 + time2), (1000.0f/tokens_per_second) , tokens_per_second); fflush(stdout); output.status = 1; generation_finished = true; diff --git a/koboldcpp.py b/koboldcpp.py index 8f72fbe2c..dd02ba239 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -334,9 +334,9 @@ def generate(prompt, memory="", max_length=32, max_context_length=512, temperatu outputs = ctypes.create_unicode_buffer(ctypes.sizeof(generation_outputs)) inputs.prompt = prompt.encode("UTF-8") inputs.memory = memory.encode("UTF-8") - if max_length >= max_context_length: + if max_length >= (max_context_length-1): max_length = max_context_length-1 - print("\nWARNING: You are trying to generate with max_length near or exceeding max_context_length. Most of the context will be gone and your outputs will not be very coherent.") + print("\nWarning: You are trying to generate with max_length near or exceeding max_context_length. Most of the context will be removed, and your outputs will not be very coherent.") global showmaxctxwarning if max_context_length > maxctx: if showmaxctxwarning: