show warning if genamt >= ctxsize, show t/s values

2025-09-10 17:14:36 +00:00 · 2024-01-31 18:51:42 +08:00 · 2024-01-31 18:51:42 +08:00 · 340fbbbb04
commit 340fbbbb04
parent 71cc19e76d
2 changed files with 5 additions and 3 deletions
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@ -2083,10 +2083,12 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
    }
    time2 = timer_check();
    float pt1 = (time1*1000.0/(embd_inp.size()==0?1:embd_inp.size()));
    float ts1 = (1000.0/pt1);
    int realnpredict = kcpp_params->n_predict-stopper_unused_tokens;
    float pt2 = (time2*1000.0/(realnpredict==0?1:realnpredict));
    float ts2 = (1000.0/pt2);
    float tokens_per_second = (realnpredict == 0 ? 0 : realnpredict / (time1 + time2));
-    printf("\nContextLimit: %d/%d, Processing:%.2fs (%.1fms/T), Generation:%.2fs (%.1fms/T), Total:%.2fs (%.1fms/T = %.2fT/s)",current_context_tokens.size(),nctx, time1, pt1, time2, pt2, (time1 + time2), (1000.0f/tokens_per_second) , tokens_per_second);
+    printf("\nContextLimit: %d/%d, Processing:%.2fs (%.1fms/T = %.2fT/s), Generation:%.2fs (%.1fms/T = %.2fT/s), Total:%.2fs (%.1fms/T = %.2fT/s)",current_context_tokens.size(),nctx, time1, pt1, ts1, time2, pt2, ts2, (time1 + time2), (1000.0f/tokens_per_second) , tokens_per_second);
    fflush(stdout);
    output.status = 1;
    generation_finished = true;
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -334,9 +334,9 @@ def generate(prompt, memory="", max_length=32, max_context_length=512, temperatu
    outputs = ctypes.create_unicode_buffer(ctypes.sizeof(generation_outputs))
    inputs.prompt = prompt.encode("UTF-8")
    inputs.memory = memory.encode("UTF-8")
-    if max_length >= max_context_length:
+    if max_length >= (max_context_length-1):
        max_length = max_context_length-1
-        print("\nWARNING: You are trying to generate with max_length near or exceeding max_context_length. Most of the context will be gone and your outputs will not be very coherent.")
+        print("\nWarning: You are trying to generate with max_length near or exceeding max_context_length. Most of the context will be removed, and your outputs will not be very coherent.")
    global showmaxctxwarning
    if max_context_length > maxctx:
        if showmaxctxwarning: