From 645b09ea20ab7b0d2a51129ce843a0983d578b5f Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sat, 30 Aug 2025 00:26:05 +0800 Subject: [PATCH] renamed promptlimit to genlimit, now applies to API requests as well, can be set in the ui. hide API info display if running in CLI mode. --- klite.embd | 14 ++++++++++++++ koboldcpp.py | 16 +++++++++++++--- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/klite.embd b/klite.embd index 7dbeee0ca..c6a94113a 100644 --- a/klite.embd +++ b/klite.embd @@ -3725,6 +3725,15 @@ Current version indicated by LITEVER below. "name":"OpenAI Harmony", "user":"<|start|>user<|message|>", "user_end":"<|end|>", + "assistant":"<|start|>assistant", + "assistant_end":"<|end|>", + "system":"<|start|>developer<|message|>", + "system_end":"<|end|>", + }, + { + "name":"OpenAI Harmony Non-Thinking", + "user":"<|start|>user<|message|>", + "user_end":"<|end|>", "assistant":"<|start|>assistant<|channel|>final<|message|>", "assistant_end":"<|end|>", "system":"<|start|>developer<|message|>", @@ -17063,6 +17072,8 @@ Current version indicated by LITEVER below. } } } + //special case for GPT-OSS, never use it as a stop sequence or stuff gets messed + seqs = seqs.filter(itm => !itm.includes("<|start|>assistant")); if(!localsettings.includedefaultstops) { seqs = []; @@ -19012,6 +19023,9 @@ Current version indicated by LITEVER below. } } + //special case for GPT-OSS, never use it as a stop sequence or stuff gets messed + stripping_arr = stripping_arr.filter(itm => !itm.includes("<|start|>assistant")); + //sometimes the OAI type endpoints get confused and repeat the instruct tag, so trim it for(let i=0;i 0 and max_length > args.genlimit: + max_length = args.genlimit if max_length >= (max_context_length-min_remain_softlimit): print(f"\n!!! ====== !!!\nWarning: You are trying to generate text with max_length ({max_length}) near or exceeding max_context_length limit ({max_context_length}).\nMost of the context will be removed, and your outputs will not be very coherent.\nConsider launching with increased --contextsize to avoid issues.\n!!! ====== !!!") if max_length >= (max_context_length-min_remain_hardlimit): @@ -4555,6 +4557,7 @@ def show_gui(): moeexperts_var = ctk.StringVar(value=str(-1)) moecpu_var = ctk.StringVar(value=str(0)) defaultgenamt_var = ctk.StringVar(value=str(768)) + genlimit_var = ctk.StringVar(value=str(0)) nobostoken_var = ctk.IntVar(value=0) override_kv_var = ctk.StringVar(value="") override_tensors_var = ctk.StringVar(value="") @@ -5223,6 +5226,7 @@ def show_gui(): makeslider(tokens_tab, "Context Size:",contextsize_text, context_var, 0, len(contextsize_text)-1, 18, width=280, set=7,tooltip="What is the maximum context size to support. Model specific. You cannot exceed it.\nLarger contexts require more memory, and not all models support it.") context_var.trace_add("write", changed_gpulayers_estimate) makelabelentry(tokens_tab, "Default Gen Amt:", defaultgenamt_var, row=20, padx=120, singleline=True, tooltip="How many tokens to generate by default, if not specified. Must be smaller than context size. Usually, your frontend GUI will override this.") + makelabelentry(tokens_tab, "Prompt Limit:", genlimit_var, row=20, padx=300, singleline=True, tooltip="If set, restricts max output tokens to this limit regardless of API request. Set to 0 to disable.",labelpadx=210) nativectx_entry, nativectx_label = makelabelentry(tokens_tab, "Override Native Context:", customrope_nativectx, row=23, padx=146, singleline=True, tooltip="Overrides the native trained context of the loaded model with a custom value to be used for Rope scaling.") customrope_scale_entry, customrope_scale_label = makelabelentry(tokens_tab, "RoPE Scale:", customrope_scale, row=23, padx=100, singleline=True, tooltip="For Linear RoPE scaling. RoPE frequency scale.") @@ -5559,6 +5563,7 @@ def show_gui(): args.moeexperts = int(moeexperts_var.get()) if moeexperts_var.get()!="" else -1 args.moecpu = int(moecpu_var.get()) if moecpu_var.get()!="" else 0 args.defaultgenamt = int(defaultgenamt_var.get()) if defaultgenamt_var.get()!="" else 768 + args.genlimit = int(genlimit_var.get()) if genlimit_var.get()!="" else 0 args.nobostoken = (nobostoken_var.get()==1) args.enableguidance = (enableguidance_var.get()==1) args.overridekv = None if override_kv_var.get() == "" else override_kv_var.get() @@ -5784,6 +5789,10 @@ def show_gui(): moecpu_var.set(dict["moecpu"]) if "defaultgenamt" in dict and dict["defaultgenamt"]: defaultgenamt_var.set(dict["defaultgenamt"]) + if "genlimit" in dict and dict["genlimit"]: + genlimit_var.set(dict["genlimit"]) + else: + genlimit_var.set(str(0)) nobostoken_var.set(dict["nobostoken"] if ("nobostoken" in dict) else 0) enableguidance_var.set(dict["enableguidance"] if ("enableguidance" in dict) else 0) if "overridekv" in dict and dict["overridekv"]: @@ -7362,7 +7371,8 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False): print(f"======\nActive Modules: {' '.join(enabledmlist)}") print(f"Inactive Modules: {' '.join(disabledmlist)}") - print(f"Enabled APIs: {' '.join(apimlist)}") + if not args.cli: + print(f"Enabled APIs: {' '.join(apimlist)}") global sslvalid if args.ssl: @@ -7447,7 +7457,7 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False): else: save_to_file = (args.benchmark and args.benchmark!="stdout" and args.benchmark!="") benchmaxctx = maxctx - benchlen = args.promptlimit + benchlen = args.genlimit if args.genlimit > 0 else 100 benchtemp = 0.1 benchtopk = 1 benchreppen = 1 @@ -7593,7 +7603,7 @@ if __name__ == '__main__': advparser.add_argument("--benchmark", help="Do not start server, instead run benchmarks. If filename is provided, appends results to provided file.", metavar=('[filename]'), nargs='?', const="stdout", type=str, default=None) advparser.add_argument("--prompt", metavar=('[prompt]'), help="Passing a prompt string triggers a direct inference, loading the model, outputs the response to stdout and exits. Can be used alone or with benchmark.", type=str, default="") advparser.add_argument("--cli", help="Does not launch KoboldCpp HTTP server. Instead, enables KoboldCpp from the command line, accepting interactive console input and displaying responses to the terminal.", action='store_true') - advparser.add_argument("--promptlimit", help="Sets the maximum number of generated tokens, usable only with --prompt or --benchmark",metavar=('[token limit]'), type=int, default=100) + advparser.add_argument("--genlimit","--promptlimit", help="Sets the maximum number of generated tokens, it will restrict all generations to this or lower. Also usable with --prompt or --benchmark.",metavar=('[token limit]'), type=int, default=0) advparser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them.", metavar=('limit'), nargs='?', const=1, type=int, default=1) advparser.add_argument("--multiplayer", help="Hosts a shared multiplayer session that others can join.", action='store_true') advparser.add_argument("--websearch", help="Enable the local search engine proxy so Web Searches can be done.", action='store_true')