renamed promptlimit to genlimit, now applies to API requests as well, can be set in the ui. hide API info display if running in CLI mode.

This commit is contained in:
Concedo 2025-08-30 00:26:05 +08:00
parent 3060dfb99f
commit 645b09ea20
2 changed files with 27 additions and 3 deletions

View file

@ -3725,6 +3725,15 @@ Current version indicated by LITEVER below.
"name":"OpenAI Harmony",
"user":"<|start|>user<|message|>",
"user_end":"<|end|>",
"assistant":"<|start|>assistant",
"assistant_end":"<|end|>",
"system":"<|start|>developer<|message|>",
"system_end":"<|end|>",
},
{
"name":"OpenAI Harmony Non-Thinking",
"user":"<|start|>user<|message|>",
"user_end":"<|end|>",
"assistant":"<|start|>assistant<|channel|>final<|message|>",
"assistant_end":"<|end|>",
"system":"<|start|>developer<|message|>",
@ -17063,6 +17072,8 @@ Current version indicated by LITEVER below.
}
}
}
//special case for GPT-OSS, never use it as a stop sequence or stuff gets messed
seqs = seqs.filter(itm => !itm.includes("<|start|>assistant"));
if(!localsettings.includedefaultstops)
{
seqs = [];
@ -19012,6 +19023,9 @@ Current version indicated by LITEVER below.
}
}
//special case for GPT-OSS, never use it as a stop sequence or stuff gets messed
stripping_arr = stripping_arr.filter(itm => !itm.includes("<|start|>assistant"));
//sometimes the OAI type endpoints get confused and repeat the instruct tag, so trim it
for(let i=0;i<stripping_arr.length;++i)
{

View file

@ -1510,6 +1510,8 @@ def generate(genparams, stream_flag=False):
max_context_length = maxctx
min_remain_hardlimit = max(min(max_context_length-4, 16),int(max_context_length*0.2))
min_remain_softlimit = max(min(max_context_length-4, 16),int(max_context_length*0.4))
if args.genlimit > 0 and max_length > args.genlimit:
max_length = args.genlimit
if max_length >= (max_context_length-min_remain_softlimit):
print(f"\n!!! ====== !!!\nWarning: You are trying to generate text with max_length ({max_length}) near or exceeding max_context_length limit ({max_context_length}).\nMost of the context will be removed, and your outputs will not be very coherent.\nConsider launching with increased --contextsize to avoid issues.\n!!! ====== !!!")
if max_length >= (max_context_length-min_remain_hardlimit):
@ -4555,6 +4557,7 @@ def show_gui():
moeexperts_var = ctk.StringVar(value=str(-1))
moecpu_var = ctk.StringVar(value=str(0))
defaultgenamt_var = ctk.StringVar(value=str(768))
genlimit_var = ctk.StringVar(value=str(0))
nobostoken_var = ctk.IntVar(value=0)
override_kv_var = ctk.StringVar(value="")
override_tensors_var = ctk.StringVar(value="")
@ -5223,6 +5226,7 @@ def show_gui():
makeslider(tokens_tab, "Context Size:",contextsize_text, context_var, 0, len(contextsize_text)-1, 18, width=280, set=7,tooltip="What is the maximum context size to support. Model specific. You cannot exceed it.\nLarger contexts require more memory, and not all models support it.")
context_var.trace_add("write", changed_gpulayers_estimate)
makelabelentry(tokens_tab, "Default Gen Amt:", defaultgenamt_var, row=20, padx=120, singleline=True, tooltip="How many tokens to generate by default, if not specified. Must be smaller than context size. Usually, your frontend GUI will override this.")
makelabelentry(tokens_tab, "Prompt Limit:", genlimit_var, row=20, padx=300, singleline=True, tooltip="If set, restricts max output tokens to this limit regardless of API request. Set to 0 to disable.",labelpadx=210)
nativectx_entry, nativectx_label = makelabelentry(tokens_tab, "Override Native Context:", customrope_nativectx, row=23, padx=146, singleline=True, tooltip="Overrides the native trained context of the loaded model with a custom value to be used for Rope scaling.")
customrope_scale_entry, customrope_scale_label = makelabelentry(tokens_tab, "RoPE Scale:", customrope_scale, row=23, padx=100, singleline=True, tooltip="For Linear RoPE scaling. RoPE frequency scale.")
@ -5559,6 +5563,7 @@ def show_gui():
args.moeexperts = int(moeexperts_var.get()) if moeexperts_var.get()!="" else -1
args.moecpu = int(moecpu_var.get()) if moecpu_var.get()!="" else 0
args.defaultgenamt = int(defaultgenamt_var.get()) if defaultgenamt_var.get()!="" else 768
args.genlimit = int(genlimit_var.get()) if genlimit_var.get()!="" else 0
args.nobostoken = (nobostoken_var.get()==1)
args.enableguidance = (enableguidance_var.get()==1)
args.overridekv = None if override_kv_var.get() == "" else override_kv_var.get()
@ -5784,6 +5789,10 @@ def show_gui():
moecpu_var.set(dict["moecpu"])
if "defaultgenamt" in dict and dict["defaultgenamt"]:
defaultgenamt_var.set(dict["defaultgenamt"])
if "genlimit" in dict and dict["genlimit"]:
genlimit_var.set(dict["genlimit"])
else:
genlimit_var.set(str(0))
nobostoken_var.set(dict["nobostoken"] if ("nobostoken" in dict) else 0)
enableguidance_var.set(dict["enableguidance"] if ("enableguidance" in dict) else 0)
if "overridekv" in dict and dict["overridekv"]:
@ -7362,7 +7371,8 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
print(f"======\nActive Modules: {' '.join(enabledmlist)}")
print(f"Inactive Modules: {' '.join(disabledmlist)}")
print(f"Enabled APIs: {' '.join(apimlist)}")
if not args.cli:
print(f"Enabled APIs: {' '.join(apimlist)}")
global sslvalid
if args.ssl:
@ -7447,7 +7457,7 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
else:
save_to_file = (args.benchmark and args.benchmark!="stdout" and args.benchmark!="")
benchmaxctx = maxctx
benchlen = args.promptlimit
benchlen = args.genlimit if args.genlimit > 0 else 100
benchtemp = 0.1
benchtopk = 1
benchreppen = 1
@ -7593,7 +7603,7 @@ if __name__ == '__main__':
advparser.add_argument("--benchmark", help="Do not start server, instead run benchmarks. If filename is provided, appends results to provided file.", metavar=('[filename]'), nargs='?', const="stdout", type=str, default=None)
advparser.add_argument("--prompt", metavar=('[prompt]'), help="Passing a prompt string triggers a direct inference, loading the model, outputs the response to stdout and exits. Can be used alone or with benchmark.", type=str, default="")
advparser.add_argument("--cli", help="Does not launch KoboldCpp HTTP server. Instead, enables KoboldCpp from the command line, accepting interactive console input and displaying responses to the terminal.", action='store_true')
advparser.add_argument("--promptlimit", help="Sets the maximum number of generated tokens, usable only with --prompt or --benchmark",metavar=('[token limit]'), type=int, default=100)
advparser.add_argument("--genlimit","--promptlimit", help="Sets the maximum number of generated tokens, it will restrict all generations to this or lower. Also usable with --prompt or --benchmark.",metavar=('[token limit]'), type=int, default=0)
advparser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them.", metavar=('limit'), nargs='?', const=1, type=int, default=1)
advparser.add_argument("--multiplayer", help="Hosts a shared multiplayer session that others can join.", action='store_true')
advparser.add_argument("--websearch", help="Enable the local search engine proxy so Web Searches can be done.", action='store_true')