mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2026-05-08 18:30:50 +00:00
renamed promptlimit to genlimit, now applies to API requests as well, can be set in the ui. hide API info display if running in CLI mode.
This commit is contained in:
parent
3060dfb99f
commit
645b09ea20
2 changed files with 27 additions and 3 deletions
14
klite.embd
14
klite.embd
|
|
@ -3725,6 +3725,15 @@ Current version indicated by LITEVER below.
|
|||
"name":"OpenAI Harmony",
|
||||
"user":"<|start|>user<|message|>",
|
||||
"user_end":"<|end|>",
|
||||
"assistant":"<|start|>assistant",
|
||||
"assistant_end":"<|end|>",
|
||||
"system":"<|start|>developer<|message|>",
|
||||
"system_end":"<|end|>",
|
||||
},
|
||||
{
|
||||
"name":"OpenAI Harmony Non-Thinking",
|
||||
"user":"<|start|>user<|message|>",
|
||||
"user_end":"<|end|>",
|
||||
"assistant":"<|start|>assistant<|channel|>final<|message|>",
|
||||
"assistant_end":"<|end|>",
|
||||
"system":"<|start|>developer<|message|>",
|
||||
|
|
@ -17063,6 +17072,8 @@ Current version indicated by LITEVER below.
|
|||
}
|
||||
}
|
||||
}
|
||||
//special case for GPT-OSS, never use it as a stop sequence or stuff gets messed
|
||||
seqs = seqs.filter(itm => !itm.includes("<|start|>assistant"));
|
||||
if(!localsettings.includedefaultstops)
|
||||
{
|
||||
seqs = [];
|
||||
|
|
@ -19012,6 +19023,9 @@ Current version indicated by LITEVER below.
|
|||
}
|
||||
}
|
||||
|
||||
//special case for GPT-OSS, never use it as a stop sequence or stuff gets messed
|
||||
stripping_arr = stripping_arr.filter(itm => !itm.includes("<|start|>assistant"));
|
||||
|
||||
//sometimes the OAI type endpoints get confused and repeat the instruct tag, so trim it
|
||||
for(let i=0;i<stripping_arr.length;++i)
|
||||
{
|
||||
|
|
|
|||
16
koboldcpp.py
16
koboldcpp.py
|
|
@ -1510,6 +1510,8 @@ def generate(genparams, stream_flag=False):
|
|||
max_context_length = maxctx
|
||||
min_remain_hardlimit = max(min(max_context_length-4, 16),int(max_context_length*0.2))
|
||||
min_remain_softlimit = max(min(max_context_length-4, 16),int(max_context_length*0.4))
|
||||
if args.genlimit > 0 and max_length > args.genlimit:
|
||||
max_length = args.genlimit
|
||||
if max_length >= (max_context_length-min_remain_softlimit):
|
||||
print(f"\n!!! ====== !!!\nWarning: You are trying to generate text with max_length ({max_length}) near or exceeding max_context_length limit ({max_context_length}).\nMost of the context will be removed, and your outputs will not be very coherent.\nConsider launching with increased --contextsize to avoid issues.\n!!! ====== !!!")
|
||||
if max_length >= (max_context_length-min_remain_hardlimit):
|
||||
|
|
@ -4555,6 +4557,7 @@ def show_gui():
|
|||
moeexperts_var = ctk.StringVar(value=str(-1))
|
||||
moecpu_var = ctk.StringVar(value=str(0))
|
||||
defaultgenamt_var = ctk.StringVar(value=str(768))
|
||||
genlimit_var = ctk.StringVar(value=str(0))
|
||||
nobostoken_var = ctk.IntVar(value=0)
|
||||
override_kv_var = ctk.StringVar(value="")
|
||||
override_tensors_var = ctk.StringVar(value="")
|
||||
|
|
@ -5223,6 +5226,7 @@ def show_gui():
|
|||
makeslider(tokens_tab, "Context Size:",contextsize_text, context_var, 0, len(contextsize_text)-1, 18, width=280, set=7,tooltip="What is the maximum context size to support. Model specific. You cannot exceed it.\nLarger contexts require more memory, and not all models support it.")
|
||||
context_var.trace_add("write", changed_gpulayers_estimate)
|
||||
makelabelentry(tokens_tab, "Default Gen Amt:", defaultgenamt_var, row=20, padx=120, singleline=True, tooltip="How many tokens to generate by default, if not specified. Must be smaller than context size. Usually, your frontend GUI will override this.")
|
||||
makelabelentry(tokens_tab, "Prompt Limit:", genlimit_var, row=20, padx=300, singleline=True, tooltip="If set, restricts max output tokens to this limit regardless of API request. Set to 0 to disable.",labelpadx=210)
|
||||
|
||||
nativectx_entry, nativectx_label = makelabelentry(tokens_tab, "Override Native Context:", customrope_nativectx, row=23, padx=146, singleline=True, tooltip="Overrides the native trained context of the loaded model with a custom value to be used for Rope scaling.")
|
||||
customrope_scale_entry, customrope_scale_label = makelabelentry(tokens_tab, "RoPE Scale:", customrope_scale, row=23, padx=100, singleline=True, tooltip="For Linear RoPE scaling. RoPE frequency scale.")
|
||||
|
|
@ -5559,6 +5563,7 @@ def show_gui():
|
|||
args.moeexperts = int(moeexperts_var.get()) if moeexperts_var.get()!="" else -1
|
||||
args.moecpu = int(moecpu_var.get()) if moecpu_var.get()!="" else 0
|
||||
args.defaultgenamt = int(defaultgenamt_var.get()) if defaultgenamt_var.get()!="" else 768
|
||||
args.genlimit = int(genlimit_var.get()) if genlimit_var.get()!="" else 0
|
||||
args.nobostoken = (nobostoken_var.get()==1)
|
||||
args.enableguidance = (enableguidance_var.get()==1)
|
||||
args.overridekv = None if override_kv_var.get() == "" else override_kv_var.get()
|
||||
|
|
@ -5784,6 +5789,10 @@ def show_gui():
|
|||
moecpu_var.set(dict["moecpu"])
|
||||
if "defaultgenamt" in dict and dict["defaultgenamt"]:
|
||||
defaultgenamt_var.set(dict["defaultgenamt"])
|
||||
if "genlimit" in dict and dict["genlimit"]:
|
||||
genlimit_var.set(dict["genlimit"])
|
||||
else:
|
||||
genlimit_var.set(str(0))
|
||||
nobostoken_var.set(dict["nobostoken"] if ("nobostoken" in dict) else 0)
|
||||
enableguidance_var.set(dict["enableguidance"] if ("enableguidance" in dict) else 0)
|
||||
if "overridekv" in dict and dict["overridekv"]:
|
||||
|
|
@ -7362,7 +7371,8 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
|
|||
|
||||
print(f"======\nActive Modules: {' '.join(enabledmlist)}")
|
||||
print(f"Inactive Modules: {' '.join(disabledmlist)}")
|
||||
print(f"Enabled APIs: {' '.join(apimlist)}")
|
||||
if not args.cli:
|
||||
print(f"Enabled APIs: {' '.join(apimlist)}")
|
||||
|
||||
global sslvalid
|
||||
if args.ssl:
|
||||
|
|
@ -7447,7 +7457,7 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
|
|||
else:
|
||||
save_to_file = (args.benchmark and args.benchmark!="stdout" and args.benchmark!="")
|
||||
benchmaxctx = maxctx
|
||||
benchlen = args.promptlimit
|
||||
benchlen = args.genlimit if args.genlimit > 0 else 100
|
||||
benchtemp = 0.1
|
||||
benchtopk = 1
|
||||
benchreppen = 1
|
||||
|
|
@ -7593,7 +7603,7 @@ if __name__ == '__main__':
|
|||
advparser.add_argument("--benchmark", help="Do not start server, instead run benchmarks. If filename is provided, appends results to provided file.", metavar=('[filename]'), nargs='?', const="stdout", type=str, default=None)
|
||||
advparser.add_argument("--prompt", metavar=('[prompt]'), help="Passing a prompt string triggers a direct inference, loading the model, outputs the response to stdout and exits. Can be used alone or with benchmark.", type=str, default="")
|
||||
advparser.add_argument("--cli", help="Does not launch KoboldCpp HTTP server. Instead, enables KoboldCpp from the command line, accepting interactive console input and displaying responses to the terminal.", action='store_true')
|
||||
advparser.add_argument("--promptlimit", help="Sets the maximum number of generated tokens, usable only with --prompt or --benchmark",metavar=('[token limit]'), type=int, default=100)
|
||||
advparser.add_argument("--genlimit","--promptlimit", help="Sets the maximum number of generated tokens, it will restrict all generations to this or lower. Also usable with --prompt or --benchmark.",metavar=('[token limit]'), type=int, default=0)
|
||||
advparser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them.", metavar=('limit'), nargs='?', const=1, type=int, default=1)
|
||||
advparser.add_argument("--multiplayer", help="Hosts a shared multiplayer session that others can join.", action='store_true')
|
||||
advparser.add_argument("--websearch", help="Enable the local search engine proxy so Web Searches can be done.", action='store_true')
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue