Added CLI chat mode

minor cli fixes (+1 squashed commits)

Squashed commits:

[60af39a9] Added CLI chat mode
This commit is contained in:
Concedo 2025-03-26 19:05:38 +08:00
parent 75e7902789
commit b4a8a5a278

View file

@ -3529,6 +3529,7 @@ def show_gui():
usemlock = ctk.IntVar()
debugmode = ctk.IntVar()
keepforeground = ctk.IntVar()
terminalonly = ctk.IntVar()
quietmode = ctk.IntVar(value=0)
nocertifymode = ctk.IntVar(value=0)
@ -4020,7 +4021,8 @@ def show_gui():
"Use MMAP": [usemmap, "Use mmap to load models if enabled, model will not be unloadable"],
"Use mlock": [usemlock, "Enables mlock, preventing the RAM used to load the model from being paged out."],
"Debug Mode": [debugmode, "Enables debug mode, with extra info printed to the terminal."],
"Keep Foreground": [keepforeground, "Bring KoboldCpp to the foreground every time there is a new generation."]
"Keep Foreground": [keepforeground, "Bring KoboldCpp to the foreground every time there is a new generation."],
"CLI Terminal Only": [terminalonly, "Does not launch KoboldCpp HTTP server. Instead, enables KoboldCpp from the command line, accepting interactive console input and displaying responses to the terminal."]
}
for idx, (name, properties) in enumerate(hardware_boxes.items()):
@ -4267,6 +4269,7 @@ def show_gui():
args.nofastforward = fastforward.get()==0
args.remotetunnel = remotetunnel.get()==1
args.foreground = keepforeground.get()==1
args.cli = terminalonly.get()==1
args.quiet = quietmode.get()==1
args.nocertify = nocertifymode.get()==1
args.nomodel = nomodel.get()==1
@ -4425,7 +4428,7 @@ def show_gui():
args.ttsgpu = (ttsgpu_var.get()==1)
args.ttsmaxlen = int(ttsmaxlen_var.get())
args.admin = (admin_var.get()==1)
args.admin = (admin_var.get()==1 and not args.cli)
args.admindir = admin_dir_var.get()
args.adminpassword = admin_password_var.get()
@ -4448,6 +4451,7 @@ def show_gui():
fastforward.set(0 if "nofastforward" in dict and dict["nofastforward"] else 1)
remotetunnel.set(1 if "remotetunnel" in dict and dict["remotetunnel"] else 0)
keepforeground.set(1 if "foreground" in dict and dict["foreground"] else 0)
terminalonly.set(1 if "cli" in dict and dict["cli"] else 0)
quietmode.set(1 if "quiet" in dict and dict["quiet"] else 0)
nocertifymode.set(1 if "nocertify" in dict and dict["nocertify"] else 0)
nomodel.set(1 if "nomodel" in dict and dict["nomodel"] else 0)
@ -5289,9 +5293,13 @@ def main(launch_args, default_args):
print(f"{KcppVersion}") # just print version and exit
return
#prevent disallowed combos
if (args.nomodel or args.benchmark or args.launch or args.admin) and args.cli:
exit_with_error(1, "Error: --cli cannot be combined with --launch, --nomodel, --admin or --benchmark")
args = convert_outdated_args(args)
temp_hide_print = (args.model_param and args.prompt and not args.benchmark and not (args.debugmode >= 1))
temp_hide_print = (args.model_param and (args.prompt and not args.cli) and not args.benchmark and not (args.debugmode >= 1))
if not temp_hide_print:
print(f"***\nWelcome to KoboldCpp - Version {KcppVersion}")
@ -5368,7 +5376,7 @@ def main(launch_args, default_args):
print("\nWARNING: Admin was set without selecting an admin directory. Admin cannot be used.\n")
if not args.admin: #run in single process mode
if args.remotetunnel and not args.prompt and not args.benchmark:
if args.remotetunnel and not args.prompt and not args.benchmark and not args.cli:
setuptunnel(global_memory, True if args.sdmodel else False)
kcpp_main_process(args,global_memory,using_gui_launcher)
if global_memory["input_to_exit"]:
@ -5379,7 +5387,7 @@ def main(launch_args, default_args):
with multiprocessing.Manager() as mp_manager:
global_memory = mp_manager.dict({"tunnel_url": "", "restart_target":"", "input_to_exit":False, "load_complete":False})
if args.remotetunnel and not args.prompt and not args.benchmark:
if args.remotetunnel and not args.prompt and not args.benchmark and not args.cli:
setuptunnel(global_memory, True if args.sdmodel else False)
# invoke the main koboldcpp process
@ -5459,10 +5467,10 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
using_gui_launcher = gui_launcher
start_time = time.time()
if args.model_param and args.prompt and not args.benchmark and not (args.debugmode >= 1):
if args.model_param and (args.prompt and not args.cli) and not args.benchmark and not (args.debugmode >= 1):
suppress_stdout()
if args.model_param and (args.benchmark or args.prompt):
if args.model_param and (args.benchmark or args.prompt or args.cli):
start_server = False
#try to read story if provided
@ -5985,34 +5993,36 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
endpoint_url = f"{httpsaffix}://localhost:{args.port}"
else:
endpoint_url = f"{httpsaffix}://{args.host}:{args.port}"
if not args.remotetunnel:
print(f"Starting Kobold API on port {args.port} at {endpoint_url}/api/")
print(f"Starting OpenAI Compatible API on port {args.port} at {endpoint_url}/v1/")
if args.sdmodel:
print(f"StableUI is available at {endpoint_url}/sdui/")
elif global_memory:
val = global_memory["tunnel_url"]
if val:
endpoint_url = val
remote_url = val
print(f"Your remote Kobold API can be found at {endpoint_url}/api")
print(f"Your remote OpenAI Compatible API can be found at {endpoint_url}/v1")
if start_server:
if not args.remotetunnel:
print(f"Starting Kobold API on port {args.port} at {endpoint_url}/api/")
print(f"Starting OpenAI Compatible API on port {args.port} at {endpoint_url}/v1/")
if args.sdmodel:
print(f"StableUI is available at {endpoint_url}/sdui/")
global_memory["load_complete"] = True
if args.launch:
def launch_browser_thread():
LaunchWebbrowser(endpoint_url,"--launch was set, but could not launch web browser automatically.")
browser_thread = threading.Timer(2, launch_browser_thread) #2 second delay
browser_thread.start()
elif global_memory:
val = global_memory["tunnel_url"]
if val:
endpoint_url = val
remote_url = val
print(f"Your remote Kobold API can be found at {endpoint_url}/api")
print(f"Your remote OpenAI Compatible API can be found at {endpoint_url}/v1")
if args.sdmodel:
print(f"StableUI is available at {endpoint_url}/sdui/")
global_memory["load_complete"] = True
if args.launch:
def launch_browser_thread():
LaunchWebbrowser(endpoint_url,"--launch was set, but could not launch web browser automatically.")
browser_thread = threading.Timer(2, launch_browser_thread) #2 second delay
browser_thread.start()
if args.hordekey and args.hordekey!="":
if args.hordeworkername and args.hordeworkername!="":
horde_thread = threading.Thread(target=run_horde_worker,args=(args,args.hordekey,args.hordeworkername))
horde_thread.daemon = True
horde_thread.start()
else:
print("Horde worker could not start. You need to specify a horde worker name with --hordeworkername")
if args.hordekey and args.hordekey!="":
if args.hordeworkername and args.hordeworkername!="":
horde_thread = threading.Thread(target=run_horde_worker,args=(args,args.hordekey,args.hordeworkername))
horde_thread.daemon = True
horde_thread.start()
else:
print("Horde worker could not start. You need to specify a horde worker name with --hordeworkername")
#if post-ready script specified, execute it
if args.onready:
@ -6024,82 +6034,105 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
timer_thread.start()
if not start_server:
save_to_file = (args.benchmark and args.benchmark!="stdout" and args.benchmark!="")
benchmaxctx = maxctx
benchlen = args.promptlimit
benchtemp = 0.1
benchtopk = 1
benchreppen = 1
benchbaneos = True
benchmodel = sanitize_string(os.path.splitext(os.path.basename(modelname))[0])
benchprompt = ""
if args.prompt:
benchprompt = args.prompt
benchtopk = 100
benchreppen = 1.07
benchtemp = 0.8
if not args.benchmark:
benchbaneos = False
if args.benchmark:
if os.path.exists(args.benchmark) and os.path.getsize(args.benchmark) > 1000000:
print("\nWarning: The benchmark CSV output file you selected exceeds 1MB. This is probably not what you want, did you select the wrong CSV file?\nFor safety, benchmark output will not be saved.")
save_to_file = False
if save_to_file:
print(f"\nRunning benchmark (Save to File: {args.benchmark})...")
else:
print("\nRunning benchmark (Not Saved)...")
if benchprompt=="":
benchprompt = " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1"
for i in range(0,14): #generate massive prompt
benchprompt += benchprompt
genp = {
"prompt":benchprompt,
"max_length":benchlen,
"max_context_length":benchmaxctx,
"temperature":benchtemp,
"top_k":benchtopk,
"rep_pen":benchreppen,
"ban_eos_token":benchbaneos
}
genout = generate(genparams=genp)
result = genout['text']
if args.prompt and not args.benchmark:
restore_stdout()
print(result)
if args.benchmark:
result = (result[:8] if len(result)>8 else "") if not args.prompt else result
t_pp = float(handle.get_last_process_time())*float(benchmaxctx-benchlen)*0.001
t_gen = float(handle.get_last_eval_time())*float(benchlen)*0.001
s_pp = float(benchmaxctx-benchlen)/t_pp
s_gen = float(benchlen)/t_gen
datetimestamp = datetime.now(timezone.utc)
benchflagstr = f"NoAVX2={args.noavx2} Threads={args.threads} HighPriority={args.highpriority} Cublas_Args={args.usecublas} Tensor_Split={args.tensor_split} BlasThreads={args.blasthreads} BlasBatchSize={args.blasbatchsize} FlashAttention={args.flashattention} KvCache={args.quantkv}"
print(f"\nBenchmark Completed - v{KcppVersion} Results:\n======")
print(f"Flags: {benchflagstr}")
print(f"Timestamp: {datetimestamp}")
print(f"Backend: {libname}")
print(f"Layers: {args.gpulayers}")
print(f"Model: {benchmodel}")
print(f"MaxCtx: {benchmaxctx}")
print(f"GenAmount: {benchlen}\n-----")
print(f"ProcessingTime: {t_pp:.3f}s")
print(f"ProcessingSpeed: {s_pp:.2f}T/s")
print(f"GenerationTime: {t_gen:.3f}s")
print(f"GenerationSpeed: {s_gen:.2f}T/s")
print(f"TotalTime: {(t_pp+t_gen):.3f}s")
print(f"Output: {result}\n-----")
if save_to_file:
try:
with open(args.benchmark, "a") as file:
file.seek(0, 2)
if file.tell() == 0: #empty file
file.write("Timestamp,Backend,Layers,Model,MaxCtx,GenAmount,ProcessingTime,ProcessingSpeed,GenerationTime,GenerationSpeed,TotalTime,Output,Flags")
file.write(f"\n{datetimestamp},{libname},{args.gpulayers},{benchmodel},{benchmaxctx},{benchlen},{t_pp:.2f},{s_pp:.2f},{t_gen:.2f},{s_gen:.2f},{(t_pp+t_gen):.2f},{result},{benchflagstr}")
except Exception as e:
print(f"Error writing benchmark to file: {e}")
if global_memory and using_gui_launcher and not save_to_file:
global_memory["input_to_exit"] = True
time.sleep(1)
if args.cli:
print("\n===\nNow running KoboldCpp in Interactive Terminal Chat mode.\nType /quit or /exit to end session.\n")
lastturns = []
if args.prompt and args.prompt!="":
lastturns.append({"role":"system","content":args.prompt})
print(f"System Prompt:\n{args.prompt}\n")
while True:
lastuserinput = input("> ")
if lastuserinput=="/quit" or lastuserinput=="/exit":
break
if not lastuserinput:
continue
lastturns.append({"role":"user","content":lastuserinput})
payload = {"messages":lastturns,"rep_pen":1.07,"temperature":0.8}
payload = transform_genparams(payload, 4) #to chat completions
suppress_stdout()
genout = generate(genparams=payload)
restore_stdout()
result = genout["text"]
if result:
lastturns.append({"role":"assistant","content":result})
print(result.strip() + "\n", flush=True)
else:
save_to_file = (args.benchmark and args.benchmark!="stdout" and args.benchmark!="")
benchmaxctx = maxctx
benchlen = args.promptlimit
benchtemp = 0.1
benchtopk = 1
benchreppen = 1
benchbaneos = True
benchmodel = sanitize_string(os.path.splitext(os.path.basename(modelname))[0])
benchprompt = ""
if args.prompt:
benchprompt = args.prompt
benchtopk = 100
benchreppen = 1.07
benchtemp = 0.8
if not args.benchmark:
benchbaneos = False
if args.benchmark:
if os.path.exists(args.benchmark) and os.path.getsize(args.benchmark) > 1000000:
print("\nWarning: The benchmark CSV output file you selected exceeds 1MB. This is probably not what you want, did you select the wrong CSV file?\nFor safety, benchmark output will not be saved.")
save_to_file = False
if save_to_file:
print(f"\nRunning benchmark (Save to File: {args.benchmark})...")
else:
print("\nRunning benchmark (Not Saved)...")
if benchprompt=="":
benchprompt = " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1"
for i in range(0,14): #generate massive prompt
benchprompt += benchprompt
genp = {
"prompt":benchprompt,
"max_length":benchlen,
"max_context_length":benchmaxctx,
"temperature":benchtemp,
"top_k":benchtopk,
"rep_pen":benchreppen,
"ban_eos_token":benchbaneos
}
genout = generate(genparams=genp)
result = genout['text']
if args.prompt and not args.benchmark:
restore_stdout()
print(result)
if args.benchmark:
result = (result[:8] if len(result)>8 else "") if not args.prompt else result
t_pp = float(handle.get_last_process_time())*float(benchmaxctx-benchlen)*0.001
t_gen = float(handle.get_last_eval_time())*float(benchlen)*0.001
s_pp = float(benchmaxctx-benchlen)/t_pp
s_gen = float(benchlen)/t_gen
datetimestamp = datetime.now(timezone.utc)
benchflagstr = f"NoAVX2={args.noavx2} Threads={args.threads} HighPriority={args.highpriority} Cublas_Args={args.usecublas} Tensor_Split={args.tensor_split} BlasThreads={args.blasthreads} BlasBatchSize={args.blasbatchsize} FlashAttention={args.flashattention} KvCache={args.quantkv}"
print(f"\nBenchmark Completed - v{KcppVersion} Results:\n======")
print(f"Flags: {benchflagstr}")
print(f"Timestamp: {datetimestamp}")
print(f"Backend: {libname}")
print(f"Layers: {args.gpulayers}")
print(f"Model: {benchmodel}")
print(f"MaxCtx: {benchmaxctx}")
print(f"GenAmount: {benchlen}\n-----")
print(f"ProcessingTime: {t_pp:.3f}s")
print(f"ProcessingSpeed: {s_pp:.2f}T/s")
print(f"GenerationTime: {t_gen:.3f}s")
print(f"GenerationSpeed: {s_gen:.2f}T/s")
print(f"TotalTime: {(t_pp+t_gen):.3f}s")
print(f"Output: {result}\n-----")
if save_to_file:
try:
with open(args.benchmark, "a") as file:
file.seek(0, 2)
if file.tell() == 0: #empty file
file.write("Timestamp,Backend,Layers,Model,MaxCtx,GenAmount,ProcessingTime,ProcessingSpeed,GenerationTime,GenerationSpeed,TotalTime,Output,Flags")
file.write(f"\n{datetimestamp},{libname},{args.gpulayers},{benchmodel},{benchmaxctx},{benchlen},{t_pp:.2f},{s_pp:.2f},{t_gen:.2f},{s_gen:.2f},{(t_pp+t_gen):.2f},{result},{benchflagstr}")
except Exception as e:
print(f"Error writing benchmark to file: {e}")
if global_memory and using_gui_launcher and not save_to_file:
global_memory["input_to_exit"] = True
time.sleep(1)
if start_server:
if args.remotetunnel:
@ -6111,7 +6144,7 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
asyncio.run(RunServerMultiThreaded(args.host, args.port, KcppServerRequestHandler))
else:
# Flush stdout for previous win32 issue so the client can see output.
if not args.prompt or args.benchmark:
if not args.prompt or args.benchmark or args.cli:
print("Server was not started, main function complete. Idling.", flush=True)
if __name__ == '__main__':
@ -6169,6 +6202,7 @@ if __name__ == '__main__':
advparser.add_argument("--onready", help="An optional shell command to execute after the model has been loaded.", metavar=('[shell command]'), type=str, default="",nargs=1)
advparser.add_argument("--benchmark", help="Do not start server, instead run benchmarks. If filename is provided, appends results to provided file.", metavar=('[filename]'), nargs='?', const="stdout", type=str, default=None)
advparser.add_argument("--prompt", metavar=('[prompt]'), help="Passing a prompt string triggers a direct inference, loading the model, outputs the response to stdout and exits. Can be used alone or with benchmark.", type=str, default="")
advparser.add_argument("--cli", help="Does not launch KoboldCpp HTTP server. Instead, enables KoboldCpp from the command line, accepting interactive console input and displaying responses to the terminal.", action='store_true')
advparser.add_argument("--promptlimit", help="Sets the maximum number of generated tokens, usable only with --prompt or --benchmark",metavar=('[token limit]'), type=int, default=100)
advparser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them.", metavar=('limit'), nargs='?', const=1, type=int, default=1)
advparser.add_argument("--multiplayer", help="Hosts a shared multiplayer session that others can join.", action='store_true')