Added CLI chat mode

minor cli fixes (+1 squashed commits)

Squashed commits:

[60af39a9] Added CLI chat mode
This commit is contained in:
Concedo 2025-03-26 19:05:38 +08:00
parent 75e7902789
commit b4a8a5a278

View file

@ -3529,6 +3529,7 @@ def show_gui():
usemlock = ctk.IntVar() usemlock = ctk.IntVar()
debugmode = ctk.IntVar() debugmode = ctk.IntVar()
keepforeground = ctk.IntVar() keepforeground = ctk.IntVar()
terminalonly = ctk.IntVar()
quietmode = ctk.IntVar(value=0) quietmode = ctk.IntVar(value=0)
nocertifymode = ctk.IntVar(value=0) nocertifymode = ctk.IntVar(value=0)
@ -4020,7 +4021,8 @@ def show_gui():
"Use MMAP": [usemmap, "Use mmap to load models if enabled, model will not be unloadable"], "Use MMAP": [usemmap, "Use mmap to load models if enabled, model will not be unloadable"],
"Use mlock": [usemlock, "Enables mlock, preventing the RAM used to load the model from being paged out."], "Use mlock": [usemlock, "Enables mlock, preventing the RAM used to load the model from being paged out."],
"Debug Mode": [debugmode, "Enables debug mode, with extra info printed to the terminal."], "Debug Mode": [debugmode, "Enables debug mode, with extra info printed to the terminal."],
"Keep Foreground": [keepforeground, "Bring KoboldCpp to the foreground every time there is a new generation."] "Keep Foreground": [keepforeground, "Bring KoboldCpp to the foreground every time there is a new generation."],
"CLI Terminal Only": [terminalonly, "Does not launch KoboldCpp HTTP server. Instead, enables KoboldCpp from the command line, accepting interactive console input and displaying responses to the terminal."]
} }
for idx, (name, properties) in enumerate(hardware_boxes.items()): for idx, (name, properties) in enumerate(hardware_boxes.items()):
@ -4267,6 +4269,7 @@ def show_gui():
args.nofastforward = fastforward.get()==0 args.nofastforward = fastforward.get()==0
args.remotetunnel = remotetunnel.get()==1 args.remotetunnel = remotetunnel.get()==1
args.foreground = keepforeground.get()==1 args.foreground = keepforeground.get()==1
args.cli = terminalonly.get()==1
args.quiet = quietmode.get()==1 args.quiet = quietmode.get()==1
args.nocertify = nocertifymode.get()==1 args.nocertify = nocertifymode.get()==1
args.nomodel = nomodel.get()==1 args.nomodel = nomodel.get()==1
@ -4425,7 +4428,7 @@ def show_gui():
args.ttsgpu = (ttsgpu_var.get()==1) args.ttsgpu = (ttsgpu_var.get()==1)
args.ttsmaxlen = int(ttsmaxlen_var.get()) args.ttsmaxlen = int(ttsmaxlen_var.get())
args.admin = (admin_var.get()==1) args.admin = (admin_var.get()==1 and not args.cli)
args.admindir = admin_dir_var.get() args.admindir = admin_dir_var.get()
args.adminpassword = admin_password_var.get() args.adminpassword = admin_password_var.get()
@ -4448,6 +4451,7 @@ def show_gui():
fastforward.set(0 if "nofastforward" in dict and dict["nofastforward"] else 1) fastforward.set(0 if "nofastforward" in dict and dict["nofastforward"] else 1)
remotetunnel.set(1 if "remotetunnel" in dict and dict["remotetunnel"] else 0) remotetunnel.set(1 if "remotetunnel" in dict and dict["remotetunnel"] else 0)
keepforeground.set(1 if "foreground" in dict and dict["foreground"] else 0) keepforeground.set(1 if "foreground" in dict and dict["foreground"] else 0)
terminalonly.set(1 if "cli" in dict and dict["cli"] else 0)
quietmode.set(1 if "quiet" in dict and dict["quiet"] else 0) quietmode.set(1 if "quiet" in dict and dict["quiet"] else 0)
nocertifymode.set(1 if "nocertify" in dict and dict["nocertify"] else 0) nocertifymode.set(1 if "nocertify" in dict and dict["nocertify"] else 0)
nomodel.set(1 if "nomodel" in dict and dict["nomodel"] else 0) nomodel.set(1 if "nomodel" in dict and dict["nomodel"] else 0)
@ -5289,9 +5293,13 @@ def main(launch_args, default_args):
print(f"{KcppVersion}") # just print version and exit print(f"{KcppVersion}") # just print version and exit
return return
#prevent disallowed combos
if (args.nomodel or args.benchmark or args.launch or args.admin) and args.cli:
exit_with_error(1, "Error: --cli cannot be combined with --launch, --nomodel, --admin or --benchmark")
args = convert_outdated_args(args) args = convert_outdated_args(args)
temp_hide_print = (args.model_param and args.prompt and not args.benchmark and not (args.debugmode >= 1)) temp_hide_print = (args.model_param and (args.prompt and not args.cli) and not args.benchmark and not (args.debugmode >= 1))
if not temp_hide_print: if not temp_hide_print:
print(f"***\nWelcome to KoboldCpp - Version {KcppVersion}") print(f"***\nWelcome to KoboldCpp - Version {KcppVersion}")
@ -5368,7 +5376,7 @@ def main(launch_args, default_args):
print("\nWARNING: Admin was set without selecting an admin directory. Admin cannot be used.\n") print("\nWARNING: Admin was set without selecting an admin directory. Admin cannot be used.\n")
if not args.admin: #run in single process mode if not args.admin: #run in single process mode
if args.remotetunnel and not args.prompt and not args.benchmark: if args.remotetunnel and not args.prompt and not args.benchmark and not args.cli:
setuptunnel(global_memory, True if args.sdmodel else False) setuptunnel(global_memory, True if args.sdmodel else False)
kcpp_main_process(args,global_memory,using_gui_launcher) kcpp_main_process(args,global_memory,using_gui_launcher)
if global_memory["input_to_exit"]: if global_memory["input_to_exit"]:
@ -5379,7 +5387,7 @@ def main(launch_args, default_args):
with multiprocessing.Manager() as mp_manager: with multiprocessing.Manager() as mp_manager:
global_memory = mp_manager.dict({"tunnel_url": "", "restart_target":"", "input_to_exit":False, "load_complete":False}) global_memory = mp_manager.dict({"tunnel_url": "", "restart_target":"", "input_to_exit":False, "load_complete":False})
if args.remotetunnel and not args.prompt and not args.benchmark: if args.remotetunnel and not args.prompt and not args.benchmark and not args.cli:
setuptunnel(global_memory, True if args.sdmodel else False) setuptunnel(global_memory, True if args.sdmodel else False)
# invoke the main koboldcpp process # invoke the main koboldcpp process
@ -5459,10 +5467,10 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
using_gui_launcher = gui_launcher using_gui_launcher = gui_launcher
start_time = time.time() start_time = time.time()
if args.model_param and args.prompt and not args.benchmark and not (args.debugmode >= 1): if args.model_param and (args.prompt and not args.cli) and not args.benchmark and not (args.debugmode >= 1):
suppress_stdout() suppress_stdout()
if args.model_param and (args.benchmark or args.prompt): if args.model_param and (args.benchmark or args.prompt or args.cli):
start_server = False start_server = False
#try to read story if provided #try to read story if provided
@ -5985,34 +5993,36 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
endpoint_url = f"{httpsaffix}://localhost:{args.port}" endpoint_url = f"{httpsaffix}://localhost:{args.port}"
else: else:
endpoint_url = f"{httpsaffix}://{args.host}:{args.port}" endpoint_url = f"{httpsaffix}://{args.host}:{args.port}"
if not args.remotetunnel:
print(f"Starting Kobold API on port {args.port} at {endpoint_url}/api/") if start_server:
print(f"Starting OpenAI Compatible API on port {args.port} at {endpoint_url}/v1/") if not args.remotetunnel:
if args.sdmodel: print(f"Starting Kobold API on port {args.port} at {endpoint_url}/api/")
print(f"StableUI is available at {endpoint_url}/sdui/") print(f"Starting OpenAI Compatible API on port {args.port} at {endpoint_url}/v1/")
elif global_memory:
val = global_memory["tunnel_url"]
if val:
endpoint_url = val
remote_url = val
print(f"Your remote Kobold API can be found at {endpoint_url}/api")
print(f"Your remote OpenAI Compatible API can be found at {endpoint_url}/v1")
if args.sdmodel: if args.sdmodel:
print(f"StableUI is available at {endpoint_url}/sdui/") print(f"StableUI is available at {endpoint_url}/sdui/")
global_memory["load_complete"] = True elif global_memory:
if args.launch: val = global_memory["tunnel_url"]
def launch_browser_thread(): if val:
LaunchWebbrowser(endpoint_url,"--launch was set, but could not launch web browser automatically.") endpoint_url = val
browser_thread = threading.Timer(2, launch_browser_thread) #2 second delay remote_url = val
browser_thread.start() print(f"Your remote Kobold API can be found at {endpoint_url}/api")
print(f"Your remote OpenAI Compatible API can be found at {endpoint_url}/v1")
if args.sdmodel:
print(f"StableUI is available at {endpoint_url}/sdui/")
global_memory["load_complete"] = True
if args.launch:
def launch_browser_thread():
LaunchWebbrowser(endpoint_url,"--launch was set, but could not launch web browser automatically.")
browser_thread = threading.Timer(2, launch_browser_thread) #2 second delay
browser_thread.start()
if args.hordekey and args.hordekey!="": if args.hordekey and args.hordekey!="":
if args.hordeworkername and args.hordeworkername!="": if args.hordeworkername and args.hordeworkername!="":
horde_thread = threading.Thread(target=run_horde_worker,args=(args,args.hordekey,args.hordeworkername)) horde_thread = threading.Thread(target=run_horde_worker,args=(args,args.hordekey,args.hordeworkername))
horde_thread.daemon = True horde_thread.daemon = True
horde_thread.start() horde_thread.start()
else: else:
print("Horde worker could not start. You need to specify a horde worker name with --hordeworkername") print("Horde worker could not start. You need to specify a horde worker name with --hordeworkername")
#if post-ready script specified, execute it #if post-ready script specified, execute it
if args.onready: if args.onready:
@ -6024,82 +6034,105 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
timer_thread.start() timer_thread.start()
if not start_server: if not start_server:
save_to_file = (args.benchmark and args.benchmark!="stdout" and args.benchmark!="") if args.cli:
benchmaxctx = maxctx print("\n===\nNow running KoboldCpp in Interactive Terminal Chat mode.\nType /quit or /exit to end session.\n")
benchlen = args.promptlimit lastturns = []
benchtemp = 0.1 if args.prompt and args.prompt!="":
benchtopk = 1 lastturns.append({"role":"system","content":args.prompt})
benchreppen = 1 print(f"System Prompt:\n{args.prompt}\n")
benchbaneos = True while True:
benchmodel = sanitize_string(os.path.splitext(os.path.basename(modelname))[0]) lastuserinput = input("> ")
benchprompt = "" if lastuserinput=="/quit" or lastuserinput=="/exit":
if args.prompt: break
benchprompt = args.prompt if not lastuserinput:
benchtopk = 100 continue
benchreppen = 1.07 lastturns.append({"role":"user","content":lastuserinput})
benchtemp = 0.8 payload = {"messages":lastturns,"rep_pen":1.07,"temperature":0.8}
if not args.benchmark: payload = transform_genparams(payload, 4) #to chat completions
benchbaneos = False suppress_stdout()
if args.benchmark: genout = generate(genparams=payload)
if os.path.exists(args.benchmark) and os.path.getsize(args.benchmark) > 1000000: restore_stdout()
print("\nWarning: The benchmark CSV output file you selected exceeds 1MB. This is probably not what you want, did you select the wrong CSV file?\nFor safety, benchmark output will not be saved.") result = genout["text"]
save_to_file = False if result:
if save_to_file: lastturns.append({"role":"assistant","content":result})
print(f"\nRunning benchmark (Save to File: {args.benchmark})...") print(result.strip() + "\n", flush=True)
else: else:
print("\nRunning benchmark (Not Saved)...") save_to_file = (args.benchmark and args.benchmark!="stdout" and args.benchmark!="")
if benchprompt=="": benchmaxctx = maxctx
benchprompt = " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1" benchlen = args.promptlimit
for i in range(0,14): #generate massive prompt benchtemp = 0.1
benchprompt += benchprompt benchtopk = 1
genp = { benchreppen = 1
"prompt":benchprompt, benchbaneos = True
"max_length":benchlen, benchmodel = sanitize_string(os.path.splitext(os.path.basename(modelname))[0])
"max_context_length":benchmaxctx, benchprompt = ""
"temperature":benchtemp, if args.prompt:
"top_k":benchtopk, benchprompt = args.prompt
"rep_pen":benchreppen, benchtopk = 100
"ban_eos_token":benchbaneos benchreppen = 1.07
} benchtemp = 0.8
genout = generate(genparams=genp) if not args.benchmark:
result = genout['text'] benchbaneos = False
if args.prompt and not args.benchmark: if args.benchmark:
restore_stdout() if os.path.exists(args.benchmark) and os.path.getsize(args.benchmark) > 1000000:
print(result) print("\nWarning: The benchmark CSV output file you selected exceeds 1MB. This is probably not what you want, did you select the wrong CSV file?\nFor safety, benchmark output will not be saved.")
if args.benchmark: save_to_file = False
result = (result[:8] if len(result)>8 else "") if not args.prompt else result if save_to_file:
t_pp = float(handle.get_last_process_time())*float(benchmaxctx-benchlen)*0.001 print(f"\nRunning benchmark (Save to File: {args.benchmark})...")
t_gen = float(handle.get_last_eval_time())*float(benchlen)*0.001 else:
s_pp = float(benchmaxctx-benchlen)/t_pp print("\nRunning benchmark (Not Saved)...")
s_gen = float(benchlen)/t_gen if benchprompt=="":
datetimestamp = datetime.now(timezone.utc) benchprompt = " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1"
benchflagstr = f"NoAVX2={args.noavx2} Threads={args.threads} HighPriority={args.highpriority} Cublas_Args={args.usecublas} Tensor_Split={args.tensor_split} BlasThreads={args.blasthreads} BlasBatchSize={args.blasbatchsize} FlashAttention={args.flashattention} KvCache={args.quantkv}" for i in range(0,14): #generate massive prompt
print(f"\nBenchmark Completed - v{KcppVersion} Results:\n======") benchprompt += benchprompt
print(f"Flags: {benchflagstr}") genp = {
print(f"Timestamp: {datetimestamp}") "prompt":benchprompt,
print(f"Backend: {libname}") "max_length":benchlen,
print(f"Layers: {args.gpulayers}") "max_context_length":benchmaxctx,
print(f"Model: {benchmodel}") "temperature":benchtemp,
print(f"MaxCtx: {benchmaxctx}") "top_k":benchtopk,
print(f"GenAmount: {benchlen}\n-----") "rep_pen":benchreppen,
print(f"ProcessingTime: {t_pp:.3f}s") "ban_eos_token":benchbaneos
print(f"ProcessingSpeed: {s_pp:.2f}T/s") }
print(f"GenerationTime: {t_gen:.3f}s") genout = generate(genparams=genp)
print(f"GenerationSpeed: {s_gen:.2f}T/s") result = genout['text']
print(f"TotalTime: {(t_pp+t_gen):.3f}s") if args.prompt and not args.benchmark:
print(f"Output: {result}\n-----") restore_stdout()
if save_to_file: print(result)
try: if args.benchmark:
with open(args.benchmark, "a") as file: result = (result[:8] if len(result)>8 else "") if not args.prompt else result
file.seek(0, 2) t_pp = float(handle.get_last_process_time())*float(benchmaxctx-benchlen)*0.001
if file.tell() == 0: #empty file t_gen = float(handle.get_last_eval_time())*float(benchlen)*0.001
file.write("Timestamp,Backend,Layers,Model,MaxCtx,GenAmount,ProcessingTime,ProcessingSpeed,GenerationTime,GenerationSpeed,TotalTime,Output,Flags") s_pp = float(benchmaxctx-benchlen)/t_pp
file.write(f"\n{datetimestamp},{libname},{args.gpulayers},{benchmodel},{benchmaxctx},{benchlen},{t_pp:.2f},{s_pp:.2f},{t_gen:.2f},{s_gen:.2f},{(t_pp+t_gen):.2f},{result},{benchflagstr}") s_gen = float(benchlen)/t_gen
except Exception as e: datetimestamp = datetime.now(timezone.utc)
print(f"Error writing benchmark to file: {e}") benchflagstr = f"NoAVX2={args.noavx2} Threads={args.threads} HighPriority={args.highpriority} Cublas_Args={args.usecublas} Tensor_Split={args.tensor_split} BlasThreads={args.blasthreads} BlasBatchSize={args.blasbatchsize} FlashAttention={args.flashattention} KvCache={args.quantkv}"
if global_memory and using_gui_launcher and not save_to_file: print(f"\nBenchmark Completed - v{KcppVersion} Results:\n======")
global_memory["input_to_exit"] = True print(f"Flags: {benchflagstr}")
time.sleep(1) print(f"Timestamp: {datetimestamp}")
print(f"Backend: {libname}")
print(f"Layers: {args.gpulayers}")
print(f"Model: {benchmodel}")
print(f"MaxCtx: {benchmaxctx}")
print(f"GenAmount: {benchlen}\n-----")
print(f"ProcessingTime: {t_pp:.3f}s")
print(f"ProcessingSpeed: {s_pp:.2f}T/s")
print(f"GenerationTime: {t_gen:.3f}s")
print(f"GenerationSpeed: {s_gen:.2f}T/s")
print(f"TotalTime: {(t_pp+t_gen):.3f}s")
print(f"Output: {result}\n-----")
if save_to_file:
try:
with open(args.benchmark, "a") as file:
file.seek(0, 2)
if file.tell() == 0: #empty file
file.write("Timestamp,Backend,Layers,Model,MaxCtx,GenAmount,ProcessingTime,ProcessingSpeed,GenerationTime,GenerationSpeed,TotalTime,Output,Flags")
file.write(f"\n{datetimestamp},{libname},{args.gpulayers},{benchmodel},{benchmaxctx},{benchlen},{t_pp:.2f},{s_pp:.2f},{t_gen:.2f},{s_gen:.2f},{(t_pp+t_gen):.2f},{result},{benchflagstr}")
except Exception as e:
print(f"Error writing benchmark to file: {e}")
if global_memory and using_gui_launcher and not save_to_file:
global_memory["input_to_exit"] = True
time.sleep(1)
if start_server: if start_server:
if args.remotetunnel: if args.remotetunnel:
@ -6111,7 +6144,7 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
asyncio.run(RunServerMultiThreaded(args.host, args.port, KcppServerRequestHandler)) asyncio.run(RunServerMultiThreaded(args.host, args.port, KcppServerRequestHandler))
else: else:
# Flush stdout for previous win32 issue so the client can see output. # Flush stdout for previous win32 issue so the client can see output.
if not args.prompt or args.benchmark: if not args.prompt or args.benchmark or args.cli:
print("Server was not started, main function complete. Idling.", flush=True) print("Server was not started, main function complete. Idling.", flush=True)
if __name__ == '__main__': if __name__ == '__main__':
@ -6169,6 +6202,7 @@ if __name__ == '__main__':
advparser.add_argument("--onready", help="An optional shell command to execute after the model has been loaded.", metavar=('[shell command]'), type=str, default="",nargs=1) advparser.add_argument("--onready", help="An optional shell command to execute after the model has been loaded.", metavar=('[shell command]'), type=str, default="",nargs=1)
advparser.add_argument("--benchmark", help="Do not start server, instead run benchmarks. If filename is provided, appends results to provided file.", metavar=('[filename]'), nargs='?', const="stdout", type=str, default=None) advparser.add_argument("--benchmark", help="Do not start server, instead run benchmarks. If filename is provided, appends results to provided file.", metavar=('[filename]'), nargs='?', const="stdout", type=str, default=None)
advparser.add_argument("--prompt", metavar=('[prompt]'), help="Passing a prompt string triggers a direct inference, loading the model, outputs the response to stdout and exits. Can be used alone or with benchmark.", type=str, default="") advparser.add_argument("--prompt", metavar=('[prompt]'), help="Passing a prompt string triggers a direct inference, loading the model, outputs the response to stdout and exits. Can be used alone or with benchmark.", type=str, default="")
advparser.add_argument("--cli", help="Does not launch KoboldCpp HTTP server. Instead, enables KoboldCpp from the command line, accepting interactive console input and displaying responses to the terminal.", action='store_true')
advparser.add_argument("--promptlimit", help="Sets the maximum number of generated tokens, usable only with --prompt or --benchmark",metavar=('[token limit]'), type=int, default=100) advparser.add_argument("--promptlimit", help="Sets the maximum number of generated tokens, usable only with --prompt or --benchmark",metavar=('[token limit]'), type=int, default=100)
advparser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them.", metavar=('limit'), nargs='?', const=1, type=int, default=1) advparser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them.", metavar=('limit'), nargs='?', const=1, type=int, default=1)
advparser.add_argument("--multiplayer", help="Hosts a shared multiplayer session that others can join.", action='store_true') advparser.add_argument("--multiplayer", help="Hosts a shared multiplayer session that others can join.", action='store_true')