Added CLI chat mode

minor cli fixes (+1 squashed commits) Squashed commits: [60af39a9] Added CLI chat mode
2025-09-10 17:14:36 +00:00 · 2025-03-26 19:05:38 +08:00 · 2025-03-26 19:05:38 +08:00 · b4a8a5a278
commit b4a8a5a278
parent 75e7902789
1 changed files with 143 additions and 109 deletions
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -3529,6 +3529,7 @@ def show_gui():
    usemlock = ctk.IntVar()
    debugmode = ctk.IntVar()
    keepforeground = ctk.IntVar()
+    terminalonly = ctk.IntVar()
    quietmode = ctk.IntVar(value=0)
    nocertifymode = ctk.IntVar(value=0)

@ -4020,7 +4021,8 @@ def show_gui():
        "Use MMAP": [usemmap, "Use mmap to load models if enabled, model will not be unloadable"],
        "Use mlock": [usemlock, "Enables mlock, preventing the RAM used to load the model from being paged out."],
        "Debug Mode": [debugmode, "Enables debug mode, with extra info printed to the terminal."],
-        "Keep Foreground": [keepforeground, "Bring KoboldCpp to the foreground every time there is a new generation."]
+        "Keep Foreground": [keepforeground, "Bring KoboldCpp to the foreground every time there is a new generation."],
+        "CLI Terminal Only": [terminalonly, "Does not launch KoboldCpp HTTP server. Instead, enables KoboldCpp from the command line, accepting interactive console input and displaying responses to the terminal."]
    }

    for idx, (name, properties) in enumerate(hardware_boxes.items()):
@ -4267,6 +4269,7 @@ def show_gui():
        args.nofastforward = fastforward.get()==0
        args.remotetunnel = remotetunnel.get()==1
        args.foreground = keepforeground.get()==1
+        args.cli = terminalonly.get()==1
        args.quiet = quietmode.get()==1
        args.nocertify = nocertifymode.get()==1
        args.nomodel = nomodel.get()==1
@ -4425,7 +4428,7 @@ def show_gui():
            args.ttsgpu = (ttsgpu_var.get()==1)
            args.ttsmaxlen = int(ttsmaxlen_var.get())

-        args.admin = (admin_var.get()==1)
+        args.admin = (admin_var.get()==1 and not args.cli)
        args.admindir = admin_dir_var.get()
        args.adminpassword = admin_password_var.get()

@ -4448,6 +4451,7 @@ def show_gui():
        fastforward.set(0 if "nofastforward" in dict and dict["nofastforward"] else 1)
        remotetunnel.set(1 if "remotetunnel" in dict and dict["remotetunnel"] else 0)
        keepforeground.set(1 if "foreground" in dict and dict["foreground"] else 0)
+        terminalonly.set(1 if "cli" in dict and dict["cli"] else 0)
        quietmode.set(1 if "quiet" in dict and dict["quiet"] else 0)
        nocertifymode.set(1 if "nocertify" in dict and dict["nocertify"] else 0)
        nomodel.set(1 if "nomodel" in dict and dict["nomodel"] else 0)
@ -5289,9 +5293,13 @@ def main(launch_args, default_args):
        print(f"{KcppVersion}") # just print version and exit
        return

+    #prevent disallowed combos
+    if (args.nomodel or args.benchmark or args.launch or args.admin) and args.cli:
+        exit_with_error(1, "Error: --cli cannot be combined with --launch, --nomodel, --admin or --benchmark")
+
    args = convert_outdated_args(args)

-    temp_hide_print = (args.model_param and args.prompt and not args.benchmark and not (args.debugmode >= 1))
+    temp_hide_print = (args.model_param and (args.prompt and not args.cli) and not args.benchmark and not (args.debugmode >= 1))

    if not temp_hide_print:
        print(f"***\nWelcome to KoboldCpp - Version {KcppVersion}")
@ -5368,7 +5376,7 @@ def main(launch_args, default_args):
        print("\nWARNING: Admin was set without selecting an admin directory. Admin cannot be used.\n")

    if not args.admin: #run in single process mode
-        if args.remotetunnel and not args.prompt and not args.benchmark:
+        if args.remotetunnel and not args.prompt and not args.benchmark and not args.cli:
            setuptunnel(global_memory, True if args.sdmodel else False)
        kcpp_main_process(args,global_memory,using_gui_launcher)
        if global_memory["input_to_exit"]:
@ -5379,7 +5387,7 @@ def main(launch_args, default_args):
        with multiprocessing.Manager() as mp_manager:
            global_memory = mp_manager.dict({"tunnel_url": "", "restart_target":"", "input_to_exit":False, "load_complete":False})

-            if args.remotetunnel and not args.prompt and not args.benchmark:
+            if args.remotetunnel and not args.prompt and not args.benchmark and not args.cli:
                setuptunnel(global_memory, True if args.sdmodel else False)

            # invoke the main koboldcpp process
@ -5459,10 +5467,10 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
    using_gui_launcher = gui_launcher
    start_time = time.time()

-    if args.model_param and args.prompt and not args.benchmark and not (args.debugmode >= 1):
+    if args.model_param and (args.prompt and not args.cli) and not args.benchmark and not (args.debugmode >= 1):
        suppress_stdout()

-    if args.model_param and (args.benchmark or args.prompt):
+    if args.model_param and (args.benchmark or args.prompt or args.cli):
        start_server = False

    #try to read story if provided
@ -5985,6 +5993,8 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
        endpoint_url = f"{httpsaffix}://localhost:{args.port}"
    else:
        endpoint_url = f"{httpsaffix}://{args.host}:{args.port}"
+
+    if start_server:
        if not args.remotetunnel:
            print(f"Starting Kobold API on port {args.port} at {endpoint_url}/api/")
            print(f"Starting OpenAI Compatible API on port {args.port} at {endpoint_url}/v1/")
@ -6024,6 +6034,29 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
        timer_thread.start()

    if not start_server:
+        if args.cli:
+            print("\n===\nNow running KoboldCpp in Interactive Terminal Chat mode.\nType /quit or /exit to end session.\n")
+            lastturns = []
+            if args.prompt and args.prompt!="":
+                lastturns.append({"role":"system","content":args.prompt})
+                print(f"System Prompt:\n{args.prompt}\n")
+            while True:
+                lastuserinput = input("> ")
+                if lastuserinput=="/quit" or lastuserinput=="/exit":
+                    break
+                if not lastuserinput:
+                    continue
+                lastturns.append({"role":"user","content":lastuserinput})
+                payload = {"messages":lastturns,"rep_pen":1.07,"temperature":0.8}
+                payload = transform_genparams(payload, 4) #to chat completions
+                suppress_stdout()
+                genout = generate(genparams=payload)
+                restore_stdout()
+                result = genout["text"]
+                if result:
+                    lastturns.append({"role":"assistant","content":result})
+                print(result.strip() + "\n", flush=True)
+        else:
            save_to_file = (args.benchmark and args.benchmark!="stdout" and args.benchmark!="")
            benchmaxctx = maxctx
            benchlen = args.promptlimit
@ -6111,7 +6144,7 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
        asyncio.run(RunServerMultiThreaded(args.host, args.port, KcppServerRequestHandler))
    else:
        # Flush stdout for previous win32 issue so the client can see output.
-        if not args.prompt or args.benchmark:
+        if not args.prompt or args.benchmark or args.cli:
            print("Server was not started, main function complete. Idling.", flush=True)

 if __name__ == '__main__':
@ -6169,6 +6202,7 @@ if __name__ == '__main__':
    advparser.add_argument("--onready", help="An optional shell command to execute after the model has been loaded.", metavar=('[shell command]'), type=str, default="",nargs=1)
    advparser.add_argument("--benchmark", help="Do not start server, instead run benchmarks. If filename is provided, appends results to provided file.", metavar=('[filename]'), nargs='?', const="stdout", type=str, default=None)
    advparser.add_argument("--prompt", metavar=('[prompt]'), help="Passing a prompt string triggers a direct inference, loading the model, outputs the response to stdout and exits. Can be used alone or with benchmark.", type=str, default="")
+    advparser.add_argument("--cli", help="Does not launch KoboldCpp HTTP server. Instead, enables KoboldCpp from the command line, accepting interactive console input and displaying responses to the terminal.", action='store_true')
    advparser.add_argument("--promptlimit", help="Sets the maximum number of generated tokens, usable only with --prompt or --benchmark",metavar=('[token limit]'), type=int, default=100)
    advparser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them.", metavar=('limit'), nargs='?', const=1, type=int, default=1)
    advparser.add_argument("--multiplayer", help="Hosts a shared multiplayer session that others can join.", action='store_true')