Added CLI chat mode

minor cli fixes (+1 squashed commits) Squashed commits: [60af39a9] Added CLI chat mode
2025-09-10 17:14:36 +00:00 · 2025-03-26 19:05:38 +08:00 · 2025-03-26 19:05:38 +08:00 · b4a8a5a278
commit b4a8a5a278
parent 75e7902789
1 changed files with 143 additions and 109 deletions
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -3529,6 +3529,7 @@ def show_gui():
    usemlock = ctk.IntVar()
    debugmode = ctk.IntVar()
    keepforeground = ctk.IntVar()
+    terminalonly = ctk.IntVar()
    quietmode = ctk.IntVar(value=0)
    nocertifymode = ctk.IntVar(value=0)

@ -4020,7 +4021,8 @@ def show_gui():
        "Use MMAP": [usemmap, "Use mmap to load models if enabled, model will not be unloadable"],
        "Use mlock": [usemlock, "Enables mlock, preventing the RAM used to load the model from being paged out."],
        "Debug Mode": [debugmode, "Enables debug mode, with extra info printed to the terminal."],
-        "Keep Foreground": [keepforeground, "Bring KoboldCpp to the foreground every time there is a new generation."]
+        "Keep Foreground": [keepforeground, "Bring KoboldCpp to the foreground every time there is a new generation."],
+        "CLI Terminal Only": [terminalonly, "Does not launch KoboldCpp HTTP server. Instead, enables KoboldCpp from the command line, accepting interactive console input and displaying responses to the terminal."]
    }

    for idx, (name, properties) in enumerate(hardware_boxes.items()):
@ -4267,6 +4269,7 @@ def show_gui():
        args.nofastforward = fastforward.get()==0
        args.remotetunnel = remotetunnel.get()==1
        args.foreground = keepforeground.get()==1
+        args.cli = terminalonly.get()==1
        args.quiet = quietmode.get()==1
        args.nocertify = nocertifymode.get()==1
        args.nomodel = nomodel.get()==1
@ -4425,7 +4428,7 @@ def show_gui():
            args.ttsgpu = (ttsgpu_var.get()==1)
            args.ttsmaxlen = int(ttsmaxlen_var.get())

-        args.admin = (admin_var.get()==1)
+        args.admin = (admin_var.get()==1 and not args.cli)
        args.admindir = admin_dir_var.get()
        args.adminpassword = admin_password_var.get()

@ -4448,6 +4451,7 @@ def show_gui():
        fastforward.set(0 if "nofastforward" in dict and dict["nofastforward"] else 1)
        remotetunnel.set(1 if "remotetunnel" in dict and dict["remotetunnel"] else 0)
        keepforeground.set(1 if "foreground" in dict and dict["foreground"] else 0)
+        terminalonly.set(1 if "cli" in dict and dict["cli"] else 0)
        quietmode.set(1 if "quiet" in dict and dict["quiet"] else 0)
        nocertifymode.set(1 if "nocertify" in dict and dict["nocertify"] else 0)
        nomodel.set(1 if "nomodel" in dict and dict["nomodel"] else 0)
@ -5289,9 +5293,13 @@ def main(launch_args, default_args):
        print(f"{KcppVersion}") # just print version and exit
        return

+    #prevent disallowed combos
+    if (args.nomodel or args.benchmark or args.launch or args.admin) and args.cli:
+        exit_with_error(1, "Error: --cli cannot be combined with --launch, --nomodel, --admin or --benchmark")
+
    args = convert_outdated_args(args)

-    temp_hide_print = (args.model_param and args.prompt and not args.benchmark and not (args.debugmode >= 1))
+    temp_hide_print = (args.model_param and (args.prompt and not args.cli) and not args.benchmark and not (args.debugmode >= 1))

    if not temp_hide_print:
        print(f"***\nWelcome to KoboldCpp - Version {KcppVersion}")
@ -5368,7 +5376,7 @@ def main(launch_args, default_args):
        print("\nWARNING: Admin was set without selecting an admin directory. Admin cannot be used.\n")

    if not args.admin: #run in single process mode
-        if args.remotetunnel and not args.prompt and not args.benchmark:
+        if args.remotetunnel and not args.prompt and not args.benchmark and not args.cli:
            setuptunnel(global_memory, True if args.sdmodel else False)
        kcpp_main_process(args,global_memory,using_gui_launcher)
        if global_memory["input_to_exit"]:
@ -5379,7 +5387,7 @@ def main(launch_args, default_args):
        with multiprocessing.Manager() as mp_manager:
            global_memory = mp_manager.dict({"tunnel_url": "", "restart_target":"", "input_to_exit":False, "load_complete":False})

-            if args.remotetunnel and not args.prompt and not args.benchmark:
+            if args.remotetunnel and not args.prompt and not args.benchmark and not args.cli:
                setuptunnel(global_memory, True if args.sdmodel else False)

            # invoke the main koboldcpp process
@ -5459,10 +5467,10 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
    using_gui_launcher = gui_launcher
    start_time = time.time()

-    if args.model_param and args.prompt and not args.benchmark and not (args.debugmode >= 1):
+    if args.model_param and (args.prompt and not args.cli) and not args.benchmark and not (args.debugmode >= 1):
        suppress_stdout()

-    if args.model_param and (args.benchmark or args.prompt):
+    if args.model_param and (args.benchmark or args.prompt or args.cli):
        start_server = False

    #try to read story if provided
@ -5985,34 +5993,36 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
        endpoint_url = f"{httpsaffix}://localhost:{args.port}"
    else:
        endpoint_url = f"{httpsaffix}://{args.host}:{args.port}"
-    if not args.remotetunnel:
-        print(f"Starting Kobold API on port {args.port} at {endpoint_url}/api/")
-        print(f"Starting OpenAI Compatible API on port {args.port} at {endpoint_url}/v1/")
-        if args.sdmodel:
-            print(f"StableUI is available at {endpoint_url}/sdui/")
-    elif global_memory:
-        val = global_memory["tunnel_url"]
-        if val:
-            endpoint_url = val
-            remote_url = val
-            print(f"Your remote Kobold API can be found at {endpoint_url}/api")
-            print(f"Your remote OpenAI Compatible API can be found at {endpoint_url}/v1")
+
+    if start_server:
+        if not args.remotetunnel:
+            print(f"Starting Kobold API on port {args.port} at {endpoint_url}/api/")
+            print(f"Starting OpenAI Compatible API on port {args.port} at {endpoint_url}/v1/")
            if args.sdmodel:
                print(f"StableUI is available at {endpoint_url}/sdui/")
-        global_memory["load_complete"] = True
-    if args.launch:
-        def launch_browser_thread():
-            LaunchWebbrowser(endpoint_url,"--launch was set, but could not launch web browser automatically.")
-        browser_thread = threading.Timer(2, launch_browser_thread) #2 second delay
-        browser_thread.start()
+        elif global_memory:
+            val = global_memory["tunnel_url"]
+            if val:
+                endpoint_url = val
+                remote_url = val
+                print(f"Your remote Kobold API can be found at {endpoint_url}/api")
+                print(f"Your remote OpenAI Compatible API can be found at {endpoint_url}/v1")
+                if args.sdmodel:
+                    print(f"StableUI is available at {endpoint_url}/sdui/")
+            global_memory["load_complete"] = True
+        if args.launch:
+            def launch_browser_thread():
+                LaunchWebbrowser(endpoint_url,"--launch was set, but could not launch web browser automatically.")
+            browser_thread = threading.Timer(2, launch_browser_thread) #2 second delay
+            browser_thread.start()

-    if args.hordekey and args.hordekey!="":
-        if args.hordeworkername and args.hordeworkername!="":
-            horde_thread = threading.Thread(target=run_horde_worker,args=(args,args.hordekey,args.hordeworkername))
-            horde_thread.daemon = True
-            horde_thread.start()
-        else:
-            print("Horde worker could not start. You need to specify a horde worker name with --hordeworkername")
+        if args.hordekey and args.hordekey!="":
+            if args.hordeworkername and args.hordeworkername!="":
+                horde_thread = threading.Thread(target=run_horde_worker,args=(args,args.hordekey,args.hordeworkername))
+                horde_thread.daemon = True
+                horde_thread.start()
+            else:
+                print("Horde worker could not start. You need to specify a horde worker name with --hordeworkername")

    #if post-ready script specified, execute it
    if args.onready:
@ -6024,82 +6034,105 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
        timer_thread.start()

    if not start_server:
-        save_to_file = (args.benchmark and args.benchmark!="stdout" and args.benchmark!="")
-        benchmaxctx = maxctx
-        benchlen = args.promptlimit
-        benchtemp = 0.1
-        benchtopk = 1
-        benchreppen = 1
-        benchbaneos = True
-        benchmodel = sanitize_string(os.path.splitext(os.path.basename(modelname))[0])
-        benchprompt = ""
-        if args.prompt:
-            benchprompt = args.prompt
-            benchtopk = 100
-            benchreppen = 1.07
-            benchtemp = 0.8
-            if not args.benchmark:
-                benchbaneos = False
-        if args.benchmark:
-            if os.path.exists(args.benchmark) and os.path.getsize(args.benchmark) > 1000000:
-                print("\nWarning: The benchmark CSV output file you selected exceeds 1MB. This is probably not what you want, did you select the wrong CSV file?\nFor safety, benchmark output will not be saved.")
-                save_to_file = False
-            if save_to_file:
-                print(f"\nRunning benchmark (Save to File: {args.benchmark})...")
-            else:
-                print("\nRunning benchmark (Not Saved)...")
-            if benchprompt=="":
-                benchprompt = " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1"
-                for i in range(0,14): #generate massive prompt
-                    benchprompt += benchprompt
-        genp = {
-            "prompt":benchprompt,
-            "max_length":benchlen,
-            "max_context_length":benchmaxctx,
-            "temperature":benchtemp,
-            "top_k":benchtopk,
-            "rep_pen":benchreppen,
-            "ban_eos_token":benchbaneos
-        }
-        genout = generate(genparams=genp)
-        result = genout['text']
-        if args.prompt and not args.benchmark:
-            restore_stdout()
-            print(result)
-        if args.benchmark:
-            result = (result[:8] if len(result)>8 else "") if not args.prompt else result
-            t_pp = float(handle.get_last_process_time())*float(benchmaxctx-benchlen)*0.001
-            t_gen = float(handle.get_last_eval_time())*float(benchlen)*0.001
-            s_pp = float(benchmaxctx-benchlen)/t_pp
-            s_gen = float(benchlen)/t_gen
-            datetimestamp = datetime.now(timezone.utc)
-            benchflagstr = f"NoAVX2={args.noavx2} Threads={args.threads} HighPriority={args.highpriority} Cublas_Args={args.usecublas} Tensor_Split={args.tensor_split} BlasThreads={args.blasthreads} BlasBatchSize={args.blasbatchsize} FlashAttention={args.flashattention} KvCache={args.quantkv}"
-            print(f"\nBenchmark Completed - v{KcppVersion} Results:\n======")
-            print(f"Flags: {benchflagstr}")
-            print(f"Timestamp: {datetimestamp}")
-            print(f"Backend: {libname}")
-            print(f"Layers: {args.gpulayers}")
-            print(f"Model: {benchmodel}")
-            print(f"MaxCtx: {benchmaxctx}")
-            print(f"GenAmount: {benchlen}\n-----")
-            print(f"ProcessingTime: {t_pp:.3f}s")
-            print(f"ProcessingSpeed: {s_pp:.2f}T/s")
-            print(f"GenerationTime: {t_gen:.3f}s")
-            print(f"GenerationSpeed: {s_gen:.2f}T/s")
-            print(f"TotalTime: {(t_pp+t_gen):.3f}s")
-            print(f"Output: {result}\n-----")
-            if save_to_file:
-                try:
-                    with open(args.benchmark, "a") as file:
-                        file.seek(0, 2)
-                        if file.tell() == 0: #empty file
-                            file.write("Timestamp,Backend,Layers,Model,MaxCtx,GenAmount,ProcessingTime,ProcessingSpeed,GenerationTime,GenerationSpeed,TotalTime,Output,Flags")
-                        file.write(f"\n{datetimestamp},{libname},{args.gpulayers},{benchmodel},{benchmaxctx},{benchlen},{t_pp:.2f},{s_pp:.2f},{t_gen:.2f},{s_gen:.2f},{(t_pp+t_gen):.2f},{result},{benchflagstr}")
-                except Exception as e:
-                    print(f"Error writing benchmark to file: {e}")
-            if global_memory and using_gui_launcher and not save_to_file:
-                global_memory["input_to_exit"] = True
-                time.sleep(1)
+        if args.cli:
+            print("\n===\nNow running KoboldCpp in Interactive Terminal Chat mode.\nType /quit or /exit to end session.\n")
+            lastturns = []
+            if args.prompt and args.prompt!="":
+                lastturns.append({"role":"system","content":args.prompt})
+                print(f"System Prompt:\n{args.prompt}\n")
+            while True:
+                lastuserinput = input("> ")
+                if lastuserinput=="/quit" or lastuserinput=="/exit":
+                    break
+                if not lastuserinput:
+                    continue
+                lastturns.append({"role":"user","content":lastuserinput})
+                payload = {"messages":lastturns,"rep_pen":1.07,"temperature":0.8}
+                payload = transform_genparams(payload, 4) #to chat completions
+                suppress_stdout()
+                genout = generate(genparams=payload)
+                restore_stdout()
+                result = genout["text"]
+                if result:
+                    lastturns.append({"role":"assistant","content":result})
+                print(result.strip() + "\n", flush=True)
+        else:
+            save_to_file = (args.benchmark and args.benchmark!="stdout" and args.benchmark!="")
+            benchmaxctx = maxctx
+            benchlen = args.promptlimit
+            benchtemp = 0.1
+            benchtopk = 1
+            benchreppen = 1
+            benchbaneos = True
+            benchmodel = sanitize_string(os.path.splitext(os.path.basename(modelname))[0])
+            benchprompt = ""
+            if args.prompt:
+                benchprompt = args.prompt
+                benchtopk = 100
+                benchreppen = 1.07
+                benchtemp = 0.8
+                if not args.benchmark:
+                    benchbaneos = False
+            if args.benchmark:
+                if os.path.exists(args.benchmark) and os.path.getsize(args.benchmark) > 1000000:
+                    print("\nWarning: The benchmark CSV output file you selected exceeds 1MB. This is probably not what you want, did you select the wrong CSV file?\nFor safety, benchmark output will not be saved.")
+                    save_to_file = False
+                if save_to_file:
+                    print(f"\nRunning benchmark (Save to File: {args.benchmark})...")
+                else:
+                    print("\nRunning benchmark (Not Saved)...")
+                if benchprompt=="":
+                    benchprompt = " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1"
+                    for i in range(0,14): #generate massive prompt
+                        benchprompt += benchprompt
+            genp = {
+                "prompt":benchprompt,
+                "max_length":benchlen,
+                "max_context_length":benchmaxctx,
+                "temperature":benchtemp,
+                "top_k":benchtopk,
+                "rep_pen":benchreppen,
+                "ban_eos_token":benchbaneos
+            }
+            genout = generate(genparams=genp)
+            result = genout['text']
+            if args.prompt and not args.benchmark:
+                restore_stdout()
+                print(result)
+            if args.benchmark:
+                result = (result[:8] if len(result)>8 else "") if not args.prompt else result
+                t_pp = float(handle.get_last_process_time())*float(benchmaxctx-benchlen)*0.001
+                t_gen = float(handle.get_last_eval_time())*float(benchlen)*0.001
+                s_pp = float(benchmaxctx-benchlen)/t_pp
+                s_gen = float(benchlen)/t_gen
+                datetimestamp = datetime.now(timezone.utc)
+                benchflagstr = f"NoAVX2={args.noavx2} Threads={args.threads} HighPriority={args.highpriority} Cublas_Args={args.usecublas} Tensor_Split={args.tensor_split} BlasThreads={args.blasthreads} BlasBatchSize={args.blasbatchsize} FlashAttention={args.flashattention} KvCache={args.quantkv}"
+                print(f"\nBenchmark Completed - v{KcppVersion} Results:\n======")
+                print(f"Flags: {benchflagstr}")
+                print(f"Timestamp: {datetimestamp}")
+                print(f"Backend: {libname}")
+                print(f"Layers: {args.gpulayers}")
+                print(f"Model: {benchmodel}")
+                print(f"MaxCtx: {benchmaxctx}")
+                print(f"GenAmount: {benchlen}\n-----")
+                print(f"ProcessingTime: {t_pp:.3f}s")
+                print(f"ProcessingSpeed: {s_pp:.2f}T/s")
+                print(f"GenerationTime: {t_gen:.3f}s")
+                print(f"GenerationSpeed: {s_gen:.2f}T/s")
+                print(f"TotalTime: {(t_pp+t_gen):.3f}s")
+                print(f"Output: {result}\n-----")
+                if save_to_file:
+                    try:
+                        with open(args.benchmark, "a") as file:
+                            file.seek(0, 2)
+                            if file.tell() == 0: #empty file
+                                file.write("Timestamp,Backend,Layers,Model,MaxCtx,GenAmount,ProcessingTime,ProcessingSpeed,GenerationTime,GenerationSpeed,TotalTime,Output,Flags")
+                            file.write(f"\n{datetimestamp},{libname},{args.gpulayers},{benchmodel},{benchmaxctx},{benchlen},{t_pp:.2f},{s_pp:.2f},{t_gen:.2f},{s_gen:.2f},{(t_pp+t_gen):.2f},{result},{benchflagstr}")
+                    except Exception as e:
+                        print(f"Error writing benchmark to file: {e}")
+                if global_memory and using_gui_launcher and not save_to_file:
+                    global_memory["input_to_exit"] = True
+                    time.sleep(1)

    if start_server:
        if args.remotetunnel:
@ -6111,7 +6144,7 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
        asyncio.run(RunServerMultiThreaded(args.host, args.port, KcppServerRequestHandler))
    else:
        # Flush stdout for previous win32 issue so the client can see output.
-        if not args.prompt or args.benchmark:
+        if not args.prompt or args.benchmark or args.cli:
            print("Server was not started, main function complete. Idling.", flush=True)

 if __name__ == '__main__':
@ -6169,6 +6202,7 @@ if __name__ == '__main__':
    advparser.add_argument("--onready", help="An optional shell command to execute after the model has been loaded.", metavar=('[shell command]'), type=str, default="",nargs=1)
    advparser.add_argument("--benchmark", help="Do not start server, instead run benchmarks. If filename is provided, appends results to provided file.", metavar=('[filename]'), nargs='?', const="stdout", type=str, default=None)
    advparser.add_argument("--prompt", metavar=('[prompt]'), help="Passing a prompt string triggers a direct inference, loading the model, outputs the response to stdout and exits. Can be used alone or with benchmark.", type=str, default="")
+    advparser.add_argument("--cli", help="Does not launch KoboldCpp HTTP server. Instead, enables KoboldCpp from the command line, accepting interactive console input and displaying responses to the terminal.", action='store_true')
    advparser.add_argument("--promptlimit", help="Sets the maximum number of generated tokens, usable only with --prompt or --benchmark",metavar=('[token limit]'), type=int, default=100)
    advparser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them.", metavar=('limit'), nargs='?', const=1, type=int, default=1)
    advparser.add_argument("--multiplayer", help="Hosts a shared multiplayer session that others can join.", action='store_true')