From 853d57c53c92f1ed39fc75758a3ea3780e9a7c99 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Tue, 6 Aug 2024 21:54:08 +0800
Subject: [PATCH] wip prompt

---
 koboldcpp.py | 153 ++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 101 insertions(+), 52 deletions(-)

diff --git a/koboldcpp.py b/koboldcpp.py
index 4fb1156a5..e96b51056 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -74,6 +74,13 @@ currfinishreason = "null"
 using_gui_launcher = False
 using_outdated_flags = False
 
+saved_stdout = None
+saved_stderr = None
+saved_stdout_py = None
+saved_stderr_py = None
+stdout_nullfile = None
+stdout_nullfile_py = None
+
 CLDevices = ["1","2","3","4"]
 CUDevices = ["1","2","3","4","All"]
 CLDevicesNames = ["","","",""]
@@ -224,6 +231,34 @@ def getabspath():
 def file_exists(filename):
     return os.path.exists(os.path.join(getdirpath(), filename))
 
+def suppress_stdout():
+    global saved_stdout, saved_stderr, saved_stdout_py, saved_stderr_py, stdout_nullfile, stdout_nullfile_py
+    if not saved_stdout and not saved_stderr and not saved_stdout_py and not saved_stderr_py and not stdout_nullfile and not stdout_nullfile_py:
+        sys.stdout.flush()
+        sys.stderr.flush()
+        saved_stdout = os.dup(sys.stdout.fileno())
+        saved_stderr = os.dup(sys.stderr.fileno())
+        saved_stderr_py = sys.stderr
+        saved_stdout_py = sys.stdout
+        stdout_nullfile = os.open(os.devnull, os.O_WRONLY)
+        stdout_nullfile_py = open(os.devnull, 'w')
+        os.dup2(stdout_nullfile, sys.stdout.fileno())
+        os.dup2(stdout_nullfile, sys.stderr.fileno())
+        sys.stderr = sys.stdout = stdout_nullfile_py
+
+def restore_stdout():
+    global saved_stdout, saved_stderr, saved_stdout_py, saved_stderr_py, stdout_nullfile, stdout_nullfile_py
+    if saved_stdout and saved_stderr and saved_stdout_py and saved_stderr_py and stdout_nullfile and stdout_nullfile_py:
+        sys.stdout = saved_stdout_py
+        sys.stderr = saved_stderr_py
+        os.dup2(saved_stdout, sys.stdout.fileno())
+        os.dup2(saved_stderr, sys.stderr.fileno())
+        os.close(stdout_nullfile)
+        stdout_nullfile_py.close()
+        os.close(saved_stdout)
+        os.close(saved_stderr)
+        saved_stdout = saved_stderr = saved_stdout_py = saved_stderr_py = stdout_nullfile = stdout_nullfile_py = None
+
 def get_default_threads():
     physical_core_limit = 1
     if os.cpu_count()!=None and os.cpu_count()>1:
@@ -3724,13 +3759,19 @@ def main(launch_args,start_server=True):
     global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui
     global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath
 
+    args = launch_args
+    if (args.model_param or args.model) and args.prompt and not args.benchmark:
+        suppress_stdout()
+
+    print(f"***\nWelcome to KoboldCpp - Version {KcppVersion}") # just update version manually
+    # print("Python version: " + sys.version)
+
     #perform some basic cleanup of old temporary directories
     try:
         delete_old_pyinstaller()
     except Exception as e:
         print(f"Error cleaning up orphaned pyinstaller dirs: {e}")
 
-    args = launch_args
     if args.unpack:
         unpack_to_dir(args.unpack)
         return
@@ -4162,61 +4203,69 @@ def main(launch_args,start_server=True):
         timer_thread = threading.Timer(1, onready_subprocess) #1 second delay
         timer_thread.start()
 
-    if args.model_param and args.benchmark:
+    if args.model_param and (args.benchmark or args.prompt):
         from datetime import datetime, timezone
         start_server = False
-        save_to_file = (args.benchmark!="stdout" and args.benchmark!="")
+        save_to_file = (args.benchmark and args.benchmark!="stdout" and args.benchmark!="")
         benchmaxctx = maxctx
         benchlen = 100
         benchmodel = sanitize_string(os.path.splitext(os.path.basename(modelname))[0])
-        if os.path.exists(args.benchmark) and os.path.getsize(args.benchmark) > 1000000:
-            print(f"\nWarning: The benchmark CSV output file you selected exceeds 1MB. This is probably not what you want, did you select the wrong CSV file?\nFor safety, benchmark output will not be saved.")
-            save_to_file = False
-        if save_to_file:
-            print(f"\nRunning benchmark (Save to File: {args.benchmark})...")
-        else:
-            print(f"\nRunning benchmark (Not Saved)...")
-
-        benchprompt = "1111111111111111"
-        for i in range(0,14): #generate massive prompt
-            benchprompt += benchprompt
+        benchprompt = ""
+        if args.prompt:
+            benchprompt = args.prompt
+        if args.benchmark:
+            if os.path.exists(args.benchmark) and os.path.getsize(args.benchmark) > 1000000:
+                print(f"\nWarning: The benchmark CSV output file you selected exceeds 1MB. This is probably not what you want, did you select the wrong CSV file?\nFor safety, benchmark output will not be saved.")
+                save_to_file = False
+            if save_to_file:
+                print(f"\nRunning benchmark (Save to File: {args.benchmark})...")
+            else:
+                print(f"\nRunning benchmark (Not Saved)...")
+            if benchprompt=="":
+                benchprompt = "1111111111111111"
+                for i in range(0,14): #generate massive prompt
+                    benchprompt += benchprompt
         genout = generate(benchprompt,memory="",images=[],max_length=benchlen,max_context_length=benchmaxctx,temperature=0.1,top_k=1,rep_pen=1,ban_eos_token=True)
         result = genout['text']
-        result = (result[:5] if len(result)>5 else "")
-        t_pp = float(handle.get_last_process_time())*float(benchmaxctx-benchlen)*0.001
-        t_gen = float(handle.get_last_eval_time())*float(benchlen)*0.001
-        s_pp = float(benchmaxctx-benchlen)/t_pp
-        s_gen = float(benchlen)/t_gen
-        datetimestamp = datetime.now(timezone.utc)
-        benchflagstr = f"NoAVX2={args.noavx2} Threads={args.threads} HighPriority={args.highpriority} NoBlas={args.noblas} Cublas_Args={args.usecublas} Tensor_Split={args.tensor_split} BlasThreads={args.blasthreads} BlasBatchSize={args.blasbatchsize} FlashAttention={args.flashattention} KvCache={args.quantkv}"
-        print(f"\nBenchmark Completed - v{KcppVersion} Results:\n======")
-        print(f"Flags: {benchflagstr}")
-        print(f"Timestamp: {datetimestamp}")
-        print(f"Backend: {libname}")
-        print(f"Layers: {args.gpulayers}")
-        print(f"Model: {benchmodel}")
-        print(f"MaxCtx: {benchmaxctx}")
-        print(f"GenAmount: {benchlen}\n-----")
-        print(f"ProcessingTime: {t_pp:.3f}s")
-        print(f"ProcessingSpeed: {s_pp:.2f}T/s")
-        print(f"GenerationTime: {t_gen:.3f}s")
-        print(f"GenerationSpeed: {s_gen:.2f}T/s")
-        print(f"TotalTime: {(t_pp+t_gen):.3f}s")
-        print(f"Output: {result}\n-----")
-        if save_to_file:
-            try:
-                with open(args.benchmark, "a") as file:
-                    file.seek(0, 2)
-                    if file.tell() == 0: #empty file
-                        file.write(f"Timestamp,Backend,Layers,Model,MaxCtx,GenAmount,ProcessingTime,ProcessingSpeed,GenerationTime,GenerationSpeed,TotalTime,Output,Flags")
-                    file.write(f"\n{datetimestamp},{libname},{args.gpulayers},{benchmodel},{benchmaxctx},{benchlen},{t_pp:.2f},{s_pp:.2f},{t_gen:.2f},{s_gen:.2f},{(t_pp+t_gen):.2f},{result},{benchflagstr}")
-            except Exception as e:
-                print(f"Error writing benchmark to file: {e}")
-        global using_gui_launcher
-        if using_gui_launcher and not save_to_file:
-            print("===")
-            print("Press ENTER key to exit.", flush=True)
-            input()
+        if args.prompt and not args.benchmark:
+            restore_stdout()
+            print(result)
+        if args.benchmark:
+            result = (result[:8] if len(result)>8 else "") if not args.prompt else result
+            t_pp = float(handle.get_last_process_time())*float(benchmaxctx-benchlen)*0.001
+            t_gen = float(handle.get_last_eval_time())*float(benchlen)*0.001
+            s_pp = float(benchmaxctx-benchlen)/t_pp
+            s_gen = float(benchlen)/t_gen
+            datetimestamp = datetime.now(timezone.utc)
+            benchflagstr = f"NoAVX2={args.noavx2} Threads={args.threads} HighPriority={args.highpriority} NoBlas={args.noblas} Cublas_Args={args.usecublas} Tensor_Split={args.tensor_split} BlasThreads={args.blasthreads} BlasBatchSize={args.blasbatchsize} FlashAttention={args.flashattention} KvCache={args.quantkv}"
+            print(f"\nBenchmark Completed - v{KcppVersion} Results:\n======")
+            print(f"Flags: {benchflagstr}")
+            print(f"Timestamp: {datetimestamp}")
+            print(f"Backend: {libname}")
+            print(f"Layers: {args.gpulayers}")
+            print(f"Model: {benchmodel}")
+            print(f"MaxCtx: {benchmaxctx}")
+            print(f"GenAmount: {benchlen}\n-----")
+            print(f"ProcessingTime: {t_pp:.3f}s")
+            print(f"ProcessingSpeed: {s_pp:.2f}T/s")
+            print(f"GenerationTime: {t_gen:.3f}s")
+            print(f"GenerationSpeed: {s_gen:.2f}T/s")
+            print(f"TotalTime: {(t_pp+t_gen):.3f}s")
+            print(f"Output: {result}\n-----")
+            if save_to_file:
+                try:
+                    with open(args.benchmark, "a") as file:
+                        file.seek(0, 2)
+                        if file.tell() == 0: #empty file
+                            file.write(f"Timestamp,Backend,Layers,Model,MaxCtx,GenAmount,ProcessingTime,ProcessingSpeed,GenerationTime,GenerationSpeed,TotalTime,Output,Flags")
+                        file.write(f"\n{datetimestamp},{libname},{args.gpulayers},{benchmodel},{benchmaxctx},{benchlen},{t_pp:.2f},{s_pp:.2f},{t_gen:.2f},{s_gen:.2f},{(t_pp+t_gen):.2f},{result},{benchflagstr}")
+                except Exception as e:
+                    print(f"Error writing benchmark to file: {e}")
+            global using_gui_launcher
+            if using_gui_launcher and not save_to_file:
+                print("===")
+                print("Press ENTER key to exit.", flush=True)
+                input()
 
     check_deprecation_warning()
     if start_server:
@@ -4228,7 +4277,8 @@ def main(launch_args,start_server=True):
         asyncio.run(RunServerMultiThreaded(args.host, args.port))
     else:
         # Flush stdout for previous win32 issue so the client can see output.
-        print(f"Server was not started, main function complete. Idling.", flush=True)
+        if not args.prompt or args.benchmark:
+            print(f"Server was not started, main function complete. Idling.", flush=True)
 
 def run_in_queue(launch_args, input_queue, output_queue):
     main(launch_args, start_server=False)
@@ -4265,8 +4315,6 @@ if __name__ == '__main__':
             return f
         return range_checker
 
-    print(f"***\nWelcome to KoboldCpp - Version {KcppVersion}") # just update version manually
-    # print("Python version: " + sys.version)
     parser = argparse.ArgumentParser(description='KoboldCpp Server')
     modelgroup = parser.add_mutually_exclusive_group() #we want to be backwards compatible with the unnamed positional args
     modelgroup.add_argument("--model", metavar=('[filename]'), help="Model file to load", type=str, default="")
@@ -4302,6 +4350,7 @@ if __name__ == '__main__':
     advparser.add_argument("--skiplauncher", help="Doesn't display or use the GUI launcher.", action='store_true')
     advparser.add_argument("--onready", help="An optional shell command to execute after the model has been loaded.", metavar=('[shell command]'), type=str, default="",nargs=1)
     advparser.add_argument("--benchmark", help="Do not start server, instead run benchmarks. If filename is provided, appends results to provided file.", metavar=('[filename]'), nargs='?', const="stdout", type=str, default=None)
+    advparser.add_argument("--prompt", metavar=('[prompt]'), help="Passing a prompt string triggers a direct inference, loading the model, outputs the response to stdout and exits. Can be used alone or with benchmark.", type=str, default="")
     advparser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them.", metavar=('limit'), nargs='?', const=1, type=int, default=1)
     advparser.add_argument("--remotetunnel", help="Uses Cloudflare to create a remote tunnel, allowing you to access koboldcpp remotely over the internet even behind a firewall.", action='store_true')
     advparser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true')