From 853d57c53c92f1ed39fc75758a3ea3780e9a7c99 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Tue, 6 Aug 2024 21:54:08 +0800 Subject: [PATCH] wip prompt --- koboldcpp.py | 153 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 101 insertions(+), 52 deletions(-) diff --git a/koboldcpp.py b/koboldcpp.py index 4fb1156a5..e96b51056 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -74,6 +74,13 @@ currfinishreason = "null" using_gui_launcher = False using_outdated_flags = False +saved_stdout = None +saved_stderr = None +saved_stdout_py = None +saved_stderr_py = None +stdout_nullfile = None +stdout_nullfile_py = None + CLDevices = ["1","2","3","4"] CUDevices = ["1","2","3","4","All"] CLDevicesNames = ["","","",""] @@ -224,6 +231,34 @@ def getabspath(): def file_exists(filename): return os.path.exists(os.path.join(getdirpath(), filename)) +def suppress_stdout(): + global saved_stdout, saved_stderr, saved_stdout_py, saved_stderr_py, stdout_nullfile, stdout_nullfile_py + if not saved_stdout and not saved_stderr and not saved_stdout_py and not saved_stderr_py and not stdout_nullfile and not stdout_nullfile_py: + sys.stdout.flush() + sys.stderr.flush() + saved_stdout = os.dup(sys.stdout.fileno()) + saved_stderr = os.dup(sys.stderr.fileno()) + saved_stderr_py = sys.stderr + saved_stdout_py = sys.stdout + stdout_nullfile = os.open(os.devnull, os.O_WRONLY) + stdout_nullfile_py = open(os.devnull, 'w') + os.dup2(stdout_nullfile, sys.stdout.fileno()) + os.dup2(stdout_nullfile, sys.stderr.fileno()) + sys.stderr = sys.stdout = stdout_nullfile_py + +def restore_stdout(): + global saved_stdout, saved_stderr, saved_stdout_py, saved_stderr_py, stdout_nullfile, stdout_nullfile_py + if saved_stdout and saved_stderr and saved_stdout_py and saved_stderr_py and stdout_nullfile and stdout_nullfile_py: + sys.stdout = saved_stdout_py + sys.stderr = saved_stderr_py + os.dup2(saved_stdout, sys.stdout.fileno()) + os.dup2(saved_stderr, sys.stderr.fileno()) + os.close(stdout_nullfile) + stdout_nullfile_py.close() + os.close(saved_stdout) + os.close(saved_stderr) + saved_stdout = saved_stderr = saved_stdout_py = saved_stderr_py = stdout_nullfile = stdout_nullfile_py = None + def get_default_threads(): physical_core_limit = 1 if os.cpu_count()!=None and os.cpu_count()>1: @@ -3724,13 +3759,19 @@ def main(launch_args,start_server=True): global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath + args = launch_args + if (args.model_param or args.model) and args.prompt and not args.benchmark: + suppress_stdout() + + print(f"***\nWelcome to KoboldCpp - Version {KcppVersion}") # just update version manually + # print("Python version: " + sys.version) + #perform some basic cleanup of old temporary directories try: delete_old_pyinstaller() except Exception as e: print(f"Error cleaning up orphaned pyinstaller dirs: {e}") - args = launch_args if args.unpack: unpack_to_dir(args.unpack) return @@ -4162,61 +4203,69 @@ def main(launch_args,start_server=True): timer_thread = threading.Timer(1, onready_subprocess) #1 second delay timer_thread.start() - if args.model_param and args.benchmark: + if args.model_param and (args.benchmark or args.prompt): from datetime import datetime, timezone start_server = False - save_to_file = (args.benchmark!="stdout" and args.benchmark!="") + save_to_file = (args.benchmark and args.benchmark!="stdout" and args.benchmark!="") benchmaxctx = maxctx benchlen = 100 benchmodel = sanitize_string(os.path.splitext(os.path.basename(modelname))[0]) - if os.path.exists(args.benchmark) and os.path.getsize(args.benchmark) > 1000000: - print(f"\nWarning: The benchmark CSV output file you selected exceeds 1MB. This is probably not what you want, did you select the wrong CSV file?\nFor safety, benchmark output will not be saved.") - save_to_file = False - if save_to_file: - print(f"\nRunning benchmark (Save to File: {args.benchmark})...") - else: - print(f"\nRunning benchmark (Not Saved)...") - - benchprompt = "1111111111111111" - for i in range(0,14): #generate massive prompt - benchprompt += benchprompt + benchprompt = "" + if args.prompt: + benchprompt = args.prompt + if args.benchmark: + if os.path.exists(args.benchmark) and os.path.getsize(args.benchmark) > 1000000: + print(f"\nWarning: The benchmark CSV output file you selected exceeds 1MB. This is probably not what you want, did you select the wrong CSV file?\nFor safety, benchmark output will not be saved.") + save_to_file = False + if save_to_file: + print(f"\nRunning benchmark (Save to File: {args.benchmark})...") + else: + print(f"\nRunning benchmark (Not Saved)...") + if benchprompt=="": + benchprompt = "1111111111111111" + for i in range(0,14): #generate massive prompt + benchprompt += benchprompt genout = generate(benchprompt,memory="",images=[],max_length=benchlen,max_context_length=benchmaxctx,temperature=0.1,top_k=1,rep_pen=1,ban_eos_token=True) result = genout['text'] - result = (result[:5] if len(result)>5 else "") - t_pp = float(handle.get_last_process_time())*float(benchmaxctx-benchlen)*0.001 - t_gen = float(handle.get_last_eval_time())*float(benchlen)*0.001 - s_pp = float(benchmaxctx-benchlen)/t_pp - s_gen = float(benchlen)/t_gen - datetimestamp = datetime.now(timezone.utc) - benchflagstr = f"NoAVX2={args.noavx2} Threads={args.threads} HighPriority={args.highpriority} NoBlas={args.noblas} Cublas_Args={args.usecublas} Tensor_Split={args.tensor_split} BlasThreads={args.blasthreads} BlasBatchSize={args.blasbatchsize} FlashAttention={args.flashattention} KvCache={args.quantkv}" - print(f"\nBenchmark Completed - v{KcppVersion} Results:\n======") - print(f"Flags: {benchflagstr}") - print(f"Timestamp: {datetimestamp}") - print(f"Backend: {libname}") - print(f"Layers: {args.gpulayers}") - print(f"Model: {benchmodel}") - print(f"MaxCtx: {benchmaxctx}") - print(f"GenAmount: {benchlen}\n-----") - print(f"ProcessingTime: {t_pp:.3f}s") - print(f"ProcessingSpeed: {s_pp:.2f}T/s") - print(f"GenerationTime: {t_gen:.3f}s") - print(f"GenerationSpeed: {s_gen:.2f}T/s") - print(f"TotalTime: {(t_pp+t_gen):.3f}s") - print(f"Output: {result}\n-----") - if save_to_file: - try: - with open(args.benchmark, "a") as file: - file.seek(0, 2) - if file.tell() == 0: #empty file - file.write(f"Timestamp,Backend,Layers,Model,MaxCtx,GenAmount,ProcessingTime,ProcessingSpeed,GenerationTime,GenerationSpeed,TotalTime,Output,Flags") - file.write(f"\n{datetimestamp},{libname},{args.gpulayers},{benchmodel},{benchmaxctx},{benchlen},{t_pp:.2f},{s_pp:.2f},{t_gen:.2f},{s_gen:.2f},{(t_pp+t_gen):.2f},{result},{benchflagstr}") - except Exception as e: - print(f"Error writing benchmark to file: {e}") - global using_gui_launcher - if using_gui_launcher and not save_to_file: - print("===") - print("Press ENTER key to exit.", flush=True) - input() + if args.prompt and not args.benchmark: + restore_stdout() + print(result) + if args.benchmark: + result = (result[:8] if len(result)>8 else "") if not args.prompt else result + t_pp = float(handle.get_last_process_time())*float(benchmaxctx-benchlen)*0.001 + t_gen = float(handle.get_last_eval_time())*float(benchlen)*0.001 + s_pp = float(benchmaxctx-benchlen)/t_pp + s_gen = float(benchlen)/t_gen + datetimestamp = datetime.now(timezone.utc) + benchflagstr = f"NoAVX2={args.noavx2} Threads={args.threads} HighPriority={args.highpriority} NoBlas={args.noblas} Cublas_Args={args.usecublas} Tensor_Split={args.tensor_split} BlasThreads={args.blasthreads} BlasBatchSize={args.blasbatchsize} FlashAttention={args.flashattention} KvCache={args.quantkv}" + print(f"\nBenchmark Completed - v{KcppVersion} Results:\n======") + print(f"Flags: {benchflagstr}") + print(f"Timestamp: {datetimestamp}") + print(f"Backend: {libname}") + print(f"Layers: {args.gpulayers}") + print(f"Model: {benchmodel}") + print(f"MaxCtx: {benchmaxctx}") + print(f"GenAmount: {benchlen}\n-----") + print(f"ProcessingTime: {t_pp:.3f}s") + print(f"ProcessingSpeed: {s_pp:.2f}T/s") + print(f"GenerationTime: {t_gen:.3f}s") + print(f"GenerationSpeed: {s_gen:.2f}T/s") + print(f"TotalTime: {(t_pp+t_gen):.3f}s") + print(f"Output: {result}\n-----") + if save_to_file: + try: + with open(args.benchmark, "a") as file: + file.seek(0, 2) + if file.tell() == 0: #empty file + file.write(f"Timestamp,Backend,Layers,Model,MaxCtx,GenAmount,ProcessingTime,ProcessingSpeed,GenerationTime,GenerationSpeed,TotalTime,Output,Flags") + file.write(f"\n{datetimestamp},{libname},{args.gpulayers},{benchmodel},{benchmaxctx},{benchlen},{t_pp:.2f},{s_pp:.2f},{t_gen:.2f},{s_gen:.2f},{(t_pp+t_gen):.2f},{result},{benchflagstr}") + except Exception as e: + print(f"Error writing benchmark to file: {e}") + global using_gui_launcher + if using_gui_launcher and not save_to_file: + print("===") + print("Press ENTER key to exit.", flush=True) + input() check_deprecation_warning() if start_server: @@ -4228,7 +4277,8 @@ def main(launch_args,start_server=True): asyncio.run(RunServerMultiThreaded(args.host, args.port)) else: # Flush stdout for previous win32 issue so the client can see output. - print(f"Server was not started, main function complete. Idling.", flush=True) + if not args.prompt or args.benchmark: + print(f"Server was not started, main function complete. Idling.", flush=True) def run_in_queue(launch_args, input_queue, output_queue): main(launch_args, start_server=False) @@ -4265,8 +4315,6 @@ if __name__ == '__main__': return f return range_checker - print(f"***\nWelcome to KoboldCpp - Version {KcppVersion}") # just update version manually - # print("Python version: " + sys.version) parser = argparse.ArgumentParser(description='KoboldCpp Server') modelgroup = parser.add_mutually_exclusive_group() #we want to be backwards compatible with the unnamed positional args modelgroup.add_argument("--model", metavar=('[filename]'), help="Model file to load", type=str, default="") @@ -4302,6 +4350,7 @@ if __name__ == '__main__': advparser.add_argument("--skiplauncher", help="Doesn't display or use the GUI launcher.", action='store_true') advparser.add_argument("--onready", help="An optional shell command to execute after the model has been loaded.", metavar=('[shell command]'), type=str, default="",nargs=1) advparser.add_argument("--benchmark", help="Do not start server, instead run benchmarks. If filename is provided, appends results to provided file.", metavar=('[filename]'), nargs='?', const="stdout", type=str, default=None) + advparser.add_argument("--prompt", metavar=('[prompt]'), help="Passing a prompt string triggers a direct inference, loading the model, outputs the response to stdout and exits. Can be used alone or with benchmark.", type=str, default="") advparser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them.", metavar=('limit'), nargs='?', const=1, type=int, default=1) advparser.add_argument("--remotetunnel", help="Uses Cloudflare to create a remote tunnel, allowing you to access koboldcpp remotely over the internet even behind a firewall.", action='store_true') advparser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true')