mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-11 01:24:36 +00:00
wip prompt
This commit is contained in:
parent
c23d91987a
commit
853d57c53c
1 changed files with 101 additions and 52 deletions
153
koboldcpp.py
153
koboldcpp.py
|
@ -74,6 +74,13 @@ currfinishreason = "null"
|
|||
using_gui_launcher = False
|
||||
using_outdated_flags = False
|
||||
|
||||
saved_stdout = None
|
||||
saved_stderr = None
|
||||
saved_stdout_py = None
|
||||
saved_stderr_py = None
|
||||
stdout_nullfile = None
|
||||
stdout_nullfile_py = None
|
||||
|
||||
CLDevices = ["1","2","3","4"]
|
||||
CUDevices = ["1","2","3","4","All"]
|
||||
CLDevicesNames = ["","","",""]
|
||||
|
@ -224,6 +231,34 @@ def getabspath():
|
|||
def file_exists(filename):
|
||||
return os.path.exists(os.path.join(getdirpath(), filename))
|
||||
|
||||
def suppress_stdout():
|
||||
global saved_stdout, saved_stderr, saved_stdout_py, saved_stderr_py, stdout_nullfile, stdout_nullfile_py
|
||||
if not saved_stdout and not saved_stderr and not saved_stdout_py and not saved_stderr_py and not stdout_nullfile and not stdout_nullfile_py:
|
||||
sys.stdout.flush()
|
||||
sys.stderr.flush()
|
||||
saved_stdout = os.dup(sys.stdout.fileno())
|
||||
saved_stderr = os.dup(sys.stderr.fileno())
|
||||
saved_stderr_py = sys.stderr
|
||||
saved_stdout_py = sys.stdout
|
||||
stdout_nullfile = os.open(os.devnull, os.O_WRONLY)
|
||||
stdout_nullfile_py = open(os.devnull, 'w')
|
||||
os.dup2(stdout_nullfile, sys.stdout.fileno())
|
||||
os.dup2(stdout_nullfile, sys.stderr.fileno())
|
||||
sys.stderr = sys.stdout = stdout_nullfile_py
|
||||
|
||||
def restore_stdout():
|
||||
global saved_stdout, saved_stderr, saved_stdout_py, saved_stderr_py, stdout_nullfile, stdout_nullfile_py
|
||||
if saved_stdout and saved_stderr and saved_stdout_py and saved_stderr_py and stdout_nullfile and stdout_nullfile_py:
|
||||
sys.stdout = saved_stdout_py
|
||||
sys.stderr = saved_stderr_py
|
||||
os.dup2(saved_stdout, sys.stdout.fileno())
|
||||
os.dup2(saved_stderr, sys.stderr.fileno())
|
||||
os.close(stdout_nullfile)
|
||||
stdout_nullfile_py.close()
|
||||
os.close(saved_stdout)
|
||||
os.close(saved_stderr)
|
||||
saved_stdout = saved_stderr = saved_stdout_py = saved_stderr_py = stdout_nullfile = stdout_nullfile_py = None
|
||||
|
||||
def get_default_threads():
|
||||
physical_core_limit = 1
|
||||
if os.cpu_count()!=None and os.cpu_count()>1:
|
||||
|
@ -3724,13 +3759,19 @@ def main(launch_args,start_server=True):
|
|||
global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui
|
||||
global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, mmprojpath, password, fullwhispermodelpath
|
||||
|
||||
args = launch_args
|
||||
if (args.model_param or args.model) and args.prompt and not args.benchmark:
|
||||
suppress_stdout()
|
||||
|
||||
print(f"***\nWelcome to KoboldCpp - Version {KcppVersion}") # just update version manually
|
||||
# print("Python version: " + sys.version)
|
||||
|
||||
#perform some basic cleanup of old temporary directories
|
||||
try:
|
||||
delete_old_pyinstaller()
|
||||
except Exception as e:
|
||||
print(f"Error cleaning up orphaned pyinstaller dirs: {e}")
|
||||
|
||||
args = launch_args
|
||||
if args.unpack:
|
||||
unpack_to_dir(args.unpack)
|
||||
return
|
||||
|
@ -4162,61 +4203,69 @@ def main(launch_args,start_server=True):
|
|||
timer_thread = threading.Timer(1, onready_subprocess) #1 second delay
|
||||
timer_thread.start()
|
||||
|
||||
if args.model_param and args.benchmark:
|
||||
if args.model_param and (args.benchmark or args.prompt):
|
||||
from datetime import datetime, timezone
|
||||
start_server = False
|
||||
save_to_file = (args.benchmark!="stdout" and args.benchmark!="")
|
||||
save_to_file = (args.benchmark and args.benchmark!="stdout" and args.benchmark!="")
|
||||
benchmaxctx = maxctx
|
||||
benchlen = 100
|
||||
benchmodel = sanitize_string(os.path.splitext(os.path.basename(modelname))[0])
|
||||
if os.path.exists(args.benchmark) and os.path.getsize(args.benchmark) > 1000000:
|
||||
print(f"\nWarning: The benchmark CSV output file you selected exceeds 1MB. This is probably not what you want, did you select the wrong CSV file?\nFor safety, benchmark output will not be saved.")
|
||||
save_to_file = False
|
||||
if save_to_file:
|
||||
print(f"\nRunning benchmark (Save to File: {args.benchmark})...")
|
||||
else:
|
||||
print(f"\nRunning benchmark (Not Saved)...")
|
||||
|
||||
benchprompt = "1111111111111111"
|
||||
for i in range(0,14): #generate massive prompt
|
||||
benchprompt += benchprompt
|
||||
benchprompt = ""
|
||||
if args.prompt:
|
||||
benchprompt = args.prompt
|
||||
if args.benchmark:
|
||||
if os.path.exists(args.benchmark) and os.path.getsize(args.benchmark) > 1000000:
|
||||
print(f"\nWarning: The benchmark CSV output file you selected exceeds 1MB. This is probably not what you want, did you select the wrong CSV file?\nFor safety, benchmark output will not be saved.")
|
||||
save_to_file = False
|
||||
if save_to_file:
|
||||
print(f"\nRunning benchmark (Save to File: {args.benchmark})...")
|
||||
else:
|
||||
print(f"\nRunning benchmark (Not Saved)...")
|
||||
if benchprompt=="":
|
||||
benchprompt = "1111111111111111"
|
||||
for i in range(0,14): #generate massive prompt
|
||||
benchprompt += benchprompt
|
||||
genout = generate(benchprompt,memory="",images=[],max_length=benchlen,max_context_length=benchmaxctx,temperature=0.1,top_k=1,rep_pen=1,ban_eos_token=True)
|
||||
result = genout['text']
|
||||
result = (result[:5] if len(result)>5 else "")
|
||||
t_pp = float(handle.get_last_process_time())*float(benchmaxctx-benchlen)*0.001
|
||||
t_gen = float(handle.get_last_eval_time())*float(benchlen)*0.001
|
||||
s_pp = float(benchmaxctx-benchlen)/t_pp
|
||||
s_gen = float(benchlen)/t_gen
|
||||
datetimestamp = datetime.now(timezone.utc)
|
||||
benchflagstr = f"NoAVX2={args.noavx2} Threads={args.threads} HighPriority={args.highpriority} NoBlas={args.noblas} Cublas_Args={args.usecublas} Tensor_Split={args.tensor_split} BlasThreads={args.blasthreads} BlasBatchSize={args.blasbatchsize} FlashAttention={args.flashattention} KvCache={args.quantkv}"
|
||||
print(f"\nBenchmark Completed - v{KcppVersion} Results:\n======")
|
||||
print(f"Flags: {benchflagstr}")
|
||||
print(f"Timestamp: {datetimestamp}")
|
||||
print(f"Backend: {libname}")
|
||||
print(f"Layers: {args.gpulayers}")
|
||||
print(f"Model: {benchmodel}")
|
||||
print(f"MaxCtx: {benchmaxctx}")
|
||||
print(f"GenAmount: {benchlen}\n-----")
|
||||
print(f"ProcessingTime: {t_pp:.3f}s")
|
||||
print(f"ProcessingSpeed: {s_pp:.2f}T/s")
|
||||
print(f"GenerationTime: {t_gen:.3f}s")
|
||||
print(f"GenerationSpeed: {s_gen:.2f}T/s")
|
||||
print(f"TotalTime: {(t_pp+t_gen):.3f}s")
|
||||
print(f"Output: {result}\n-----")
|
||||
if save_to_file:
|
||||
try:
|
||||
with open(args.benchmark, "a") as file:
|
||||
file.seek(0, 2)
|
||||
if file.tell() == 0: #empty file
|
||||
file.write(f"Timestamp,Backend,Layers,Model,MaxCtx,GenAmount,ProcessingTime,ProcessingSpeed,GenerationTime,GenerationSpeed,TotalTime,Output,Flags")
|
||||
file.write(f"\n{datetimestamp},{libname},{args.gpulayers},{benchmodel},{benchmaxctx},{benchlen},{t_pp:.2f},{s_pp:.2f},{t_gen:.2f},{s_gen:.2f},{(t_pp+t_gen):.2f},{result},{benchflagstr}")
|
||||
except Exception as e:
|
||||
print(f"Error writing benchmark to file: {e}")
|
||||
global using_gui_launcher
|
||||
if using_gui_launcher and not save_to_file:
|
||||
print("===")
|
||||
print("Press ENTER key to exit.", flush=True)
|
||||
input()
|
||||
if args.prompt and not args.benchmark:
|
||||
restore_stdout()
|
||||
print(result)
|
||||
if args.benchmark:
|
||||
result = (result[:8] if len(result)>8 else "") if not args.prompt else result
|
||||
t_pp = float(handle.get_last_process_time())*float(benchmaxctx-benchlen)*0.001
|
||||
t_gen = float(handle.get_last_eval_time())*float(benchlen)*0.001
|
||||
s_pp = float(benchmaxctx-benchlen)/t_pp
|
||||
s_gen = float(benchlen)/t_gen
|
||||
datetimestamp = datetime.now(timezone.utc)
|
||||
benchflagstr = f"NoAVX2={args.noavx2} Threads={args.threads} HighPriority={args.highpriority} NoBlas={args.noblas} Cublas_Args={args.usecublas} Tensor_Split={args.tensor_split} BlasThreads={args.blasthreads} BlasBatchSize={args.blasbatchsize} FlashAttention={args.flashattention} KvCache={args.quantkv}"
|
||||
print(f"\nBenchmark Completed - v{KcppVersion} Results:\n======")
|
||||
print(f"Flags: {benchflagstr}")
|
||||
print(f"Timestamp: {datetimestamp}")
|
||||
print(f"Backend: {libname}")
|
||||
print(f"Layers: {args.gpulayers}")
|
||||
print(f"Model: {benchmodel}")
|
||||
print(f"MaxCtx: {benchmaxctx}")
|
||||
print(f"GenAmount: {benchlen}\n-----")
|
||||
print(f"ProcessingTime: {t_pp:.3f}s")
|
||||
print(f"ProcessingSpeed: {s_pp:.2f}T/s")
|
||||
print(f"GenerationTime: {t_gen:.3f}s")
|
||||
print(f"GenerationSpeed: {s_gen:.2f}T/s")
|
||||
print(f"TotalTime: {(t_pp+t_gen):.3f}s")
|
||||
print(f"Output: {result}\n-----")
|
||||
if save_to_file:
|
||||
try:
|
||||
with open(args.benchmark, "a") as file:
|
||||
file.seek(0, 2)
|
||||
if file.tell() == 0: #empty file
|
||||
file.write(f"Timestamp,Backend,Layers,Model,MaxCtx,GenAmount,ProcessingTime,ProcessingSpeed,GenerationTime,GenerationSpeed,TotalTime,Output,Flags")
|
||||
file.write(f"\n{datetimestamp},{libname},{args.gpulayers},{benchmodel},{benchmaxctx},{benchlen},{t_pp:.2f},{s_pp:.2f},{t_gen:.2f},{s_gen:.2f},{(t_pp+t_gen):.2f},{result},{benchflagstr}")
|
||||
except Exception as e:
|
||||
print(f"Error writing benchmark to file: {e}")
|
||||
global using_gui_launcher
|
||||
if using_gui_launcher and not save_to_file:
|
||||
print("===")
|
||||
print("Press ENTER key to exit.", flush=True)
|
||||
input()
|
||||
|
||||
check_deprecation_warning()
|
||||
if start_server:
|
||||
|
@ -4228,7 +4277,8 @@ def main(launch_args,start_server=True):
|
|||
asyncio.run(RunServerMultiThreaded(args.host, args.port))
|
||||
else:
|
||||
# Flush stdout for previous win32 issue so the client can see output.
|
||||
print(f"Server was not started, main function complete. Idling.", flush=True)
|
||||
if not args.prompt or args.benchmark:
|
||||
print(f"Server was not started, main function complete. Idling.", flush=True)
|
||||
|
||||
def run_in_queue(launch_args, input_queue, output_queue):
|
||||
main(launch_args, start_server=False)
|
||||
|
@ -4265,8 +4315,6 @@ if __name__ == '__main__':
|
|||
return f
|
||||
return range_checker
|
||||
|
||||
print(f"***\nWelcome to KoboldCpp - Version {KcppVersion}") # just update version manually
|
||||
# print("Python version: " + sys.version)
|
||||
parser = argparse.ArgumentParser(description='KoboldCpp Server')
|
||||
modelgroup = parser.add_mutually_exclusive_group() #we want to be backwards compatible with the unnamed positional args
|
||||
modelgroup.add_argument("--model", metavar=('[filename]'), help="Model file to load", type=str, default="")
|
||||
|
@ -4302,6 +4350,7 @@ if __name__ == '__main__':
|
|||
advparser.add_argument("--skiplauncher", help="Doesn't display or use the GUI launcher.", action='store_true')
|
||||
advparser.add_argument("--onready", help="An optional shell command to execute after the model has been loaded.", metavar=('[shell command]'), type=str, default="",nargs=1)
|
||||
advparser.add_argument("--benchmark", help="Do not start server, instead run benchmarks. If filename is provided, appends results to provided file.", metavar=('[filename]'), nargs='?', const="stdout", type=str, default=None)
|
||||
advparser.add_argument("--prompt", metavar=('[prompt]'), help="Passing a prompt string triggers a direct inference, loading the model, outputs the response to stdout and exits. Can be used alone or with benchmark.", type=str, default="")
|
||||
advparser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them.", metavar=('limit'), nargs='?', const=1, type=int, default=1)
|
||||
advparser.add_argument("--remotetunnel", help="Uses Cloudflare to create a remote tunnel, allowing you to access koboldcpp remotely over the internet even behind a firewall.", action='store_true')
|
||||
advparser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true')
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue