From 66134bb36ee72bfdf31f4433ca317a469c9bc85f Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Thu, 29 Feb 2024 17:08:22 +0800 Subject: [PATCH] ui for loading SD models done --- expose.h | 1 + koboldcpp.py | 176 ++++++++++++++++++++++----------- otherarch/sdcpp/sd_adapter.cpp | 22 +++++ 3 files changed, 142 insertions(+), 57 deletions(-) diff --git a/expose.h b/expose.h index 872003ff5..6ee13d00e 100644 --- a/expose.h +++ b/expose.h @@ -102,6 +102,7 @@ struct token_count_outputs struct load_sd_model_inputs { const char * model_filename; + const int debugmode = 0; }; struct sd_generation_inputs { diff --git a/koboldcpp.py b/koboldcpp.py index e9d7d317d..86c0dc2d1 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -89,6 +89,22 @@ class generation_outputs(ctypes.Structure): _fields_ = [("status", ctypes.c_int), ("text", ctypes.c_char_p)] +class sd_load_model_inputs(ctypes.Structure): + _fields_ = [("model_filename", ctypes.c_char_p), + ("debugmode", ctypes.c_int)] + +class sd_generation_inputs(ctypes.Structure): + _fields_ = [("prompt", ctypes.c_char_p), + ("negative_prompt", ctypes.c_char_p), + ("cfg_scale", ctypes.c_float), + ("sample_steps", ctypes.c_int), + ("seed", ctypes.c_int), + ("sample_method", ctypes.c_char_p)] + +class sd_generation_outputs(ctypes.Structure): + _fields_ = [("status", ctypes.c_int), + ("data", ctypes.c_char_p)] + class token_count_outputs(ctypes.Structure): _fields_ = [("count", ctypes.c_int), ("ids", ctypes.POINTER(ctypes.c_int))] @@ -257,6 +273,10 @@ def init_library(): handle.abort_generate.restype = ctypes.c_bool handle.token_count.restype = token_count_outputs handle.get_pending_output.restype = ctypes.c_char_p + handle.load_model_sd.argtypes = [sd_load_model_inputs] + handle.load_model_sd.restype = ctypes.c_bool + handle.generate_sd.argtypes = [sd_generation_inputs] + handle.generate_sd.restype = sd_generation_outputs def load_model(model_filename): global args @@ -448,6 +468,15 @@ def generate(prompt, memory="", max_length=32, max_context_length=512, temperatu outstr = outstr[:sindex] return outstr + +def load_model_sd(model_filename): + global args + inputs = sd_load_model_inputs() + inputs.debugmode = args.debugmode + inputs.model_filename = model_filename.encode("UTF-8") + ret = handle.load_model_sd(inputs) + return ret + def utfprint(str): try: print(str) @@ -1146,7 +1175,7 @@ def show_new_gui(): root.destroy() if args.model_param and args.model_param!="" and args.model_param.lower().endswith('.kcpps'): loadconfigfile(args.model_param) - if not args.model_param: + if not args.model_param and not args.sdconfig: global exitcounter exitcounter = 999 print("\nNo ggml model or kcpps file was selected. Exiting.") @@ -1192,7 +1221,7 @@ def show_new_gui(): tabs = ctk.CTkFrame(root, corner_radius = 0, width=windowwidth, height=windowheight-50) tabs.grid(row=0, stick="nsew") - tabnames= ["Quick Launch", "Hardware", "Tokens", "Model", "Network"] + tabnames= ["Quick Launch", "Hardware", "Tokens", "Model", "Network","Image Gen"] navbuttons = {} navbuttonframe = ctk.CTkFrame(tabs, width=100, height=int(tabs.cget("height"))) navbuttonframe.grid(row=0, column=0, padx=2,pady=2) @@ -1283,6 +1312,9 @@ def show_new_gui(): ssl_cert_var = ctk.StringVar() ssl_key_var = ctk.StringVar() + sd_model_var = ctk.StringVar() + sd_quick_var = ctk.IntVar(value=0) + def tabbuttonaction(name): for t in tabcontent: if name == t: @@ -1755,9 +1787,14 @@ def show_new_gui(): makecheckbox(network_tab, "Configure for Horde", usehorde_var, 19, command=togglehorde,tooltiptxt="Enable the embedded AI Horde worker.") togglehorde(1,1,1) + # Image Gen Tab + images_tab = tabcontent["Image Gen"] + makefileentry(images_tab, "Stable Diffusion Model (f16):", "Select Stable Diffusion Model File", sd_model_var, 1, filetypes=[("*.safetensors","*.safetensors")], tooltiptxt="Select a .safetensors Stable Diffusion model file on disk to be loaded.") + makecheckbox(images_tab, "Quick Mode (Low Quality)", sd_quick_var, 4,tooltiptxt="Force optimal generation settings for speed.") + # launch def guilaunch(): - if model_var.get() == "": + if model_var.get() == "" and sd_model_var.get() == "": tmp = askopenfilename(title="Select ggml model .bin or .gguf file") model_var.set(tmp) nonlocal nextstate @@ -1841,6 +1878,8 @@ def show_new_gui(): else: args.hordeconfig = None if usehorde_var.get() == 0 else [horde_name_var.get(), horde_gen_var.get(), horde_context_var.get(), horde_apikey_var.get(), horde_workername_var.get()] + args.sdconfig = None if sd_model_var.get() == "" else [sd_model_var.get(), ("quick" if sd_quick_var.get()==1 else "normal")] + def import_vars(dict): if "threads" in dict: threads_var.set(dict["threads"]) @@ -1968,6 +2007,11 @@ def show_new_gui(): horde_workername_var.set(dict["hordeconfig"][4]) usehorde_var.set("1") + if "sdconfig" in dict and dict["sdconfig"] and len(dict["sdconfig"]) > 1: + sd_model_var.set(dict["sdconfig"][0]) + if len(dict["sdconfig"]) > 2: + sd_quick_var.set(1 if dict["sdconfig"][1]=="quick" else 0) + def save_config(): file_type = [("KoboldCpp Settings", "*.kcpps")] filename = asksaveasfile(filetypes=file_type, defaultextension=file_type) @@ -2026,9 +2070,9 @@ def show_new_gui(): # processing vars export_vars() - if not args.model_param: + if not args.model_param and not args.sdconfig: exitcounter = 999 - print("\nNo ggml model file was selected. Exiting.") + print("\nNo text or image model file was selected. Exiting.") time.sleep(3) sys.exit(2) @@ -2394,7 +2438,7 @@ def main(launch_args,start_server=True): if not args.model_param: args.model_param = args.model - if not args.model_param: + if not args.model_param and not args.sdconfig: #give them a chance to pick a file print("For command line arguments, please refer to --help") print("***") @@ -2419,7 +2463,7 @@ def main(launch_args,start_server=True): print(f"Warning: Saved story file {args.preloadstory} invalid or not found. No story will be preloaded into server.") # sanitize and replace the default vanity name. remember me.... - if args.model_param!="": + if args.model_param and args.model_param!="": newmdldisplayname = os.path.basename(args.model_param) newmdldisplayname = os.path.splitext(newmdldisplayname)[0] friendlymodelname = "koboldcpp/" + sanitize_string(newmdldisplayname) @@ -2471,45 +2515,67 @@ def main(launch_args,start_server=True): init_library() # Note: if blas does not exist and is enabled, program will crash. print("==========") time.sleep(1) - if not os.path.exists(args.model_param): - exitcounter = 999 - print(f"Cannot find model file: {args.model_param}") - time.sleep(3) - sys.exit(2) - if args.lora and args.lora[0]!="": - if not os.path.exists(args.lora[0]): + #handle loading text model + if args.model_param: + if not os.path.exists(args.model_param): exitcounter = 999 - print(f"Cannot find lora file: {args.lora[0]}") + print(f"Cannot find text model file: {args.model_param}") time.sleep(3) sys.exit(2) - else: - args.lora[0] = os.path.abspath(args.lora[0]) - if len(args.lora) > 1: - if not os.path.exists(args.lora[1]): - exitcounter = 999 - print(f"Cannot find lora base: {args.lora[1]}") - time.sleep(3) - sys.exit(2) - else: - args.lora[1] = os.path.abspath(args.lora[1]) - if not args.blasthreads or args.blasthreads <= 0: - args.blasthreads = args.threads + if args.lora and args.lora[0]!="": + if not os.path.exists(args.lora[0]): + exitcounter = 999 + print(f"Cannot find lora file: {args.lora[0]}") + time.sleep(3) + sys.exit(2) + else: + args.lora[0] = os.path.abspath(args.lora[0]) + if len(args.lora) > 1: + if not os.path.exists(args.lora[1]): + exitcounter = 999 + print(f"Cannot find lora base: {args.lora[1]}") + time.sleep(3) + sys.exit(2) + else: + args.lora[1] = os.path.abspath(args.lora[1]) - modelname = os.path.abspath(args.model_param) - print(args) - # Flush stdout for win32 issue with regards to piping in terminals, - # especially before handing over to C++ context. - print(f"==========\nLoading model: {modelname} \n[Threads: {args.threads}, BlasThreads: {args.blasthreads}, SmartContext: {args.smartcontext}, ContextShift: {not (args.noshift)}]", flush=True) - loadok = load_model(modelname) - print("Load Model OK: " + str(loadok)) + if not args.blasthreads or args.blasthreads <= 0: + args.blasthreads = args.threads - if not loadok: - exitcounter = 999 - print("Could not load model: " + modelname) - time.sleep(3) - sys.exit(3) + modelname = os.path.abspath(args.model_param) + print(args) + # Flush stdout for win32 issue with regards to piping in terminals, + # especially before handing over to C++ context. + print(f"==========\nLoading model: {modelname} \n[Threads: {args.threads}, BlasThreads: {args.blasthreads}, SmartContext: {args.smartcontext}, ContextShift: {not (args.noshift)}]", flush=True) + loadok = load_model(modelname) + print("Load Text Model OK: " + str(loadok)) + + if not loadok: + exitcounter = 999 + print("Could not load text model: " + modelname) + time.sleep(3) + sys.exit(3) + + #handle loading image model + if args.sdconfig: + imgmodel = args.sdconfig[0] + if not imgmodel or not os.path.exists(imgmodel): + exitcounter = 999 + print(f"Cannot find image model file: {imgmodel}") + time.sleep(3) + sys.exit(2) + imgmodel = os.path.abspath(imgmodel) + loadok = load_model_sd(imgmodel) + print("Load Image Model OK: " + str(loadok)) + if not loadok: + exitcounter = 999 + print("Could not load image model: " + imgmodel) + time.sleep(3) + sys.exit(3) + + #load embedded lite try: basepath = os.path.abspath(os.path.dirname(__file__)) with open(os.path.join(basepath, "klite.embd"), mode='rb') as f: @@ -2572,7 +2638,7 @@ def main(launch_args,start_server=True): timer_thread = threading.Timer(1, onready_subprocess) #1 second delay timer_thread.start() - if args.benchmark is not None: + if args.model_param and args.benchmark is not None: from datetime import datetime, timezone global libname start_server = False @@ -2678,18 +2744,25 @@ if __name__ == '__main__': portgroup.add_argument("port_param", help="Port to listen on (positional)", default=defaultport, nargs="?", type=int, action='store') parser.add_argument("--host", help="Host IP to listen on. If empty, all routable interfaces are accepted.", default="") parser.add_argument("--launch", help="Launches a web browser when load is completed.", action='store_true') - parser.add_argument("--lora", help="LLAMA models only, applies a lora file on top of model. Experimental.", metavar=('[lora_filename]', '[lora_base]'), nargs='+') parser.add_argument("--config", help="Load settings from a .kcpps file. Other arguments will be ignored", type=str, nargs=1) physical_core_limit = 1 if os.cpu_count()!=None and os.cpu_count()>1: physical_core_limit = int(os.cpu_count()/2) default_threads = (physical_core_limit if physical_core_limit<=3 else max(3,physical_core_limit-1)) parser.add_argument("--threads", help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=default_threads) - parser.add_argument("--blasthreads", help="Use a different number of threads during BLAS if specified. Otherwise, has the same value as --threads",metavar=('[threads]'), type=int, default=0) - parser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true') + compatgroup = parser.add_mutually_exclusive_group() + compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs. For hipBLAS binaries, please check YellowRoseCx rocm fork.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq] [rowsplit]'), choices=['normal', 'lowvram', '0', '1', '2', '3', 'mmq', 'rowsplit']) + compatgroup.add_argument("--usevulkan", help="Use Vulkan for GPU Acceleration. Can optionally specify GPU Device ID (e.g. --usevulkan 0).", metavar=('[Device ID]'), nargs='*', type=int, default=None) + compatgroup.add_argument("--useclblast", help="Use CLBlast for GPU Acceleration. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2) + compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true') + parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU.",metavar=('[GPU layers]'), nargs='?', const=1, type=int, default=0) + parser.add_argument("--tensor_split", help="For CUDA and Vulkan only, ratio to split tensors across multiple GPUs, space-separated list of proportions, e.g. 7 3", metavar=('[Ratios]'), type=float, nargs='+') parser.add_argument("--contextsize", help="Controls the memory allocated for maximum context size, only change if you need more RAM for big contexts. (default 2048). Supported values are [256,512,1024,2048,3072,4096,6144,8192,12288,16384,24576,32768,49152,65536]. IF YOU USE ANYTHING ELSE YOU ARE ON YOUR OWN.",metavar=('[256,512,1024,2048,3072,4096,6144,8192,12288,16384,24576,32768,49152,65536]'), type=check_range(int,256,262144), default=2048) - parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,32,64,128,256,512,1024,2048], default=512) parser.add_argument("--ropeconfig", help="If set, uses customized RoPE scaling from configured frequency scale and frequency base (e.g. --ropeconfig 0.25 10000). Otherwise, uses NTK-Aware scaling set automatically based on context size. For linear rope, simply set the freq-scale and ignore the freq-base",metavar=('[rope-freq-scale]', '[rope-freq-base]'), default=[0.0, 10000.0], type=float, nargs='+') + #more advanced params + parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,32,64,128,256,512,1024,2048], default=512) + parser.add_argument("--blasthreads", help="Use a different number of threads during BLAS if specified. Otherwise, has the same value as --threads",metavar=('[threads]'), type=int, default=0) + parser.add_argument("--lora", help="LLAMA models only, applies a lora file on top of model. Experimental.", metavar=('[lora_filename]', '[lora_base]'), nargs='+') parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true') parser.add_argument("--noshift", help="If set, do not attempt to Trim and Shift the GGUF context.", action='store_true') parser.add_argument("--bantokens", help="You can manually specify a list of token SUBSTRINGS that the AI cannot use. This bans ALL instances of that substring.", metavar=('[token_substrings]'), nargs='+') @@ -2700,27 +2773,16 @@ if __name__ == '__main__': parser.add_argument("--debugmode", help="Shows additional debug info in the terminal.", nargs='?', const=1, type=int, default=0) parser.add_argument("--skiplauncher", help="Doesn't display or use the GUI launcher.", action='store_true') parser.add_argument("--hordeconfig", help="Sets the display model name to something else, for easy use on AI Horde. Optional additional parameters set the horde max genlength, max ctxlen, API key and worker name.",metavar=('[hordemodelname]', '[hordegenlength] [hordemaxctx] [hordeapikey] [hordeworkername]'), nargs='+') - compatgroup = parser.add_mutually_exclusive_group() - compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true') - compatgroup.add_argument("--useclblast", help="Use CLBlast for GPU Acceleration. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2) - compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs. For hipBLAS binaries, please check YellowRoseCx rocm fork.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq] [rowsplit]'), choices=['normal', 'lowvram', '0', '1', '2', '3', 'mmq', 'rowsplit']) - compatgroup.add_argument("--usevulkan", help="Use Vulkan for GPU Acceleration. Can optionally specify GPU Device ID (e.g. --usevulkan 0).", metavar=('[Device ID]'), nargs='*', type=int, default=None) - parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU.",metavar=('[GPU layers]'), nargs='?', const=1, type=int, default=0) - parser.add_argument("--tensor_split", help="For CUDA and Vulkan only, ratio to split tensors across multiple GPUs, space-separated list of proportions, e.g. 7 3", metavar=('[Ratios]'), type=float, nargs='+') parser.add_argument("--onready", help="An optional shell command to execute after the model has been loaded.", metavar=('[shell command]'), type=str, default="",nargs=1) parser.add_argument("--benchmark", help="Do not start server, instead run benchmarks. If filename is provided, appends results to provided file.", metavar=('[filename]'), nargs='?', const="stdout", type=str, default=None) parser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them.", metavar=('limit'), nargs='?', const=1, type=int, default=0) parser.add_argument("--remotetunnel", help="Uses Cloudflare to create a remote tunnel, allowing you to access koboldcpp remotely over the internet even behind a firewall.", action='store_true') + parser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true') parser.add_argument("--foreground", help="Windows only. Sends the terminal to the foreground every time a new prompt is generated. This helps avoid some idle slowdown issues.", action='store_true') parser.add_argument("--preloadstory", help="Configures a prepared story json save file to be hosted on the server, which frontends (such as Kobold Lite) can access over the API.", default="") parser.add_argument("--quiet", help="Enable quiet mode, which hides generation inputs and outputs in the terminal. Quiet mode is automatically enabled when running --hordeconfig.", action='store_true') parser.add_argument("--ssl", help="Allows all content to be served over SSL instead. A valid UNENCRYPTED SSL cert and key .pem files must be provided", metavar=('[cert_pem]', '[key_pem]'), nargs='+') parser.add_argument("--nocertify", help="Allows insecure SSL connections. Use this if you have cert errors and need to bypass certificate restrictions.", action='store_true') - - # #deprecated hidden args. they do nothing. do not use - # parser.add_argument("--psutil_set_threads", action='store_true', help=argparse.SUPPRESS) - # parser.add_argument("--stream", action='store_true', help=argparse.SUPPRESS) - # parser.add_argument("--unbantokens", action='store_true', help=argparse.SUPPRESS) - # parser.add_argument("--usemirostat", action='store_true', help=argparse.SUPPRESS) + parser.add_argument("--sdconfig", help="Specify a stable diffusion safetensors model to enable image generation. If quick is specified, force optimal generation settings for speed.",metavar=('[sd_filename]', '[normal|quick]'), nargs='+') main(parser.parse_args(),start_server=True) diff --git a/otherarch/sdcpp/sd_adapter.cpp b/otherarch/sdcpp/sd_adapter.cpp index 619702e8a..629db9f62 100644 --- a/otherarch/sdcpp/sd_adapter.cpp +++ b/otherarch/sdcpp/sd_adapter.cpp @@ -111,13 +111,35 @@ struct SDParams { static SDParams * sd_params = nullptr; static sd_ctx_t * sd_ctx = nullptr; +static void sd_logger_callback(enum sd_log_level_t level, const char* log, void* data) { + SDParams* params = (SDParams*)data; + if (!params->verbose && level <= SD_LOG_DEBUG) { + return; + } + if (level <= SD_LOG_INFO) { + fputs(log, stdout); + fflush(stdout); + } else { + fputs(log, stderr); + fflush(stderr); + } +} + bool sdtype_load_model(const load_sd_model_inputs inputs) { + + printf("\nSelected Image Model: %s\n",inputs.model_filename); + sd_params = new SDParams(); sd_params->model_path = inputs.model_filename; sd_params->wtype = SD_TYPE_F16; sd_params->n_threads = -1; //use physical cores sd_params->input_path = ""; //unused + if(inputs.debugmode==1) + { + sd_set_log_callback(sd_logger_callback, (void*)sd_params); + } + bool vae_decode_only = false; sd_ctx = new_sd_ctx(sd_params->model_path.c_str(),