From 66134bb36ee72bfdf31f4433ca317a469c9bc85f Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Thu, 29 Feb 2024 17:08:22 +0800
Subject: [PATCH] ui for loading SD models done

---
 expose.h                       |   1 +
 koboldcpp.py                   | 176 ++++++++++++++++++++++-----------
 otherarch/sdcpp/sd_adapter.cpp |  22 +++++
 3 files changed, 142 insertions(+), 57 deletions(-)

diff --git a/expose.h b/expose.h
index 872003ff5..6ee13d00e 100644
--- a/expose.h
+++ b/expose.h
@@ -102,6 +102,7 @@ struct token_count_outputs
 struct load_sd_model_inputs
 {
     const char * model_filename;
+    const int debugmode = 0;
 };
 struct sd_generation_inputs
 {
diff --git a/koboldcpp.py b/koboldcpp.py
index e9d7d317d..86c0dc2d1 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -89,6 +89,22 @@ class generation_outputs(ctypes.Structure):
     _fields_ = [("status", ctypes.c_int),
                 ("text", ctypes.c_char_p)]
 
+class sd_load_model_inputs(ctypes.Structure):
+    _fields_ = [("model_filename", ctypes.c_char_p),
+                ("debugmode", ctypes.c_int)]
+
+class sd_generation_inputs(ctypes.Structure):
+    _fields_ = [("prompt", ctypes.c_char_p),
+                ("negative_prompt", ctypes.c_char_p),
+                ("cfg_scale", ctypes.c_float),
+                ("sample_steps", ctypes.c_int),
+                ("seed", ctypes.c_int),
+                ("sample_method", ctypes.c_char_p)]
+
+class sd_generation_outputs(ctypes.Structure):
+    _fields_ = [("status", ctypes.c_int),
+                ("data", ctypes.c_char_p)]
+
 class token_count_outputs(ctypes.Structure):
     _fields_ = [("count", ctypes.c_int),
                 ("ids", ctypes.POINTER(ctypes.c_int))]
@@ -257,6 +273,10 @@ def init_library():
     handle.abort_generate.restype = ctypes.c_bool
     handle.token_count.restype = token_count_outputs
     handle.get_pending_output.restype = ctypes.c_char_p
+    handle.load_model_sd.argtypes = [sd_load_model_inputs]
+    handle.load_model_sd.restype = ctypes.c_bool
+    handle.generate_sd.argtypes = [sd_generation_inputs]
+    handle.generate_sd.restype = sd_generation_outputs
 
 def load_model(model_filename):
     global args
@@ -448,6 +468,15 @@ def generate(prompt, memory="", max_length=32, max_context_length=512, temperatu
                     outstr = outstr[:sindex]
         return outstr
 
+
+def load_model_sd(model_filename):
+    global args
+    inputs = sd_load_model_inputs()
+    inputs.debugmode = args.debugmode
+    inputs.model_filename = model_filename.encode("UTF-8")
+    ret = handle.load_model_sd(inputs)
+    return ret
+
 def utfprint(str):
     try:
         print(str)
@@ -1146,7 +1175,7 @@ def show_new_gui():
         root.destroy()
         if args.model_param and args.model_param!="" and args.model_param.lower().endswith('.kcpps'):
             loadconfigfile(args.model_param)
-        if not args.model_param:
+        if not args.model_param and not args.sdconfig:
             global exitcounter
             exitcounter = 999
             print("\nNo ggml model or kcpps file was selected. Exiting.")
@@ -1192,7 +1221,7 @@ def show_new_gui():
 
     tabs = ctk.CTkFrame(root, corner_radius = 0, width=windowwidth, height=windowheight-50)
     tabs.grid(row=0, stick="nsew")
-    tabnames= ["Quick Launch", "Hardware", "Tokens", "Model", "Network"]
+    tabnames= ["Quick Launch", "Hardware", "Tokens", "Model", "Network","Image Gen"]
     navbuttons = {}
     navbuttonframe = ctk.CTkFrame(tabs, width=100, height=int(tabs.cget("height")))
     navbuttonframe.grid(row=0, column=0, padx=2,pady=2)
@@ -1283,6 +1312,9 @@ def show_new_gui():
     ssl_cert_var = ctk.StringVar()
     ssl_key_var = ctk.StringVar()
 
+    sd_model_var = ctk.StringVar()
+    sd_quick_var = ctk.IntVar(value=0)
+
     def tabbuttonaction(name):
         for t in tabcontent:
             if name == t:
@@ -1755,9 +1787,14 @@ def show_new_gui():
     makecheckbox(network_tab, "Configure for Horde", usehorde_var, 19, command=togglehorde,tooltiptxt="Enable the embedded AI Horde worker.")
     togglehorde(1,1,1)
 
+    # Image Gen Tab
+    images_tab = tabcontent["Image Gen"]
+    makefileentry(images_tab, "Stable Diffusion Model (f16):", "Select Stable Diffusion Model File", sd_model_var, 1, filetypes=[("*.safetensors","*.safetensors")], tooltiptxt="Select a .safetensors Stable Diffusion model file on disk to be loaded.")
+    makecheckbox(images_tab, "Quick Mode (Low Quality)", sd_quick_var, 4,tooltiptxt="Force optimal generation settings for speed.")
+
     # launch
     def guilaunch():
-        if model_var.get() == "":
+        if model_var.get() == "" and sd_model_var.get() == "":
             tmp = askopenfilename(title="Select ggml model .bin or .gguf file")
             model_var.set(tmp)
         nonlocal nextstate
@@ -1841,6 +1878,8 @@ def show_new_gui():
         else:
             args.hordeconfig = None if usehorde_var.get() == 0 else [horde_name_var.get(), horde_gen_var.get(), horde_context_var.get(), horde_apikey_var.get(), horde_workername_var.get()]
 
+        args.sdconfig = None if sd_model_var.get() == "" else [sd_model_var.get(), ("quick" if sd_quick_var.get()==1 else "normal")]
+
     def import_vars(dict):
         if "threads" in dict:
             threads_var.set(dict["threads"])
@@ -1968,6 +2007,11 @@ def show_new_gui():
                 horde_workername_var.set(dict["hordeconfig"][4])
                 usehorde_var.set("1")
 
+        if "sdconfig" in dict and dict["sdconfig"] and len(dict["sdconfig"]) > 1:
+            sd_model_var.set(dict["sdconfig"][0])
+            if len(dict["sdconfig"]) > 2:
+                sd_quick_var.set(1 if dict["sdconfig"][1]=="quick" else 0)
+
     def save_config():
         file_type = [("KoboldCpp Settings", "*.kcpps")]
         filename = asksaveasfile(filetypes=file_type, defaultextension=file_type)
@@ -2026,9 +2070,9 @@ def show_new_gui():
         # processing vars
         export_vars()
 
-        if not args.model_param:
+        if not args.model_param and not args.sdconfig:
             exitcounter = 999
-            print("\nNo ggml model file was selected. Exiting.")
+            print("\nNo text or image model file was selected. Exiting.")
             time.sleep(3)
             sys.exit(2)
 
@@ -2394,7 +2438,7 @@ def main(launch_args,start_server=True):
     if not args.model_param:
         args.model_param = args.model
 
-    if not args.model_param:
+    if not args.model_param and not args.sdconfig:
         #give them a chance to pick a file
         print("For command line arguments, please refer to --help")
         print("***")
@@ -2419,7 +2463,7 @@ def main(launch_args,start_server=True):
             print(f"Warning: Saved story file {args.preloadstory} invalid or not found. No story will be preloaded into server.")
 
     # sanitize and replace the default vanity name. remember me....
-    if args.model_param!="":
+    if args.model_param and args.model_param!="":
         newmdldisplayname = os.path.basename(args.model_param)
         newmdldisplayname = os.path.splitext(newmdldisplayname)[0]
         friendlymodelname = "koboldcpp/" + sanitize_string(newmdldisplayname)
@@ -2471,45 +2515,67 @@ def main(launch_args,start_server=True):
     init_library() # Note: if blas does not exist and is enabled, program will crash.
     print("==========")
     time.sleep(1)
-    if not os.path.exists(args.model_param):
-        exitcounter = 999
-        print(f"Cannot find model file: {args.model_param}")
-        time.sleep(3)
-        sys.exit(2)
 
-    if args.lora and args.lora[0]!="":
-        if not os.path.exists(args.lora[0]):
+    #handle loading text model
+    if args.model_param:
+        if not os.path.exists(args.model_param):
             exitcounter = 999
-            print(f"Cannot find lora file: {args.lora[0]}")
+            print(f"Cannot find text model file: {args.model_param}")
             time.sleep(3)
             sys.exit(2)
-        else:
-            args.lora[0] = os.path.abspath(args.lora[0])
-            if len(args.lora) > 1:
-                if not os.path.exists(args.lora[1]):
-                    exitcounter = 999
-                    print(f"Cannot find lora base: {args.lora[1]}")
-                    time.sleep(3)
-                    sys.exit(2)
-                else:
-                    args.lora[1] = os.path.abspath(args.lora[1])
 
-    if not args.blasthreads or args.blasthreads <= 0:
-        args.blasthreads = args.threads
+        if args.lora and args.lora[0]!="":
+            if not os.path.exists(args.lora[0]):
+                exitcounter = 999
+                print(f"Cannot find lora file: {args.lora[0]}")
+                time.sleep(3)
+                sys.exit(2)
+            else:
+                args.lora[0] = os.path.abspath(args.lora[0])
+                if len(args.lora) > 1:
+                    if not os.path.exists(args.lora[1]):
+                        exitcounter = 999
+                        print(f"Cannot find lora base: {args.lora[1]}")
+                        time.sleep(3)
+                        sys.exit(2)
+                    else:
+                        args.lora[1] = os.path.abspath(args.lora[1])
 
-    modelname = os.path.abspath(args.model_param)
-    print(args)
-    # Flush stdout for win32 issue with regards to piping in terminals,
-    # especially before handing over to C++ context.
-    print(f"==========\nLoading model: {modelname} \n[Threads: {args.threads}, BlasThreads: {args.blasthreads}, SmartContext: {args.smartcontext}, ContextShift: {not (args.noshift)}]", flush=True)
-    loadok = load_model(modelname)
-    print("Load Model OK: " + str(loadok))
+        if not args.blasthreads or args.blasthreads <= 0:
+            args.blasthreads = args.threads
 
-    if not loadok:
-        exitcounter = 999
-        print("Could not load model: " + modelname)
-        time.sleep(3)
-        sys.exit(3)
+        modelname = os.path.abspath(args.model_param)
+        print(args)
+        # Flush stdout for win32 issue with regards to piping in terminals,
+        # especially before handing over to C++ context.
+        print(f"==========\nLoading model: {modelname} \n[Threads: {args.threads}, BlasThreads: {args.blasthreads}, SmartContext: {args.smartcontext}, ContextShift: {not (args.noshift)}]", flush=True)
+        loadok = load_model(modelname)
+        print("Load Text Model OK: " + str(loadok))
+
+        if not loadok:
+            exitcounter = 999
+            print("Could not load text model: " + modelname)
+            time.sleep(3)
+            sys.exit(3)
+
+    #handle loading image model
+    if args.sdconfig:
+        imgmodel = args.sdconfig[0]
+        if not imgmodel or not os.path.exists(imgmodel):
+            exitcounter = 999
+            print(f"Cannot find image model file: {imgmodel}")
+            time.sleep(3)
+            sys.exit(2)
+        imgmodel = os.path.abspath(imgmodel)
+        loadok = load_model_sd(imgmodel)
+        print("Load Image Model OK: " + str(loadok))
+        if not loadok:
+            exitcounter = 999
+            print("Could not load image model: " + imgmodel)
+            time.sleep(3)
+            sys.exit(3)
+
+    #load embedded lite
     try:
         basepath = os.path.abspath(os.path.dirname(__file__))
         with open(os.path.join(basepath, "klite.embd"), mode='rb') as f:
@@ -2572,7 +2638,7 @@ def main(launch_args,start_server=True):
         timer_thread = threading.Timer(1, onready_subprocess) #1 second delay
         timer_thread.start()
 
-    if args.benchmark is not None:
+    if args.model_param and args.benchmark is not None:
         from datetime import datetime, timezone
         global libname
         start_server = False
@@ -2678,18 +2744,25 @@ if __name__ == '__main__':
     portgroup.add_argument("port_param", help="Port to listen on (positional)", default=defaultport, nargs="?", type=int, action='store')
     parser.add_argument("--host", help="Host IP to listen on. If empty, all routable interfaces are accepted.", default="")
     parser.add_argument("--launch", help="Launches a web browser when load is completed.", action='store_true')
-    parser.add_argument("--lora", help="LLAMA models only, applies a lora file on top of model. Experimental.", metavar=('[lora_filename]', '[lora_base]'), nargs='+')
     parser.add_argument("--config", help="Load settings from a .kcpps file. Other arguments will be ignored", type=str, nargs=1)
     physical_core_limit = 1
     if os.cpu_count()!=None and os.cpu_count()>1:
         physical_core_limit = int(os.cpu_count()/2)
     default_threads = (physical_core_limit if physical_core_limit<=3 else max(3,physical_core_limit-1))
     parser.add_argument("--threads", help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=default_threads)
-    parser.add_argument("--blasthreads", help="Use a different number of threads during BLAS if specified. Otherwise, has the same value as --threads",metavar=('[threads]'), type=int, default=0)
-    parser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true')
+    compatgroup = parser.add_mutually_exclusive_group()
+    compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs. For hipBLAS binaries, please check YellowRoseCx rocm fork.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq] [rowsplit]'), choices=['normal', 'lowvram', '0', '1', '2', '3', 'mmq', 'rowsplit'])
+    compatgroup.add_argument("--usevulkan", help="Use Vulkan for GPU Acceleration. Can optionally specify GPU Device ID (e.g. --usevulkan 0).", metavar=('[Device ID]'), nargs='*', type=int, default=None)
+    compatgroup.add_argument("--useclblast", help="Use CLBlast for GPU Acceleration. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2)
+    compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
+    parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU.",metavar=('[GPU layers]'), nargs='?', const=1, type=int, default=0)
+    parser.add_argument("--tensor_split", help="For CUDA and Vulkan only, ratio to split tensors across multiple GPUs, space-separated list of proportions, e.g. 7 3", metavar=('[Ratios]'), type=float, nargs='+')
     parser.add_argument("--contextsize", help="Controls the memory allocated for maximum context size, only change if you need more RAM for big contexts. (default 2048). Supported values are [256,512,1024,2048,3072,4096,6144,8192,12288,16384,24576,32768,49152,65536]. IF YOU USE ANYTHING ELSE YOU ARE ON YOUR OWN.",metavar=('[256,512,1024,2048,3072,4096,6144,8192,12288,16384,24576,32768,49152,65536]'), type=check_range(int,256,262144), default=2048)
-    parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,32,64,128,256,512,1024,2048], default=512)
     parser.add_argument("--ropeconfig", help="If set, uses customized RoPE scaling from configured frequency scale and frequency base (e.g. --ropeconfig 0.25 10000). Otherwise, uses NTK-Aware scaling set automatically based on context size. For linear rope, simply set the freq-scale and ignore the freq-base",metavar=('[rope-freq-scale]', '[rope-freq-base]'), default=[0.0, 10000.0], type=float, nargs='+')
+    #more advanced params
+    parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,32,64,128,256,512,1024,2048], default=512)
+    parser.add_argument("--blasthreads", help="Use a different number of threads during BLAS if specified. Otherwise, has the same value as --threads",metavar=('[threads]'), type=int, default=0)
+    parser.add_argument("--lora", help="LLAMA models only, applies a lora file on top of model. Experimental.", metavar=('[lora_filename]', '[lora_base]'), nargs='+')
     parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true')
     parser.add_argument("--noshift", help="If set, do not attempt to Trim and Shift the GGUF context.", action='store_true')
     parser.add_argument("--bantokens", help="You can manually specify a list of token SUBSTRINGS that the AI cannot use. This bans ALL instances of that substring.", metavar=('[token_substrings]'), nargs='+')
@@ -2700,27 +2773,16 @@ if __name__ == '__main__':
     parser.add_argument("--debugmode", help="Shows additional debug info in the terminal.", nargs='?', const=1, type=int, default=0)
     parser.add_argument("--skiplauncher", help="Doesn't display or use the GUI launcher.", action='store_true')
     parser.add_argument("--hordeconfig", help="Sets the display model name to something else, for easy use on AI Horde. Optional additional parameters set the horde max genlength, max ctxlen, API key and worker name.",metavar=('[hordemodelname]', '[hordegenlength] [hordemaxctx] [hordeapikey] [hordeworkername]'), nargs='+')
-    compatgroup = parser.add_mutually_exclusive_group()
-    compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
-    compatgroup.add_argument("--useclblast", help="Use CLBlast for GPU Acceleration. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2)
-    compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs. For hipBLAS binaries, please check YellowRoseCx rocm fork.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq] [rowsplit]'), choices=['normal', 'lowvram', '0', '1', '2', '3', 'mmq', 'rowsplit'])
-    compatgroup.add_argument("--usevulkan", help="Use Vulkan for GPU Acceleration. Can optionally specify GPU Device ID (e.g. --usevulkan 0).", metavar=('[Device ID]'), nargs='*', type=int, default=None)
-    parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU.",metavar=('[GPU layers]'), nargs='?', const=1, type=int, default=0)
-    parser.add_argument("--tensor_split", help="For CUDA and Vulkan only, ratio to split tensors across multiple GPUs, space-separated list of proportions, e.g. 7 3", metavar=('[Ratios]'), type=float, nargs='+')
     parser.add_argument("--onready", help="An optional shell command to execute after the model has been loaded.", metavar=('[shell command]'), type=str, default="",nargs=1)
     parser.add_argument("--benchmark", help="Do not start server, instead run benchmarks. If filename is provided, appends results to provided file.", metavar=('[filename]'), nargs='?', const="stdout", type=str, default=None)
     parser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them.", metavar=('limit'), nargs='?', const=1, type=int, default=0)
     parser.add_argument("--remotetunnel", help="Uses Cloudflare to create a remote tunnel, allowing you to access koboldcpp remotely over the internet even behind a firewall.", action='store_true')
+    parser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true')
     parser.add_argument("--foreground", help="Windows only. Sends the terminal to the foreground every time a new prompt is generated. This helps avoid some idle slowdown issues.", action='store_true')
     parser.add_argument("--preloadstory", help="Configures a prepared story json save file to be hosted on the server, which frontends (such as Kobold Lite) can access over the API.", default="")
     parser.add_argument("--quiet", help="Enable quiet mode, which hides generation inputs and outputs in the terminal. Quiet mode is automatically enabled when running --hordeconfig.", action='store_true')
     parser.add_argument("--ssl", help="Allows all content to be served over SSL instead. A valid UNENCRYPTED SSL cert and key .pem files must be provided", metavar=('[cert_pem]', '[key_pem]'), nargs='+')
     parser.add_argument("--nocertify", help="Allows insecure SSL connections. Use this if you have cert errors and need to bypass certificate restrictions.", action='store_true')
-
-    # #deprecated hidden args. they do nothing. do not use
-    # parser.add_argument("--psutil_set_threads", action='store_true', help=argparse.SUPPRESS)
-    # parser.add_argument("--stream", action='store_true', help=argparse.SUPPRESS)
-    # parser.add_argument("--unbantokens", action='store_true', help=argparse.SUPPRESS)
-    # parser.add_argument("--usemirostat", action='store_true', help=argparse.SUPPRESS)
+    parser.add_argument("--sdconfig", help="Specify a stable diffusion safetensors model to enable image generation. If quick is specified, force optimal generation settings for speed.",metavar=('[sd_filename]', '[normal|quick]'), nargs='+')
 
     main(parser.parse_args(),start_server=True)
diff --git a/otherarch/sdcpp/sd_adapter.cpp b/otherarch/sdcpp/sd_adapter.cpp
index 619702e8a..629db9f62 100644
--- a/otherarch/sdcpp/sd_adapter.cpp
+++ b/otherarch/sdcpp/sd_adapter.cpp
@@ -111,13 +111,35 @@ struct SDParams {
 static SDParams * sd_params = nullptr;
 static sd_ctx_t * sd_ctx = nullptr;
 
+static void sd_logger_callback(enum sd_log_level_t level, const char* log, void* data) {
+    SDParams* params = (SDParams*)data;
+    if (!params->verbose && level <= SD_LOG_DEBUG) {
+        return;
+    }
+    if (level <= SD_LOG_INFO) {
+        fputs(log, stdout);
+        fflush(stdout);
+    } else {
+        fputs(log, stderr);
+        fflush(stderr);
+    }
+}
+
 bool sdtype_load_model(const load_sd_model_inputs inputs) {
+
+    printf("\nSelected Image Model: %s\n",inputs.model_filename);
+
     sd_params = new SDParams();
     sd_params->model_path = inputs.model_filename;
     sd_params->wtype = SD_TYPE_F16;
     sd_params->n_threads = -1; //use physical cores
     sd_params->input_path = ""; //unused
 
+    if(inputs.debugmode==1)
+    {
+        sd_set_log_callback(sd_logger_callback, (void*)sd_params);
+    }
+
     bool vae_decode_only = false;
 
     sd_ctx = new_sd_ctx(sd_params->model_path.c_str(),