diff --git a/koboldcpp.py b/koboldcpp.py index a69011cdb..9870648ed 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -1136,10 +1136,11 @@ def autoset_gpu_layers(ctxsize, sdquanted, bbs, qkv_level): #shitty algo to dete showmultigpuwarning = False print("Multi-Part GGUF detected. Layer estimates may not be very accurate - recommend setting layers manually.") fsize *= total_parts + sdquantsavings = sdquanted if modelfile_extracted_meta[3] > 1024*1024*1024*5: #sdxl tax - mem -= 1024*1024*1024*(6 if sdquanted else 9) + mem -= 1024*1024*1024*(9 - sdquantsavings * 1.5) # 9, 7.5, 6 elif modelfile_extracted_meta[3] > 1024*1024*512: #normal sd tax - mem -= 1024*1024*1024*(3.25 if sdquanted else 4.25) + mem -= 1024*1024*1024*(4.25 - sdquantsavings * 0.5) # 4.25, 3.75, 3.25 if modelfile_extracted_meta[4] > 1024*1024*10: #whisper tax mem -= max(350*1024*1024,modelfile_extracted_meta[4]*1.5) if modelfile_extracted_meta[5] > 1024*1024*10: #mmproj tax @@ -1656,22 +1657,25 @@ def sd_convdirect_option(value): return 'full' raise argparse.ArgumentTypeError(f"Invalid sdconvdirect option \"{value}\". Must be one of {sd_convdirect_choices}.") +sd_quant_choices = ['0 (off)', '1 (q8_0)', '2 (q4_0)'] + +def sd_quant_option(value): + return int((value or '0')[0]) + def sd_load_model(model_filename,vae_filename,lora_filename,t5xxl_filename,clipl_filename,clipg_filename,photomaker_filename): global args inputs = sd_load_model_inputs() inputs.model_filename = model_filename.encode("UTF-8") thds = args.threads - quant = 0 if args.sdthreads and args.sdthreads > 0: sdt = int(args.sdthreads) if sdt > 0: thds = sdt - if args.sdquant: - quant = 1 inputs.threads = thds - inputs.quant = quant + sd_quant_types = {0: -1, 1: 8, 2: 2} # enum sd_type_t + inputs.quant = sd_quant_types[args.sdquant] inputs.flash_attention = args.sdflashattention sdconvdirect = sd_convdirect_option(args.sdconvdirect) inputs.diffusion_conv_direct = sdconvdirect == 'full' @@ -4596,7 +4600,7 @@ def show_gui(): sd_clamped_var = ctk.StringVar(value="0") sd_clamped_soft_var = ctk.StringVar(value="0") sd_threads_var = ctk.StringVar(value=str(default_threads)) - sd_quant_var = ctk.IntVar(value=0) + sd_quant_var = ctk.StringVar(value=sd_quant_choices[0]) whisper_model_var = ctk.StringVar() tts_model_var = ctk.StringVar() @@ -4939,7 +4943,7 @@ def show_gui(): pass def changed_gpulayers_estimate(*args): - predicted_gpu_layers = autoset_gpu_layers(int(contextsize_text[context_var.get()]),(sd_quant_var.get()==1),int(blasbatchsize_values[int(blas_size_var.get())]),(quantkv_var.get() if flashattention_var.get()==1 else 0)) + predicted_gpu_layers = autoset_gpu_layers(int(contextsize_text[context_var.get()]),sd_quant_option(sd_quant_var.get() or 0),int(blasbatchsize_values[int(blas_size_var.get())]),(quantkv_var.get() if flashattention_var.get()==1 else 0)) max_gpu_layers = (f"/{modelfile_extracted_meta[1][0]+3}" if (modelfile_extracted_meta and modelfile_extracted_meta[1] and modelfile_extracted_meta[1][0]!=0) else "") index = runopts_var.get() gpu_be = (index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Old CPU)" or index == "Use CLBlast (Older CPU)" or index == "Use CUDA" or index == "Use hipBLAS (ROCm)") @@ -5341,7 +5345,7 @@ def show_gui(): makelabelentry(images_tab, "(Soft):", sd_clamped_soft_var, 4, 50, padx=290,singleline=True,tooltip="Square image size restriction, to protect the server against memory crashes.\nAllows width-height tradeoffs, eg. 640 allows 640x640 and 512x768\nLeave at 0 for the default value: 832 for SD1.5/SD2, 1024 otherwise.",labelpadx=250) makelabelentry(images_tab, "Image Threads:" , sd_threads_var, 8, 50,padx=290,singleline=True,tooltip="How many threads to use during image generation.\nIf left blank, uses same value as threads.") sd_model_var.trace_add("write", gui_changed_modelfile) - makecheckbox(images_tab, "Compress Weights (Saves Memory)", sd_quant_var, 10,tooltiptxt="Quantizes the SD model weights to save memory. May degrade quality.") + makelabelcombobox(images_tab, "Compress Weights (Saves Memory)", sd_quant_var, 10, width=50, labelpadx=65, tooltiptxt="Quantizes the SD model weights to save memory.\nHigher levels save more memory, and cause more quality degradation.", values=sd_quant_choices) sd_quant_var.trace_add("write", changed_gpulayers_estimate) makefileentry(images_tab, "Image LoRA (safetensors/gguf):", "Select SD lora file",sd_lora_var, 20, width=280, singlecol=True, filetypes=[("*.safetensors *.gguf", "*.safetensors *.gguf")],tooltiptxt="Select a .safetensors or .gguf SD LoRA model file to be loaded. Should be unquantized!") @@ -5626,8 +5630,7 @@ def show_gui(): args.sdclipg = sd_clipg_var.get() if sd_photomaker_var.get() != "": args.sdphotomaker = sd_photomaker_var.get() - if sd_quant_var.get()==1: - args.sdquant = True + args.sdquant = sd_quant_option(sd_quant_var.get()) if sd_lora_var.get() != "": args.sdlora = sd_lora_var.get() args.sdloramult = float(sd_loramult_var.get()) @@ -5837,7 +5840,7 @@ def show_gui(): sd_clamped_var.set(int(dict["sdclamped"]) if ("sdclamped" in dict and dict["sdclamped"]) else 0) sd_clamped_soft_var.set(int(dict["sdclampedsoft"]) if ("sdclampedsoft" in dict and dict["sdclampedsoft"]) else 0) sd_threads_var.set(str(dict["sdthreads"]) if ("sdthreads" in dict and dict["sdthreads"]) else str(default_threads)) - sd_quant_var.set(1 if ("sdquant" in dict and dict["sdquant"]) else 0) + sd_quant_var.set(sd_quant_choices[(dict["sdquant"] if "sdquant" in dict else 0)]) sd_flash_attention_var.set(1 if ("sdflashattention" in dict and dict["sdflashattention"]) else 0) sd_convdirect_var.set(sd_convdirect_option(dict.get("sdconvdirect"))) sd_vae_var.set(dict["sdvae"] if ("sdvae" in dict and dict["sdvae"]) else "") @@ -6190,7 +6193,7 @@ def convert_invalid_args(args): if dict["sdconfig"] and len(dict["sdconfig"]) > 2: dict["sdthreads"] = int(dict["sdconfig"][2]) if dict["sdconfig"] and len(dict["sdconfig"]) > 3: - dict["sdquant"] = (True if dict["sdconfig"][3]=="quant" else False) + dict["sdquant"] = (2 if dict["sdconfig"][3]=="quant" else 0) if "hordeconfig" in dict and dict["hordeconfig"] and dict["hordeconfig"][0]!="": dict["hordemodelname"] = dict["hordeconfig"][0] if len(dict["hordeconfig"]) > 1: @@ -6216,6 +6219,8 @@ def convert_invalid_args(args): dict["model_param"] = model_value[0] # Take the first file in the list if "sdnotile" in dict and "sdtiledvae" not in dict: dict["sdtiledvae"] = (0 if (dict["sdnotile"]) else default_vae_tile_threshold) # convert legacy option + if 'sdquant' in dict and type(dict['sdquant']) is bool: + dict['sdquant'] = 2 if dict['sdquant'] else 0 return args def setuptunnel(global_memory, has_sd): @@ -7648,7 +7653,7 @@ if __name__ == '__main__': sdparsergroupvae.add_argument("--sdvae", metavar=('[filename]'), help="Specify an image generation safetensors VAE which replaces the one in the model.", default="") sdparsergroupvae.add_argument("--sdvaeauto", help="Uses a built-in VAE via TAE SD, which is very fast, and fixed bad VAEs.", action='store_true') sdparsergrouplora = sdparsergroup.add_mutually_exclusive_group() - sdparsergrouplora.add_argument("--sdquant", help="If specified, loads the model quantized to save memory.", action='store_true') + sdparsergrouplora.add_argument("--sdquant", metavar=('[quantization level 0/1/2]'), help="If specified, loads the model quantized to save memory. 0=off, 1=q8, 2=q4", type=int, choices=[0,1,2], nargs="?", const=2, default=0) sdparsergrouplora.add_argument("--sdlora", metavar=('[filename]'), help="Specify an image generation LORA safetensors model to be applied.", default="") sdparsergroup.add_argument("--sdloramult", metavar=('[amount]'), help="Multiplier for the image LORA model to be applied.", type=float, default=1.0) sdparsergroup.add_argument("--sdtiledvae", metavar=('[maxres]'), help="Adjust the automatic VAE tiling trigger for images above this size. 0 disables vae tiling.", type=int, default=default_vae_tile_threshold) diff --git a/otherarch/sdcpp/sdtype_adapter.cpp b/otherarch/sdcpp/sdtype_adapter.cpp index ba10e881a..c43d03adf 100644 --- a/otherarch/sdcpp/sdtype_adapter.cpp +++ b/otherarch/sdcpp/sdtype_adapter.cpp @@ -221,7 +221,7 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) { { printf("Conv2D Direct for VAE model is enabled\n"); } - if(inputs.quant) + if(inputs.quant > 0) { printf("Note: Loading a pre-quantized model is always faster than using compress weights!\n"); } @@ -253,7 +253,11 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) { sd_params = new SDParams(); sd_params->model_path = inputs.model_filename; - sd_params->wtype = (inputs.quant==0?SD_TYPE_COUNT:SD_TYPE_Q4_0); + sd_params->wtype = SD_TYPE_COUNT; + if (inputs.quant > 0 && inputs.quant < SD_TYPE_COUNT) { + sd_params->wtype = (sd_type_t) inputs.quant; + printf("\nDiffusion Model quantized to %s", sd_type_name(sd_params->wtype)); + } sd_params->n_threads = inputs.threads; //if -1 use physical cores sd_params->diffusion_flash_attn = inputs.flash_attention; sd_params->diffusion_conv_direct = inputs.diffusion_conv_direct;