Support q8_0 quantization for image model loading (#1692)

* Support q8_0 quantization for image model loading q4_0 may degrade quality significantly, especially for smaller models like SD 1.5 and SDXL. q8_0 provides a middle-ground, giving half the memory savings of q4_0 but loading faster and with less quality loss. * Accept --sdquant with no parameters * Use numerical values for the sdquant option
2025-09-10 09:04:36 +00:00 · 2025-08-22 11:17:15 -03:00 · 2025-08-22 11:17:15 -03:00 · 2f8b0ec538
commit 2f8b0ec538
parent 7fef0bc949
2 changed files with 25 additions and 16 deletions
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -1136,10 +1136,11 @@ def autoset_gpu_layers(ctxsize, sdquanted, bbs, qkv_level): #shitty algo to dete
                            showmultigpuwarning = False
                            print("Multi-Part GGUF detected. Layer estimates may not be very accurate - recommend setting layers manually.")
                        fsize *= total_parts
+            sdquantsavings = sdquanted
            if modelfile_extracted_meta[3] > 1024*1024*1024*5: #sdxl tax
-                mem -= 1024*1024*1024*(6 if sdquanted else 9)
+                mem -= 1024*1024*1024*(9 - sdquantsavings * 1.5) # 9, 7.5, 6
            elif modelfile_extracted_meta[3] > 1024*1024*512: #normal sd tax
-                mem -= 1024*1024*1024*(3.25 if sdquanted else 4.25)
+                mem -= 1024*1024*1024*(4.25 - sdquantsavings * 0.5) # 4.25, 3.75, 3.25
            if modelfile_extracted_meta[4] > 1024*1024*10: #whisper tax
                mem -= max(350*1024*1024,modelfile_extracted_meta[4]*1.5)
            if modelfile_extracted_meta[5] > 1024*1024*10: #mmproj tax
@ -1656,22 +1657,25 @@ def sd_convdirect_option(value):
        return 'full'
    raise argparse.ArgumentTypeError(f"Invalid sdconvdirect option \"{value}\". Must be one of {sd_convdirect_choices}.")

+sd_quant_choices = ['0   (off)', '1   (q8_0)', '2   (q4_0)']
+
+def sd_quant_option(value):
+    return int((value or '0')[0])
+
 def sd_load_model(model_filename,vae_filename,lora_filename,t5xxl_filename,clipl_filename,clipg_filename,photomaker_filename):
    global args
    inputs = sd_load_model_inputs()
    inputs.model_filename = model_filename.encode("UTF-8")
    thds = args.threads
-    quant = 0

    if args.sdthreads and args.sdthreads > 0:
        sdt = int(args.sdthreads)
        if sdt > 0:
            thds = sdt
-    if args.sdquant:
-        quant = 1

    inputs.threads = thds
-    inputs.quant = quant
+    sd_quant_types = {0: -1, 1: 8, 2: 2} # enum sd_type_t
+    inputs.quant = sd_quant_types[args.sdquant]
    inputs.flash_attention = args.sdflashattention
    sdconvdirect = sd_convdirect_option(args.sdconvdirect)
    inputs.diffusion_conv_direct = sdconvdirect == 'full'
@ -4596,7 +4600,7 @@ def show_gui():
    sd_clamped_var = ctk.StringVar(value="0")
    sd_clamped_soft_var = ctk.StringVar(value="0")
    sd_threads_var = ctk.StringVar(value=str(default_threads))
-    sd_quant_var = ctk.IntVar(value=0)
+    sd_quant_var = ctk.StringVar(value=sd_quant_choices[0])

    whisper_model_var = ctk.StringVar()
    tts_model_var = ctk.StringVar()
@ -4939,7 +4943,7 @@ def show_gui():
        pass

    def changed_gpulayers_estimate(*args):
-        predicted_gpu_layers = autoset_gpu_layers(int(contextsize_text[context_var.get()]),(sd_quant_var.get()==1),int(blasbatchsize_values[int(blas_size_var.get())]),(quantkv_var.get() if flashattention_var.get()==1 else 0))
+        predicted_gpu_layers = autoset_gpu_layers(int(contextsize_text[context_var.get()]),sd_quant_option(sd_quant_var.get() or 0),int(blasbatchsize_values[int(blas_size_var.get())]),(quantkv_var.get() if flashattention_var.get()==1 else 0))
        max_gpu_layers = (f"/{modelfile_extracted_meta[1][0]+3}" if (modelfile_extracted_meta and modelfile_extracted_meta[1] and modelfile_extracted_meta[1][0]!=0) else "")
        index = runopts_var.get()
        gpu_be = (index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Old CPU)" or index == "Use CLBlast (Older CPU)" or index == "Use CUDA" or index == "Use hipBLAS (ROCm)")
@ -5341,7 +5345,7 @@ def show_gui():
    makelabelentry(images_tab, "(Soft):", sd_clamped_soft_var, 4, 50, padx=290,singleline=True,tooltip="Square image size restriction, to protect the server against memory crashes.\nAllows width-height tradeoffs, eg. 640 allows 640x640 and 512x768\nLeave at 0 for the default value: 832 for SD1.5/SD2, 1024 otherwise.",labelpadx=250)
    makelabelentry(images_tab, "Image Threads:" , sd_threads_var, 8, 50,padx=290,singleline=True,tooltip="How many threads to use during image generation.\nIf left blank, uses same value as threads.")
    sd_model_var.trace_add("write", gui_changed_modelfile)
-    makecheckbox(images_tab, "Compress Weights (Saves Memory)", sd_quant_var, 10,tooltiptxt="Quantizes the SD model weights to save memory. May degrade quality.")
+    makelabelcombobox(images_tab, "Compress Weights (Saves Memory)", sd_quant_var, 10, width=50, labelpadx=65, tooltiptxt="Quantizes the SD model weights to save memory.\nHigher levels save more memory, and cause more quality degradation.", values=sd_quant_choices)
    sd_quant_var.trace_add("write", changed_gpulayers_estimate)

    makefileentry(images_tab, "Image LoRA (safetensors/gguf):", "Select SD lora file",sd_lora_var, 20, width=280, singlecol=True, filetypes=[("*.safetensors *.gguf", "*.safetensors *.gguf")],tooltiptxt="Select a .safetensors or .gguf SD LoRA model file to be loaded. Should be unquantized!")
@ -5626,8 +5630,7 @@ def show_gui():
            args.sdclipg = sd_clipg_var.get()
        if sd_photomaker_var.get() != "":
            args.sdphotomaker = sd_photomaker_var.get()
-        if sd_quant_var.get()==1:
-            args.sdquant = True
+        args.sdquant = sd_quant_option(sd_quant_var.get())
        if sd_lora_var.get() != "":
            args.sdlora = sd_lora_var.get()
            args.sdloramult = float(sd_loramult_var.get())
@ -5837,7 +5840,7 @@ def show_gui():
        sd_clamped_var.set(int(dict["sdclamped"]) if ("sdclamped" in dict and dict["sdclamped"]) else 0)
        sd_clamped_soft_var.set(int(dict["sdclampedsoft"]) if ("sdclampedsoft" in dict and dict["sdclampedsoft"]) else 0)
        sd_threads_var.set(str(dict["sdthreads"]) if ("sdthreads" in dict and dict["sdthreads"]) else str(default_threads))
-        sd_quant_var.set(1 if ("sdquant" in dict and dict["sdquant"]) else 0)
+        sd_quant_var.set(sd_quant_choices[(dict["sdquant"] if "sdquant" in dict else 0)])
        sd_flash_attention_var.set(1 if ("sdflashattention" in dict and dict["sdflashattention"]) else 0)
        sd_convdirect_var.set(sd_convdirect_option(dict.get("sdconvdirect")))
        sd_vae_var.set(dict["sdvae"] if ("sdvae" in dict and dict["sdvae"]) else "")
@ -6190,7 +6193,7 @@ def convert_invalid_args(args):
        if dict["sdconfig"] and len(dict["sdconfig"]) > 2:
            dict["sdthreads"] = int(dict["sdconfig"][2])
        if dict["sdconfig"] and len(dict["sdconfig"]) > 3:
-            dict["sdquant"] = (True if dict["sdconfig"][3]=="quant" else False)
+            dict["sdquant"] = (2 if dict["sdconfig"][3]=="quant" else 0)
    if "hordeconfig" in dict and dict["hordeconfig"] and dict["hordeconfig"][0]!="":
        dict["hordemodelname"] = dict["hordeconfig"][0]
        if len(dict["hordeconfig"]) > 1:
@ -6216,6 +6219,8 @@ def convert_invalid_args(args):
            dict["model_param"] = model_value[0]  # Take the first file in the list
    if "sdnotile" in dict and "sdtiledvae" not in dict:
        dict["sdtiledvae"] = (0 if (dict["sdnotile"]) else default_vae_tile_threshold) # convert legacy option
+    if 'sdquant' in dict and type(dict['sdquant']) is bool:
+        dict['sdquant'] = 2 if dict['sdquant'] else 0
    return args

 def setuptunnel(global_memory, has_sd):
@ -7648,7 +7653,7 @@ if __name__ == '__main__':
    sdparsergroupvae.add_argument("--sdvae", metavar=('[filename]'), help="Specify an image generation safetensors VAE which replaces the one in the model.", default="")
    sdparsergroupvae.add_argument("--sdvaeauto", help="Uses a built-in VAE via TAE SD, which is very fast, and fixed bad VAEs.", action='store_true')
    sdparsergrouplora = sdparsergroup.add_mutually_exclusive_group()
-    sdparsergrouplora.add_argument("--sdquant", help="If specified, loads the model quantized to save memory.", action='store_true')
+    sdparsergrouplora.add_argument("--sdquant",  metavar=('[quantization level 0/1/2]'), help="If specified, loads the model quantized to save memory. 0=off, 1=q8, 2=q4", type=int, choices=[0,1,2], nargs="?", const=2, default=0)
    sdparsergrouplora.add_argument("--sdlora", metavar=('[filename]'), help="Specify an image generation LORA safetensors model to be applied.", default="")
    sdparsergroup.add_argument("--sdloramult", metavar=('[amount]'), help="Multiplier for the image LORA model to be applied.", type=float, default=1.0)
    sdparsergroup.add_argument("--sdtiledvae", metavar=('[maxres]'), help="Adjust the automatic VAE tiling trigger for images above this size. 0 disables vae tiling.", type=int, default=default_vae_tile_threshold)
--- a/otherarch/sdcpp/sdtype_adapter.cpp
+++ b/otherarch/sdcpp/sdtype_adapter.cpp
@ -221,7 +221,7 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
    {
        printf("Conv2D Direct for VAE model is enabled\n");
    }
-    if(inputs.quant)
+    if(inputs.quant > 0)
    {
        printf("Note: Loading a pre-quantized model is always faster than using compress weights!\n");
    }
@ -253,7 +253,11 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {

    sd_params = new SDParams();
    sd_params->model_path = inputs.model_filename;
-    sd_params->wtype = (inputs.quant==0?SD_TYPE_COUNT:SD_TYPE_Q4_0);
+    sd_params->wtype = SD_TYPE_COUNT;
+    if (inputs.quant > 0 && inputs.quant < SD_TYPE_COUNT) {
+        sd_params->wtype = (sd_type_t) inputs.quant;
+        printf("\nDiffusion Model quantized to %s", sd_type_name(sd_params->wtype));
+    }
    sd_params->n_threads = inputs.threads; //if -1 use physical cores
    sd_params->diffusion_flash_attn = inputs.flash_attention;
    sd_params->diffusion_conv_direct = inputs.diffusion_conv_direct;