minor adjustments for sdquant: allow backend to do the translation for the type more defensively, adjust the UI dropdown for clarity.

2025-09-10 17:14:36 +00:00 · 2025-08-22 23:23:32 +08:00 · 2025-08-22 23:23:32 +08:00 · 80dabbb689
commit 80dabbb689
parent 2f8b0ec538
2 changed files with 13 additions and 10 deletions
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -1657,10 +1657,14 @@ def sd_convdirect_option(value):
        return 'full'
    raise argparse.ArgumentTypeError(f"Invalid sdconvdirect option \"{value}\". Must be one of {sd_convdirect_choices}.")

-sd_quant_choices = ['0   (off)', '1   (q8_0)', '2   (q4_0)']
+sd_quant_choices = ['off','q8','q4']

 def sd_quant_option(value):
-    return int((value or '0')[0])
+    try:
+        lvl = sd_quant_choices.index(value)
+        return lvl
+    except Exception:
+        return 0

 def sd_load_model(model_filename,vae_filename,lora_filename,t5xxl_filename,clipl_filename,clipg_filename,photomaker_filename):
    global args
@ -1674,8 +1678,7 @@ def sd_load_model(model_filename,vae_filename,lora_filename,t5xxl_filename,clipl
            thds = sdt

    inputs.threads = thds
-    sd_quant_types = {0: -1, 1: 8, 2: 2} # enum sd_type_t
-    inputs.quant = sd_quant_types[args.sdquant]
+    inputs.quant = args.sdquant
    inputs.flash_attention = args.sdflashattention
    sdconvdirect = sd_convdirect_option(args.sdconvdirect)
    inputs.diffusion_conv_direct = sdconvdirect == 'full'
@ -4943,7 +4946,7 @@ def show_gui():
        pass

    def changed_gpulayers_estimate(*args):
-        predicted_gpu_layers = autoset_gpu_layers(int(contextsize_text[context_var.get()]),sd_quant_option(sd_quant_var.get() or 0),int(blasbatchsize_values[int(blas_size_var.get())]),(quantkv_var.get() if flashattention_var.get()==1 else 0))
+        predicted_gpu_layers = autoset_gpu_layers(int(contextsize_text[context_var.get()]),sd_quant_option(sd_quant_var.get()),int(blasbatchsize_values[int(blas_size_var.get())]),(quantkv_var.get() if flashattention_var.get()==1 else 0))
        max_gpu_layers = (f"/{modelfile_extracted_meta[1][0]+3}" if (modelfile_extracted_meta and modelfile_extracted_meta[1] and modelfile_extracted_meta[1][0]!=0) else "")
        index = runopts_var.get()
        gpu_be = (index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Old CPU)" or index == "Use CLBlast (Older CPU)" or index == "Use CUDA" or index == "Use hipBLAS (ROCm)")
@ -5345,7 +5348,7 @@ def show_gui():
    makelabelentry(images_tab, "(Soft):", sd_clamped_soft_var, 4, 50, padx=290,singleline=True,tooltip="Square image size restriction, to protect the server against memory crashes.\nAllows width-height tradeoffs, eg. 640 allows 640x640 and 512x768\nLeave at 0 for the default value: 832 for SD1.5/SD2, 1024 otherwise.",labelpadx=250)
    makelabelentry(images_tab, "Image Threads:" , sd_threads_var, 8, 50,padx=290,singleline=True,tooltip="How many threads to use during image generation.\nIf left blank, uses same value as threads.")
    sd_model_var.trace_add("write", gui_changed_modelfile)
-    makelabelcombobox(images_tab, "Compress Weights (Saves Memory)", sd_quant_var, 10, width=50, labelpadx=65, tooltiptxt="Quantizes the SD model weights to save memory.\nHigher levels save more memory, and cause more quality degradation.", values=sd_quant_choices)
+    makelabelcombobox(images_tab, "Compress Weights (Saves Memory): ", sd_quant_var, 10, width=60, padx=220, labelpadx=8, tooltiptxt="Quantizes the SD model weights to save memory.\nHigher levels save more memory, and cause more quality degradation.", values=sd_quant_choices)
    sd_quant_var.trace_add("write", changed_gpulayers_estimate)

    makefileentry(images_tab, "Image LoRA (safetensors/gguf):", "Select SD lora file",sd_lora_var, 20, width=280, singlecol=True, filetypes=[("*.safetensors *.gguf", "*.safetensors *.gguf")],tooltiptxt="Select a .safetensors or .gguf SD LoRA model file to be loaded. Should be unquantized!")
@ -5840,7 +5843,7 @@ def show_gui():
        sd_clamped_var.set(int(dict["sdclamped"]) if ("sdclamped" in dict and dict["sdclamped"]) else 0)
        sd_clamped_soft_var.set(int(dict["sdclampedsoft"]) if ("sdclampedsoft" in dict and dict["sdclampedsoft"]) else 0)
        sd_threads_var.set(str(dict["sdthreads"]) if ("sdthreads" in dict and dict["sdthreads"]) else str(default_threads))
-        sd_quant_var.set(sd_quant_choices[(dict["sdquant"] if "sdquant" in dict else 0)])
+        sd_quant_var.set(sd_quant_choices[(dict["sdquant"] if ("sdquant" in dict and dict["sdquant"]>=0 and dict["sdquant"]<len(sd_quant_choices)) else 0)])
        sd_flash_attention_var.set(1 if ("sdflashattention" in dict and dict["sdflashattention"]) else 0)
        sd_convdirect_var.set(sd_convdirect_option(dict.get("sdconvdirect")))
        sd_vae_var.set(dict["sdvae"] if ("sdvae" in dict and dict["sdvae"]) else "")
--- a/otherarch/sdcpp/sdtype_adapter.cpp
+++ b/otherarch/sdcpp/sdtype_adapter.cpp
@ -254,9 +254,9 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
    sd_params = new SDParams();
    sd_params->model_path = inputs.model_filename;
    sd_params->wtype = SD_TYPE_COUNT;
-    if (inputs.quant > 0 && inputs.quant < SD_TYPE_COUNT) {
-        sd_params->wtype = (sd_type_t) inputs.quant;
-        printf("\nDiffusion Model quantized to %s", sd_type_name(sd_params->wtype));
+    if (inputs.quant > 0) {
+        sd_params->wtype = (inputs.quant==1?SD_TYPE_Q8_0:SD_TYPE_Q4_0);
+        printf("Diffusion Model quantized to %s\n", sd_type_name(sd_params->wtype));
    }
    sd_params->n_threads = inputs.threads; //if -1 use physical cores
    sd_params->diffusion_flash_attn = inputs.flash_attention;