Support q8_0 quantization for image model loading (#1692)

* Support q8_0 quantization for image model loading

q4_0 may degrade quality significantly, especially for smaller
models like SD 1.5 and SDXL. q8_0 provides a middle-ground,
giving half the memory savings of q4_0 but loading faster and
with less quality loss.

* Accept --sdquant with no parameters

* Use numerical values for the sdquant option
This commit is contained in:
Wagner Bruna 2025-08-22 11:17:15 -03:00 committed by GitHub
parent 7fef0bc949
commit 2f8b0ec538
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 25 additions and 16 deletions

View file

@ -1136,10 +1136,11 @@ def autoset_gpu_layers(ctxsize, sdquanted, bbs, qkv_level): #shitty algo to dete
showmultigpuwarning = False
print("Multi-Part GGUF detected. Layer estimates may not be very accurate - recommend setting layers manually.")
fsize *= total_parts
sdquantsavings = sdquanted
if modelfile_extracted_meta[3] > 1024*1024*1024*5: #sdxl tax
mem -= 1024*1024*1024*(6 if sdquanted else 9)
mem -= 1024*1024*1024*(9 - sdquantsavings * 1.5) # 9, 7.5, 6
elif modelfile_extracted_meta[3] > 1024*1024*512: #normal sd tax
mem -= 1024*1024*1024*(3.25 if sdquanted else 4.25)
mem -= 1024*1024*1024*(4.25 - sdquantsavings * 0.5) # 4.25, 3.75, 3.25
if modelfile_extracted_meta[4] > 1024*1024*10: #whisper tax
mem -= max(350*1024*1024,modelfile_extracted_meta[4]*1.5)
if modelfile_extracted_meta[5] > 1024*1024*10: #mmproj tax
@ -1656,22 +1657,25 @@ def sd_convdirect_option(value):
return 'full'
raise argparse.ArgumentTypeError(f"Invalid sdconvdirect option \"{value}\". Must be one of {sd_convdirect_choices}.")
sd_quant_choices = ['0 (off)', '1 (q8_0)', '2 (q4_0)']
def sd_quant_option(value):
return int((value or '0')[0])
def sd_load_model(model_filename,vae_filename,lora_filename,t5xxl_filename,clipl_filename,clipg_filename,photomaker_filename):
global args
inputs = sd_load_model_inputs()
inputs.model_filename = model_filename.encode("UTF-8")
thds = args.threads
quant = 0
if args.sdthreads and args.sdthreads > 0:
sdt = int(args.sdthreads)
if sdt > 0:
thds = sdt
if args.sdquant:
quant = 1
inputs.threads = thds
inputs.quant = quant
sd_quant_types = {0: -1, 1: 8, 2: 2} # enum sd_type_t
inputs.quant = sd_quant_types[args.sdquant]
inputs.flash_attention = args.sdflashattention
sdconvdirect = sd_convdirect_option(args.sdconvdirect)
inputs.diffusion_conv_direct = sdconvdirect == 'full'
@ -4596,7 +4600,7 @@ def show_gui():
sd_clamped_var = ctk.StringVar(value="0")
sd_clamped_soft_var = ctk.StringVar(value="0")
sd_threads_var = ctk.StringVar(value=str(default_threads))
sd_quant_var = ctk.IntVar(value=0)
sd_quant_var = ctk.StringVar(value=sd_quant_choices[0])
whisper_model_var = ctk.StringVar()
tts_model_var = ctk.StringVar()
@ -4939,7 +4943,7 @@ def show_gui():
pass
def changed_gpulayers_estimate(*args):
predicted_gpu_layers = autoset_gpu_layers(int(contextsize_text[context_var.get()]),(sd_quant_var.get()==1),int(blasbatchsize_values[int(blas_size_var.get())]),(quantkv_var.get() if flashattention_var.get()==1 else 0))
predicted_gpu_layers = autoset_gpu_layers(int(contextsize_text[context_var.get()]),sd_quant_option(sd_quant_var.get() or 0),int(blasbatchsize_values[int(blas_size_var.get())]),(quantkv_var.get() if flashattention_var.get()==1 else 0))
max_gpu_layers = (f"/{modelfile_extracted_meta[1][0]+3}" if (modelfile_extracted_meta and modelfile_extracted_meta[1] and modelfile_extracted_meta[1][0]!=0) else "")
index = runopts_var.get()
gpu_be = (index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Old CPU)" or index == "Use CLBlast (Older CPU)" or index == "Use CUDA" or index == "Use hipBLAS (ROCm)")
@ -5341,7 +5345,7 @@ def show_gui():
makelabelentry(images_tab, "(Soft):", sd_clamped_soft_var, 4, 50, padx=290,singleline=True,tooltip="Square image size restriction, to protect the server against memory crashes.\nAllows width-height tradeoffs, eg. 640 allows 640x640 and 512x768\nLeave at 0 for the default value: 832 for SD1.5/SD2, 1024 otherwise.",labelpadx=250)
makelabelentry(images_tab, "Image Threads:" , sd_threads_var, 8, 50,padx=290,singleline=True,tooltip="How many threads to use during image generation.\nIf left blank, uses same value as threads.")
sd_model_var.trace_add("write", gui_changed_modelfile)
makecheckbox(images_tab, "Compress Weights (Saves Memory)", sd_quant_var, 10,tooltiptxt="Quantizes the SD model weights to save memory. May degrade quality.")
makelabelcombobox(images_tab, "Compress Weights (Saves Memory)", sd_quant_var, 10, width=50, labelpadx=65, tooltiptxt="Quantizes the SD model weights to save memory.\nHigher levels save more memory, and cause more quality degradation.", values=sd_quant_choices)
sd_quant_var.trace_add("write", changed_gpulayers_estimate)
makefileentry(images_tab, "Image LoRA (safetensors/gguf):", "Select SD lora file",sd_lora_var, 20, width=280, singlecol=True, filetypes=[("*.safetensors *.gguf", "*.safetensors *.gguf")],tooltiptxt="Select a .safetensors or .gguf SD LoRA model file to be loaded. Should be unquantized!")
@ -5626,8 +5630,7 @@ def show_gui():
args.sdclipg = sd_clipg_var.get()
if sd_photomaker_var.get() != "":
args.sdphotomaker = sd_photomaker_var.get()
if sd_quant_var.get()==1:
args.sdquant = True
args.sdquant = sd_quant_option(sd_quant_var.get())
if sd_lora_var.get() != "":
args.sdlora = sd_lora_var.get()
args.sdloramult = float(sd_loramult_var.get())
@ -5837,7 +5840,7 @@ def show_gui():
sd_clamped_var.set(int(dict["sdclamped"]) if ("sdclamped" in dict and dict["sdclamped"]) else 0)
sd_clamped_soft_var.set(int(dict["sdclampedsoft"]) if ("sdclampedsoft" in dict and dict["sdclampedsoft"]) else 0)
sd_threads_var.set(str(dict["sdthreads"]) if ("sdthreads" in dict and dict["sdthreads"]) else str(default_threads))
sd_quant_var.set(1 if ("sdquant" in dict and dict["sdquant"]) else 0)
sd_quant_var.set(sd_quant_choices[(dict["sdquant"] if "sdquant" in dict else 0)])
sd_flash_attention_var.set(1 if ("sdflashattention" in dict and dict["sdflashattention"]) else 0)
sd_convdirect_var.set(sd_convdirect_option(dict.get("sdconvdirect")))
sd_vae_var.set(dict["sdvae"] if ("sdvae" in dict and dict["sdvae"]) else "")
@ -6190,7 +6193,7 @@ def convert_invalid_args(args):
if dict["sdconfig"] and len(dict["sdconfig"]) > 2:
dict["sdthreads"] = int(dict["sdconfig"][2])
if dict["sdconfig"] and len(dict["sdconfig"]) > 3:
dict["sdquant"] = (True if dict["sdconfig"][3]=="quant" else False)
dict["sdquant"] = (2 if dict["sdconfig"][3]=="quant" else 0)
if "hordeconfig" in dict and dict["hordeconfig"] and dict["hordeconfig"][0]!="":
dict["hordemodelname"] = dict["hordeconfig"][0]
if len(dict["hordeconfig"]) > 1:
@ -6216,6 +6219,8 @@ def convert_invalid_args(args):
dict["model_param"] = model_value[0] # Take the first file in the list
if "sdnotile" in dict and "sdtiledvae" not in dict:
dict["sdtiledvae"] = (0 if (dict["sdnotile"]) else default_vae_tile_threshold) # convert legacy option
if 'sdquant' in dict and type(dict['sdquant']) is bool:
dict['sdquant'] = 2 if dict['sdquant'] else 0
return args
def setuptunnel(global_memory, has_sd):
@ -7648,7 +7653,7 @@ if __name__ == '__main__':
sdparsergroupvae.add_argument("--sdvae", metavar=('[filename]'), help="Specify an image generation safetensors VAE which replaces the one in the model.", default="")
sdparsergroupvae.add_argument("--sdvaeauto", help="Uses a built-in VAE via TAE SD, which is very fast, and fixed bad VAEs.", action='store_true')
sdparsergrouplora = sdparsergroup.add_mutually_exclusive_group()
sdparsergrouplora.add_argument("--sdquant", help="If specified, loads the model quantized to save memory.", action='store_true')
sdparsergrouplora.add_argument("--sdquant", metavar=('[quantization level 0/1/2]'), help="If specified, loads the model quantized to save memory. 0=off, 1=q8, 2=q4", type=int, choices=[0,1,2], nargs="?", const=2, default=0)
sdparsergrouplora.add_argument("--sdlora", metavar=('[filename]'), help="Specify an image generation LORA safetensors model to be applied.", default="")
sdparsergroup.add_argument("--sdloramult", metavar=('[amount]'), help="Multiplier for the image LORA model to be applied.", type=float, default=1.0)
sdparsergroup.add_argument("--sdtiledvae", metavar=('[maxres]'), help="Adjust the automatic VAE tiling trigger for images above this size. 0 disables vae tiling.", type=int, default=default_vae_tile_threshold)