mirror of
https://github.com/LostRuins/koboldcpp.git
synced 2025-09-10 09:04:36 +00:00
Support q8_0 quantization for image model loading (#1692)
* Support q8_0 quantization for image model loading q4_0 may degrade quality significantly, especially for smaller models like SD 1.5 and SDXL. q8_0 provides a middle-ground, giving half the memory savings of q4_0 but loading faster and with less quality loss. * Accept --sdquant with no parameters * Use numerical values for the sdquant option
This commit is contained in:
parent
7fef0bc949
commit
2f8b0ec538
2 changed files with 25 additions and 16 deletions
33
koboldcpp.py
33
koboldcpp.py
|
@ -1136,10 +1136,11 @@ def autoset_gpu_layers(ctxsize, sdquanted, bbs, qkv_level): #shitty algo to dete
|
|||
showmultigpuwarning = False
|
||||
print("Multi-Part GGUF detected. Layer estimates may not be very accurate - recommend setting layers manually.")
|
||||
fsize *= total_parts
|
||||
sdquantsavings = sdquanted
|
||||
if modelfile_extracted_meta[3] > 1024*1024*1024*5: #sdxl tax
|
||||
mem -= 1024*1024*1024*(6 if sdquanted else 9)
|
||||
mem -= 1024*1024*1024*(9 - sdquantsavings * 1.5) # 9, 7.5, 6
|
||||
elif modelfile_extracted_meta[3] > 1024*1024*512: #normal sd tax
|
||||
mem -= 1024*1024*1024*(3.25 if sdquanted else 4.25)
|
||||
mem -= 1024*1024*1024*(4.25 - sdquantsavings * 0.5) # 4.25, 3.75, 3.25
|
||||
if modelfile_extracted_meta[4] > 1024*1024*10: #whisper tax
|
||||
mem -= max(350*1024*1024,modelfile_extracted_meta[4]*1.5)
|
||||
if modelfile_extracted_meta[5] > 1024*1024*10: #mmproj tax
|
||||
|
@ -1656,22 +1657,25 @@ def sd_convdirect_option(value):
|
|||
return 'full'
|
||||
raise argparse.ArgumentTypeError(f"Invalid sdconvdirect option \"{value}\". Must be one of {sd_convdirect_choices}.")
|
||||
|
||||
sd_quant_choices = ['0 (off)', '1 (q8_0)', '2 (q4_0)']
|
||||
|
||||
def sd_quant_option(value):
|
||||
return int((value or '0')[0])
|
||||
|
||||
def sd_load_model(model_filename,vae_filename,lora_filename,t5xxl_filename,clipl_filename,clipg_filename,photomaker_filename):
|
||||
global args
|
||||
inputs = sd_load_model_inputs()
|
||||
inputs.model_filename = model_filename.encode("UTF-8")
|
||||
thds = args.threads
|
||||
quant = 0
|
||||
|
||||
if args.sdthreads and args.sdthreads > 0:
|
||||
sdt = int(args.sdthreads)
|
||||
if sdt > 0:
|
||||
thds = sdt
|
||||
if args.sdquant:
|
||||
quant = 1
|
||||
|
||||
inputs.threads = thds
|
||||
inputs.quant = quant
|
||||
sd_quant_types = {0: -1, 1: 8, 2: 2} # enum sd_type_t
|
||||
inputs.quant = sd_quant_types[args.sdquant]
|
||||
inputs.flash_attention = args.sdflashattention
|
||||
sdconvdirect = sd_convdirect_option(args.sdconvdirect)
|
||||
inputs.diffusion_conv_direct = sdconvdirect == 'full'
|
||||
|
@ -4596,7 +4600,7 @@ def show_gui():
|
|||
sd_clamped_var = ctk.StringVar(value="0")
|
||||
sd_clamped_soft_var = ctk.StringVar(value="0")
|
||||
sd_threads_var = ctk.StringVar(value=str(default_threads))
|
||||
sd_quant_var = ctk.IntVar(value=0)
|
||||
sd_quant_var = ctk.StringVar(value=sd_quant_choices[0])
|
||||
|
||||
whisper_model_var = ctk.StringVar()
|
||||
tts_model_var = ctk.StringVar()
|
||||
|
@ -4939,7 +4943,7 @@ def show_gui():
|
|||
pass
|
||||
|
||||
def changed_gpulayers_estimate(*args):
|
||||
predicted_gpu_layers = autoset_gpu_layers(int(contextsize_text[context_var.get()]),(sd_quant_var.get()==1),int(blasbatchsize_values[int(blas_size_var.get())]),(quantkv_var.get() if flashattention_var.get()==1 else 0))
|
||||
predicted_gpu_layers = autoset_gpu_layers(int(contextsize_text[context_var.get()]),sd_quant_option(sd_quant_var.get() or 0),int(blasbatchsize_values[int(blas_size_var.get())]),(quantkv_var.get() if flashattention_var.get()==1 else 0))
|
||||
max_gpu_layers = (f"/{modelfile_extracted_meta[1][0]+3}" if (modelfile_extracted_meta and modelfile_extracted_meta[1] and modelfile_extracted_meta[1][0]!=0) else "")
|
||||
index = runopts_var.get()
|
||||
gpu_be = (index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CLBlast" or index == "Use CLBlast (Old CPU)" or index == "Use CLBlast (Older CPU)" or index == "Use CUDA" or index == "Use hipBLAS (ROCm)")
|
||||
|
@ -5341,7 +5345,7 @@ def show_gui():
|
|||
makelabelentry(images_tab, "(Soft):", sd_clamped_soft_var, 4, 50, padx=290,singleline=True,tooltip="Square image size restriction, to protect the server against memory crashes.\nAllows width-height tradeoffs, eg. 640 allows 640x640 and 512x768\nLeave at 0 for the default value: 832 for SD1.5/SD2, 1024 otherwise.",labelpadx=250)
|
||||
makelabelentry(images_tab, "Image Threads:" , sd_threads_var, 8, 50,padx=290,singleline=True,tooltip="How many threads to use during image generation.\nIf left blank, uses same value as threads.")
|
||||
sd_model_var.trace_add("write", gui_changed_modelfile)
|
||||
makecheckbox(images_tab, "Compress Weights (Saves Memory)", sd_quant_var, 10,tooltiptxt="Quantizes the SD model weights to save memory. May degrade quality.")
|
||||
makelabelcombobox(images_tab, "Compress Weights (Saves Memory)", sd_quant_var, 10, width=50, labelpadx=65, tooltiptxt="Quantizes the SD model weights to save memory.\nHigher levels save more memory, and cause more quality degradation.", values=sd_quant_choices)
|
||||
sd_quant_var.trace_add("write", changed_gpulayers_estimate)
|
||||
|
||||
makefileentry(images_tab, "Image LoRA (safetensors/gguf):", "Select SD lora file",sd_lora_var, 20, width=280, singlecol=True, filetypes=[("*.safetensors *.gguf", "*.safetensors *.gguf")],tooltiptxt="Select a .safetensors or .gguf SD LoRA model file to be loaded. Should be unquantized!")
|
||||
|
@ -5626,8 +5630,7 @@ def show_gui():
|
|||
args.sdclipg = sd_clipg_var.get()
|
||||
if sd_photomaker_var.get() != "":
|
||||
args.sdphotomaker = sd_photomaker_var.get()
|
||||
if sd_quant_var.get()==1:
|
||||
args.sdquant = True
|
||||
args.sdquant = sd_quant_option(sd_quant_var.get())
|
||||
if sd_lora_var.get() != "":
|
||||
args.sdlora = sd_lora_var.get()
|
||||
args.sdloramult = float(sd_loramult_var.get())
|
||||
|
@ -5837,7 +5840,7 @@ def show_gui():
|
|||
sd_clamped_var.set(int(dict["sdclamped"]) if ("sdclamped" in dict and dict["sdclamped"]) else 0)
|
||||
sd_clamped_soft_var.set(int(dict["sdclampedsoft"]) if ("sdclampedsoft" in dict and dict["sdclampedsoft"]) else 0)
|
||||
sd_threads_var.set(str(dict["sdthreads"]) if ("sdthreads" in dict and dict["sdthreads"]) else str(default_threads))
|
||||
sd_quant_var.set(1 if ("sdquant" in dict and dict["sdquant"]) else 0)
|
||||
sd_quant_var.set(sd_quant_choices[(dict["sdquant"] if "sdquant" in dict else 0)])
|
||||
sd_flash_attention_var.set(1 if ("sdflashattention" in dict and dict["sdflashattention"]) else 0)
|
||||
sd_convdirect_var.set(sd_convdirect_option(dict.get("sdconvdirect")))
|
||||
sd_vae_var.set(dict["sdvae"] if ("sdvae" in dict and dict["sdvae"]) else "")
|
||||
|
@ -6190,7 +6193,7 @@ def convert_invalid_args(args):
|
|||
if dict["sdconfig"] and len(dict["sdconfig"]) > 2:
|
||||
dict["sdthreads"] = int(dict["sdconfig"][2])
|
||||
if dict["sdconfig"] and len(dict["sdconfig"]) > 3:
|
||||
dict["sdquant"] = (True if dict["sdconfig"][3]=="quant" else False)
|
||||
dict["sdquant"] = (2 if dict["sdconfig"][3]=="quant" else 0)
|
||||
if "hordeconfig" in dict and dict["hordeconfig"] and dict["hordeconfig"][0]!="":
|
||||
dict["hordemodelname"] = dict["hordeconfig"][0]
|
||||
if len(dict["hordeconfig"]) > 1:
|
||||
|
@ -6216,6 +6219,8 @@ def convert_invalid_args(args):
|
|||
dict["model_param"] = model_value[0] # Take the first file in the list
|
||||
if "sdnotile" in dict and "sdtiledvae" not in dict:
|
||||
dict["sdtiledvae"] = (0 if (dict["sdnotile"]) else default_vae_tile_threshold) # convert legacy option
|
||||
if 'sdquant' in dict and type(dict['sdquant']) is bool:
|
||||
dict['sdquant'] = 2 if dict['sdquant'] else 0
|
||||
return args
|
||||
|
||||
def setuptunnel(global_memory, has_sd):
|
||||
|
@ -7648,7 +7653,7 @@ if __name__ == '__main__':
|
|||
sdparsergroupvae.add_argument("--sdvae", metavar=('[filename]'), help="Specify an image generation safetensors VAE which replaces the one in the model.", default="")
|
||||
sdparsergroupvae.add_argument("--sdvaeauto", help="Uses a built-in VAE via TAE SD, which is very fast, and fixed bad VAEs.", action='store_true')
|
||||
sdparsergrouplora = sdparsergroup.add_mutually_exclusive_group()
|
||||
sdparsergrouplora.add_argument("--sdquant", help="If specified, loads the model quantized to save memory.", action='store_true')
|
||||
sdparsergrouplora.add_argument("--sdquant", metavar=('[quantization level 0/1/2]'), help="If specified, loads the model quantized to save memory. 0=off, 1=q8, 2=q4", type=int, choices=[0,1,2], nargs="?", const=2, default=0)
|
||||
sdparsergrouplora.add_argument("--sdlora", metavar=('[filename]'), help="Specify an image generation LORA safetensors model to be applied.", default="")
|
||||
sdparsergroup.add_argument("--sdloramult", metavar=('[amount]'), help="Multiplier for the image LORA model to be applied.", type=float, default=1.0)
|
||||
sdparsergroup.add_argument("--sdtiledvae", metavar=('[maxres]'), help="Adjust the automatic VAE tiling trigger for images above this size. 0 disables vae tiling.", type=int, default=default_vae_tile_threshold)
|
||||
|
|
|
@ -221,7 +221,7 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
|
|||
{
|
||||
printf("Conv2D Direct for VAE model is enabled\n");
|
||||
}
|
||||
if(inputs.quant)
|
||||
if(inputs.quant > 0)
|
||||
{
|
||||
printf("Note: Loading a pre-quantized model is always faster than using compress weights!\n");
|
||||
}
|
||||
|
@ -253,7 +253,11 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
|
|||
|
||||
sd_params = new SDParams();
|
||||
sd_params->model_path = inputs.model_filename;
|
||||
sd_params->wtype = (inputs.quant==0?SD_TYPE_COUNT:SD_TYPE_Q4_0);
|
||||
sd_params->wtype = SD_TYPE_COUNT;
|
||||
if (inputs.quant > 0 && inputs.quant < SD_TYPE_COUNT) {
|
||||
sd_params->wtype = (sd_type_t) inputs.quant;
|
||||
printf("\nDiffusion Model quantized to %s", sd_type_name(sd_params->wtype));
|
||||
}
|
||||
sd_params->n_threads = inputs.threads; //if -1 use physical cores
|
||||
sd_params->diffusion_flash_attn = inputs.flash_attention;
|
||||
sd_params->diffusion_conv_direct = inputs.diffusion_conv_direct;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue