draft model sets gpu split instead of id, made mmq default for cli

This commit is contained in:
Concedo 2024-12-14 23:58:45 +08:00
parent 102281cfbb
commit 60cd68a39d
3 changed files with 50 additions and 22 deletions

View file

@ -43,7 +43,7 @@ struct load_model_inputs
const char * draftmodel_filename = nullptr; const char * draftmodel_filename = nullptr;
const int draft_amount = 8; const int draft_amount = 8;
const int draft_gpulayers = 999; const int draft_gpulayers = 999;
const int draft_gpuid = -1; const float draft_gpusplit[tensor_split_max] = {};
const char * mmproj_filename = nullptr; const char * mmproj_filename = nullptr;
const bool use_mmap = false; const bool use_mmap = false;
const bool use_mlock = false; const bool use_mlock = false;

View file

@ -597,7 +597,7 @@ struct kcpp_embd_batch { //duplcated from llava_embd_batch
}; };
//loads a model for speculative decoding. //loads a model for speculative decoding.
static void speculative_decoding_setup(std::string spec_model_filename, const llama_model_params & base_model_params, const llama_context_params & base_ctx_params, int base_n_vocab, int draftgpuid, int draftgpulayers) static void speculative_decoding_setup(std::string spec_model_filename, const llama_model_params & base_model_params, const llama_context_params & base_ctx_params, int base_n_vocab, const float * draft_gpusplit, int draftgpulayers)
{ {
llama_model_params draft_model_params = llama_model_default_params(); llama_model_params draft_model_params = llama_model_default_params();
llama_context_params draft_ctx_params = llama_context_default_params(); llama_context_params draft_ctx_params = llama_context_default_params();
@ -608,8 +608,22 @@ static void speculative_decoding_setup(std::string spec_model_filename, const ll
draft_ctx_params.n_ctx = base_ctx_params.n_ctx; draft_ctx_params.n_ctx = base_ctx_params.n_ctx;
draft_ctx_params.logits_all = false; draft_ctx_params.logits_all = false;
draft_ctx_params.offload_kqv = base_ctx_params.offload_kqv; draft_ctx_params.offload_kqv = base_ctx_params.offload_kqv;
draft_model_params.main_gpu = (draftgpuid>=0?draftgpuid:base_model_params.main_gpu); draft_model_params.main_gpu = base_model_params.main_gpu;
draft_model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER; draft_model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER;
#if defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN)
bool ts_all_zero = true;
for (int i = 0; i < tensor_split_max; ++i) {
if (draft_gpusplit[i] != 0.0f) {
ts_all_zero = false;
break;
}
}
if(!ts_all_zero)
{
printf("\nApplying Draft GPU Split...\n");
draft_model_params.tensor_split = draft_gpusplit;
}
#endif
draft_ctx_params.n_batch = base_ctx_params.n_batch; draft_ctx_params.n_batch = base_ctx_params.n_batch;
draft_ctx_params.n_ubatch = base_ctx_params.n_ubatch; draft_ctx_params.n_ubatch = base_ctx_params.n_ubatch;
draft_ctx_params.n_threads = base_ctx_params.n_threads; draft_ctx_params.n_threads = base_ctx_params.n_threads;
@ -2063,7 +2077,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
} }
if(!ts_all_zero) if(!ts_all_zero)
{ {
printf("\nApplying Tensor Split..."); printf("\nApplying Tensor Split...\n");
llama_ctx_params.tensor_split = inputs.tensor_split; llama_ctx_params.tensor_split = inputs.tensor_split;
} }
#endif #endif
@ -2174,7 +2188,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
} }
if(!ts_all_zero) if(!ts_all_zero)
{ {
printf("\nApplying Tensor Split..."); printf("\nApplying Tensor Split...\n");
model_params.tensor_split = inputs.tensor_split; model_params.tensor_split = inputs.tensor_split;
} }
#endif #endif
@ -2294,7 +2308,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
{ {
printf("\nAttempting to load draft model for speculative decoding. It will be fully offloaded if possible. Vocab must match the main model.\n"); printf("\nAttempting to load draft model for speculative decoding. It will be fully offloaded if possible. Vocab must match the main model.\n");
speculative_chunk_amt = inputs.draft_amount; speculative_chunk_amt = inputs.draft_amount;
speculative_decoding_setup(draftmodel_filename, model_params, llama_ctx_params, n_vocab, inputs.draft_gpuid, inputs.draft_gpulayers); speculative_decoding_setup(draftmodel_filename, model_params, llama_ctx_params, n_vocab, inputs.draft_gpusplit, inputs.draft_gpulayers);
} }
} }

View file

@ -144,7 +144,7 @@ class load_model_inputs(ctypes.Structure):
("draftmodel_filename", ctypes.c_char_p), ("draftmodel_filename", ctypes.c_char_p),
("draft_amount", ctypes.c_int), ("draft_amount", ctypes.c_int),
("draft_gpulayers", ctypes.c_int), ("draft_gpulayers", ctypes.c_int),
("draft_gpuid", ctypes.c_int), ("draft_gpusplit", ctypes.c_float * tensor_split_max),
("mmproj_filename", ctypes.c_char_p), ("mmproj_filename", ctypes.c_char_p),
("use_mmap", ctypes.c_bool), ("use_mmap", ctypes.c_bool),
("use_mlock", ctypes.c_bool), ("use_mlock", ctypes.c_bool),
@ -840,7 +840,7 @@ def load_model(model_filename):
inputs.max_context_length = maxctx #initial value to use for ctx, can be overwritten inputs.max_context_length = maxctx #initial value to use for ctx, can be overwritten
inputs.threads = args.threads inputs.threads = args.threads
inputs.low_vram = (True if (args.usecublas and "lowvram" in args.usecublas) else False) inputs.low_vram = (True if (args.usecublas and "lowvram" in args.usecublas) else False)
inputs.use_mmq = (True if (args.usecublas and "mmq" in args.usecublas) else False) inputs.use_mmq = (True if (args.usecublas and "nommq" not in args.usecublas) else False)
inputs.use_rowsplit = (True if (args.usecublas and "rowsplit" in args.usecublas) else False) inputs.use_rowsplit = (True if (args.usecublas and "rowsplit" in args.usecublas) else False)
inputs.vulkan_info = "0".encode("UTF-8") inputs.vulkan_info = "0".encode("UTF-8")
inputs.blasthreads = args.blasthreads inputs.blasthreads = args.blasthreads
@ -857,7 +857,11 @@ def load_model(model_filename):
inputs.draftmodel_filename = args.draftmodel.encode("UTF-8") if args.draftmodel else "".encode("UTF-8") inputs.draftmodel_filename = args.draftmodel.encode("UTF-8") if args.draftmodel else "".encode("UTF-8")
inputs.draft_amount = args.draftamount inputs.draft_amount = args.draftamount
inputs.draft_gpulayers = args.draftgpulayers inputs.draft_gpulayers = args.draftgpulayers
inputs.draft_gpuid = args.draftgpuid for n in range(tensor_split_max):
if args.draftgpusplit and n < len(args.draftgpusplit):
inputs.draft_gpusplit[n] = float(args.draftgpusplit[n])
else:
inputs.draft_gpusplit[n] = 0
inputs.mmproj_filename = args.mmproj.encode("UTF-8") if args.mmproj else "".encode("UTF-8") inputs.mmproj_filename = args.mmproj.encode("UTF-8") if args.mmproj else "".encode("UTF-8")
inputs.use_smartcontext = args.smartcontext inputs.use_smartcontext = args.smartcontext
inputs.use_contextshift = (0 if args.noshift else 1) inputs.use_contextshift = (0 if args.noshift else 1)
@ -2757,7 +2761,7 @@ def show_gui():
draftmodel_var = ctk.StringVar() draftmodel_var = ctk.StringVar()
draftamount_var = ctk.StringVar(value=str(default_draft_amount)) draftamount_var = ctk.StringVar(value=str(default_draft_amount))
draftgpulayers_var = ctk.StringVar(value=str(999)) draftgpulayers_var = ctk.StringVar(value=str(999))
draftgpuid_var = ctk.StringVar(value=str(-1)) draftgpusplit_str_vars = ctk.StringVar(value="")
nomodel = ctk.IntVar(value=0) nomodel = ctk.IntVar(value=0)
port_var = ctk.StringVar(value=defaultport) port_var = ctk.StringVar(value=defaultport)
@ -3245,7 +3249,7 @@ def show_gui():
makefileentry(model_tab, "Vision mmproj:", "Select Vision mmproj File", mmproj_var, 7,width=280,singlerow=True,tooltiptxt="Select a mmproj file to use for vision models like LLaVA.\nLeave blank to skip.") makefileentry(model_tab, "Vision mmproj:", "Select Vision mmproj File", mmproj_var, 7,width=280,singlerow=True,tooltiptxt="Select a mmproj file to use for vision models like LLaVA.\nLeave blank to skip.")
makefileentry(model_tab, "Draft Model:", "Select Speculative Text Model File", draftmodel_var, 9,width=280,singlerow=True,tooltiptxt="Select a draft text model file to use for speculative decoding.\nLeave blank to skip.") makefileentry(model_tab, "Draft Model:", "Select Speculative Text Model File", draftmodel_var, 9,width=280,singlerow=True,tooltiptxt="Select a draft text model file to use for speculative decoding.\nLeave blank to skip.")
makelabelentry(model_tab, "Draft Amount: ", draftamount_var, 11, 50,padx=100,singleline=True,tooltip="How many tokens to draft per chunk before verifying results") makelabelentry(model_tab, "Draft Amount: ", draftamount_var, 11, 50,padx=100,singleline=True,tooltip="How many tokens to draft per chunk before verifying results")
makelabelentry(model_tab, "GPU ID: ", draftgpuid_var, 11, 50,padx=210,singleline=True,tooltip="Which GPU to use for draft model. Only works if multi-gpu (All) selected in main model.", labelpadx=160) makelabelentry(model_tab, "Splits: ", draftgpusplit_str_vars, 11, 50,padx=210,singleline=True,tooltip="Distribution of draft model layers. Leave blank to follow main model's gpu split. Only works if multi-gpu (All) selected in main model.", labelpadx=160)
makelabelentry(model_tab, "Layers: ", draftgpulayers_var, 11, 50,padx=320,singleline=True,tooltip="How many layers to GPU offload for the draft model", labelpadx=270) makelabelentry(model_tab, "Layers: ", draftgpulayers_var, 11, 50,padx=320,singleline=True,tooltip="How many layers to GPU offload for the draft model", labelpadx=270)
makefileentry(model_tab, "Preload Story:", "Select Preloaded Story File", preloadstory_var, 15,width=280,singlerow=True,tooltiptxt="Select an optional KoboldAI JSON savefile \nto be served on launch to any client.") makefileentry(model_tab, "Preload Story:", "Select Preloaded Story File", preloadstory_var, 15,width=280,singlerow=True,tooltiptxt="Select an optional KoboldAI JSON savefile \nto be served on launch to any client.")
makefileentry(model_tab, "ChatCompletions Adapter:", "Select ChatCompletions Adapter File", chatcompletionsadapter_var, 24, width=250, filetypes=[("JSON Adapter", "*.json")], tooltiptxt="Select an optional ChatCompletions Adapter JSON file to force custom instruct tags.") makefileentry(model_tab, "ChatCompletions Adapter:", "Select ChatCompletions Adapter File", chatcompletionsadapter_var, 24, width=250, filetypes=[("JSON Adapter", "*.json")], tooltiptxt="Select an optional ChatCompletions Adapter JSON file to force custom instruct tags.")
@ -3378,6 +3382,7 @@ def show_gui():
savdict["usecublas"] = None savdict["usecublas"] = None
savdict["usevulkan"] = None savdict["usevulkan"] = None
savdict["tensor_split"] = None savdict["tensor_split"] = None
savdict["draftgpusplit"] = None
savdict["config"] = None savdict["config"] = None
filename = asksaveasfile(filetypes=file_type, defaultextension=file_type) filename = asksaveasfile(filetypes=file_type, defaultextension=file_type)
if filename is None: if filename is None:
@ -3447,6 +3452,8 @@ def show_gui():
args.usecublas = ["lowvram",str(gpuchoiceidx)] if lowvram_var.get() == 1 else ["normal",str(gpuchoiceidx)] args.usecublas = ["lowvram",str(gpuchoiceidx)] if lowvram_var.get() == 1 else ["normal",str(gpuchoiceidx)]
if mmq_var.get()==1: if mmq_var.get()==1:
args.usecublas.append("mmq") args.usecublas.append("mmq")
else:
args.usecublas.append("nommq")
if rowsplit_var.get()==1: if rowsplit_var.get()==1:
args.usecublas.append("rowsplit") args.usecublas.append("rowsplit")
if runopts_var.get() == "Use Vulkan" or runopts_var.get() == "Use Vulkan (Old CPU)": if runopts_var.get() == "Use Vulkan" or runopts_var.get() == "Use Vulkan (Old CPU)":
@ -3473,6 +3480,13 @@ def show_gui():
args.tensor_split = [float(x) for x in tssv.split(",")] args.tensor_split = [float(x) for x in tssv.split(",")]
else: else:
args.tensor_split = [float(x) for x in tssv.split(" ")] args.tensor_split = [float(x) for x in tssv.split(" ")]
if draftgpusplit_str_vars.get()!="":
tssv = draftgpusplit_str_vars.get()
if "," in tssv:
args.draftgpusplit = [float(x) for x in tssv.split(",")]
else:
args.draftgpusplit = [float(x) for x in tssv.split(" ")]
args.blasthreads = None if blas_threads_var.get()=="" else int(blas_threads_var.get()) args.blasthreads = None if blas_threads_var.get()=="" else int(blas_threads_var.get())
args.blasbatchsize = int(blasbatchsize_values[int(blas_size_var.get())]) args.blasbatchsize = int(blasbatchsize_values[int(blas_size_var.get())])
@ -3504,7 +3518,6 @@ def show_gui():
args.draftmodel = None if draftmodel_var.get() == "" else draftmodel_var.get() args.draftmodel = None if draftmodel_var.get() == "" else draftmodel_var.get()
args.draftamount = int(draftamount_var.get()) if draftamount_var.get()!="" else default_draft_amount args.draftamount = int(draftamount_var.get()) if draftamount_var.get()!="" else default_draft_amount
args.draftgpulayers = int(draftgpulayers_var.get()) if draftgpulayers_var.get()!="" else 999 args.draftgpulayers = int(draftgpulayers_var.get()) if draftgpulayers_var.get()!="" else 999
args.draftgpuid = int(draftgpuid_var.get()) if draftgpuid_var.get()!="" else -1
args.ssl = None if (ssl_cert_var.get() == "" or ssl_key_var.get() == "") else ([ssl_cert_var.get(), ssl_key_var.get()]) args.ssl = None if (ssl_cert_var.get() == "" or ssl_key_var.get() == "") else ([ssl_cert_var.get(), ssl_key_var.get()])
args.password = None if (password_var.get() == "") else (password_var.get()) args.password = None if (password_var.get() == "") else (password_var.get())
@ -3635,6 +3648,9 @@ def show_gui():
if "tensor_split" in dict and dict["tensor_split"]: if "tensor_split" in dict and dict["tensor_split"]:
tssep = ','.join(map(str, dict["tensor_split"])) tssep = ','.join(map(str, dict["tensor_split"]))
tensor_split_str_vars.set(tssep) tensor_split_str_vars.set(tssep)
if "draftgpusplit" in dict and dict["draftgpusplit"]:
tssep = ','.join(map(str, dict["draftgpusplit"]))
draftgpusplit_str_vars.set(tssep)
if "blasthreads" in dict and dict["blasthreads"]: if "blasthreads" in dict and dict["blasthreads"]:
blas_threads_var.set(str(dict["blasthreads"])) blas_threads_var.set(str(dict["blasthreads"]))
else: else:
@ -3672,8 +3688,6 @@ def show_gui():
draftamount_var.set(dict["draftamount"]) draftamount_var.set(dict["draftamount"])
if "draftgpulayers" in dict: if "draftgpulayers" in dict:
draftgpulayers_var.set(dict["draftgpulayers"]) draftgpulayers_var.set(dict["draftgpulayers"])
if "draftgpuid" in dict:
draftgpuid_var.set(dict["draftgpuid"])
ssl_cert_var.set("") ssl_cert_var.set("")
ssl_key_var.set("") ssl_key_var.set("")
@ -4927,7 +4941,7 @@ if __name__ == '__main__':
parser.add_argument("--threads", metavar=('[threads]'), help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=get_default_threads()) parser.add_argument("--threads", metavar=('[threads]'), help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=get_default_threads())
compatgroup = parser.add_mutually_exclusive_group() compatgroup = parser.add_mutually_exclusive_group()
compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs. For hipBLAS binaries, please check YellowRoseCx rocm fork.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq] [rowsplit]'), choices=['normal', 'lowvram', '0', '1', '2', '3', 'all', 'mmq', 'rowsplit']) compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs. For hipBLAS binaries, please check YellowRoseCx rocm fork.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq|nommq] [rowsplit]'), choices=['normal', 'lowvram', '0', '1', '2', '3', 'all', 'mmq', 'nommq', 'rowsplit'])
compatgroup.add_argument("--usevulkan", help="Use Vulkan for GPU Acceleration. Can optionally specify one or more GPU Device ID (e.g. --usevulkan 0), leave blank to autodetect.", metavar=('[Device IDs]'), nargs='*', type=int, default=None) compatgroup.add_argument("--usevulkan", help="Use Vulkan for GPU Acceleration. Can optionally specify one or more GPU Device ID (e.g. --usevulkan 0), leave blank to autodetect.", metavar=('[Device IDs]'), nargs='*', type=int, default=None)
compatgroup.add_argument("--useclblast", help="Use CLBlast for GPU Acceleration. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2) compatgroup.add_argument("--useclblast", help="Use CLBlast for GPU Acceleration. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2)
compatgroup.add_argument("--usecpu", help="Do not use any GPU acceleration (CPU Only)", action='store_true') compatgroup.add_argument("--usecpu", help="Do not use any GPU acceleration (CPU Only)", action='store_true')
@ -4957,18 +4971,18 @@ if __name__ == '__main__':
advparser.add_argument("--remotetunnel", help="Uses Cloudflare to create a remote tunnel, allowing you to access koboldcpp remotely over the internet even behind a firewall.", action='store_true') advparser.add_argument("--remotetunnel", help="Uses Cloudflare to create a remote tunnel, allowing you to access koboldcpp remotely over the internet even behind a firewall.", action='store_true')
advparser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true') advparser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true')
advparser.add_argument("--foreground", help="Windows only. Sends the terminal to the foreground every time a new prompt is generated. This helps avoid some idle slowdown issues.", action='store_true') advparser.add_argument("--foreground", help="Windows only. Sends the terminal to the foreground every time a new prompt is generated. This helps avoid some idle slowdown issues.", action='store_true')
advparser.add_argument("--preloadstory", help="Configures a prepared story json save file to be hosted on the server, which frontends (such as KoboldAI Lite) can access over the API.", default="") advparser.add_argument("--preloadstory", metavar=('[savefile]'), help="Configures a prepared story json save file to be hosted on the server, which frontends (such as KoboldAI Lite) can access over the API.", default="")
advparser.add_argument("--quiet", help="Enable quiet mode, which hides generation inputs and outputs in the terminal. Quiet mode is automatically enabled when running a horde worker.", action='store_true') advparser.add_argument("--quiet", help="Enable quiet mode, which hides generation inputs and outputs in the terminal. Quiet mode is automatically enabled when running a horde worker.", action='store_true')
advparser.add_argument("--ssl", help="Allows all content to be served over SSL instead. A valid UNENCRYPTED SSL cert and key .pem files must be provided", metavar=('[cert_pem]', '[key_pem]'), nargs='+') advparser.add_argument("--ssl", help="Allows all content to be served over SSL instead. A valid UNENCRYPTED SSL cert and key .pem files must be provided", metavar=('[cert_pem]', '[key_pem]'), nargs='+')
advparser.add_argument("--nocertify", help="Allows insecure SSL connections. Use this if you have cert errors and need to bypass certificate restrictions.", action='store_true') advparser.add_argument("--nocertify", help="Allows insecure SSL connections. Use this if you have cert errors and need to bypass certificate restrictions.", action='store_true')
advparser.add_argument("--mmproj", help="Select a multimodal projector file for vision models like LLaVA.", default="") advparser.add_argument("--mmproj", metavar=('[filename]'), help="Select a multimodal projector file for vision models like LLaVA.", default="")
advparser.add_argument("--draftmodel", help="Load a small draft model for speculative decoding. It will be fully offloaded. Vocab must match the main model.", default="") advparser.add_argument("--draftmodel", metavar=('[filename]'), help="Load a small draft model for speculative decoding. It will be fully offloaded. Vocab must match the main model.", default="")
advparser.add_argument("--draftamount", metavar=('[tokens]'), help="How many tokens to draft per chunk before verifying results", type=int, default=default_draft_amount) advparser.add_argument("--draftamount", metavar=('[tokens]'), help="How many tokens to draft per chunk before verifying results", type=int, default=default_draft_amount)
advparser.add_argument("--draftgpulayers", metavar=('[layers]'), help="How many layers to offload to GPU for the draft model (default=full offload)", type=int, default=999) advparser.add_argument("--draftgpulayers", metavar=('[layers]'), help="How many layers to offload to GPU for the draft model (default=full offload)", type=int, default=999)
advparser.add_argument("--draftgpuid", metavar=('[gpu id]'), help="Which GPU to use for draft model (default=same as main). Only works if multi-GPUs selected for MAIN model!", type=int, default=-1) advparser.add_argument("--draftgpusplit", help="GPU layer distribution ratio for draft model (default=same as main). Only works if multi-GPUs selected for MAIN model and tensor_split is set!", metavar=('[Ratios]'), type=float, nargs='+')
advparser.add_argument("--password", help="Enter a password required to use this instance. This key will be required for all text endpoints. Image endpoints are not secured.", default=None) advparser.add_argument("--password", metavar=('[API key]'), help="Enter a password required to use this instance. This key will be required for all text endpoints. Image endpoints are not secured.", default=None)
advparser.add_argument("--ignoremissing", help="Ignores all missing non-essential files, just skipping them instead.", action='store_true') advparser.add_argument("--ignoremissing", help="Ignores all missing non-essential files, just skipping them instead.", action='store_true')
advparser.add_argument("--chatcompletionsadapter", help="Select an optional ChatCompletions Adapter JSON file to force custom instruct tags.", default="") advparser.add_argument("--chatcompletionsadapter", metavar=('[filename]'), help="Select an optional ChatCompletions Adapter JSON file to force custom instruct tags.", default="")
advparser.add_argument("--flashattention", help="Enables flash attention.", action='store_true') advparser.add_argument("--flashattention", help="Enables flash attention.", action='store_true')
advparser.add_argument("--quantkv", help="Sets the KV cache data type quantization, 0=f16, 1=q8, 2=q4. Requires Flash Attention, and disables context shifting.",metavar=('[quantization level 0/1/2]'), type=int, choices=[0,1,2], default=0) advparser.add_argument("--quantkv", help="Sets the KV cache data type quantization, 0=f16, 1=q8, 2=q4. Requires Flash Attention, and disables context shifting.",metavar=('[quantization level 0/1/2]'), type=int, choices=[0,1,2], default=0)
advparser.add_argument("--forceversion", help="If the model file format detection fails (e.g. rogue modified model) you can set this to override the detected format (enter desired version, e.g. 401 for GPTNeoX-Type2).",metavar=('[version]'), type=int, default=0) advparser.add_argument("--forceversion", help="If the model file format detection fails (e.g. rogue modified model) you can set this to override the detected format (enter desired version, e.g. 401 for GPTNeoX-Type2).",metavar=('[version]'), type=int, default=0)
@ -4990,7 +5004,7 @@ if __name__ == '__main__':
sdparsergroup = parser.add_argument_group('Image Generation Commands') sdparsergroup = parser.add_argument_group('Image Generation Commands')
sdparsergroup.add_argument("--sdmodel", metavar=('[filename]'), help="Specify a stable diffusion safetensors or gguf model to enable image generation.", default="") sdparsergroup.add_argument("--sdmodel", metavar=('[filename]'), help="Specify a stable diffusion safetensors or gguf model to enable image generation.", default="")
sdparsergroup.add_argument("--sdthreads", metavar=('[threads]'), help="Use a different number of threads for image generation if specified. Otherwise, has the same value as --threads.", type=int, default=0) sdparsergroup.add_argument("--sdthreads", metavar=('[threads]'), help="Use a different number of threads for image generation if specified. Otherwise, has the same value as --threads.", type=int, default=0)
sdparsergroup.add_argument("--sdclamped", help="If specified, limit generation steps and resolution settings for shared use. Accepts an extra optional parameter that indicates maximum resolution (eg. 768 clamps to 768x768, min 512px, disabled if 0).", nargs='?', const=512, type=int, default=0) sdparsergroup.add_argument("--sdclamped", metavar=('[maxres]'), help="If specified, limit generation steps and resolution settings for shared use. Accepts an extra optional parameter that indicates maximum resolution (eg. 768 clamps to 768x768, min 512px, disabled if 0).", nargs='?', const=512, type=int, default=0)
sdparsergroup.add_argument("--sdt5xxl", metavar=('[filename]'), help="Specify a T5-XXL safetensors model for use in SD3 or Flux. Leave blank if prebaked or unused.", default="") sdparsergroup.add_argument("--sdt5xxl", metavar=('[filename]'), help="Specify a T5-XXL safetensors model for use in SD3 or Flux. Leave blank if prebaked or unused.", default="")
sdparsergroup.add_argument("--sdclipl", metavar=('[filename]'), help="Specify a Clip-L safetensors model for use in SD3 or Flux. Leave blank if prebaked or unused.", default="") sdparsergroup.add_argument("--sdclipl", metavar=('[filename]'), help="Specify a Clip-L safetensors model for use in SD3 or Flux. Leave blank if prebaked or unused.", default="")
sdparsergroup.add_argument("--sdclipg", metavar=('[filename]'), help="Specify a Clip-G safetensors model for use in SD3. Leave blank if prebaked or unused.", default="") sdparsergroup.add_argument("--sdclipg", metavar=('[filename]'), help="Specify a Clip-G safetensors model for use in SD3. Leave blank if prebaked or unused.", default="")