From 60cd68a39df9a4f00c2b83fe90004c35e6373f3e Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sat, 14 Dec 2024 23:58:45 +0800 Subject: [PATCH] draft model sets gpu split instead of id, made mmq default for cli --- expose.h | 2 +- gpttype_adapter.cpp | 24 ++++++++++++++++++----- koboldcpp.py | 46 +++++++++++++++++++++++++++++---------------- 3 files changed, 50 insertions(+), 22 deletions(-) diff --git a/expose.h b/expose.h index 85826db1a..37ee8535e 100644 --- a/expose.h +++ b/expose.h @@ -43,7 +43,7 @@ struct load_model_inputs const char * draftmodel_filename = nullptr; const int draft_amount = 8; const int draft_gpulayers = 999; - const int draft_gpuid = -1; + const float draft_gpusplit[tensor_split_max] = {}; const char * mmproj_filename = nullptr; const bool use_mmap = false; const bool use_mlock = false; diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index f7d2f5ecd..e6bfd339b 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -597,7 +597,7 @@ struct kcpp_embd_batch { //duplcated from llava_embd_batch }; //loads a model for speculative decoding. -static void speculative_decoding_setup(std::string spec_model_filename, const llama_model_params & base_model_params, const llama_context_params & base_ctx_params, int base_n_vocab, int draftgpuid, int draftgpulayers) +static void speculative_decoding_setup(std::string spec_model_filename, const llama_model_params & base_model_params, const llama_context_params & base_ctx_params, int base_n_vocab, const float * draft_gpusplit, int draftgpulayers) { llama_model_params draft_model_params = llama_model_default_params(); llama_context_params draft_ctx_params = llama_context_default_params(); @@ -608,8 +608,22 @@ static void speculative_decoding_setup(std::string spec_model_filename, const ll draft_ctx_params.n_ctx = base_ctx_params.n_ctx; draft_ctx_params.logits_all = false; draft_ctx_params.offload_kqv = base_ctx_params.offload_kqv; - draft_model_params.main_gpu = (draftgpuid>=0?draftgpuid:base_model_params.main_gpu); + draft_model_params.main_gpu = base_model_params.main_gpu; draft_model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER; + #if defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) + bool ts_all_zero = true; + for (int i = 0; i < tensor_split_max; ++i) { + if (draft_gpusplit[i] != 0.0f) { + ts_all_zero = false; + break; + } + } + if(!ts_all_zero) + { + printf("\nApplying Draft GPU Split...\n"); + draft_model_params.tensor_split = draft_gpusplit; + } + #endif draft_ctx_params.n_batch = base_ctx_params.n_batch; draft_ctx_params.n_ubatch = base_ctx_params.n_ubatch; draft_ctx_params.n_threads = base_ctx_params.n_threads; @@ -2063,7 +2077,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } if(!ts_all_zero) { - printf("\nApplying Tensor Split..."); + printf("\nApplying Tensor Split...\n"); llama_ctx_params.tensor_split = inputs.tensor_split; } #endif @@ -2174,7 +2188,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } if(!ts_all_zero) { - printf("\nApplying Tensor Split..."); + printf("\nApplying Tensor Split...\n"); model_params.tensor_split = inputs.tensor_split; } #endif @@ -2294,7 +2308,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in { printf("\nAttempting to load draft model for speculative decoding. It will be fully offloaded if possible. Vocab must match the main model.\n"); speculative_chunk_amt = inputs.draft_amount; - speculative_decoding_setup(draftmodel_filename, model_params, llama_ctx_params, n_vocab, inputs.draft_gpuid, inputs.draft_gpulayers); + speculative_decoding_setup(draftmodel_filename, model_params, llama_ctx_params, n_vocab, inputs.draft_gpusplit, inputs.draft_gpulayers); } } diff --git a/koboldcpp.py b/koboldcpp.py index d9a6fcaaf..bcb6a67e0 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -144,7 +144,7 @@ class load_model_inputs(ctypes.Structure): ("draftmodel_filename", ctypes.c_char_p), ("draft_amount", ctypes.c_int), ("draft_gpulayers", ctypes.c_int), - ("draft_gpuid", ctypes.c_int), + ("draft_gpusplit", ctypes.c_float * tensor_split_max), ("mmproj_filename", ctypes.c_char_p), ("use_mmap", ctypes.c_bool), ("use_mlock", ctypes.c_bool), @@ -840,7 +840,7 @@ def load_model(model_filename): inputs.max_context_length = maxctx #initial value to use for ctx, can be overwritten inputs.threads = args.threads inputs.low_vram = (True if (args.usecublas and "lowvram" in args.usecublas) else False) - inputs.use_mmq = (True if (args.usecublas and "mmq" in args.usecublas) else False) + inputs.use_mmq = (True if (args.usecublas and "nommq" not in args.usecublas) else False) inputs.use_rowsplit = (True if (args.usecublas and "rowsplit" in args.usecublas) else False) inputs.vulkan_info = "0".encode("UTF-8") inputs.blasthreads = args.blasthreads @@ -857,7 +857,11 @@ def load_model(model_filename): inputs.draftmodel_filename = args.draftmodel.encode("UTF-8") if args.draftmodel else "".encode("UTF-8") inputs.draft_amount = args.draftamount inputs.draft_gpulayers = args.draftgpulayers - inputs.draft_gpuid = args.draftgpuid + for n in range(tensor_split_max): + if args.draftgpusplit and n < len(args.draftgpusplit): + inputs.draft_gpusplit[n] = float(args.draftgpusplit[n]) + else: + inputs.draft_gpusplit[n] = 0 inputs.mmproj_filename = args.mmproj.encode("UTF-8") if args.mmproj else "".encode("UTF-8") inputs.use_smartcontext = args.smartcontext inputs.use_contextshift = (0 if args.noshift else 1) @@ -2757,7 +2761,7 @@ def show_gui(): draftmodel_var = ctk.StringVar() draftamount_var = ctk.StringVar(value=str(default_draft_amount)) draftgpulayers_var = ctk.StringVar(value=str(999)) - draftgpuid_var = ctk.StringVar(value=str(-1)) + draftgpusplit_str_vars = ctk.StringVar(value="") nomodel = ctk.IntVar(value=0) port_var = ctk.StringVar(value=defaultport) @@ -3245,7 +3249,7 @@ def show_gui(): makefileentry(model_tab, "Vision mmproj:", "Select Vision mmproj File", mmproj_var, 7,width=280,singlerow=True,tooltiptxt="Select a mmproj file to use for vision models like LLaVA.\nLeave blank to skip.") makefileentry(model_tab, "Draft Model:", "Select Speculative Text Model File", draftmodel_var, 9,width=280,singlerow=True,tooltiptxt="Select a draft text model file to use for speculative decoding.\nLeave blank to skip.") makelabelentry(model_tab, "Draft Amount: ", draftamount_var, 11, 50,padx=100,singleline=True,tooltip="How many tokens to draft per chunk before verifying results") - makelabelentry(model_tab, "GPU ID: ", draftgpuid_var, 11, 50,padx=210,singleline=True,tooltip="Which GPU to use for draft model. Only works if multi-gpu (All) selected in main model.", labelpadx=160) + makelabelentry(model_tab, "Splits: ", draftgpusplit_str_vars, 11, 50,padx=210,singleline=True,tooltip="Distribution of draft model layers. Leave blank to follow main model's gpu split. Only works if multi-gpu (All) selected in main model.", labelpadx=160) makelabelentry(model_tab, "Layers: ", draftgpulayers_var, 11, 50,padx=320,singleline=True,tooltip="How many layers to GPU offload for the draft model", labelpadx=270) makefileentry(model_tab, "Preload Story:", "Select Preloaded Story File", preloadstory_var, 15,width=280,singlerow=True,tooltiptxt="Select an optional KoboldAI JSON savefile \nto be served on launch to any client.") makefileentry(model_tab, "ChatCompletions Adapter:", "Select ChatCompletions Adapter File", chatcompletionsadapter_var, 24, width=250, filetypes=[("JSON Adapter", "*.json")], tooltiptxt="Select an optional ChatCompletions Adapter JSON file to force custom instruct tags.") @@ -3378,6 +3382,7 @@ def show_gui(): savdict["usecublas"] = None savdict["usevulkan"] = None savdict["tensor_split"] = None + savdict["draftgpusplit"] = None savdict["config"] = None filename = asksaveasfile(filetypes=file_type, defaultextension=file_type) if filename is None: @@ -3447,6 +3452,8 @@ def show_gui(): args.usecublas = ["lowvram",str(gpuchoiceidx)] if lowvram_var.get() == 1 else ["normal",str(gpuchoiceidx)] if mmq_var.get()==1: args.usecublas.append("mmq") + else: + args.usecublas.append("nommq") if rowsplit_var.get()==1: args.usecublas.append("rowsplit") if runopts_var.get() == "Use Vulkan" or runopts_var.get() == "Use Vulkan (Old CPU)": @@ -3473,6 +3480,13 @@ def show_gui(): args.tensor_split = [float(x) for x in tssv.split(",")] else: args.tensor_split = [float(x) for x in tssv.split(" ")] + if draftgpusplit_str_vars.get()!="": + tssv = draftgpusplit_str_vars.get() + if "," in tssv: + args.draftgpusplit = [float(x) for x in tssv.split(",")] + else: + args.draftgpusplit = [float(x) for x in tssv.split(" ")] + args.blasthreads = None if blas_threads_var.get()=="" else int(blas_threads_var.get()) args.blasbatchsize = int(blasbatchsize_values[int(blas_size_var.get())]) @@ -3504,7 +3518,6 @@ def show_gui(): args.draftmodel = None if draftmodel_var.get() == "" else draftmodel_var.get() args.draftamount = int(draftamount_var.get()) if draftamount_var.get()!="" else default_draft_amount args.draftgpulayers = int(draftgpulayers_var.get()) if draftgpulayers_var.get()!="" else 999 - args.draftgpuid = int(draftgpuid_var.get()) if draftgpuid_var.get()!="" else -1 args.ssl = None if (ssl_cert_var.get() == "" or ssl_key_var.get() == "") else ([ssl_cert_var.get(), ssl_key_var.get()]) args.password = None if (password_var.get() == "") else (password_var.get()) @@ -3635,6 +3648,9 @@ def show_gui(): if "tensor_split" in dict and dict["tensor_split"]: tssep = ','.join(map(str, dict["tensor_split"])) tensor_split_str_vars.set(tssep) + if "draftgpusplit" in dict and dict["draftgpusplit"]: + tssep = ','.join(map(str, dict["draftgpusplit"])) + draftgpusplit_str_vars.set(tssep) if "blasthreads" in dict and dict["blasthreads"]: blas_threads_var.set(str(dict["blasthreads"])) else: @@ -3672,8 +3688,6 @@ def show_gui(): draftamount_var.set(dict["draftamount"]) if "draftgpulayers" in dict: draftgpulayers_var.set(dict["draftgpulayers"]) - if "draftgpuid" in dict: - draftgpuid_var.set(dict["draftgpuid"]) ssl_cert_var.set("") ssl_key_var.set("") @@ -4927,7 +4941,7 @@ if __name__ == '__main__': parser.add_argument("--threads", metavar=('[threads]'), help="Use a custom number of threads if specified. Otherwise, uses an amount based on CPU cores", type=int, default=get_default_threads()) compatgroup = parser.add_mutually_exclusive_group() - compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs. For hipBLAS binaries, please check YellowRoseCx rocm fork.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq] [rowsplit]'), choices=['normal', 'lowvram', '0', '1', '2', '3', 'all', 'mmq', 'rowsplit']) + compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs. For hipBLAS binaries, please check YellowRoseCx rocm fork.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq|nommq] [rowsplit]'), choices=['normal', 'lowvram', '0', '1', '2', '3', 'all', 'mmq', 'nommq', 'rowsplit']) compatgroup.add_argument("--usevulkan", help="Use Vulkan for GPU Acceleration. Can optionally specify one or more GPU Device ID (e.g. --usevulkan 0), leave blank to autodetect.", metavar=('[Device IDs]'), nargs='*', type=int, default=None) compatgroup.add_argument("--useclblast", help="Use CLBlast for GPU Acceleration. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2) compatgroup.add_argument("--usecpu", help="Do not use any GPU acceleration (CPU Only)", action='store_true') @@ -4957,18 +4971,18 @@ if __name__ == '__main__': advparser.add_argument("--remotetunnel", help="Uses Cloudflare to create a remote tunnel, allowing you to access koboldcpp remotely over the internet even behind a firewall.", action='store_true') advparser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true') advparser.add_argument("--foreground", help="Windows only. Sends the terminal to the foreground every time a new prompt is generated. This helps avoid some idle slowdown issues.", action='store_true') - advparser.add_argument("--preloadstory", help="Configures a prepared story json save file to be hosted on the server, which frontends (such as KoboldAI Lite) can access over the API.", default="") + advparser.add_argument("--preloadstory", metavar=('[savefile]'), help="Configures a prepared story json save file to be hosted on the server, which frontends (such as KoboldAI Lite) can access over the API.", default="") advparser.add_argument("--quiet", help="Enable quiet mode, which hides generation inputs and outputs in the terminal. Quiet mode is automatically enabled when running a horde worker.", action='store_true') advparser.add_argument("--ssl", help="Allows all content to be served over SSL instead. A valid UNENCRYPTED SSL cert and key .pem files must be provided", metavar=('[cert_pem]', '[key_pem]'), nargs='+') advparser.add_argument("--nocertify", help="Allows insecure SSL connections. Use this if you have cert errors and need to bypass certificate restrictions.", action='store_true') - advparser.add_argument("--mmproj", help="Select a multimodal projector file for vision models like LLaVA.", default="") - advparser.add_argument("--draftmodel", help="Load a small draft model for speculative decoding. It will be fully offloaded. Vocab must match the main model.", default="") + advparser.add_argument("--mmproj", metavar=('[filename]'), help="Select a multimodal projector file for vision models like LLaVA.", default="") + advparser.add_argument("--draftmodel", metavar=('[filename]'), help="Load a small draft model for speculative decoding. It will be fully offloaded. Vocab must match the main model.", default="") advparser.add_argument("--draftamount", metavar=('[tokens]'), help="How many tokens to draft per chunk before verifying results", type=int, default=default_draft_amount) advparser.add_argument("--draftgpulayers", metavar=('[layers]'), help="How many layers to offload to GPU for the draft model (default=full offload)", type=int, default=999) - advparser.add_argument("--draftgpuid", metavar=('[gpu id]'), help="Which GPU to use for draft model (default=same as main). Only works if multi-GPUs selected for MAIN model!", type=int, default=-1) - advparser.add_argument("--password", help="Enter a password required to use this instance. This key will be required for all text endpoints. Image endpoints are not secured.", default=None) + advparser.add_argument("--draftgpusplit", help="GPU layer distribution ratio for draft model (default=same as main). Only works if multi-GPUs selected for MAIN model and tensor_split is set!", metavar=('[Ratios]'), type=float, nargs='+') + advparser.add_argument("--password", metavar=('[API key]'), help="Enter a password required to use this instance. This key will be required for all text endpoints. Image endpoints are not secured.", default=None) advparser.add_argument("--ignoremissing", help="Ignores all missing non-essential files, just skipping them instead.", action='store_true') - advparser.add_argument("--chatcompletionsadapter", help="Select an optional ChatCompletions Adapter JSON file to force custom instruct tags.", default="") + advparser.add_argument("--chatcompletionsadapter", metavar=('[filename]'), help="Select an optional ChatCompletions Adapter JSON file to force custom instruct tags.", default="") advparser.add_argument("--flashattention", help="Enables flash attention.", action='store_true') advparser.add_argument("--quantkv", help="Sets the KV cache data type quantization, 0=f16, 1=q8, 2=q4. Requires Flash Attention, and disables context shifting.",metavar=('[quantization level 0/1/2]'), type=int, choices=[0,1,2], default=0) advparser.add_argument("--forceversion", help="If the model file format detection fails (e.g. rogue modified model) you can set this to override the detected format (enter desired version, e.g. 401 for GPTNeoX-Type2).",metavar=('[version]'), type=int, default=0) @@ -4990,7 +5004,7 @@ if __name__ == '__main__': sdparsergroup = parser.add_argument_group('Image Generation Commands') sdparsergroup.add_argument("--sdmodel", metavar=('[filename]'), help="Specify a stable diffusion safetensors or gguf model to enable image generation.", default="") sdparsergroup.add_argument("--sdthreads", metavar=('[threads]'), help="Use a different number of threads for image generation if specified. Otherwise, has the same value as --threads.", type=int, default=0) - sdparsergroup.add_argument("--sdclamped", help="If specified, limit generation steps and resolution settings for shared use. Accepts an extra optional parameter that indicates maximum resolution (eg. 768 clamps to 768x768, min 512px, disabled if 0).", nargs='?', const=512, type=int, default=0) + sdparsergroup.add_argument("--sdclamped", metavar=('[maxres]'), help="If specified, limit generation steps and resolution settings for shared use. Accepts an extra optional parameter that indicates maximum resolution (eg. 768 clamps to 768x768, min 512px, disabled if 0).", nargs='?', const=512, type=int, default=0) sdparsergroup.add_argument("--sdt5xxl", metavar=('[filename]'), help="Specify a T5-XXL safetensors model for use in SD3 or Flux. Leave blank if prebaked or unused.", default="") sdparsergroup.add_argument("--sdclipl", metavar=('[filename]'), help="Specify a Clip-L safetensors model for use in SD3 or Flux. Leave blank if prebaked or unused.", default="") sdparsergroup.add_argument("--sdclipg", metavar=('[filename]'), help="Specify a Clip-G safetensors model for use in SD3. Leave blank if prebaked or unused.", default="")