From fdfb713d91a7814d62beaaec0ce9013cbb505af7 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sat, 21 Mar 2026 17:34:12 +0800 Subject: [PATCH] added `--sdmaingpu` allowing image models to be independently placed on any gpu --- expose.h | 12 ++++++------ ggml/src/ggml-vulkan/ggml-vulkan.cpp | 2 +- koboldcpp.py | 21 ++++++++++++++++++--- otherarch/sdcpp/sdtype_adapter.cpp | 13 +++++++++++++ otherarch/sdcpp/stable-diffusion.cpp | 24 +++++++++++++++++++++++- 5 files changed, 61 insertions(+), 11 deletions(-) diff --git a/expose.h b/expose.h index 70e68543c..266ca09a4 100644 --- a/expose.h +++ b/expose.h @@ -54,7 +54,7 @@ struct load_model_inputs const bool use_smartcontext = false; const bool use_contextshift = false; const bool use_fastforward = false; - const int kcpp_main_gpu = 0; + const int kcpp_main_gpu = -1; const char * vulkan_info = nullptr; const int batchsize = 512; const bool autofit = false; @@ -172,7 +172,7 @@ struct sd_load_model_inputs { const char * model_filename = nullptr; const char * executable_path = nullptr; - const int kcpp_main_gpu = 0; + const int kcpp_main_gpu = -1; const char * vulkan_info = nullptr; const int threads = 0; const int quant = 0; @@ -255,7 +255,7 @@ struct whisper_load_model_inputs { const char * model_filename = nullptr; const char * executable_path = nullptr; - const int kcpp_main_gpu = 0; + const int kcpp_main_gpu = -1; const char * vulkan_info = nullptr; const char * devices_override = nullptr; const bool quiet = false; @@ -280,7 +280,7 @@ struct tts_load_model_inputs const char * ttc_model_filename = nullptr; const char * cts_model_filename = nullptr; const char * executable_path = nullptr; - const int kcpp_main_gpu = 0; + const int kcpp_main_gpu = -1; const char * vulkan_info = nullptr; const int gpulayers = 0; const bool flash_attention = false; @@ -310,7 +310,7 @@ struct embeddings_load_model_inputs const int threads = 4; const char * model_filename = nullptr; const char * executable_path = nullptr; - const int kcpp_main_gpu = 0; + const int kcpp_main_gpu = -1; const char * vulkan_info = nullptr; const int gpulayers = 0; const bool flash_attention = false; @@ -340,7 +340,7 @@ struct music_load_model_inputs const char * musicvae_filename = nullptr; const bool lowvram = false; const char * executable_path = nullptr; - const int kcpp_main_gpu = 0; + const int kcpp_main_gpu = -1; const char * vulkan_info = nullptr; const char * devices_override = nullptr; const bool quiet = false; diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 74abcc02d..bdd0cec1b 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -4814,7 +4814,7 @@ static vk_device ggml_vk_get_device(size_t idx) { vk::PhysicalDeviceProperties tmpprops = device->physical_device.getProperties(); if(tmpprops.vendorID == VK_VENDOR_ID_NVIDIA) { - printf("Apply NVIDIA Anti-BSOD Fix for KCPP\n"); + printf("\nApply NVIDIA Anti-BSOD Fix for KCPP\n"); device->disable_host_visible_vidmem = true; //kcpp requested fix for vulkan BSOD on Nvidia } diff --git a/koboldcpp.py b/koboldcpp.py index ce4a306d2..f5a4ba2af 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -909,7 +909,7 @@ def init_library(): def set_backend_props(inputs): # we must force an explicit tensor split # otherwise the default will divide equally and multigpu crap will slow it down badly - inputs.kcpp_main_gpu = 0 + inputs.kcpp_main_gpu = -1 if(args.maingpu is not None and args.maingpu>=0): inputs.kcpp_main_gpu = args.maingpu @@ -2135,6 +2135,7 @@ def sd_load_model(model_filename,vae_filename,t5xxl_filename,clip1_filename,clip inputs.img_hard_limit = args.sdclamped inputs.img_soft_limit = args.sdclampedsoft inputs = set_backend_props(inputs) + inputs.kcpp_main_gpu = args.sdmaingpu ret = handle.sd_load_model(inputs) return ret @@ -6361,6 +6362,8 @@ def show_gui(): sd_clamped_soft_var = ctk.StringVar(value="0") sd_threads_var = ctk.StringVar(value=str(default_threads)) sd_quant_var = ctk.StringVar(value=sd_quant_choices[0]) + sd_main_gpu_var = ctk.StringVar(value="-1") + gen_defaults_var = ctk.StringVar() gen_defaults_overwrite_var = ctk.IntVar(value=0) @@ -6953,7 +6956,7 @@ def show_gui(): layercounter_label.grid(row=6, column=0, padx=230, sticky="W") layercounter_label.configure(text_color="#ffff00") tensor_split_entry,tensor_split_label = makelabelentry(hardware_tab, "Tensor Split:", tensor_split_str_vars, 8, 80, padx=160, singleline=True, tooltip='When using multiple GPUs this option controls how large tensors should be split across all GPUs.\nUses a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order.\nFor example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1.') - maingpu_entry,maingpu_label = makelabelentry(hardware_tab, "Main GPU:" , maingpu_var, 8, 50,padx=340,singleline=True,tooltip="Only for multi-gpu, which GPU to set as main?\nIf left blank or -1, uses default value.",labelpadx=270) + maingpu_entry,maingpu_label = makelabelentry(hardware_tab, "Main GPU:" , maingpu_var, 8, 50,padx=340,singleline=True,tooltip="Only for multi-gpu, which GPU ID to set as main?\nIf left blank or -1, uses default value.",labelpadx=270) # threads makelabelentry(hardware_tab, "Threads:" , threads_var, 11, 50, padx=160, singleline=True,tooltip="How many threads to use.\nRecommended value is your CPU core count, defaults are usually OK.") @@ -7140,7 +7143,9 @@ def show_gui(): makefileentry(images_tab, "Image Gen. Model (safetensors/gguf):", "Select Image Gen Model File", sd_model_var, 1, width=280, singlecol=True, filetypes=[("*.safetensors *.gguf","*.safetensors *.gguf")], tooltiptxt="Select a .safetensors or .gguf Image Generation model file on disk to be loaded.") makelabelentry(images_tab, "Clamp Resolution Limit (Hard):", sd_clamped_var, 4, 50, padx=(190),singleline=True,tooltip="Limit generation steps and output image size for shared use.\nSet to 0 to disable, otherwise value is clamped to the max size limit (min 512px).") makelabelentry(images_tab, "(Soft):", sd_clamped_soft_var, 4, 50, padx=(290),singleline=True,tooltip="Square image size restriction, to protect the server against memory crashes.\nAllows width-height tradeoffs, eg. 640 allows 640x640 and 512x768\nLeave at 0 for the default value: 832 for SD1.5/SD2, 1024 otherwise.",labelpadx=(250)) - makelabelentry(images_tab, "ImgThreads:" , sd_threads_var, 8, 50,padx=(290),singleline=True,tooltip="How many threads to use during image generation.\nIf left blank, uses same value as threads.",labelpadx=(210)) + makelabelentry(images_tab, "ImgThreads:" , sd_threads_var, 8, 40,padx=(280),singleline=True,tooltip="How many threads to use during image generation.\nIf left blank, uses same value as threads.",labelpadx=(200)) + makelabelentry(images_tab, "ImgGPU:" , sd_main_gpu_var, 8, 40,padx=394,singleline=True,tooltip="Which GPU ID to use for Image Gen?\nIf left blank or -1, uses default value.",labelpadx=340) + sd_model_var.trace_add("write", gui_changed_modelfile) makelabelcombobox(images_tab, "Compress Weights: ", sd_quant_var, 8, width=(60), padx=(126), labelpadx=8, tooltiptxt="Quantizes the SD model weights to save memory.\nHigher levels save more memory, and cause more quality degradation.", values=sd_quant_choices) sd_quant_var.trace_add("write", changed_gpulayers_estimate) @@ -7493,6 +7498,7 @@ def show_gui(): args.sdlora = [item.strip() for item in sd_lora_var.get().split("|") if item] # XXX the user may have used '|' since it's used for the LoRAs args.sdloramult = sanitize_lora_multipliers(re.split(r"[ |]+", sd_loramult_var.get())) + args.sdmaingpu = (-1 if sd_main_gpu_var.get()=="" else int(sd_main_gpu_var.get())) if gen_defaults_var.get() != "": args.gendefaults = gen_defaults_var.get() @@ -7748,6 +7754,11 @@ def show_gui(): sd_tiled_vae_var.set(str(dict["sdtiledvae"]) if ("sdtiledvae" in dict and dict["sdtiledvae"]) else str(default_vae_tile_threshold)) sd_lora_var.set("|".join(sanitize_lora_list(dict.get('sdlora')))) sd_loramult_var.set(" ".join(f"{n:.3f}".rstrip('0').rstrip('.') for n in dict.get("sdloramult", []))) + if "sdmaingpu" in dict: + sd_main_gpu_var.set(dict["sdmaingpu"]) + else: + sd_main_gpu_var.set("-1") + gendefaults = (dict["gendefaults"] if ("gendefaults" in dict and dict["gendefaults"]) else "") if isinstance(gendefaults, type({})): gendefaults = json.dumps(gendefaults) @@ -8370,6 +8381,8 @@ def convert_args_to_template(savdict): savdict["hordekey"] = "" savdict["hordeworkername"] = "" savdict["sdthreads"] = 0 + savdict["maingpu"] = -1 + savdict["sdmaingpu"] = -1 savdict["password"] = None savdict["adminpassword"] = None savdict["usemmap"] = False @@ -10093,6 +10106,8 @@ if __name__ == '__main__': sdparsergrouplora.add_argument("--sdlora", metavar=('[filename]'), help="Specify image generation LoRAs safetensors models to be applied. Multiple LoRAs are accepted.", nargs='+') sdparsergroup.add_argument("--sdloramult", metavar=('[amounts]'), help="Multipliers for the image LoRA model to be applied.", type=float, nargs='+', default=[1.0]) sdparsergroup.add_argument("--sdtiledvae", metavar=('[maxres]'), help="Adjust the automatic VAE tiling trigger for images above this size. 0 disables vae tiling.", type=int, default=default_vae_tile_threshold) + sdparsergroup.add_argument("--sdmaingpu", metavar=('[Device ID]'), help="If specified, Image Generation weights will be placed on the selected GPU index", type=int, default=-1) + whisperparsergroup = parser.add_argument_group('Whisper Transcription Commands') whisperparsergroup.add_argument("--whispermodel", metavar=('[filename]'), help="Specify a Whisper .bin model to enable Speech-To-Text transcription.", default="") diff --git a/otherarch/sdcpp/sdtype_adapter.cpp b/otherarch/sdcpp/sdtype_adapter.cpp index 94ef675d5..94d3ef1a8 100644 --- a/otherarch/sdcpp/sdtype_adapter.cpp +++ b/otherarch/sdcpp/sdtype_adapter.cpp @@ -159,6 +159,7 @@ static std::vector input_extraimage_buffers; const int max_extra_images = 4; static std::string sdvulkandeviceenv; +static std::string sdmaingpuenv; static int cfg_tiled_vae_threshold = 0; static int cfg_square_limit = 0; static int cfg_side_limit = 0; @@ -284,6 +285,18 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) { cfg_square_limit = inputs.img_soft_limit; printf("\nImageGen Init - Load Model: %s\n",inputs.model_filename); + { + //kcpp allow gpu id override + std::string sdmaingpu = std::to_string(inputs.kcpp_main_gpu); + const char* existingenv = getenv("SD_VK_DEVICE"); + int kcpp_parseinfo_maindevice = inputs.kcpp_main_gpu<=0?0:inputs.kcpp_main_gpu; + if(kcpp_parseinfo_maindevice>0 && !existingenv && sdmaingpu!="") + { + sdmaingpuenv = "SD_VK_DEVICE="+sdmaingpu; + putenv((char*)sdmaingpuenv.c_str()); + } + } + int lora_apply_mode = LORA_APPLY_AT_RUNTIME; bool lora_dynamic = false; bool lora_cache = false; diff --git a/otherarch/sdcpp/stable-diffusion.cpp b/otherarch/sdcpp/stable-diffusion.cpp index 2d15d1eac..66cbadfd5 100644 --- a/otherarch/sdcpp/stable-diffusion.cpp +++ b/otherarch/sdcpp/stable-diffusion.cpp @@ -199,7 +199,29 @@ public: void init_backend() { #ifdef SD_USE_CUDA LOG_DEBUG("Using CUDA backend"); - backend = ggml_backend_cuda_init(0); + size_t device = 0; //kcpp: ported device selection from vulkan + const int device_count = ggml_backend_cuda_get_device_count(); + if (device_count) { + const char* SD_VK_DEVICE = getenv("SD_VK_DEVICE"); + if (SD_VK_DEVICE != nullptr) { + std::string sd_vk_device_str = SD_VK_DEVICE; + try { + device = std::stoull(sd_vk_device_str); + } catch (const std::invalid_argument&) { + LOG_WARN("SD_VK_DEVICE environment variable is not a valid integer (%s). Falling back to device 0.", SD_VK_DEVICE); + device = 0; + } catch (const std::out_of_range&) { + LOG_WARN("SD_VK_DEVICE environment variable value is out of range for `unsigned long long` type (%s). Falling back to device 0.", SD_VK_DEVICE); + device = 0; + } + if (device >= device_count) { + LOG_WARN("Cannot find targeted cuda device (%llu). Falling back to device 0.", device); + device = 0; + } + } + LOG_INFO("CUDA: Using device %llu", device); + } + backend = ggml_backend_cuda_init(device); #endif #ifdef SD_USE_METAL LOG_DEBUG("Using Metal backend");