From 1cbe716e4503ef97c0ddc19a3f88a2c5cf19d4a3 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Thu, 12 Jun 2025 17:53:43 +0800 Subject: [PATCH] allow setting maingpu --- expose.h | 10 ++++----- gpttype_adapter.cpp | 16 +++++++------- koboldcpp.py | 51 ++++++++++++++++++++++++++++++++------------- 3 files changed, 49 insertions(+), 28 deletions(-) diff --git a/expose.h b/expose.h index 9d03b30c4..57fb9df6c 100644 --- a/expose.h +++ b/expose.h @@ -52,7 +52,7 @@ struct load_model_inputs const bool use_contextshift = false; const bool use_fastforward = false; const int clblast_info = 0; - const int cublas_info = 0; + const int kcpp_main_gpu = 0; const char * vulkan_info = nullptr; const int blasbatchsize = 512; const int forceversion = 0; @@ -157,7 +157,7 @@ struct sd_load_model_inputs const char * model_filename = nullptr; const char * executable_path = nullptr; const int clblast_info = 0; - const int cublas_info = 0; + const int kcpp_main_gpu = 0; const char * vulkan_info = nullptr; const int threads = 0; const int quant = 0; @@ -199,7 +199,7 @@ struct whisper_load_model_inputs const char * model_filename = nullptr; const char * executable_path = nullptr; const int clblast_info = 0; - const int cublas_info = 0; + const int kcpp_main_gpu = 0; const char * vulkan_info = nullptr; const bool quiet = false; const int debugmode = 0; @@ -224,7 +224,7 @@ struct tts_load_model_inputs const char * cts_model_filename = nullptr; const char * executable_path = nullptr; const int clblast_info = 0; - const int cublas_info = 0; + const int kcpp_main_gpu = 0; const char * vulkan_info = nullptr; const int gpulayers = 0; const bool flash_attention = false; @@ -252,7 +252,7 @@ struct embeddings_load_model_inputs const char * model_filename = nullptr; const char * executable_path = nullptr; const int clblast_info = 0; - const int cublas_info = 0; + const int kcpp_main_gpu = 0; const char * vulkan_info = nullptr; const int gpulayers = 0; const bool flash_attention = false; diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index ccab246c2..1b9c9b5ca 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -2020,16 +2020,16 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in //this is used for the mem_per_token eval, blas needs more RAM bool v3_use_scratch = ggml_v3_cpu_has_gpublas(); - int cu_parseinfo_maindevice = inputs.cublas_info<=0?0:inputs.cublas_info; + int kcpp_parseinfo_maindevice = inputs.kcpp_main_gpu<=0?0:inputs.kcpp_main_gpu; printf("System Info: %s\n", kcpp_print_system_info()); #if defined(GGML_USE_CUDA) if(file_format!=FileFormat::GGUF_GENERIC) { - if(ggml_v3_cpu_has_gpublas() && cu_parseinfo_maindevice>0) + if(ggml_v3_cpu_has_gpublas() && kcpp_parseinfo_maindevice>0) { - printf("CUBLAS v3: Set main device to %d\n",cu_parseinfo_maindevice); - ggml_v3_cuda_set_main_device(cu_parseinfo_maindevice); + printf("CUBLAS v3: Set main device to %d\n",kcpp_parseinfo_maindevice); + ggml_v3_cuda_set_main_device(kcpp_parseinfo_maindevice); } } @@ -2092,7 +2092,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in llama_ctx_params.use_mmap = inputs.use_mmap; llama_ctx_params.use_mlock = inputs.use_mlock; llama_ctx_params.n_gpu_layers = inputs.gpulayers; - llama_ctx_params.main_gpu = cu_parseinfo_maindevice; + llama_ctx_params.main_gpu = kcpp_parseinfo_maindevice; llama_ctx_params.rope_freq_base = rope_freq_base; llama_ctx_params.rope_freq_scale = rope_freq_scale; llama_ctx_params.n_batch = kcpp_data->n_batch; @@ -2178,9 +2178,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } #endif #if defined(GGML_USE_CUDA) - if(cu_parseinfo_maindevice>0) + if(kcpp_parseinfo_maindevice>0) { - printf("CUDA: Set main device to %d\n",cu_parseinfo_maindevice); + printf("CUDA: Set main device to %d\n",kcpp_parseinfo_maindevice); } printf("CUDA MMQ: %s\n",(inputs.use_mmq?"True":"False")); ggml_cuda_set_mul_mat_q(inputs.use_mmq); @@ -2194,7 +2194,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in printf("Qwen2VL detected! Mrope will be used, and context shift will be disabled!\n"); kcpp_data->use_contextshift = false; } - model_params.main_gpu = cu_parseinfo_maindevice; + model_params.main_gpu = kcpp_parseinfo_maindevice; #if defined(GGML_USE_CUDA) model_params.split_mode = (inputs.use_rowsplit?llama_split_mode::LLAMA_SPLIT_MODE_ROW:llama_split_mode::LLAMA_SPLIT_MODE_LAYER); diff --git a/koboldcpp.py b/koboldcpp.py index 665b4a130..e928deb48 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -178,7 +178,7 @@ class load_model_inputs(ctypes.Structure): ("use_contextshift", ctypes.c_bool), ("use_fastforward", ctypes.c_bool), ("clblast_info", ctypes.c_int), - ("cublas_info", ctypes.c_int), + ("kcpp_main_gpu", ctypes.c_int), ("vulkan_info", ctypes.c_char_p), ("blasbatchsize", ctypes.c_int), ("forceversion", ctypes.c_int), @@ -262,7 +262,7 @@ class sd_load_model_inputs(ctypes.Structure): _fields_ = [("model_filename", ctypes.c_char_p), ("executable_path", ctypes.c_char_p), ("clblast_info", ctypes.c_int), - ("cublas_info", ctypes.c_int), + ("kcpp_main_gpu", ctypes.c_int), ("vulkan_info", ctypes.c_char_p), ("threads", ctypes.c_int), ("quant", ctypes.c_int), @@ -300,7 +300,7 @@ class whisper_load_model_inputs(ctypes.Structure): _fields_ = [("model_filename", ctypes.c_char_p), ("executable_path", ctypes.c_char_p), ("clblast_info", ctypes.c_int), - ("cublas_info", ctypes.c_int), + ("kcpp_main_gpu", ctypes.c_int), ("vulkan_info", ctypes.c_char_p), ("quiet", ctypes.c_bool), ("debugmode", ctypes.c_int)] @@ -321,7 +321,7 @@ class tts_load_model_inputs(ctypes.Structure): ("cts_model_filename", ctypes.c_char_p), ("executable_path", ctypes.c_char_p), ("clblast_info", ctypes.c_int), - ("cublas_info", ctypes.c_int), + ("kcpp_main_gpu", ctypes.c_int), ("vulkan_info", ctypes.c_char_p), ("gpulayers", ctypes.c_int), ("flash_attention", ctypes.c_bool), @@ -345,7 +345,7 @@ class embeddings_load_model_inputs(ctypes.Structure): ("model_filename", ctypes.c_char_p), ("executable_path", ctypes.c_char_p), ("clblast_info", ctypes.c_int), - ("cublas_info", ctypes.c_int), + ("kcpp_main_gpu", ctypes.c_int), ("vulkan_info", ctypes.c_char_p), ("gpulayers", ctypes.c_int), ("flash_attention", ctypes.c_bool), @@ -566,7 +566,9 @@ def set_backend_props(inputs): # we must force an explicit tensor split # otherwise the default will divide equally and multigpu crap will slow it down badly - inputs.cublas_info = 0 + inputs.kcpp_main_gpu = 0 + if(args.maingpu is not None and args.maingpu>=0): + inputs.kcpp_main_gpu = args.maingpu if args.usecublas: os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" @@ -574,24 +576,29 @@ def set_backend_props(inputs): if (args.usecublas and "0" in args.usecublas): os.environ["CUDA_VISIBLE_DEVICES"] = "0" os.environ["HIP_VISIBLE_DEVICES"] = "0" + inputs.kcpp_main_gpu = 0 elif (args.usecublas and "1" in args.usecublas): os.environ["CUDA_VISIBLE_DEVICES"] = "1" os.environ["HIP_VISIBLE_DEVICES"] = "1" + inputs.kcpp_main_gpu = 0 elif (args.usecublas and "2" in args.usecublas): os.environ["CUDA_VISIBLE_DEVICES"] = "2" os.environ["HIP_VISIBLE_DEVICES"] = "2" + inputs.kcpp_main_gpu = 0 elif (args.usecublas and "3" in args.usecublas): os.environ["CUDA_VISIBLE_DEVICES"] = "3" os.environ["HIP_VISIBLE_DEVICES"] = "3" + inputs.kcpp_main_gpu = 0 else: - if (args.usecublas and "0" in args.usecublas): - inputs.cublas_info = 0 - elif (args.usecublas and "1" in args.usecublas): - inputs.cublas_info = 1 - elif (args.usecublas and "2" in args.usecublas): - inputs.cublas_info = 2 - elif (args.usecublas and "3" in args.usecublas): - inputs.cublas_info = 3 + if(args.maingpu is None or args.maingpu<0): + if (args.usecublas and "0" in args.usecublas): + inputs.kcpp_main_gpu = 0 + elif (args.usecublas and "1" in args.usecublas): + inputs.kcpp_main_gpu = 1 + elif (args.usecublas and "2" in args.usecublas): + inputs.kcpp_main_gpu = 2 + elif (args.usecublas and "3" in args.usecublas): + inputs.kcpp_main_gpu = 3 if args.usevulkan: #is an empty array if using vulkan without defined gpu s = "" @@ -4201,6 +4208,7 @@ def show_gui(): version_var = ctk.StringVar(value="0") tensor_split_str_vars = ctk.StringVar(value="") rowsplit_var = ctk.IntVar() + maingpu_var = ctk.StringVar(value="") contextshift_var = ctk.IntVar(value=1) fastforward_var = ctk.IntVar(value=1) @@ -4676,6 +4684,8 @@ def show_gui(): quick_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw") CUDA_gpu_selector_box.grid_remove() CUDA_quick_gpu_selector_box.grid_remove() + maingpu_label.grid_remove() + maingpu_entry.grid_remove() if gpu_choice_var.get()=="All": gpu_choice_var.set("1") elif index == "Use Vulkan" or index == "Use Vulkan (Old CPU)" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)": @@ -4683,6 +4693,8 @@ def show_gui(): quick_gpu_selector_box.grid_remove() CUDA_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw") CUDA_quick_gpu_selector_box.grid(row=3, column=1, padx=8, pady=1, stick="nw") + maingpu_label.grid(row=10, column=0, padx = 8, pady=1, stick="nw") + maingpu_entry.grid(row=10, column=1, padx = 8, pady=1, stick="nw") else: quick_gpuname_label.grid_remove() gpuname_label.grid_remove() @@ -4692,6 +4704,8 @@ def show_gui(): quick_gpu_selector_label.grid_remove() quick_gpu_selector_box.grid_remove() CUDA_quick_gpu_selector_box.grid_remove() + maingpu_label.grid_remove() + maingpu_entry.grid_remove() if index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)": lowvram_box.grid(row=4, column=0, padx=8, pady=1, stick="nw") @@ -4803,6 +4817,8 @@ def show_gui(): mmq_box = makecheckbox(hardware_tab, "Use QuantMatMul (mmq)", mmq_var, 4,1, tooltiptxt="Enable MMQ mode to use finetuned kernels instead of default CuBLAS/HipBLAS for prompt processing.\nRead the wiki. Speed may vary.") splitmode_box = makecheckbox(hardware_tab, "Row-Split", rowsplit_var, 5,0, tooltiptxt="Split rows across GPUs instead of splitting layers and KV across GPUs.\nUses the main GPU for small tensors and intermediate results. Speed may vary.") + maingpu_entry,maingpu_label = makelabelentry(hardware_tab, "Main GPU:" , maingpu_var, 10, 50,tooltip="Only for multi-gpu, which GPU to set as main?\nIf left blank, uses default value.") + # threads makelabelentry(hardware_tab, "Threads:" , threads_var, 11, 50,tooltip="How many threads to use.\nRecommended value is your CPU core count, defaults are usually OK.") @@ -5140,7 +5156,7 @@ def show_gui(): else: args.draftgpusplit = [float(x) for x in tssv.split(" ")] - + args.maingpu = -1 if maingpu_var.get()=="" else int(maingpu_var.get()) args.blasthreads = None if blas_threads_var.get()=="" else int(blas_threads_var.get()) args.blasbatchsize = int(blasbatchsize_values[int(blas_size_var.get())]) args.forceversion = 0 if version_var.get()=="" else int(version_var.get()) @@ -5330,6 +5346,10 @@ def show_gui(): gpulayers_var.set(dict["gpulayers"]) else: gpulayers_var.set("0") + if "maingpu" in dict: + maingpu_var.set(dict["maingpu"]) + else: + maingpu_var.set("") if "tensor_split" in dict and dict["tensor_split"]: tssep = ','.join(map(str, dict["tensor_split"])) tensor_split_str_vars.set(tssep) @@ -7076,6 +7096,7 @@ if __name__ == '__main__': advparser = parser.add_argument_group('Advanced Commands') advparser.add_argument("--version", help="Prints version and exits.", action='store_true') advparser.add_argument("--analyze", metavar=('[filename]'), help="Reads the metadata, weight types and tensor names in any GGUF file.", default="") + advparser.add_argument("--maingpu", help="Only used in a multi-gpu setup. Sets the index of the main GPU that will be used.",metavar=('[Device ID]'), type=int, default=-1) advparser.add_argument("--ropeconfig", help="If set, uses customized RoPE scaling from configured frequency scale and frequency base (e.g. --ropeconfig 0.25 10000). Otherwise, uses NTK-Aware scaling set automatically based on context size. For linear rope, simply set the freq-scale and ignore the freq-base",metavar=('[rope-freq-scale]', '[rope-freq-base]'), default=[0.0, 10000.0], type=float, nargs='+') advparser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,16,32,64,128,256,512,1024,2048], default=512) advparser.add_argument("--blasthreads", help="Use a different number of threads during BLAS if specified. Otherwise, has the same value as --threads",metavar=('[threads]'), type=int, default=0)