From 25fab4113ebdf8fcf8b697be2984f8af8db884a9 Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Sat, 2 May 2026 12:10:29 -0300 Subject: [PATCH] refactor: handle GGML_VK_VISIBLE_DEVICES at the Python level (#2179) All C++ handling code currently: - build a comma-separated list from the info_vulkan array - if GGML_VK_VISIBLE_DEVICES isn't set - set GGML_VK_VISIBLE_DEVICES to the list Once set, GGML_VK_VISIBLE_DEVICES affects the whole process. So this can be done in the same way at the Python level, before all loading functions. Caveat: load_model had the default `inputs.vulkan_info = "0"`, so the default GPU would be "0" only when loading a text model. --- expose.cpp | 17 ----------------- expose.h | 6 ------ koboldcpp.py | 17 +++-------------- otherarch/acestep/music_adapter.cpp | 16 ---------------- otherarch/embeddings_adapter.cpp | 16 ---------------- otherarch/sdcpp/sdtype_adapter.cpp | 17 ----------------- otherarch/tts_adapter.cpp | 16 ---------------- otherarch/whispercpp/whisper_adapter.cpp | 18 ------------------ 8 files changed, 3 insertions(+), 120 deletions(-) diff --git a/expose.cpp b/expose.cpp index 2f855b368..c58ad8fa8 100644 --- a/expose.cpp +++ b/expose.cpp @@ -23,8 +23,6 @@ extern "C" { - std::string vulkandeviceenv; - //return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt) static FileFormat file_format = FileFormat::BADFORMAT; static FileFormatExtraMeta file_format_meta; @@ -38,21 +36,6 @@ extern "C" file_format = check_file_format(model.c_str(),&file_format_meta); - std::string vulkan_info_raw = inputs.vulkan_info; - std::string vulkan_info_str = ""; - for (size_t i = 0; i < vulkan_info_raw.length(); ++i) { - vulkan_info_str += vulkan_info_raw[i]; - if (i < vulkan_info_raw.length() - 1) { - vulkan_info_str += ","; - } - } - const char* existingenv = getenv("GGML_VK_VISIBLE_DEVICES"); - if(!existingenv && vulkan_info_str!="") - { - vulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+vulkan_info_str; - putenv((char*)vulkandeviceenv.c_str()); - } - executable_path = inputs.executable_path; if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2 || file_format==FileFormat::GPTJ_3 || file_format==FileFormat::GPTJ_4 || file_format==FileFormat::GPTJ_5) diff --git a/expose.h b/expose.h index 8babf7344..f44c327ee 100644 --- a/expose.h +++ b/expose.h @@ -55,7 +55,6 @@ struct load_model_inputs const bool use_contextshift = false; const bool use_fastforward = false; const int kcpp_main_gpu = -1; - const char * vulkan_info = nullptr; const int batchsize = 512; const bool autofit = false; const int autofit_tax_mb = 0; @@ -178,7 +177,6 @@ struct sd_load_model_inputs const char * model_filename = nullptr; const char * executable_path = nullptr; const int kcpp_main_gpu = -1; - const char * vulkan_info = nullptr; const int threads = 0; const int quant = 0; const bool flash_attention = false; @@ -263,7 +261,6 @@ struct whisper_load_model_inputs const char * model_filename = nullptr; const char * executable_path = nullptr; const int kcpp_main_gpu = -1; - const char * vulkan_info = nullptr; const char * devices_override = nullptr; const bool quiet = false; const int debugmode = 0; @@ -288,7 +285,6 @@ struct tts_load_model_inputs const char * cts_model_filename = nullptr; const char * executable_path = nullptr; const int kcpp_main_gpu = -1; - const char * vulkan_info = nullptr; const int gpulayers = 0; const bool flash_attention = false; const int ttsmaxlen = 4096; @@ -319,7 +315,6 @@ struct embeddings_load_model_inputs const char * model_filename = nullptr; const char * executable_path = nullptr; const int kcpp_main_gpu = -1; - const char * vulkan_info = nullptr; const int gpulayers = 0; const bool flash_attention = false; const bool use_mmap = false; @@ -349,7 +344,6 @@ struct music_load_model_inputs const bool lowvram = false; const char * executable_path = nullptr; const int kcpp_main_gpu = -1; - const char * vulkan_info = nullptr; const char * devices_override = nullptr; const bool quiet = false; const int debugmode = 0; diff --git a/koboldcpp.py b/koboldcpp.py index 342e98be1..17c8dc5e5 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -256,7 +256,6 @@ class load_model_inputs(ctypes.Structure): ("use_contextshift", ctypes.c_bool), ("use_fastforward", ctypes.c_bool), ("kcpp_main_gpu", ctypes.c_int), - ("vulkan_info", ctypes.c_char_p), ("batchsize", ctypes.c_int), ("autofit", ctypes.c_bool), ("autofit_tax_mb", ctypes.c_int), @@ -356,7 +355,6 @@ class sd_load_model_inputs(ctypes.Structure): _fields_ = [("model_filename", ctypes.c_char_p), ("executable_path", ctypes.c_char_p), ("kcpp_main_gpu", ctypes.c_int), - ("vulkan_info", ctypes.c_char_p), ("threads", ctypes.c_int), ("quant", ctypes.c_int), ("flash_attention", ctypes.c_bool), @@ -435,7 +433,6 @@ class whisper_load_model_inputs(ctypes.Structure): _fields_ = [("model_filename", ctypes.c_char_p), ("executable_path", ctypes.c_char_p), ("kcpp_main_gpu", ctypes.c_int), - ("vulkan_info", ctypes.c_char_p), ("devices_override", ctypes.c_char_p), ("quiet", ctypes.c_bool), ("debugmode", ctypes.c_int)] @@ -456,7 +453,6 @@ class tts_load_model_inputs(ctypes.Structure): ("cts_model_filename", ctypes.c_char_p), ("executable_path", ctypes.c_char_p), ("kcpp_main_gpu", ctypes.c_int), - ("vulkan_info", ctypes.c_char_p), ("gpulayers", ctypes.c_int), ("flash_attention", ctypes.c_bool), ("ttsmaxlen", ctypes.c_int), @@ -483,7 +479,6 @@ class embeddings_load_model_inputs(ctypes.Structure): ("model_filename", ctypes.c_char_p), ("executable_path", ctypes.c_char_p), ("kcpp_main_gpu", ctypes.c_int), - ("vulkan_info", ctypes.c_char_p), ("gpulayers", ctypes.c_int), ("flash_attention", ctypes.c_bool), ("use_mmap", ctypes.c_bool), @@ -509,7 +504,6 @@ class music_load_model_inputs(ctypes.Structure): ("lowvram", ctypes.c_bool), ("executable_path", ctypes.c_char_p), ("kcpp_main_gpu", ctypes.c_int), - ("vulkan_info", ctypes.c_char_p), ("devices_override", ctypes.c_char_p), ("quiet", ctypes.c_bool), ("debugmode", ctypes.c_int)] @@ -993,13 +987,9 @@ def set_backend_props(inputs): elif (args.usecuda and "3" in args.usecuda): inputs.kcpp_main_gpu = 3 - if args.usevulkan: #is an empty array if using vulkan without defined gpu - s = "" - for it in range(0,len(args.usevulkan)): - s += str(args.usevulkan[it]) - inputs.vulkan_info = s.encode("UTF-8") - else: - inputs.vulkan_info = "".encode("UTF-8") + if "GGML_VK_VISIBLE_DEVICES" not in os.environ: + if args.usevulkan: # is an empty array if using vulkan without defined gpu + os.environ["GGML_VK_VISIBLE_DEVICES"] = ','.join([str(g) for g in args.usevulkan]) # set universal flags inputs.devices_override = (args.device if args.device else "").encode("UTF-8") @@ -1890,7 +1880,6 @@ def load_model(model_filename): inputs.low_vram = True if args.lowvram else False inputs.use_mmq = False if args.nommq else True inputs.splitmode = splitmode_choices_to_int(args.splitmode) #layer=1, row=2, tensor=3 - inputs.vulkan_info = "0".encode("UTF-8") inputs.blasthreads = args.blasthreads inputs.use_mmap = args.usemmap inputs.use_mlock = args.usemlock diff --git a/otherarch/acestep/music_adapter.cpp b/otherarch/acestep/music_adapter.cpp index 1547d3677..9b6f97a09 100644 --- a/otherarch/acestep/music_adapter.cpp +++ b/otherarch/acestep/music_adapter.cpp @@ -33,22 +33,6 @@ bool musictype_load_model(const music_load_model_inputs inputs) { music_is_quiet = inputs.quiet; - //duplicated from expose.cpp - std::string vulkan_info_raw = inputs.vulkan_info; - std::string vulkan_info_str = ""; - for (size_t i = 0; i < vulkan_info_raw.length(); ++i) { - vulkan_info_str += vulkan_info_raw[i]; - if (i < vulkan_info_raw.length() - 1) { - vulkan_info_str += ","; - } - } - const char* existingenv = getenv("GGML_VK_VISIBLE_DEVICES"); - if(!existingenv && vulkan_info_str!="") - { - musicvulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+vulkan_info_str; - putenv((char*)musicvulkandeviceenv.c_str()); - } - std::string musicllm_filename = inputs.musicllm_filename; std::string musicembedding_filename = inputs.musicembedding_filename; std::string musicdiffusion_filename = inputs.musicdiffusion_filename; diff --git a/otherarch/embeddings_adapter.cpp b/otherarch/embeddings_adapter.cpp index f04703d0a..0613256e2 100644 --- a/otherarch/embeddings_adapter.cpp +++ b/otherarch/embeddings_adapter.cpp @@ -22,7 +22,6 @@ #endif static llama_context * embeddings_ctx = nullptr; //text to codes ctx -static std::string ttsvulkandeviceenv; bool embeddings_debug = false; static int max_batchsize = 512; static std::string last_output = ""; @@ -82,27 +81,12 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu bool embeddingstype_load_model(const embeddings_load_model_inputs inputs) { - //duplicated from expose.cpp - std::string vulkan_info_raw = inputs.vulkan_info; - std::string vulkan_info_str = ""; - for (size_t i = 0; i < vulkan_info_raw.length(); ++i) { - vulkan_info_str += vulkan_info_raw[i]; - if (i < vulkan_info_raw.length() - 1) { - vulkan_info_str += ","; - } - } - const char* existingenv = getenv("GGML_VK_VISIBLE_DEVICES"); std::vector devices_override; std::string dev_override_str = inputs.devices_override; if(dev_override_str!="") { devices_override = kcpp_parse_device_list(dev_override_str); } - if(!existingenv && vulkan_info_str!="") - { - ttsvulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+vulkan_info_str; - putenv((char*)ttsvulkandeviceenv.c_str()); - } llama_backend_init(); diff --git a/otherarch/sdcpp/sdtype_adapter.cpp b/otherarch/sdcpp/sdtype_adapter.cpp index e583cd6bd..41aaa82f2 100644 --- a/otherarch/sdcpp/sdtype_adapter.cpp +++ b/otherarch/sdcpp/sdtype_adapter.cpp @@ -187,7 +187,6 @@ static uint8_t * upscale_src_buffer = NULL; static std::vector input_extraimage_buffers; const int max_extra_images = 4; -static std::string sdvulkandeviceenv; static std::string sdmaingpuenv; static int cfg_tiled_vae_threshold = 0; static int cfg_square_limit = 0; @@ -430,22 +429,6 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) { printf("Note: Loading a pre-quantized model is always faster than using compress weights!\n"); } - //duplicated from expose.cpp - std::string vulkan_info_raw = inputs.vulkan_info; - std::string vulkan_info_str = ""; - for (size_t i = 0; i < vulkan_info_raw.length(); ++i) { - vulkan_info_str += vulkan_info_raw[i]; - if (i < vulkan_info_raw.length() - 1) { - vulkan_info_str += ","; - } - } - const char* existingenv = getenv("GGML_VK_VISIBLE_DEVICES"); - if(!existingenv && vulkan_info_str!="") - { - sdvulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+vulkan_info_str; - putenv((char*)sdvulkandeviceenv.c_str()); - } - sd_params = new SDParams(); sd_params->model_path = inputs.model_filename; sd_params->wtype = SD_TYPE_COUNT; diff --git a/otherarch/tts_adapter.cpp b/otherarch/tts_adapter.cpp index 21d5ef163..408c4da81 100644 --- a/otherarch/tts_adapter.cpp +++ b/otherarch/tts_adapter.cpp @@ -478,7 +478,6 @@ static llama_context * cts_ctx = nullptr; //codes to speech static TTS_VER ttsver = TTS_VER_2; static int ttsdebugmode = 0; static bool tts_is_quiet = false; -static std::string ttsvulkandeviceenv; static std::string last_generated_audio = ""; static std::string last_generation_settings_prompt = ""; //for caching purposes to fix ST bug static int last_generation_settings_speaker_seed; @@ -511,27 +510,12 @@ bool ttstype_load_model(const tts_load_model_inputs inputs) tts_is_quiet = inputs.quiet; tts_executable_path = inputs.executable_path; - //duplicated from expose.cpp - std::string vulkan_info_raw = inputs.vulkan_info; - std::string vulkan_info_str = ""; - for (size_t i = 0; i < vulkan_info_raw.length(); ++i) { - vulkan_info_str += vulkan_info_raw[i]; - if (i < vulkan_info_raw.length() - 1) { - vulkan_info_str += ","; - } - } - const char* existingenv = getenv("GGML_VK_VISIBLE_DEVICES"); std::vector devices_override; std::string dev_override_str = inputs.devices_override; if(dev_override_str!="") { devices_override = kcpp_parse_device_list(dev_override_str); } - if(!existingenv && vulkan_info_str!="") - { - ttsvulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+vulkan_info_str; - putenv((char*)ttsvulkandeviceenv.c_str()); - } llama_backend_init(); diff --git a/otherarch/whispercpp/whisper_adapter.cpp b/otherarch/whispercpp/whisper_adapter.cpp index ee26a8e5b..0d54a3f2f 100644 --- a/otherarch/whispercpp/whisper_adapter.cpp +++ b/otherarch/whispercpp/whisper_adapter.cpp @@ -59,28 +59,10 @@ static std::string output_txt(struct whisper_context * ctx) { void cb_log_disable(enum ggml_log_level , const char * , void * ) { } -static std::string whispervulkandeviceenv; bool whispertype_load_model(const whisper_load_model_inputs inputs) { whisper_is_quiet = inputs.quiet; - //duplicated from expose.cpp - std::string vulkan_info_raw = inputs.vulkan_info; - std::string vulkan_info_str = ""; - for (size_t i = 0; i < vulkan_info_raw.length(); ++i) { - vulkan_info_str += vulkan_info_raw[i]; - if (i < vulkan_info_raw.length() - 1) { - vulkan_info_str += ","; - } - } - const char* existingenv = getenv("GGML_VK_VISIBLE_DEVICES"); - if(!existingenv && vulkan_info_str!="") - { - whispervulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+vulkan_info_str; - putenv((char*)whispervulkandeviceenv.c_str()); - } - - std::string modelfile = inputs.model_filename; printf("\nLoading Whisper Model: %s",modelfile.c_str());