From 25fab4113ebdf8fcf8b697be2984f8af8db884a9 Mon Sep 17 00:00:00 2001
From: Wagner Bruna <wbruna@users.noreply.github.com>
Date: Sat, 2 May 2026 12:10:29 -0300
Subject: [PATCH] refactor: handle GGML_VK_VISIBLE_DEVICES at the Python level
 (#2179)

All C++ handling code currently:
- build a comma-separated list from the info_vulkan array
- if GGML_VK_VISIBLE_DEVICES isn't set
  - set GGML_VK_VISIBLE_DEVICES to the list

Once set, GGML_VK_VISIBLE_DEVICES affects the whole process. So this
can be done in the same way at the Python level, before all loading
functions.

Caveat: load_model had the default `inputs.vulkan_info = "0"`,
so the default GPU would be "0" only when loading a text model.
---
 expose.cpp                               | 17 -----------------
 expose.h                                 |  6 ------
 koboldcpp.py                             | 17 +++--------------
 otherarch/acestep/music_adapter.cpp      | 16 ----------------
 otherarch/embeddings_adapter.cpp         | 16 ----------------
 otherarch/sdcpp/sdtype_adapter.cpp       | 17 -----------------
 otherarch/tts_adapter.cpp                | 16 ----------------
 otherarch/whispercpp/whisper_adapter.cpp | 18 ------------------
 8 files changed, 3 insertions(+), 120 deletions(-)

diff --git a/expose.cpp b/expose.cpp
index 2f855b368..c58ad8fa8 100644
--- a/expose.cpp
+++ b/expose.cpp
@@ -23,8 +23,6 @@
 extern "C"
 {
 
-    std::string vulkandeviceenv;
-
     //return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt)
     static FileFormat file_format = FileFormat::BADFORMAT;
     static FileFormatExtraMeta file_format_meta;
@@ -38,21 +36,6 @@ extern "C"
 
         file_format = check_file_format(model.c_str(),&file_format_meta);
 
-        std::string vulkan_info_raw = inputs.vulkan_info;
-        std::string vulkan_info_str = "";
-        for (size_t i = 0; i < vulkan_info_raw.length(); ++i) {
-            vulkan_info_str += vulkan_info_raw[i];
-            if (i < vulkan_info_raw.length() - 1) {
-                vulkan_info_str += ",";
-            }
-        }
-        const char* existingenv = getenv("GGML_VK_VISIBLE_DEVICES");
-        if(!existingenv && vulkan_info_str!="")
-        {
-            vulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+vulkan_info_str;
-            putenv((char*)vulkandeviceenv.c_str());
-        }
-
         executable_path = inputs.executable_path;
 
         if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2 || file_format==FileFormat::GPTJ_3 || file_format==FileFormat::GPTJ_4  || file_format==FileFormat::GPTJ_5)
diff --git a/expose.h b/expose.h
index 8babf7344..f44c327ee 100644
--- a/expose.h
+++ b/expose.h
@@ -55,7 +55,6 @@ struct load_model_inputs
     const bool use_contextshift = false;
     const bool use_fastforward = false;
     const int kcpp_main_gpu = -1;
-    const char * vulkan_info = nullptr;
     const int batchsize = 512;
     const bool autofit = false;
     const int autofit_tax_mb = 0;
@@ -178,7 +177,6 @@ struct sd_load_model_inputs
     const char * model_filename = nullptr;
     const char * executable_path = nullptr;
     const int kcpp_main_gpu = -1;
-    const char * vulkan_info = nullptr;
     const int threads = 0;
     const int quant = 0;
     const bool flash_attention = false;
@@ -263,7 +261,6 @@ struct whisper_load_model_inputs
     const char * model_filename = nullptr;
     const char * executable_path = nullptr;
     const int kcpp_main_gpu = -1;
-    const char * vulkan_info = nullptr;
     const char * devices_override = nullptr;
     const bool quiet = false;
     const int debugmode = 0;
@@ -288,7 +285,6 @@ struct tts_load_model_inputs
     const char * cts_model_filename = nullptr;
     const char * executable_path = nullptr;
     const int kcpp_main_gpu = -1;
-    const char * vulkan_info = nullptr;
     const int gpulayers = 0;
     const bool flash_attention = false;
     const int ttsmaxlen = 4096;
@@ -319,7 +315,6 @@ struct embeddings_load_model_inputs
     const char * model_filename = nullptr;
     const char * executable_path = nullptr;
     const int kcpp_main_gpu = -1;
-    const char * vulkan_info = nullptr;
     const int gpulayers = 0;
     const bool flash_attention = false;
     const bool use_mmap = false;
@@ -349,7 +344,6 @@ struct music_load_model_inputs
     const bool lowvram = false;
     const char * executable_path = nullptr;
     const int kcpp_main_gpu = -1;
-    const char * vulkan_info = nullptr;
     const char * devices_override = nullptr;
     const bool quiet = false;
     const int debugmode = 0;
diff --git a/koboldcpp.py b/koboldcpp.py
index 342e98be1..17c8dc5e5 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -256,7 +256,6 @@ class load_model_inputs(ctypes.Structure):
                 ("use_contextshift", ctypes.c_bool),
                 ("use_fastforward", ctypes.c_bool),
                 ("kcpp_main_gpu", ctypes.c_int),
-                ("vulkan_info", ctypes.c_char_p),
                 ("batchsize", ctypes.c_int),
                 ("autofit", ctypes.c_bool),
                 ("autofit_tax_mb", ctypes.c_int),
@@ -356,7 +355,6 @@ class sd_load_model_inputs(ctypes.Structure):
     _fields_ = [("model_filename", ctypes.c_char_p),
                 ("executable_path", ctypes.c_char_p),
                 ("kcpp_main_gpu", ctypes.c_int),
-                ("vulkan_info", ctypes.c_char_p),
                 ("threads", ctypes.c_int),
                 ("quant", ctypes.c_int),
                 ("flash_attention", ctypes.c_bool),
@@ -435,7 +433,6 @@ class whisper_load_model_inputs(ctypes.Structure):
     _fields_ = [("model_filename", ctypes.c_char_p),
                 ("executable_path", ctypes.c_char_p),
                 ("kcpp_main_gpu", ctypes.c_int),
-                ("vulkan_info", ctypes.c_char_p),
                 ("devices_override", ctypes.c_char_p),
                 ("quiet", ctypes.c_bool),
                 ("debugmode", ctypes.c_int)]
@@ -456,7 +453,6 @@ class tts_load_model_inputs(ctypes.Structure):
                 ("cts_model_filename", ctypes.c_char_p),
                 ("executable_path", ctypes.c_char_p),
                 ("kcpp_main_gpu", ctypes.c_int),
-                ("vulkan_info", ctypes.c_char_p),
                 ("gpulayers", ctypes.c_int),
                 ("flash_attention", ctypes.c_bool),
                 ("ttsmaxlen", ctypes.c_int),
@@ -483,7 +479,6 @@ class embeddings_load_model_inputs(ctypes.Structure):
                 ("model_filename", ctypes.c_char_p),
                 ("executable_path", ctypes.c_char_p),
                 ("kcpp_main_gpu", ctypes.c_int),
-                ("vulkan_info", ctypes.c_char_p),
                 ("gpulayers", ctypes.c_int),
                 ("flash_attention", ctypes.c_bool),
                 ("use_mmap", ctypes.c_bool),
@@ -509,7 +504,6 @@ class music_load_model_inputs(ctypes.Structure):
                 ("lowvram", ctypes.c_bool),
                 ("executable_path", ctypes.c_char_p),
                 ("kcpp_main_gpu", ctypes.c_int),
-                ("vulkan_info", ctypes.c_char_p),
                 ("devices_override", ctypes.c_char_p),
                 ("quiet", ctypes.c_bool),
                 ("debugmode", ctypes.c_int)]
@@ -993,13 +987,9 @@ def set_backend_props(inputs):
             elif (args.usecuda and "3" in args.usecuda):
                 inputs.kcpp_main_gpu = 3
 
-    if args.usevulkan: #is an empty array if using vulkan without defined gpu
-        s = ""
-        for it in range(0,len(args.usevulkan)):
-            s += str(args.usevulkan[it])
-        inputs.vulkan_info = s.encode("UTF-8")
-    else:
-        inputs.vulkan_info = "".encode("UTF-8")
+    if "GGML_VK_VISIBLE_DEVICES" not in os.environ:
+        if args.usevulkan: # is an empty array if using vulkan without defined gpu
+            os.environ["GGML_VK_VISIBLE_DEVICES"] = ','.join([str(g) for g in args.usevulkan])
 
     # set universal flags
     inputs.devices_override = (args.device if args.device else "").encode("UTF-8")
@@ -1890,7 +1880,6 @@ def load_model(model_filename):
     inputs.low_vram = True if args.lowvram else False
     inputs.use_mmq = False if args.nommq else True
     inputs.splitmode = splitmode_choices_to_int(args.splitmode) #layer=1, row=2, tensor=3
-    inputs.vulkan_info = "0".encode("UTF-8")
     inputs.blasthreads = args.blasthreads
     inputs.use_mmap = args.usemmap
     inputs.use_mlock = args.usemlock
diff --git a/otherarch/acestep/music_adapter.cpp b/otherarch/acestep/music_adapter.cpp
index 1547d3677..9b6f97a09 100644
--- a/otherarch/acestep/music_adapter.cpp
+++ b/otherarch/acestep/music_adapter.cpp
@@ -33,22 +33,6 @@ bool musictype_load_model(const music_load_model_inputs inputs)
 {
     music_is_quiet = inputs.quiet;
 
-    //duplicated from expose.cpp
-    std::string vulkan_info_raw = inputs.vulkan_info;
-    std::string vulkan_info_str = "";
-    for (size_t i = 0; i < vulkan_info_raw.length(); ++i) {
-        vulkan_info_str += vulkan_info_raw[i];
-        if (i < vulkan_info_raw.length() - 1) {
-            vulkan_info_str += ",";
-        }
-    }
-    const char* existingenv = getenv("GGML_VK_VISIBLE_DEVICES");
-    if(!existingenv && vulkan_info_str!="")
-    {
-        musicvulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+vulkan_info_str;
-        putenv((char*)musicvulkandeviceenv.c_str());
-    }
-
     std::string musicllm_filename = inputs.musicllm_filename;
     std::string musicembedding_filename = inputs.musicembedding_filename;
     std::string musicdiffusion_filename = inputs.musicdiffusion_filename;
diff --git a/otherarch/embeddings_adapter.cpp b/otherarch/embeddings_adapter.cpp
index f04703d0a..0613256e2 100644
--- a/otherarch/embeddings_adapter.cpp
+++ b/otherarch/embeddings_adapter.cpp
@@ -22,7 +22,6 @@
 #endif
 
 static llama_context * embeddings_ctx = nullptr; //text to codes ctx
-static std::string ttsvulkandeviceenv;
 bool embeddings_debug = false;
 static int max_batchsize = 512;
 static std::string last_output = "";
@@ -82,27 +81,12 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
 
 bool embeddingstype_load_model(const embeddings_load_model_inputs inputs)
 {
-    //duplicated from expose.cpp
-    std::string vulkan_info_raw = inputs.vulkan_info;
-    std::string vulkan_info_str = "";
-    for (size_t i = 0; i < vulkan_info_raw.length(); ++i) {
-        vulkan_info_str += vulkan_info_raw[i];
-        if (i < vulkan_info_raw.length() - 1) {
-            vulkan_info_str += ",";
-        }
-    }
-    const char* existingenv = getenv("GGML_VK_VISIBLE_DEVICES");
     std::vector<ggml_backend_dev_t> devices_override;
     std::string dev_override_str = inputs.devices_override;
     if(dev_override_str!="")
     {
         devices_override = kcpp_parse_device_list(dev_override_str);
     }
-    if(!existingenv && vulkan_info_str!="")
-    {
-        ttsvulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+vulkan_info_str;
-        putenv((char*)ttsvulkandeviceenv.c_str());
-    }
 
     llama_backend_init();
 
diff --git a/otherarch/sdcpp/sdtype_adapter.cpp b/otherarch/sdcpp/sdtype_adapter.cpp
index e583cd6bd..41aaa82f2 100644
--- a/otherarch/sdcpp/sdtype_adapter.cpp
+++ b/otherarch/sdcpp/sdtype_adapter.cpp
@@ -187,7 +187,6 @@ static uint8_t * upscale_src_buffer = NULL;
 static std::vector<uint8_t *> input_extraimage_buffers;
 const int max_extra_images = 4;
 
-static std::string sdvulkandeviceenv;
 static std::string sdmaingpuenv;
 static int cfg_tiled_vae_threshold = 0;
 static int cfg_square_limit = 0;
@@ -430,22 +429,6 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
         printf("Note: Loading a pre-quantized model is always faster than using compress weights!\n");
     }
 
-    //duplicated from expose.cpp
-    std::string vulkan_info_raw = inputs.vulkan_info;
-    std::string vulkan_info_str = "";
-    for (size_t i = 0; i < vulkan_info_raw.length(); ++i) {
-        vulkan_info_str += vulkan_info_raw[i];
-        if (i < vulkan_info_raw.length() - 1) {
-            vulkan_info_str += ",";
-        }
-    }
-    const char* existingenv = getenv("GGML_VK_VISIBLE_DEVICES");
-    if(!existingenv && vulkan_info_str!="")
-    {
-        sdvulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+vulkan_info_str;
-        putenv((char*)sdvulkandeviceenv.c_str());
-    }
-
     sd_params = new SDParams();
     sd_params->model_path = inputs.model_filename;
     sd_params->wtype = SD_TYPE_COUNT;
diff --git a/otherarch/tts_adapter.cpp b/otherarch/tts_adapter.cpp
index 21d5ef163..408c4da81 100644
--- a/otherarch/tts_adapter.cpp
+++ b/otherarch/tts_adapter.cpp
@@ -478,7 +478,6 @@ static llama_context * cts_ctx = nullptr; //codes to speech
 static TTS_VER ttsver = TTS_VER_2;
 static int ttsdebugmode = 0;
 static bool tts_is_quiet = false;
-static std::string ttsvulkandeviceenv;
 static std::string last_generated_audio = "";
 static std::string last_generation_settings_prompt = ""; //for caching purposes to fix ST bug
 static int last_generation_settings_speaker_seed;
@@ -511,27 +510,12 @@ bool ttstype_load_model(const tts_load_model_inputs inputs)
     tts_is_quiet = inputs.quiet;
     tts_executable_path = inputs.executable_path;
 
-    //duplicated from expose.cpp
-    std::string vulkan_info_raw = inputs.vulkan_info;
-    std::string vulkan_info_str = "";
-    for (size_t i = 0; i < vulkan_info_raw.length(); ++i) {
-        vulkan_info_str += vulkan_info_raw[i];
-        if (i < vulkan_info_raw.length() - 1) {
-            vulkan_info_str += ",";
-        }
-    }
-    const char* existingenv = getenv("GGML_VK_VISIBLE_DEVICES");
     std::vector<ggml_backend_dev_t> devices_override;
     std::string dev_override_str = inputs.devices_override;
     if(dev_override_str!="")
     {
         devices_override = kcpp_parse_device_list(dev_override_str);
     }
-    if(!existingenv && vulkan_info_str!="")
-    {
-        ttsvulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+vulkan_info_str;
-        putenv((char*)ttsvulkandeviceenv.c_str());
-    }
 
     llama_backend_init();
 
diff --git a/otherarch/whispercpp/whisper_adapter.cpp b/otherarch/whispercpp/whisper_adapter.cpp
index ee26a8e5b..0d54a3f2f 100644
--- a/otherarch/whispercpp/whisper_adapter.cpp
+++ b/otherarch/whispercpp/whisper_adapter.cpp
@@ -59,28 +59,10 @@ static std::string output_txt(struct whisper_context * ctx) {
 
 void cb_log_disable(enum ggml_log_level , const char * , void * ) { }
 
-static std::string whispervulkandeviceenv;
 bool whispertype_load_model(const whisper_load_model_inputs inputs)
 {
     whisper_is_quiet = inputs.quiet;
 
-    //duplicated from expose.cpp
-    std::string vulkan_info_raw = inputs.vulkan_info;
-    std::string vulkan_info_str = "";
-    for (size_t i = 0; i < vulkan_info_raw.length(); ++i) {
-        vulkan_info_str += vulkan_info_raw[i];
-        if (i < vulkan_info_raw.length() - 1) {
-            vulkan_info_str += ",";
-        }
-    }
-    const char* existingenv = getenv("GGML_VK_VISIBLE_DEVICES");
-    if(!existingenv && vulkan_info_str!="")
-    {
-        whispervulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+vulkan_info_str;
-        putenv((char*)whispervulkandeviceenv.c_str());
-    }
-
-
     std::string modelfile = inputs.model_filename;
     printf("\nLoading Whisper Model: %s",modelfile.c_str());