refactor: handle GGML_VK_VISIBLE_DEVICES at the Python level (#2179)

All C++ handling code currently:
- build a comma-separated list from the info_vulkan array
- if GGML_VK_VISIBLE_DEVICES isn't set
  - set GGML_VK_VISIBLE_DEVICES to the list

Once set, GGML_VK_VISIBLE_DEVICES affects the whole process. So this
can be done in the same way at the Python level, before all loading
functions.

Caveat: load_model had the default `inputs.vulkan_info = "0"`,
so the default GPU would be "0" only when loading a text model.
This commit is contained in:
Wagner Bruna 2026-05-02 12:10:29 -03:00 committed by GitHub
parent 42ce63fd3b
commit 25fab4113e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 3 additions and 120 deletions

View file

@ -23,8 +23,6 @@
extern "C"
{
std::string vulkandeviceenv;
//return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt)
static FileFormat file_format = FileFormat::BADFORMAT;
static FileFormatExtraMeta file_format_meta;
@ -38,21 +36,6 @@ extern "C"
file_format = check_file_format(model.c_str(),&file_format_meta);
std::string vulkan_info_raw = inputs.vulkan_info;
std::string vulkan_info_str = "";
for (size_t i = 0; i < vulkan_info_raw.length(); ++i) {
vulkan_info_str += vulkan_info_raw[i];
if (i < vulkan_info_raw.length() - 1) {
vulkan_info_str += ",";
}
}
const char* existingenv = getenv("GGML_VK_VISIBLE_DEVICES");
if(!existingenv && vulkan_info_str!="")
{
vulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+vulkan_info_str;
putenv((char*)vulkandeviceenv.c_str());
}
executable_path = inputs.executable_path;
if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2 || file_format==FileFormat::GPTJ_3 || file_format==FileFormat::GPTJ_4 || file_format==FileFormat::GPTJ_5)

View file

@ -55,7 +55,6 @@ struct load_model_inputs
const bool use_contextshift = false;
const bool use_fastforward = false;
const int kcpp_main_gpu = -1;
const char * vulkan_info = nullptr;
const int batchsize = 512;
const bool autofit = false;
const int autofit_tax_mb = 0;
@ -178,7 +177,6 @@ struct sd_load_model_inputs
const char * model_filename = nullptr;
const char * executable_path = nullptr;
const int kcpp_main_gpu = -1;
const char * vulkan_info = nullptr;
const int threads = 0;
const int quant = 0;
const bool flash_attention = false;
@ -263,7 +261,6 @@ struct whisper_load_model_inputs
const char * model_filename = nullptr;
const char * executable_path = nullptr;
const int kcpp_main_gpu = -1;
const char * vulkan_info = nullptr;
const char * devices_override = nullptr;
const bool quiet = false;
const int debugmode = 0;
@ -288,7 +285,6 @@ struct tts_load_model_inputs
const char * cts_model_filename = nullptr;
const char * executable_path = nullptr;
const int kcpp_main_gpu = -1;
const char * vulkan_info = nullptr;
const int gpulayers = 0;
const bool flash_attention = false;
const int ttsmaxlen = 4096;
@ -319,7 +315,6 @@ struct embeddings_load_model_inputs
const char * model_filename = nullptr;
const char * executable_path = nullptr;
const int kcpp_main_gpu = -1;
const char * vulkan_info = nullptr;
const int gpulayers = 0;
const bool flash_attention = false;
const bool use_mmap = false;
@ -349,7 +344,6 @@ struct music_load_model_inputs
const bool lowvram = false;
const char * executable_path = nullptr;
const int kcpp_main_gpu = -1;
const char * vulkan_info = nullptr;
const char * devices_override = nullptr;
const bool quiet = false;
const int debugmode = 0;

View file

@ -256,7 +256,6 @@ class load_model_inputs(ctypes.Structure):
("use_contextshift", ctypes.c_bool),
("use_fastforward", ctypes.c_bool),
("kcpp_main_gpu", ctypes.c_int),
("vulkan_info", ctypes.c_char_p),
("batchsize", ctypes.c_int),
("autofit", ctypes.c_bool),
("autofit_tax_mb", ctypes.c_int),
@ -356,7 +355,6 @@ class sd_load_model_inputs(ctypes.Structure):
_fields_ = [("model_filename", ctypes.c_char_p),
("executable_path", ctypes.c_char_p),
("kcpp_main_gpu", ctypes.c_int),
("vulkan_info", ctypes.c_char_p),
("threads", ctypes.c_int),
("quant", ctypes.c_int),
("flash_attention", ctypes.c_bool),
@ -435,7 +433,6 @@ class whisper_load_model_inputs(ctypes.Structure):
_fields_ = [("model_filename", ctypes.c_char_p),
("executable_path", ctypes.c_char_p),
("kcpp_main_gpu", ctypes.c_int),
("vulkan_info", ctypes.c_char_p),
("devices_override", ctypes.c_char_p),
("quiet", ctypes.c_bool),
("debugmode", ctypes.c_int)]
@ -456,7 +453,6 @@ class tts_load_model_inputs(ctypes.Structure):
("cts_model_filename", ctypes.c_char_p),
("executable_path", ctypes.c_char_p),
("kcpp_main_gpu", ctypes.c_int),
("vulkan_info", ctypes.c_char_p),
("gpulayers", ctypes.c_int),
("flash_attention", ctypes.c_bool),
("ttsmaxlen", ctypes.c_int),
@ -483,7 +479,6 @@ class embeddings_load_model_inputs(ctypes.Structure):
("model_filename", ctypes.c_char_p),
("executable_path", ctypes.c_char_p),
("kcpp_main_gpu", ctypes.c_int),
("vulkan_info", ctypes.c_char_p),
("gpulayers", ctypes.c_int),
("flash_attention", ctypes.c_bool),
("use_mmap", ctypes.c_bool),
@ -509,7 +504,6 @@ class music_load_model_inputs(ctypes.Structure):
("lowvram", ctypes.c_bool),
("executable_path", ctypes.c_char_p),
("kcpp_main_gpu", ctypes.c_int),
("vulkan_info", ctypes.c_char_p),
("devices_override", ctypes.c_char_p),
("quiet", ctypes.c_bool),
("debugmode", ctypes.c_int)]
@ -993,13 +987,9 @@ def set_backend_props(inputs):
elif (args.usecuda and "3" in args.usecuda):
inputs.kcpp_main_gpu = 3
if args.usevulkan: #is an empty array if using vulkan without defined gpu
s = ""
for it in range(0,len(args.usevulkan)):
s += str(args.usevulkan[it])
inputs.vulkan_info = s.encode("UTF-8")
else:
inputs.vulkan_info = "".encode("UTF-8")
if "GGML_VK_VISIBLE_DEVICES" not in os.environ:
if args.usevulkan: # is an empty array if using vulkan without defined gpu
os.environ["GGML_VK_VISIBLE_DEVICES"] = ','.join([str(g) for g in args.usevulkan])
# set universal flags
inputs.devices_override = (args.device if args.device else "").encode("UTF-8")
@ -1890,7 +1880,6 @@ def load_model(model_filename):
inputs.low_vram = True if args.lowvram else False
inputs.use_mmq = False if args.nommq else True
inputs.splitmode = splitmode_choices_to_int(args.splitmode) #layer=1, row=2, tensor=3
inputs.vulkan_info = "0".encode("UTF-8")
inputs.blasthreads = args.blasthreads
inputs.use_mmap = args.usemmap
inputs.use_mlock = args.usemlock

View file

@ -33,22 +33,6 @@ bool musictype_load_model(const music_load_model_inputs inputs)
{
music_is_quiet = inputs.quiet;
//duplicated from expose.cpp
std::string vulkan_info_raw = inputs.vulkan_info;
std::string vulkan_info_str = "";
for (size_t i = 0; i < vulkan_info_raw.length(); ++i) {
vulkan_info_str += vulkan_info_raw[i];
if (i < vulkan_info_raw.length() - 1) {
vulkan_info_str += ",";
}
}
const char* existingenv = getenv("GGML_VK_VISIBLE_DEVICES");
if(!existingenv && vulkan_info_str!="")
{
musicvulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+vulkan_info_str;
putenv((char*)musicvulkandeviceenv.c_str());
}
std::string musicllm_filename = inputs.musicllm_filename;
std::string musicembedding_filename = inputs.musicembedding_filename;
std::string musicdiffusion_filename = inputs.musicdiffusion_filename;

View file

@ -22,7 +22,6 @@
#endif
static llama_context * embeddings_ctx = nullptr; //text to codes ctx
static std::string ttsvulkandeviceenv;
bool embeddings_debug = false;
static int max_batchsize = 512;
static std::string last_output = "";
@ -82,27 +81,12 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
bool embeddingstype_load_model(const embeddings_load_model_inputs inputs)
{
//duplicated from expose.cpp
std::string vulkan_info_raw = inputs.vulkan_info;
std::string vulkan_info_str = "";
for (size_t i = 0; i < vulkan_info_raw.length(); ++i) {
vulkan_info_str += vulkan_info_raw[i];
if (i < vulkan_info_raw.length() - 1) {
vulkan_info_str += ",";
}
}
const char* existingenv = getenv("GGML_VK_VISIBLE_DEVICES");
std::vector<ggml_backend_dev_t> devices_override;
std::string dev_override_str = inputs.devices_override;
if(dev_override_str!="")
{
devices_override = kcpp_parse_device_list(dev_override_str);
}
if(!existingenv && vulkan_info_str!="")
{
ttsvulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+vulkan_info_str;
putenv((char*)ttsvulkandeviceenv.c_str());
}
llama_backend_init();

View file

@ -187,7 +187,6 @@ static uint8_t * upscale_src_buffer = NULL;
static std::vector<uint8_t *> input_extraimage_buffers;
const int max_extra_images = 4;
static std::string sdvulkandeviceenv;
static std::string sdmaingpuenv;
static int cfg_tiled_vae_threshold = 0;
static int cfg_square_limit = 0;
@ -430,22 +429,6 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
printf("Note: Loading a pre-quantized model is always faster than using compress weights!\n");
}
//duplicated from expose.cpp
std::string vulkan_info_raw = inputs.vulkan_info;
std::string vulkan_info_str = "";
for (size_t i = 0; i < vulkan_info_raw.length(); ++i) {
vulkan_info_str += vulkan_info_raw[i];
if (i < vulkan_info_raw.length() - 1) {
vulkan_info_str += ",";
}
}
const char* existingenv = getenv("GGML_VK_VISIBLE_DEVICES");
if(!existingenv && vulkan_info_str!="")
{
sdvulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+vulkan_info_str;
putenv((char*)sdvulkandeviceenv.c_str());
}
sd_params = new SDParams();
sd_params->model_path = inputs.model_filename;
sd_params->wtype = SD_TYPE_COUNT;

View file

@ -478,7 +478,6 @@ static llama_context * cts_ctx = nullptr; //codes to speech
static TTS_VER ttsver = TTS_VER_2;
static int ttsdebugmode = 0;
static bool tts_is_quiet = false;
static std::string ttsvulkandeviceenv;
static std::string last_generated_audio = "";
static std::string last_generation_settings_prompt = ""; //for caching purposes to fix ST bug
static int last_generation_settings_speaker_seed;
@ -511,27 +510,12 @@ bool ttstype_load_model(const tts_load_model_inputs inputs)
tts_is_quiet = inputs.quiet;
tts_executable_path = inputs.executable_path;
//duplicated from expose.cpp
std::string vulkan_info_raw = inputs.vulkan_info;
std::string vulkan_info_str = "";
for (size_t i = 0; i < vulkan_info_raw.length(); ++i) {
vulkan_info_str += vulkan_info_raw[i];
if (i < vulkan_info_raw.length() - 1) {
vulkan_info_str += ",";
}
}
const char* existingenv = getenv("GGML_VK_VISIBLE_DEVICES");
std::vector<ggml_backend_dev_t> devices_override;
std::string dev_override_str = inputs.devices_override;
if(dev_override_str!="")
{
devices_override = kcpp_parse_device_list(dev_override_str);
}
if(!existingenv && vulkan_info_str!="")
{
ttsvulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+vulkan_info_str;
putenv((char*)ttsvulkandeviceenv.c_str());
}
llama_backend_init();

View file

@ -59,28 +59,10 @@ static std::string output_txt(struct whisper_context * ctx) {
void cb_log_disable(enum ggml_log_level , const char * , void * ) { }
static std::string whispervulkandeviceenv;
bool whispertype_load_model(const whisper_load_model_inputs inputs)
{
whisper_is_quiet = inputs.quiet;
//duplicated from expose.cpp
std::string vulkan_info_raw = inputs.vulkan_info;
std::string vulkan_info_str = "";
for (size_t i = 0; i < vulkan_info_raw.length(); ++i) {
vulkan_info_str += vulkan_info_raw[i];
if (i < vulkan_info_raw.length() - 1) {
vulkan_info_str += ",";
}
}
const char* existingenv = getenv("GGML_VK_VISIBLE_DEVICES");
if(!existingenv && vulkan_info_str!="")
{
whispervulkandeviceenv = "GGML_VK_VISIBLE_DEVICES="+vulkan_info_str;
putenv((char*)whispervulkandeviceenv.c_str());
}
std::string modelfile = inputs.model_filename;
printf("\nLoading Whisper Model: %s",modelfile.c_str());