sd: add backend support for max_vram (#2221)

2026-05-22 11:16:08 +00:00 · 2026-05-21 00:51:00 -03:00 · 2026-05-21 00:51:00 -03:00 · f85a747dc0
commit f85a747dc0
parent 095bf63b58
3 changed files with 6 additions and 0 deletions
--- a/expose.h
+++ b/expose.h
@ -207,6 +207,7 @@ struct sd_load_model_inputs
    const char * upscaler_filename = nullptr;
    const int img_hard_limit = 0;
    const int img_soft_limit = 0;
+    float max_vram = 0.f;
    const char * devices_override = nullptr;
    const bool quiet = false;
    const int debugmode = 0;
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -389,6 +389,7 @@ class sd_load_model_inputs(ctypes.Structure):
                ("upscaler_filename", ctypes.c_char_p),
                ("img_hard_limit", ctypes.c_int),
                ("img_soft_limit", ctypes.c_int),
+                ("max_vram", ctypes.c_float),
                ("devices_override", ctypes.c_char_p),
                ("quiet", ctypes.c_bool),
                ("debugmode", ctypes.c_int)]
--- a/otherarch/sdcpp/sdtype_adapter.cpp
+++ b/otherarch/sdcpp/sdtype_adapter.cpp
@ -397,6 +397,9 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
    } else if (inputs.use_mmap) {
        printf("Using mmap for I/O\n");
    }
+    if(inputs.max_vram != 0.f) {
+        printf("Using max VRAM = %0.2f\n", inputs.max_vram);
+    }
    if(inputs.quant > 0)
    {
        printf("Note: Loading a pre-quantized model is always faster than using compress weights!\n");
@ -460,6 +463,7 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
    params.diffusion_conv_direct = sd_params->diffusion_conv_direct;
    params.vae_conv_direct = sd_params->vae_conv_direct;
    params.chroma_use_dit_mask = sd_params->chroma_use_dit_mask;
+    params.max_vram = inputs.max_vram;
    params.enable_mmap = inputs.use_mmap;
    // the _cpu flags are only used if the backend string is empty, but
    // we always set both for consistency