diff --git a/expose.h b/expose.h
index 9bcff6d91..60bebee4f 100644
--- a/expose.h
+++ b/expose.h
@@ -207,6 +207,7 @@ struct sd_load_model_inputs
     const char * upscaler_filename = nullptr;
     const int img_hard_limit = 0;
     const int img_soft_limit = 0;
+    float max_vram = 0.f;
     const char * devices_override = nullptr;
     const bool quiet = false;
     const int debugmode = 0;
diff --git a/koboldcpp.py b/koboldcpp.py
index abb0f6142..000f125ab 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -389,6 +389,7 @@ class sd_load_model_inputs(ctypes.Structure):
                 ("upscaler_filename", ctypes.c_char_p),
                 ("img_hard_limit", ctypes.c_int),
                 ("img_soft_limit", ctypes.c_int),
+                ("max_vram", ctypes.c_float),
                 ("devices_override", ctypes.c_char_p),
                 ("quiet", ctypes.c_bool),
                 ("debugmode", ctypes.c_int)]
diff --git a/otherarch/sdcpp/sdtype_adapter.cpp b/otherarch/sdcpp/sdtype_adapter.cpp
index e8a989a5d..8fdc8428b 100644
--- a/otherarch/sdcpp/sdtype_adapter.cpp
+++ b/otherarch/sdcpp/sdtype_adapter.cpp
@@ -397,6 +397,9 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
     } else if (inputs.use_mmap) {
         printf("Using mmap for I/O\n");
     }
+    if(inputs.max_vram != 0.f) {
+        printf("Using max VRAM = %0.2f\n", inputs.max_vram);
+    }
     if(inputs.quant > 0)
     {
         printf("Note: Loading a pre-quantized model is always faster than using compress weights!\n");
@@ -460,6 +463,7 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
     params.diffusion_conv_direct = sd_params->diffusion_conv_direct;
     params.vae_conv_direct = sd_params->vae_conv_direct;
     params.chroma_use_dit_mask = sd_params->chroma_use_dit_mask;
+    params.max_vram = inputs.max_vram;
     params.enable_mmap = inputs.use_mmap;
     // the _cpu flags are only used if the backend string is empty, but
     // we always set both for consistency