diff --git a/expose.h b/expose.h index 9bcff6d91..60bebee4f 100644 --- a/expose.h +++ b/expose.h @@ -207,6 +207,7 @@ struct sd_load_model_inputs const char * upscaler_filename = nullptr; const int img_hard_limit = 0; const int img_soft_limit = 0; + float max_vram = 0.f; const char * devices_override = nullptr; const bool quiet = false; const int debugmode = 0; diff --git a/koboldcpp.py b/koboldcpp.py index abb0f6142..000f125ab 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -389,6 +389,7 @@ class sd_load_model_inputs(ctypes.Structure): ("upscaler_filename", ctypes.c_char_p), ("img_hard_limit", ctypes.c_int), ("img_soft_limit", ctypes.c_int), + ("max_vram", ctypes.c_float), ("devices_override", ctypes.c_char_p), ("quiet", ctypes.c_bool), ("debugmode", ctypes.c_int)] diff --git a/otherarch/sdcpp/sdtype_adapter.cpp b/otherarch/sdcpp/sdtype_adapter.cpp index e8a989a5d..8fdc8428b 100644 --- a/otherarch/sdcpp/sdtype_adapter.cpp +++ b/otherarch/sdcpp/sdtype_adapter.cpp @@ -397,6 +397,9 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) { } else if (inputs.use_mmap) { printf("Using mmap for I/O\n"); } + if(inputs.max_vram != 0.f) { + printf("Using max VRAM = %0.2f\n", inputs.max_vram); + } if(inputs.quant > 0) { printf("Note: Loading a pre-quantized model is always faster than using compress weights!\n"); @@ -460,6 +463,7 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) { params.diffusion_conv_direct = sd_params->diffusion_conv_direct; params.vae_conv_direct = sd_params->vae_conv_direct; params.chroma_use_dit_mask = sd_params->chroma_use_dit_mask; + params.max_vram = inputs.max_vram; params.enable_mmap = inputs.use_mmap; // the _cpu flags are only used if the backend string is empty, but // we always set both for consistency