diff --git a/expose.h b/expose.h index 37ee8535e..af06f201c 100644 --- a/expose.h +++ b/expose.h @@ -149,6 +149,7 @@ struct sd_load_model_inputs const int threads = 0; const int quant = 0; const bool taesd = false; + const bool notile = false; const char * t5xxl_filename = nullptr; const char * clipl_filename = nullptr; const char * clipg_filename = nullptr; diff --git a/koboldcpp.py b/koboldcpp.py index 566299fb7..d2af29ca9 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -234,6 +234,7 @@ class sd_load_model_inputs(ctypes.Structure): ("threads", ctypes.c_int), ("quant", ctypes.c_int), ("taesd", ctypes.c_bool), + ("notile", ctypes.c_bool), ("t5xxl_filename", ctypes.c_char_p), ("clipl_filename", ctypes.c_char_p), ("clipg_filename", ctypes.c_char_p), @@ -1121,6 +1122,7 @@ def sd_load_model(model_filename,vae_filename,lora_filename,t5xxl_filename,clipl inputs.threads = thds inputs.quant = quant inputs.taesd = True if args.sdvaeauto else False + inputs.notile = True if args.sdnotile else False inputs.vae_filename = vae_filename.encode("UTF-8") inputs.lora_filename = lora_filename.encode("UTF-8") inputs.lora_multiplier = args.sdloramult @@ -2980,6 +2982,7 @@ def show_gui(): sd_clipl_var = ctk.StringVar() sd_clipg_var = ctk.StringVar() sd_vaeauto_var = ctk.IntVar(value=0) + sd_notile_var = ctk.IntVar(value=0) sd_clamped_var = ctk.StringVar(value="0") sd_threads_var = ctk.StringVar(value=str(default_threads)) sd_quant_var = ctk.IntVar(value=0) @@ -3548,6 +3551,7 @@ def show_gui(): sdvaeitem2.grid() sdvaeitem3.grid() makecheckbox(images_tab, "Use TAE SD (AutoFix Broken VAE)", sd_vaeauto_var, 22,command=toggletaesd,tooltiptxt="Replace VAE with TAESD. May fix bad VAE.") + makecheckbox(images_tab, "No VAE Tiling", sd_notile_var, 24,tooltiptxt="Disables VAE tiling, may not work for large images.") # audio tab audio_tab = tabcontent["Audio"] @@ -3738,6 +3742,7 @@ def show_gui(): args.sdthreads = (0 if sd_threads_var.get()=="" else int(sd_threads_var.get())) args.sdclamped = (0 if int(sd_clamped_var.get())<=0 else int(sd_clamped_var.get())) + args.sdnotile = (True if sd_notile_var.get()==1 else False) if sd_vaeauto_var.get()==1: args.sdvaeauto = True args.sdvae = "" @@ -3919,6 +3924,7 @@ def show_gui(): sd_clipl_var.set(dict["sdclipl"] if ("sdclipl" in dict and dict["sdclipl"]) else "") sd_clipg_var.set(dict["sdclipg"] if ("sdclipg" in dict and dict["sdclipg"]) else "") sd_vaeauto_var.set(1 if ("sdvaeauto" in dict and dict["sdvaeauto"]) else 0) + sd_notile_var.set(1 if ("sdnotile" in dict and dict["sdnotile"]) else 0) sd_lora_var.set(dict["sdlora"] if ("sdlora" in dict and dict["sdlora"]) else "") sd_loramult_var.set(str(dict["sdloramult"]) if ("sdloramult" in dict and dict["sdloramult"]) else "1.0") @@ -5237,6 +5243,7 @@ if __name__ == '__main__': sdparsergrouplora.add_argument("--sdquant", help="If specified, loads the model quantized to save memory.", action='store_true') sdparsergrouplora.add_argument("--sdlora", metavar=('[filename]'), help="Specify a stable diffusion LORA safetensors model to be applied. Cannot be used with quant models.", default="") sdparsergroup.add_argument("--sdloramult", metavar=('[amount]'), help="Multiplier for the LORA model to be applied.", type=float, default=1.0) + sdparsergroup.add_argument("--sdnotile", help="Disables VAE tiling, may not work for large images.", action='store_true') whisperparsergroup = parser.add_argument_group('Whisper Transcription Commands') whisperparsergroup.add_argument("--whispermodel", metavar=('[filename]'), help="Specify a Whisper bin model to enable Speech-To-Text transcription.", default="") diff --git a/otherarch/sdcpp/ggml_extend.hpp b/otherarch/sdcpp/ggml_extend.hpp index a306a46ef..1546d0447 100644 --- a/otherarch/sdcpp/ggml_extend.hpp +++ b/otherarch/sdcpp/ggml_extend.hpp @@ -52,6 +52,25 @@ #define __STATIC_INLINE__ static inline #endif +__STATIC_INLINE__ void* sd_aligned_malloc(size_t required_bytes, size_t alignment) +{ + void* p1; // original block + void** p2; // aligned block + int offset = alignment - 1 + sizeof(void*); + if ((p1 = (void*)calloc(1, required_bytes + offset)) == NULL) + { + return NULL; + } + p2 = (void**)(((size_t)(p1) + offset) & ~(alignment - 1)); + p2[-1] = p1; + return p2; +} + +__STATIC_INLINE__ void sd_aligned_free(void *p) +{ + free(((void**)p)[-1]); +} + __STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const char* text, void* user_data) { (void)level; (void)user_data; @@ -507,15 +526,23 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const params.mem_size += tile_size * tile_size * input->ne[2] * sizeof(float); // input chunk params.mem_size += (tile_size * scale) * (tile_size * scale) * output->ne[2] * sizeof(float); // output chunk params.mem_size += 3 * ggml_tensor_overhead(); + params.mem_size += 512; //extra 512 bytes why not, we will use and handle our own memory + params.mem_size = GGML_PAD(params.mem_size, GGML_MEM_ALIGN); params.mem_buffer = NULL; params.no_alloc = false; LOG_DEBUG("tile work buffer size: %.2f MB", params.mem_size / 1024.f / 1024.f); + params.mem_buffer = sd_aligned_malloc(params.mem_size,64); + // draft context struct ggml_context* tiles_ctx = ggml_init(params); if (!tiles_ctx) { LOG_ERROR("ggml_init() failed"); + if(params.mem_buffer!=NULL) + { + sd_aligned_free(params.mem_buffer); + } return; } @@ -554,6 +581,10 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const pretty_progress(num_tiles, num_tiles, last_time); } ggml_free(tiles_ctx); + if(params.mem_buffer!=NULL) + { + sd_aligned_free(params.mem_buffer); + } } __STATIC_INLINE__ struct ggml_tensor* ggml_group_norm_32(struct ggml_context* ctx, diff --git a/otherarch/sdcpp/sdtype_adapter.cpp b/otherarch/sdcpp/sdtype_adapter.cpp index 8106f0cd7..5b7ab7605 100644 --- a/otherarch/sdcpp/sdtype_adapter.cpp +++ b/otherarch/sdcpp/sdtype_adapter.cpp @@ -135,6 +135,7 @@ std::string base64_encode(const unsigned char* data, unsigned int data_length) { } static std::string sdplatformenv, sddeviceenv, sdvulkandeviceenv; +static bool notiling = false; bool sdtype_load_model(const sd_load_model_inputs inputs) { executable_path = inputs.executable_path; @@ -144,6 +145,7 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) { std::string t5xxl_filename = inputs.t5xxl_filename; std::string clipl_filename = inputs.clipl_filename; std::string clipg_filename = inputs.clipg_filename; + notiling = inputs.notile; printf("\nImageGen Init - Load Model: %s\n",inputs.model_filename); if(lorafilename!="") { @@ -352,7 +354,7 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs) sd_params->width = newwidth; sd_params->height = newheight; } - bool dotile = (sd_params->width>768 || sd_params->height>768); + bool dotile = (sd_params->width>768 || sd_params->height>768) && !notiling; set_sd_vae_tiling(sd_ctx,dotile); //changes vae tiling, prevents memory related crash/oom //for img2img diff --git a/otherarch/sdcpp/stable-diffusion.cpp b/otherarch/sdcpp/stable-diffusion.cpp index 897c4f021..738e11c66 100644 --- a/otherarch/sdcpp/stable-diffusion.cpp +++ b/otherarch/sdcpp/stable-diffusion.cpp @@ -1084,7 +1084,8 @@ public: ggml_tensor_scale_output(result); } } else { - if (vae_tiling && decode) { // TODO: support tiling vae encode + //koboldcpp never use tiling with taesd + if (false && vae_tiling && decode) { // TODO: support tiling vae encode // split latent in 64x64 tiles and compute in several steps auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { tae_first_stage->compute(n_threads, in, decode, &out);